repofinder/main_analysis.py at main · UC-OSPO-Network/repofinder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from repofinder.analysis.language_distribution import plot_language_distribution
from repofinder.analysis.license_distribution import plot_license_distribution
from repofinder.analysis.feature_counts import plot_feature_counts
from repofinder.analysis.plot_utils import build_shared_color_map, filter_data, db_to_df
import matplotlib.pyplot as plt
import os
import string
import matplotlib

matplotlib.rcParams['font.family'] = 'Lato'

def plot_analysis(all_data_dict):
    """
    Generate and save grouped bar plots for language usage, license types,
    and repository feature presence across multiple universities.

    Parameters
    ----------
    all_data_dict : dict of {str: pd.DataFrame}
        A dictionary mapping university acronyms (e.g., "UCSB", "UCSC", "UCSD") to their
        corresponding repository metadata DataFrames.

    Notes
    -----
    - Thresholds for filtering repositories differ by university and are hardcoded within the function.
    - Uses `filter_data`, `plot_language_distribution`, `plot_license_distribution`,
      `plot_feature_counts`, and `build_shared_color_map` as helper functions.
    - Output files:
        - `language_distribution_grouped.png`
        - `license_distribution_grouped.png`
        - `feature_counts.png`
    """
    os.makedirs('plots/combined', exist_ok=True)

    thresholds = {"UCSB": 0.3, "UCSC": 0.6, "UCSD": 0.3}

    # LANGUAGE DISTRIBUTION
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    lang_color_map = build_shared_color_map(all_data_dict, column='language')

    for i, (acronym, df) in enumerate(all_data_dict.items()):
        letter = string.ascii_lowercase[i]
        title_prefix = f"({letter}) "
        filtered = filter_data(df, threshold=thresholds[acronym])
        plot_language_distribution(filtered, acronym, ax=axes[i], color_map=lang_color_map, title_prefix=title_prefix)
    plt.tight_layout()
    plt.savefig("plots/combined/language_distribution_grouped.png", dpi=300)
    plt.show()
    plt.close()

    # LICENSE DISTRIBUTION
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    lang_color_map = build_shared_color_map(all_data_dict, column='license')
    for i, (acronym, df) in enumerate(all_data_dict.items()):
        letter = string.ascii_lowercase[i]
        title_prefix = f"({letter}) "
        filtered = filter_data(df, threshold=thresholds[acronym])
        plot_license_distribution(filtered, acronym, ax=axes[i], color_map=lang_color_map, title_prefix=title_prefix)
    plt.tight_layout()
    plt.savefig("plots/combined/license_distribution_grouped.png", dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()

    # FEATURE COUNT
    order, feature_colors = None, None

    # Step 1: Determine maximum y-limit across all universities
    max_count = 0

    for i, (acronym, df) in enumerate(all_data_dict.items()):
        filtered = filter_data(df, threshold=thresholds[acronym])
        count = filtered[
            ['description', 'readme', 'license', 'code_of_conduct_file',
             'contributing', 'security_policy', 'issue_templates', 'pull_request_template']
        ].notna().sum().max()
        max_count = max(max_count, count)

    ylim = max_count + int(max_count * 0.09)  # Add 9% headroom for percentage labels

    # Step 2: Plot with shared scale and controlled y-axis labels
    fig, axes = plt.subplots(1, 3, figsize=(18, 8), constrained_layout=True)
    for i, (acronym, df) in enumerate(all_data_dict.items()):
        letter = string.ascii_lowercase[i]
        title_prefix = f"({letter})"
        filtered = filter_data(df, threshold=thresholds[acronym])
        order, feature_colors = plot_feature_counts(
            filtered,
            acronym,
            ax=axes[i],
            title_prefix=title_prefix,
            order=order,
            feature_colors=feature_colors,
            ylim=ylim,
            hide_ylabel=(i != 0)
        )
    plt.savefig("plots/combined/feature_counts.png", dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()


def get_all_data(acronyms = ['UCSB', 'UCSC', 'UCSD']):
    all_data = {}
    for acronym in acronyms:
        path = f"Data/db/repository_data_{acronym}_database.db"
        data = db_to_df(path, acronym)
        all_data[acronym] = data
    return all_data


if __name__ == "__main__":
    all_data = get_all_data()
    plot_analysis(all_data)