Frameworks_Assignment/analysis.py at master · mokwathedeveloper/Frameworks_Assignment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os

def load_and_process_data(file_path='metadata.csv'):
    """Loads, cleans, and processes the CORD-19 metadata.csv file."""
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found. Please download 'metadata.csv' from Kaggle and place it in the project directory.")
        return pd.DataFrame() # Return empty DataFrame on error
    except Exception as e:
        print(f"An error occurred during data loading: {e}")
        return pd.DataFrame()

    # Handle missing values
    df.dropna(subset=['abstract', 'publish_time', 'title', 'journal'], inplace=True)

    # Convert 'publish_time' to datetime and extract year
    df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
    df.dropna(subset=['publish_time'], inplace=True) # Drop rows where conversion failed
    df['year'] = df['publish_time'].dt.year.astype(int)

    # Create abstract word count
    df['abstract_word_count'] = df['abstract'].apply(lambda x: len(str(x).split()))

    return df


# --- Main execution for analysis.py (for direct script run) ---
if __name__ == "__main__":
    print("Running analysis.py directly...")
    df = load_and_process_data()

    if not df.empty:
        print("\n--- Data Loading & Basic Exploration (from function) ---")
        print(f"Successfully loaded and processed {df.shape[0]} rows, {df.shape[1]} columns.")
        print("First 5 rows:")
        print(df.head())
        print("DataFrame Info:")
        print(df.info())
        print("Missing values:")
        print(df.isnull().sum())
        print("Basic Statistics:")
        print(df.describe())

        print("\n--- Starting Data Analysis & Visualization ---")

        # 1. Count number of papers by publication year
        papers_by_year = df['year'].value_counts().sort_index()
        print("\n--- Papers by Publication Year ---")
        print(papers_by_year.head())

        # 2. Identify top journals by publication count
        top_journals = df['journal'].value_counts().head(10)
        print("\n--- Top 10 Publishing Journals ---")
        print(top_journals)

        # 3. Simple word frequency analysis on titles
        all_titles = ' '.join(df['title'].dropna().tolist())
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_titles)
        print("Performed word frequency analysis on titles.")

        # Create a directory for plots if it doesn't exist
        plots_dir = 'plots'
        if not os.path.exists(plots_dir):
            os.makedirs(plots_dir)

        # Visualization 1: Line/bar chart of publications over time
        plt.figure(figsize=(12, 6))
        sns.lineplot(x=papers_by_year.index, y=papers_by_year.values)
        plt.title('Number of Publications Over Time')
        plt.xlabel('Year')
        plt.ylabel('Number of Papers')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, 'publications_over_time.png'))
        plt.close()
        print(f"Saved plot: {os.path.join(plots_dir, 'publications_over_time.png')}")

        # Visualization 2: Bar chart of top publishing journals
        plt.figure(figsize=(12, 7))
        sns.barplot(x=top_journals.values, y=top_journals.index, palette='viridis')
        plt.title('Top 10 Publishing Journals')
        plt.xlabel('Number of Papers')
        plt.ylabel('Journal')
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, 'top_journals.png'))
        plt.close()
        print(f"Saved plot: {os.path.join(plots_dir, 'top_journals.png')}")

        # Visualization 3: Word cloud of paper titles
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Paper Titles')
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, 'title_wordcloud.png'))
        plt.close()
        print(f"Saved plot: {os.path.join(plots_dir, 'title_wordcloud.png')}")

        # Visualization 4: Distribution of paper counts by source
        if 'source_x' in df.columns:
            paper_sources = df['source_x'].value_counts().head(10)
            source_col = 'source_x'
        else:
            paper_sources = df['journal'].value_counts().head(10) # Fallback
            source_col = 'journal'

        plt.figure(figsize=(12, 7))
        sns.barplot(x=paper_sources.values, y=paper_sources.index, palette='plasma')
        plt.title(f'Top 10 Paper Sources ({source_col})')
        plt.xlabel('Number of Papers')
        plt.ylabel('Source')
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, 'paper_sources_distribution.png'))
        plt.close()
        print(f"Saved plot: {os.path.join(plots_dir, 'paper_sources_distribution.png')}")

    else:
        print("DataFrame is empty. Skipping analysis and visualization.")