-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalysis.py
More file actions
123 lines (104 loc) · 4.88 KB
/
analysis.py
File metadata and controls
123 lines (104 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os
def load_and_process_data(file_path='metadata.csv'):
"""Loads, cleans, and processes the CORD-19 metadata.csv file."""
try:
df = pd.read_csv(file_path)
except FileNotFoundError:
print(f"Error: '{file_path}' not found. Please download 'metadata.csv' from Kaggle and place it in the project directory.")
return pd.DataFrame() # Return empty DataFrame on error
except Exception as e:
print(f"An error occurred during data loading: {e}")
return pd.DataFrame()
# Handle missing values
df.dropna(subset=['abstract', 'publish_time', 'title', 'journal'], inplace=True)
# Convert 'publish_time' to datetime and extract year
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df.dropna(subset=['publish_time'], inplace=True) # Drop rows where conversion failed
df['year'] = df['publish_time'].dt.year.astype(int)
# Create abstract word count
df['abstract_word_count'] = df['abstract'].apply(lambda x: len(str(x).split()))
return df
# --- Main execution for analysis.py (for direct script run) ---
if __name__ == "__main__":
print("Running analysis.py directly...")
df = load_and_process_data()
if not df.empty:
print("\n--- Data Loading & Basic Exploration (from function) ---")
print(f"Successfully loaded and processed {df.shape[0]} rows, {df.shape[1]} columns.")
print("First 5 rows:")
print(df.head())
print("DataFrame Info:")
print(df.info())
print("Missing values:")
print(df.isnull().sum())
print("Basic Statistics:")
print(df.describe())
print("\n--- Starting Data Analysis & Visualization ---")
# 1. Count number of papers by publication year
papers_by_year = df['year'].value_counts().sort_index()
print("\n--- Papers by Publication Year ---")
print(papers_by_year.head())
# 2. Identify top journals by publication count
top_journals = df['journal'].value_counts().head(10)
print("\n--- Top 10 Publishing Journals ---")
print(top_journals)
# 3. Simple word frequency analysis on titles
all_titles = ' '.join(df['title'].dropna().tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_titles)
print("Performed word frequency analysis on titles.")
# Create a directory for plots if it doesn't exist
plots_dir = 'plots'
if not os.path.exists(plots_dir):
os.makedirs(plots_dir)
# Visualization 1: Line/bar chart of publications over time
plt.figure(figsize=(12, 6))
sns.lineplot(x=papers_by_year.index, y=papers_by_year.values)
plt.title('Number of Publications Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'publications_over_time.png'))
plt.close()
print(f"Saved plot: {os.path.join(plots_dir, 'publications_over_time.png')}")
# Visualization 2: Bar chart of top publishing journals
plt.figure(figsize=(12, 7))
sns.barplot(x=top_journals.values, y=top_journals.index, palette='viridis')
plt.title('Top 10 Publishing Journals')
plt.xlabel('Number of Papers')
plt.ylabel('Journal')
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'top_journals.png'))
plt.close()
print(f"Saved plot: {os.path.join(plots_dir, 'top_journals.png')}")
# Visualization 3: Word cloud of paper titles
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Paper Titles')
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'title_wordcloud.png'))
plt.close()
print(f"Saved plot: {os.path.join(plots_dir, 'title_wordcloud.png')}")
# Visualization 4: Distribution of paper counts by source
if 'source_x' in df.columns:
paper_sources = df['source_x'].value_counts().head(10)
source_col = 'source_x'
else:
paper_sources = df['journal'].value_counts().head(10) # Fallback
source_col = 'journal'
plt.figure(figsize=(12, 7))
sns.barplot(x=paper_sources.values, y=paper_sources.index, palette='plasma')
plt.title(f'Top 10 Paper Sources ({source_col})')
plt.xlabel('Number of Papers')
plt.ylabel('Source')
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'paper_sources_distribution.png'))
plt.close()
print(f"Saved plot: {os.path.join(plots_dir, 'paper_sources_distribution.png')}")
else:
print("DataFrame is empty. Skipping analysis and visualization.")