-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathmain.py
More file actions
executable file
·68 lines (51 loc) · 1.89 KB
/
main.py
File metadata and controls
executable file
·68 lines (51 loc) · 1.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/python
__author__ = 'Ashwin'
__email__ = 'gashwin1@umbc.edu'
"""
CMSC 691 - Final Project
Visualizing Topics using LDA
AUTHOR: Ashwinkumar Ganesan.
Kiante Branley
Latent Dirichlet Allocation is a topic modelling method that calculates
the joint probabilities of words across documents and generates the
two distributions:
1. The distribution of topics for each document.
2. The distribution of words for each topic.
This project tries to visualize the topics, the documents and the clusters
that are generated.
"""
from processdata.fileops import FileReader
from processdata.lda import LDAVisualModel
from processdata.fileops import write_rank_to_file
num_of_topics = 20
num_of_passes = 50
num_of_words = 10
# Location of the data.
data_loc = '20_newsgroups/alt.temp/9976'
data_dir_loc = '20_newsgroups/alt.temp/'
data_sects = 'server/data/info_vis_abstracts.csv'
title_list = 'server/data/key_vis_title.csv'
final_data_file = 'server/data/data.csv'
if __name__ == "__main__":
# Read the directory
reader = FileReader()
#reader.read_file(data_loc)
#reader.read_dir(data_dir_loc)
reader.read_text_sections(data_sects)
# Get the token list
word_corpus = reader.get_token_list()
# Perform LDA.
lda = LDAVisualModel(word_corpus)
lda.create_word_corpus(word_corpus)
# Train the LDA model for specific number of topics
# and iterations.
lda.train_lda(num_of_topics, num_of_passes)
# Generate the document to topic matrix.
doc_top = lda.generate_doc_topic()
doc_top_rank = lda.generate_doc_topic_rank()
# Get the topic corpus.
topics = lda.get_lda_corpus(num_of_topics, num_of_words)
# Isolate top words for documents.
doc_to_word = lda.gen_doc_top_words(topics, doc_top)
# Print the topic information to a file.
write_rank_to_file(doc_to_word, doc_top_rank, num_of_words, num_of_topics, title_list, final_data_file)