Commit 05780340 authored by Alessandro Saccoia's avatar Alessandro Saccoia

Added 3 tools related to unsupervised topic extraction and LDA

parent ae27251b
#!/usr/bin/python
"""
Usage: opp_tc_createTermDocumentMatrix.py atti.json termDocumentMatrix.csv
Input: JSON file of parliamentary acts acts with tags and keywords
Output: .csv file made in this way:
atto_id wc1 wc2 wc3 [...] wcN cat1 cat2 cat3 [...] catM
123 13 34 2 13 1 0 0 1
And one file containing the vocabulary
Author: Alessandro Saccoia
Date: Sun, 7/31/2018
"""
import sys, os, io, json, random
from lxml import etree
from time import time
import logging
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
input_json = sys.argv[1]
with open(input_json) as f:
data = json.load(f)
print 'Loading %d atti...' % len(data),
t0 = time()
with open(input_json) as f:
data = json.load(f)
print("done in %0.3fs." % (time() - t0))
import sys, os, io
from time import time
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import json
import pprint
file_folder = sys.argv[1]
output_count =file_folder + '/count.npz'
output_tfidf = file_folder + '/tfidf.npz'
output_wordlist = file_folder + '/wordlist.json'
output_classes = file_folder + '/classes.npy'
output_classes_names = file_folder + '/classes.json'
output_stats = file_folder + '/stats.json'
output_stats = file_folder + '/stats.json'
topics_lda = file_folder + '/topic_lda_tfidf.npy'
print("Loading classes...")
t0 = time()
topics_human = np.load(output_classes)
topics_computer = np.load(topics_lda)
input_json = sys.argv[1]
with open(output_classes_names) as f:
topic_names_human = json.load(f)
nr_documents = topics_human.shape[0]
nr_topics_human = topics_human.shape[1]
nr_topics_computer = topics_computer.shape[1]
occurrencies_per_topic_human = np.sum(topics_human,1)
topic_normalization_factor = occurrencies_per_topic_human / np.max(occurrencies_per_topic_human)
print (topics_human.shape)
print (topics_computer.shape)
confusion_matrix = np.zeros([nr_topics_human, nr_topics_computer])
for di in range(nr_documents):
for ix in range(nr_topics_human):
for iy in range(nr_topics_computer):
confusion_matrix[ix,iy] += topics_human[di,ix] * topics_computer[di,iy] / topic_normalization_factor[di]
for ix in range(nr_topics_human):
confusion_matrix[ix, :] *= 100.0 / np.max(confusion_matrix[ix, :])
fig, ax = plt.subplots()
im = ax.imshow(confusion_matrix)
ax.set_yticks(np.arange(len(topic_names_human)))
ax.set_yticklabels(reversed(topic_names_human[::-1]))
plt.show()
print("done in %0.3fs" % (time() - t0))
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# Lars Buitinck
# Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause
# http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
from __future__ import print_function
from time import time
import sys, os, io
from lxml import etree
from time import time
import pprint
import json
from scipy import sparse
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
n_components = 30
n_top_words = 20
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
message = "Topic #%d: " % topic_idx
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
print(message)
print()
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.
file_folder = sys.argv[1]
output_count =file_folder + '/count.npz'
output_tfidf = file_folder + '/tfidf.npz'
output_wordlist = file_folder + '/wordlist.json'
output_classes = file_folder + '/classes.npy'
output_classes_names = file_folder + '/classes.json'
output_stats = file_folder + '/stats.json'
print("Loading dataset...")
t0 = time()
X_tfidf = sparse.load_npz(output_tfidf).toarray()
X_count = sparse.load_npz(output_count).toarray()
Y = np.load(output_classes)
n_samples = Y.shape[0]
n_features = X_tfidf.shape[1]
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
"n_samples=%d and n_features=%d..."
% (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
alpha=.1, l1_ratio=.5).fit(X_tfidf)
print("done in %0.3fs." % (time() - t0))
print("\nTopics in NMF model (Frobenius norm):")
with open(output_wordlist) as f:
wordlist = json.load(f)
print('Loaded %d words' % len(wordlist))
print_top_words(nmf, wordlist, n_top_words)
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
"tf-idf features, n_samples=%d and n_features=%d..."
% (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
l1_ratio=.5).fit(X_tfidf)
print("done in %0.3fs." % (time() - t0))
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
print_top_words(nmf, wordlist, n_top_words)
print("Fitting LDA model with tf features, "
"n_samples=%d and n_features=%d..."
% (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
t0 = time()
lda.fit(X_count)
print("done in %0.3fs." % (time() - t0))
print("\nTopics in LDA model (tf):")
print_top_words(lda, wordlist, n_top_words)
doc_topic_dist = lda.transform(X_count)
np.save(file_folder +'topic_lda_tf.npy', doc_topic_dist)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment