Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
O
opp-text-classification
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
openpolis
opp-text-classification
Commits
05780340
Commit
05780340
authored
Aug 03, 2018
by
Alessandro Saccoia
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added 3 tools related to unsupervised topic extraction and LDA
parent
ae27251b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
212 additions
and
0 deletions
+212
-0
src/opp_tc_createTermDocumentMatrix.py
src/opp_tc_createTermDocumentMatrix.py
+43
-0
src/opp_tc_overlapTopics.py
src/opp_tc_overlapTopics.py
+62
-0
src/opp_tc_topicExtraction.py
src/opp_tc_topicExtraction.py
+107
-0
No files found.
src/opp_tc_createTermDocumentMatrix.py
0 → 100644
View file @
05780340
#!/usr/bin/python
"""
Usage: opp_tc_createTermDocumentMatrix.py atti.json termDocumentMatrix.csv
Input: JSON file of parliamentary acts acts with tags and keywords
Output: .csv file made in this way:
atto_id wc1 wc2 wc3 [...] wcN cat1 cat2 cat3 [...] catM
123 13 34 2 13 1 0 0 1
And one file containing the vocabulary
Author: Alessandro Saccoia
Date: Sun, 7/31/2018
"""
import
sys
,
os
,
io
,
json
,
random
from
lxml
import
etree
from
time
import
time
import
logging
from
pprint
import
pprint
from
sklearn.feature_extraction.text
import
CountVectorizer
from
sklearn.feature_extraction.text
import
TfidfTransformer
from
sklearn.linear_model
import
SGDClassifier
from
sklearn.pipeline
import
Pipeline
from
sklearn.model_selection
import
GridSearchCV
# Display progress logs on stdout
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
'%(asctime)s %(levelname)s %(message)s'
)
input_json
=
sys
.
argv
[
1
]
with
open
(
input_json
)
as
f
:
data
=
json
.
load
(
f
)
print
'Loading %d atti...'
%
len
(
data
),
t0
=
time
()
with
open
(
input_json
)
as
f
:
data
=
json
.
load
(
f
)
print
(
"done in %0.3fs."
%
(
time
()
-
t0
))
src/opp_tc_overlapTopics.py
0 → 100644
View file @
05780340
import
sys
,
os
,
io
from
time
import
time
import
numpy
as
np
import
matplotlib
import
matplotlib.pyplot
as
plt
import
json
import
pprint
file_folder
=
sys
.
argv
[
1
]
output_count
=
file_folder
+
'/count.npz'
output_tfidf
=
file_folder
+
'/tfidf.npz'
output_wordlist
=
file_folder
+
'/wordlist.json'
output_classes
=
file_folder
+
'/classes.npy'
output_classes_names
=
file_folder
+
'/classes.json'
output_stats
=
file_folder
+
'/stats.json'
output_stats
=
file_folder
+
'/stats.json'
topics_lda
=
file_folder
+
'/topic_lda_tfidf.npy'
print
(
"Loading classes..."
)
t0
=
time
()
topics_human
=
np
.
load
(
output_classes
)
topics_computer
=
np
.
load
(
topics_lda
)
input_json
=
sys
.
argv
[
1
]
with
open
(
output_classes_names
)
as
f
:
topic_names_human
=
json
.
load
(
f
)
nr_documents
=
topics_human
.
shape
[
0
]
nr_topics_human
=
topics_human
.
shape
[
1
]
nr_topics_computer
=
topics_computer
.
shape
[
1
]
occurrencies_per_topic_human
=
np
.
sum
(
topics_human
,
1
)
topic_normalization_factor
=
occurrencies_per_topic_human
/
np
.
max
(
occurrencies_per_topic_human
)
print
(
topics_human
.
shape
)
print
(
topics_computer
.
shape
)
confusion_matrix
=
np
.
zeros
([
nr_topics_human
,
nr_topics_computer
])
for
di
in
range
(
nr_documents
):
for
ix
in
range
(
nr_topics_human
):
for
iy
in
range
(
nr_topics_computer
):
confusion_matrix
[
ix
,
iy
]
+=
topics_human
[
di
,
ix
]
*
topics_computer
[
di
,
iy
]
/
topic_normalization_factor
[
di
]
for
ix
in
range
(
nr_topics_human
):
confusion_matrix
[
ix
,
:]
*=
100.0
/
np
.
max
(
confusion_matrix
[
ix
,
:])
fig
,
ax
=
plt
.
subplots
()
im
=
ax
.
imshow
(
confusion_matrix
)
ax
.
set_yticks
(
np
.
arange
(
len
(
topic_names_human
)))
ax
.
set_yticklabels
(
reversed
(
topic_names_human
[::
-
1
]))
plt
.
show
()
print
(
"done in %0.3fs"
%
(
time
()
-
t0
))
src/opp_tc_topicExtraction.py
0 → 100644
View file @
05780340
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# Lars Buitinck
# Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause
# http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
from
__future__
import
print_function
from
time
import
time
import
sys
,
os
,
io
from
lxml
import
etree
from
time
import
time
import
pprint
import
json
from
scipy
import
sparse
from
sklearn.naive_bayes
import
MultinomialNB
import
numpy
as
np
from
sklearn.feature_extraction.text
import
TfidfVectorizer
,
CountVectorizer
from
sklearn.decomposition
import
NMF
,
LatentDirichletAllocation
n_components
=
30
n_top_words
=
20
def
print_top_words
(
model
,
feature_names
,
n_top_words
):
for
topic_idx
,
topic
in
enumerate
(
model
.
components_
):
message
=
"Topic #%d: "
%
topic_idx
message
+=
" "
.
join
([
feature_names
[
i
]
for
i
in
topic
.
argsort
()[:
-
n_top_words
-
1
:
-
1
]])
print
(
message
)
print
()
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.
file_folder
=
sys
.
argv
[
1
]
output_count
=
file_folder
+
'/count.npz'
output_tfidf
=
file_folder
+
'/tfidf.npz'
output_wordlist
=
file_folder
+
'/wordlist.json'
output_classes
=
file_folder
+
'/classes.npy'
output_classes_names
=
file_folder
+
'/classes.json'
output_stats
=
file_folder
+
'/stats.json'
print
(
"Loading dataset..."
)
t0
=
time
()
X_tfidf
=
sparse
.
load_npz
(
output_tfidf
).
toarray
()
X_count
=
sparse
.
load_npz
(
output_count
).
toarray
()
Y
=
np
.
load
(
output_classes
)
n_samples
=
Y
.
shape
[
0
]
n_features
=
X_tfidf
.
shape
[
1
]
# Fit the NMF model
print
(
"Fitting the NMF model (Frobenius norm) with tf-idf features, "
"n_samples=%d and n_features=%d..."
%
(
n_samples
,
n_features
))
t0
=
time
()
nmf
=
NMF
(
n_components
=
n_components
,
random_state
=
1
,
alpha
=
.
1
,
l1_ratio
=
.
5
).
fit
(
X_tfidf
)
print
(
"done in %0.3fs."
%
(
time
()
-
t0
))
print
(
"
\n
Topics in NMF model (Frobenius norm):"
)
with
open
(
output_wordlist
)
as
f
:
wordlist
=
json
.
load
(
f
)
print
(
'Loaded %d words'
%
len
(
wordlist
))
print_top_words
(
nmf
,
wordlist
,
n_top_words
)
# Fit the NMF model
print
(
"Fitting the NMF model (generalized Kullback-Leibler divergence) with "
"tf-idf features, n_samples=%d and n_features=%d..."
%
(
n_samples
,
n_features
))
t0
=
time
()
nmf
=
NMF
(
n_components
=
n_components
,
random_state
=
1
,
beta_loss
=
'kullback-leibler'
,
solver
=
'mu'
,
max_iter
=
1000
,
alpha
=
.
1
,
l1_ratio
=
.
5
).
fit
(
X_tfidf
)
print
(
"done in %0.3fs."
%
(
time
()
-
t0
))
print
(
"
\n
Topics in NMF model (generalized Kullback-Leibler divergence):"
)
print_top_words
(
nmf
,
wordlist
,
n_top_words
)
print
(
"Fitting LDA model with tf features, "
"n_samples=%d and n_features=%d..."
%
(
n_samples
,
n_features
))
lda
=
LatentDirichletAllocation
(
n_components
=
n_components
,
max_iter
=
5
,
learning_method
=
'online'
,
learning_offset
=
50.
,
random_state
=
0
)
t0
=
time
()
lda
.
fit
(
X_count
)
print
(
"done in %0.3fs."
%
(
time
()
-
t0
))
print
(
"
\n
Topics in LDA model (tf):"
)
print_top_words
(
lda
,
wordlist
,
n_top_words
)
doc_topic_dist
=
lda
.
transform
(
X_count
)
np
.
save
(
file_folder
+
'topic_lda_tf.npy'
,
doc_topic_dist
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment