Skip to content

Commit

Permalink
Added mallet train-topics command.
Browse files Browse the repository at this point in the history
  • Loading branch information
ontoligent committed Mar 26, 2020
1 parent 079d184 commit 072d111
Showing 1 changed file with 41 additions and 6 deletions.
47 changes: 41 additions & 6 deletions mazo.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,45 @@
cmd = "{} import-file --input {} --output {} --keep-sequence true --remove-stopwords true"\
.format(mallet_bin, corpus_file, mallet_file)
os.system(cmd)

# if not os.path.isfile():
# try:
# os.mkdir(tables_dir)
# except FileExistsError:
# pass
print("Done creating MALLET file.")

# Make sure output directory exists
if not os.path.isfile('./output'):
try:
os.mkdir('./output')
except FileExistsError:
pass

if not os.path.isfile('./output/{}'.format(keyword)):
try:
os.mkdir('./output/{}'.format(keyword))
except FileExistsError:
pass

# Run the topic model
# Eventually provide ways to override these defaults
params = {
'num-topics': n_topics,
'num-top-words': 10,
'num-iterations': 1000,
'optimize-interval': 100,
'num-threads': 4,
'num-top-docs': 5,
'doc-topics-max': 10,
'show-topics-interval': 100,
'input corpus': mallet_file,
'output-topic-keys': 'output/{}/topic-keys.txt'.format(keyword),
'output-doc-topics': 'output/{}/doc-topics.txt'.format(keyword),
'word-topic-counts-file': 'output/{}/word-topic-counts.txt'.format(keyword),
'topic-word-weights-file': 'output/{}/topic-word-weights.txt'.format(keyword),
'xml-topic-report': 'output/{}/topic-report.xml'.format(keyword),
'xml-topic-phrase-report': 'output/{}/topic-phrase-report.xml'.format(keyword),
'diagnostics-file': 'output/{}/diagnostics.xml'.format(keyword),
'output-state': 'output/{}/output-state.gz'.format(keyword)
}
cmds = []
for k, v in params.items():
cmds.append("--{} {}".format(k, v))
train_cmd = "{} train-topics".format(mallet_bin) + ' '.join(cmds)
print(train_cmd)

0 comments on commit 072d111

Please sign in to comment.