Skip to content

Commit

Permalink
Merge branch 'release-3.5.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
menshikh-iv committed Jul 6, 2018
2 parents 885430d + 6ecd261 commit 006a6ee
Show file tree
Hide file tree
Showing 153 changed files with 24,965 additions and 14,517 deletions.
202 changes: 202 additions & 0 deletions CHANGELOG.md

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions docs/notebooks/Corpora_and_Vector_Spaces.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,10 @@
},
"outputs": [],
"source": [
"from smart_open import smart_open\n",
"class MyCorpus(object):\n",
" def __iter__(self):\n",
" for line in open('datasets/mycorpus.txt'):\n",
" for line in smart_open('datasets/mycorpus.txt', 'rb'):\n",
" # assume there's one document per line, tokens separated by whitespace\n",
" yield dictionary.doc2bow(line.lower().split())"
]
Expand Down Expand Up @@ -374,9 +375,10 @@
],
"source": [
"from six import iteritems\n",
"from smart_open import smart_open\n",
"\n",
"# collect statistics about all tokens\n",
"dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))\n",
"dictionary = corpora.Dictionary(line.lower().split() for line in smart_open('datasets/mycorpus.txt', 'rb'))\n",
"\n",
"# remove stop words and words that appear only once\n",
"stop_ids = [dictionary.token2id[stopword] for stopword in stoplist \n",
Expand Down
16 changes: 8 additions & 8 deletions docs/notebooks/Poincare Evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,7 @@
" parts = first_line.rstrip().split(\"\\t\")\n",
" model_size = len(parts) - 1\n",
" vocab_size = len(lines)\n",
" with open(output_file, 'w') as f:\n",
" with smart_open(output_file, 'w') as f:\n",
" f.write('%d %d\\n' % (vocab_size, model_size))\n",
" for line in lines:\n",
" f.write(line.replace('\\t', ' '))\n",
Expand All @@ -709,7 +709,7 @@
" \n",
" model_size = random_embedding.shape[0]\n",
" vocab_size = len(np_embeddings)\n",
" with open(output_file, 'w') as f:\n",
" with smart_open(output_file, 'w') as f:\n",
" f.write('%d %d\\n' % (vocab_size, model_size))\n",
" for key, vector in np_embeddings.items():\n",
" vector_string = ' '.join('%.6f' % value for value in vector)\n",
Expand Down Expand Up @@ -1113,7 +1113,7 @@
" test_line_candidates = []\n",
" line_count = 0\n",
" all_nodes = set()\n",
" with open(data_file, 'rb') as f:\n",
" with smart_open(data_file, 'rb') as f:\n",
" for i, line in enumerate(f):\n",
" node_1, node_2 = line.split()\n",
" all_nodes.update([node_1, node_2])\n",
Expand All @@ -1135,9 +1135,9 @@
" train_line_indices = set(l for l in range(line_count) if l not in test_line_indices)\n",
" \n",
" train_set_nodes = set()\n",
" with open(data_file, 'rb') as f:\n",
" train_file = open(train_filename, 'wb')\n",
" test_file = open(test_filename, 'wb')\n",
" with smart_open(data_file, 'rb') as f:\n",
" train_file = smart_open(train_filename, 'wb')\n",
" test_file = smart_open(test_filename, 'wb')\n",
" for i, line in enumerate(f):\n",
" if i in train_line_indices:\n",
" train_set_nodes.update(line.split())\n",
Expand Down Expand Up @@ -1169,13 +1169,13 @@
" \"\"\"\n",
" root_candidates = set()\n",
" leaf_candidates = set()\n",
" with open(data_file, 'rb') as f:\n",
" with smart_open(data_file, 'rb') as f:\n",
" for line in f:\n",
" nodes = line.split()\n",
" root_candidates.update(nodes)\n",
" leaf_candidates.update(nodes)\n",
" \n",
" with open(data_file, 'rb') as f:\n",
" with smart_open(data_file, 'rb') as f:\n",
" for line in f:\n",
" node_1, node_2 = line.split()\n",
" if node_1 == node_2:\n",
Expand Down
9 changes: 5 additions & 4 deletions docs/notebooks/Tensorboard_visualizations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,7 @@
"import pandas as pd\n",
"import smart_open\n",
"import random\n",
"from smart_open import smart_open\n",
"\n",
"# read data\n",
"dataframe = pd.read_csv('movie_plots.csv')\n",
Expand Down Expand Up @@ -803,7 +804,7 @@
},
"outputs": [],
"source": [
"with open('movie_plot_metadata.tsv','w') as w:\n",
"with smart_open('movie_plot_metadata.tsv','w') as w:\n",
" w.write('Titles\\tGenres\\n')\n",
" for i,j in zip(dataframe.Titles, dataframe.Genres):\n",
" w.write(\"%s\\t%s\\n\" % (i,j))"
Expand Down Expand Up @@ -1024,14 +1025,14 @@
"outputs": [],
"source": [
"# create file for tensors\n",
"with open('doc_lda_tensor.tsv','w') as w:\n",
"with smart_open('doc_lda_tensor.tsv','w') as w:\n",
" for doc_topics in all_topics:\n",
" for topics in doc_topics:\n",
" w.write(str(topics[1])+ \"\\t\")\n",
" w.write(\"\\n\")\n",
" \n",
"# create file for metadata\n",
"with open('doc_lda_metadata.tsv','w') as w:\n",
"with smart_open('doc_lda_metadata.tsv','w') as w:\n",
" w.write('Titles\\tGenres\\n')\n",
" for j, k in zip(dataframe.Titles, dataframe.Genres):\n",
" w.write(\"%s\\t%s\\n\" % (j, k))"
Expand Down Expand Up @@ -1084,7 +1085,7 @@
"\n",
"# overwrite metadata file\n",
"i=0\n",
"with open('doc_lda_metadata.tsv','w') as w:\n",
"with smart_open('doc_lda_metadata.tsv','w') as w:\n",
" w.write('Titles\\tGenres\\n')\n",
" for j,k in zip(dataframe.Titles, dataframe.Genres):\n",
" w.write(\"%s\\t%s\\n\" % (''.join((str(j), str(tensors[i]))),k))\n",
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/WMD_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@
"start = time()\n",
"\n",
"import json\n",
"from smart_open import smart_open\n",
"\n",
"# Business IDs of the restaurants.\n",
"ids = ['4bEjOyTaDG24SY5TxsaUNQ', '2e2e7WgqU1BnpxmQL5jbfw', 'zt1TpTuJ6y9n551sw9TaEg',\n",
Expand All @@ -310,7 +311,7 @@
"w2v_corpus = [] # Documents to train word2vec on (all 6 restaurants).\n",
"wmd_corpus = [] # Documents to run queries against (only one restaurant).\n",
"documents = [] # wmd_corpus, with no pre-processing (so we can see the original documents).\n",
"with open('/data/yelp_academic_dataset_review.json') as data_file:\n",
"with smart_open('/data/yelp_academic_dataset_review.json', 'rb') as data_file:\n",
" for line in data_file:\n",
" json_line = json.loads(line)\n",
" \n",
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/Word2Vec_FastText_Comparison.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,12 @@
],
"source": [
"import nltk\n",
"from smart_open import smart_open\n",
"nltk.download('brown') \n",
"# Only the brown corpus is needed in case you don't have it.\n",
"\n",
"# Generate brown corpus text file\n",
"with open('brown_corp.txt', 'w+') as f:\n",
"with smart_open('brown_corp.txt', 'w+') as f:\n",
" for word in nltk.corpus.brown.words():\n",
" f.write('{word} '.format(word=word))\n",
"\n",
Expand Down
8 changes: 5 additions & 3 deletions docs/notebooks/Wordrank_comparisons.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,21 @@
],
"source": [
"import nltk\n",
"from smart_open import smart_open\n",
"from gensim.parsing.preprocessing import strip_punctuation, strip_multiple_whitespaces\n",
"\n",
"# Only the brown corpus is needed in case you don't have it.\n",
"nltk.download('brown') \n",
"\n",
"# Generate brown corpus text file\n",
"with open('brown_corp.txt', 'w+') as f:\n",
"with smart_open('brown_corp.txt', 'w+') as f:\n",
" for word in nltk.corpus.brown.words():\n",
" f.write('{word} '.format(word=word))\n",
" f.seek(0)\n",
" brown = f.read()\n",
"\n",
"# Preprocess brown corpus\n",
"with open('proc_brown_corp.txt', 'w') as f:\n",
"with smart_open('proc_brown_corp.txt', 'w') as f:\n",
" proc_brown = strip_punctuation(brown)\n",
" proc_brown = strip_multiple_whitespaces(proc_brown).lower()\n",
" f.write(proc_brown)\n",
Expand Down Expand Up @@ -1004,12 +1005,13 @@
"import copy\n",
"import multiprocessing\n",
"import numpy as np\n",
"from smart_open import smart_open\n",
"\n",
"\n",
"def compute_accuracies(model, freq):\n",
" # mean_freq will contain analogies together with the mean frequency of 4 words involved\n",
" mean_freq = {}\n",
" with open(word_analogies_file, 'r') as r:\n",
" with smart_open(word_analogies_file, 'r') as r:\n",
" for i, line in enumerate(r):\n",
" if ':' not in line:\n",
" analogy = tuple(line.split())\n",
Expand Down
Loading

0 comments on commit 006a6ee

Please sign in to comment.