forked from caicloud/tensorflow-tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
perhapszzy
authored and
perhapszzy
committed
Jan 26, 2018
1 parent
38fba9b
commit a872294
Showing
13 changed files
with
843,543 additions
and
0 deletions.
There are no files selected for viewing
126 changes: 126 additions & 0 deletions
126
Deep_Learning_with_TensorFlow/1.4.0/Chapter09/1. izplfhudf.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import tensorflow as tf" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### 1. sparse_softmax_cross_entropy_with_logits样例。" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[ 0.32656264 0.46436879]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# 假设词汇表的大小为3, 语料包含两个单词\"2 0\"\n", | ||
"word_labels = tf.constant([2, 0])\n", | ||
"\n", | ||
"# 假设模型对两个单词预测时,产生的logit分别是[2.0, -1.0, 3.0]和[1.0, 0.0, -0.5]\n", | ||
"predict_logits = tf.constant([[2.0, -1.0, 3.0], [1.0, 0.0, -0.5]])\n", | ||
"\n", | ||
"# 使用sparse_softmax_cross_entropy_with_logits计算交叉熵。\n", | ||
"loss = tf.nn.sparse_softmax_cross_entropy_with_logits(\n", | ||
" labels=word_labels, logits=predict_logits)\n", | ||
"\n", | ||
"# 运行程序,计算loss的结果是[0.32656264, 0.46436879], 这对应两个预测的\n", | ||
"# perplexity损失。\n", | ||
"sess = tf.Session()\n", | ||
"print(sess.run(loss))\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### 2. softmax_cross_entropy_with_logits样例。" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[ 0.32656264 0.46436879]\n", | ||
"[ 0.37656265 0.48936883]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# softmax_cross_entropy_with_logits与上面的函数相似,但是需要将预测目标以\n", | ||
"# 概率分布的形式给出。\n", | ||
"word_prob_distribution = tf.constant([[0.0, 0.0, 1.0], [1.0, 0.0, 0.0]])\n", | ||
"loss = tf.nn.softmax_cross_entropy_with_logits(\n", | ||
" labels=word_prob_distribution, logits=predict_logits)\n", | ||
"# 运行结果与上面相同:[ 0.32656264, 0.46436879]\n", | ||
"print(sess.run(loss))\n", | ||
"\n", | ||
"# label smoothing:将正确数据的概率设为一个比1.0略小的值,将错误数据的概率\n", | ||
"# 设为比0.0略大的值,这样可以避免模型与数据过拟合,在某些时候可以提高训练效果。\n", | ||
"word_prob_smooth = tf.constant([[0.01, 0.01, 0.98], [0.98, 0.01, 0.01]])\n", | ||
"loss = tf.nn.softmax_cross_entropy_with_logits(\n", | ||
" labels=word_prob_smooth, logits=predict_logits)\n", | ||
"# 运行结果:[ 0.37656265, 0.48936883]\n", | ||
"print(sess.run(loss))\n", | ||
"\n", | ||
"sess.close()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |
138 changes: 138 additions & 0 deletions
138
Deep_Learning_with_TensorFlow/1.4.0/Chapter09/2. rwuasmk--fondc.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import codecs\n", | ||
"import collections\n", | ||
"from operator import itemgetter" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### 1. 设置参数。" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"MODE = \"PTB\" # 将MODE设置为\"PTB\", \"TRANSLATE_EN\", \"TRANSLATE_ZH\"之一。\n", | ||
"\n", | ||
"if MODE == \"PTB\": # PTB数据处理\n", | ||
" RAW_DATA = \"../../datasets/PTB_data/ptb.train.txt\" # 训练集数据文件\n", | ||
" VOCAB_OUTPUT = \"ptb.vocab\" # 输出的词汇表文件\n", | ||
"elif MODE == \"TRANSLATE_ZH\": # 翻译语料的中文部分\n", | ||
" RAW_DATA = \"../../datasets/TED_data/train.txt.zh\"\n", | ||
" VOCAB_OUTPUT = \"zh.vocab\"\n", | ||
" VOCAB_SIZE = 4000\n", | ||
"elif MODE == \"TRANSLATE_EN\": # 翻译语料的英文部分\n", | ||
" RAW_DATA = \"../../datasets/TED_data/train.txt.en\"\n", | ||
" VOCAB_OUTPUT = \"en.vocab\"\n", | ||
" VOCAB_SIZE = 10000" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### 2.对单词按词频排序。" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"counter = collections.Counter()\n", | ||
"with codecs.open(RAW_DATA, \"r\", \"utf-8\") as f:\n", | ||
" for line in f:\n", | ||
" for word in line.strip().split():\n", | ||
" counter[word] += 1\n", | ||
"\n", | ||
"# 按词频顺序对单词进行排序。\n", | ||
"sorted_word_to_cnt = sorted(\n", | ||
" counter.items(), key=itemgetter(1), reverse=True)\n", | ||
"sorted_words = [x[0] for x in sorted_word_to_cnt]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### 3.插入特殊符号。" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if MODE == \"PTB\":\n", | ||
" # 稍后我们需要在文本换行处加入句子结束符\"<eos>\",这里预先将其加入词汇表。\n", | ||
" sorted_words = [\"<eos>\"] + sorted_words\n", | ||
"elif MODE in [\"TRANSLATE_EN\", \"TRANSLATE_ZH\"]:\n", | ||
" # 在9.3.2小节处理机器翻译数据时,除了\"<eos>\"以外,还需要将\"<unk>\"和句子起始符\n", | ||
" # \"<sos>\"加入词汇表,并从词汇表中删除低频词汇。\n", | ||
" sorted_words = [\"<unk>\", \"<sos>\", \"<eos>\"] + sorted_words\n", | ||
" if len(sorted_words) > VOCAB_SIZE:\n", | ||
" sorted_words = sorted_words[:VOCAB_SIZE]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### 4.保存词汇表文件。" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 22, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"with codecs.open(VOCAB_OUTPUT, 'w', 'utf-8') as file_output:\n", | ||
" for word in sorted_words:\n", | ||
" file_output.write(word + \"\\n\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
126 changes: 126 additions & 0 deletions
126
Deep_Learning_with_TensorFlow/1.4.0/Chapter09/3. rwuasmk--fohdrm.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import codecs\n", | ||
"import sys" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### 1. 参数设置。" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"MODE = \"PTB_TRAIN\" # 将MODE设置为\"PTB_TRAIN\", \"PTB_VALID\", \"PTB_TEST\", \"TRANSLATE_EN\", \"TRANSLATE_ZH\"之一。\n", | ||
"\n", | ||
"if MODE == \"PTB_TRAIN\": # PTB训练数据\n", | ||
" RAW_DATA = \"../../datasets/PTB_data/ptb.train.txt\" # 训练集数据文件\n", | ||
" VOCAB = \"ptb.vocab\" # 词汇表文件\n", | ||
" OUTPUT_DATA = \"ptb.train\" # 将单词替换为单词编号后的输出文件\n", | ||
"elif MODE == \"PTB_VALID\": # PTB验证数据\n", | ||
" RAW_DATA = \"../../datasets/PTB_data/ptb.valid.txt\"\n", | ||
" VOCAB = \"ptb.vocab\"\n", | ||
" OUTPUT_DATA = \"ptb.valid\"\n", | ||
"elif MODE == \"PTB_TEST\": # PTB测试数据\n", | ||
" RAW_DATA = \"../../datasets/PTB_data/ptb.test.txt\"\n", | ||
" VOCAB = \"ptb.vocab\"\n", | ||
" OUTPUT_DATA = \"ptb.test\"\n", | ||
"elif MODE == \"TRANSLATE_ZH\": # 中文翻译数据\n", | ||
" RAW_DATA = \"../../datasets/TED_data/train.txt.zh\"\n", | ||
" VOCAB = \"zh.vocab\"\n", | ||
" OUTPUT_DATA = \"train.zh\"\n", | ||
"elif MODE == \"TRANSLATE_EN\": # 英文翻译数据\n", | ||
" RAW_DATA = \"../../datasets/TED_data/train.txt.en\"\n", | ||
" VOCAB = \"en.vocab\"\n", | ||
" OUTPUT_DATA = \"train.en\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### 2.按词汇表对将单词映射到编号。" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# 读取词汇表,并建立词汇到单词编号的映射。\n", | ||
"with codecs.open(VOCAB, \"r\", \"utf-8\") as f_vocab:\n", | ||
" vocab = [w.strip() for w in f_vocab.readlines()]\n", | ||
"word_to_id = {k: v for (k, v) in zip(vocab, range(len(vocab)))}\n", | ||
"\n", | ||
"# 如果出现了不在词汇表内的低频词,则替换为\"unk\"。\n", | ||
"def get_id(word):\n", | ||
" return word_to_id[word] if word in word_to_id else word_to_id[\"<unk>\"]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### 3.对数据进行替换并保存结果。" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"fin = codecs.open(RAW_DATA, \"r\", \"utf-8\")\n", | ||
"fout = codecs.open(OUTPUT_DATA, 'w', 'utf-8')\n", | ||
"for line in fin:\n", | ||
" words = line.strip().split() + [\"<eos>\"] # 读取单词并添加<eos>结束符\n", | ||
" # 将每个单词替换为词汇表中的编号\n", | ||
" out_line = ' '.join([str(get_id(w)) for w in words]) + '\\n'\n", | ||
" fout.write(out_line)\n", | ||
"fin.close()\n", | ||
"fout.close()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.