Skip to content

Commit

Permalink
add chapter9 examples
Browse files Browse the repository at this point in the history
  • Loading branch information
perhapszzy authored and perhapszzy committed Jan 26, 2018
1 parent 38fba9b commit a872294
Show file tree
Hide file tree
Showing 13 changed files with 843,543 additions and 0 deletions.
126 changes: 126 additions & 0 deletions Deep_Learning_with_TensorFlow/1.4.0/Chapter09/1. izplfhudf.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1. sparse_softmax_cross_entropy_with_logits样例。"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0.32656264 0.46436879]\n"
]
}
],
"source": [
"# 假设词汇表的大小为3, 语料包含两个单词\"2 0\"\n",
"word_labels = tf.constant([2, 0])\n",
"\n",
"# 假设模型对两个单词预测时,产生的logit分别是[2.0, -1.0, 3.0]和[1.0, 0.0, -0.5]\n",
"predict_logits = tf.constant([[2.0, -1.0, 3.0], [1.0, 0.0, -0.5]])\n",
"\n",
"# 使用sparse_softmax_cross_entropy_with_logits计算交叉熵。\n",
"loss = tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
" labels=word_labels, logits=predict_logits)\n",
"\n",
"# 运行程序,计算loss的结果是[0.32656264, 0.46436879], 这对应两个预测的\n",
"# perplexity损失。\n",
"sess = tf.Session()\n",
"print(sess.run(loss))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2. softmax_cross_entropy_with_logits样例。"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0.32656264 0.46436879]\n",
"[ 0.37656265 0.48936883]\n"
]
}
],
"source": [
"# softmax_cross_entropy_with_logits与上面的函数相似,但是需要将预测目标以\n",
"# 概率分布的形式给出。\n",
"word_prob_distribution = tf.constant([[0.0, 0.0, 1.0], [1.0, 0.0, 0.0]])\n",
"loss = tf.nn.softmax_cross_entropy_with_logits(\n",
" labels=word_prob_distribution, logits=predict_logits)\n",
"# 运行结果与上面相同:[ 0.32656264, 0.46436879]\n",
"print(sess.run(loss))\n",
"\n",
"# label smoothing:将正确数据的概率设为一个比1.0略小的值,将错误数据的概率\n",
"# 设为比0.0略大的值,这样可以避免模型与数据过拟合,在某些时候可以提高训练效果。\n",
"word_prob_smooth = tf.constant([[0.01, 0.01, 0.98], [0.98, 0.01, 0.01]])\n",
"loss = tf.nn.softmax_cross_entropy_with_logits(\n",
" labels=word_prob_smooth, logits=predict_logits)\n",
"# 运行结果:[ 0.37656265, 0.48936883]\n",
"print(sess.run(loss))\n",
"\n",
"sess.close()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
138 changes: 138 additions & 0 deletions Deep_Learning_with_TensorFlow/1.4.0/Chapter09/2. rwuasmk--fondc.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import codecs\n",
"import collections\n",
"from operator import itemgetter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1. 设置参数。"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"MODE = \"PTB\" # 将MODE设置为\"PTB\", \"TRANSLATE_EN\", \"TRANSLATE_ZH\"之一。\n",
"\n",
"if MODE == \"PTB\": # PTB数据处理\n",
" RAW_DATA = \"../../datasets/PTB_data/ptb.train.txt\" # 训练集数据文件\n",
" VOCAB_OUTPUT = \"ptb.vocab\" # 输出的词汇表文件\n",
"elif MODE == \"TRANSLATE_ZH\": # 翻译语料的中文部分\n",
" RAW_DATA = \"../../datasets/TED_data/train.txt.zh\"\n",
" VOCAB_OUTPUT = \"zh.vocab\"\n",
" VOCAB_SIZE = 4000\n",
"elif MODE == \"TRANSLATE_EN\": # 翻译语料的英文部分\n",
" RAW_DATA = \"../../datasets/TED_data/train.txt.en\"\n",
" VOCAB_OUTPUT = \"en.vocab\"\n",
" VOCAB_SIZE = 10000"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.对单词按词频排序。"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"counter = collections.Counter()\n",
"with codecs.open(RAW_DATA, \"r\", \"utf-8\") as f:\n",
" for line in f:\n",
" for word in line.strip().split():\n",
" counter[word] += 1\n",
"\n",
"# 按词频顺序对单词进行排序。\n",
"sorted_word_to_cnt = sorted(\n",
" counter.items(), key=itemgetter(1), reverse=True)\n",
"sorted_words = [x[0] for x in sorted_word_to_cnt]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.插入特殊符号。"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"if MODE == \"PTB\":\n",
" # 稍后我们需要在文本换行处加入句子结束符\"<eos>\",这里预先将其加入词汇表。\n",
" sorted_words = [\"<eos>\"] + sorted_words\n",
"elif MODE in [\"TRANSLATE_EN\", \"TRANSLATE_ZH\"]:\n",
" # 在9.3.2小节处理机器翻译数据时,除了\"<eos>\"以外,还需要将\"<unk>\"和句子起始符\n",
" # \"<sos>\"加入词汇表,并从词汇表中删除低频词汇。\n",
" sorted_words = [\"<unk>\", \"<sos>\", \"<eos>\"] + sorted_words\n",
" if len(sorted_words) > VOCAB_SIZE:\n",
" sorted_words = sorted_words[:VOCAB_SIZE]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 4.保存词汇表文件。"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"with codecs.open(VOCAB_OUTPUT, 'w', 'utf-8') as file_output:\n",
" for word in sorted_words:\n",
" file_output.write(word + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
126 changes: 126 additions & 0 deletions Deep_Learning_with_TensorFlow/1.4.0/Chapter09/3. rwuasmk--fohdrm.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import codecs\n",
"import sys"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1. 参数设置。"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"MODE = \"PTB_TRAIN\" # 将MODE设置为\"PTB_TRAIN\", \"PTB_VALID\", \"PTB_TEST\", \"TRANSLATE_EN\", \"TRANSLATE_ZH\"之一。\n",
"\n",
"if MODE == \"PTB_TRAIN\": # PTB训练数据\n",
" RAW_DATA = \"../../datasets/PTB_data/ptb.train.txt\" # 训练集数据文件\n",
" VOCAB = \"ptb.vocab\" # 词汇表文件\n",
" OUTPUT_DATA = \"ptb.train\" # 将单词替换为单词编号后的输出文件\n",
"elif MODE == \"PTB_VALID\": # PTB验证数据\n",
" RAW_DATA = \"../../datasets/PTB_data/ptb.valid.txt\"\n",
" VOCAB = \"ptb.vocab\"\n",
" OUTPUT_DATA = \"ptb.valid\"\n",
"elif MODE == \"PTB_TEST\": # PTB测试数据\n",
" RAW_DATA = \"../../datasets/PTB_data/ptb.test.txt\"\n",
" VOCAB = \"ptb.vocab\"\n",
" OUTPUT_DATA = \"ptb.test\"\n",
"elif MODE == \"TRANSLATE_ZH\": # 中文翻译数据\n",
" RAW_DATA = \"../../datasets/TED_data/train.txt.zh\"\n",
" VOCAB = \"zh.vocab\"\n",
" OUTPUT_DATA = \"train.zh\"\n",
"elif MODE == \"TRANSLATE_EN\": # 英文翻译数据\n",
" RAW_DATA = \"../../datasets/TED_data/train.txt.en\"\n",
" VOCAB = \"en.vocab\"\n",
" OUTPUT_DATA = \"train.en\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.按词汇表对将单词映射到编号。"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# 读取词汇表,并建立词汇到单词编号的映射。\n",
"with codecs.open(VOCAB, \"r\", \"utf-8\") as f_vocab:\n",
" vocab = [w.strip() for w in f_vocab.readlines()]\n",
"word_to_id = {k: v for (k, v) in zip(vocab, range(len(vocab)))}\n",
"\n",
"# 如果出现了不在词汇表内的低频词,则替换为\"unk\"\n",
"def get_id(word):\n",
" return word_to_id[word] if word in word_to_id else word_to_id[\"<unk>\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.对数据进行替换并保存结果。"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"fin = codecs.open(RAW_DATA, \"r\", \"utf-8\")\n",
"fout = codecs.open(OUTPUT_DATA, 'w', 'utf-8')\n",
"for line in fin:\n",
" words = line.strip().split() + [\"<eos>\"] # 读取单词并添加<eos>结束符\n",
" # 将每个单词替换为词汇表中的编号\n",
" out_line = ' '.join([str(get_id(w)) for w in words]) + '\\n'\n",
" fout.write(out_line)\n",
"fin.close()\n",
"fout.close()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit a872294

Please sign in to comment.