add chapter9 examples

FlyMeToTheMars · Jan 26, 2018 · a872294 · a872294
1 parent 38fba9b
commit a872294
Show file tree

Hide file tree

Showing 13 changed files with 843,543 additions and 0 deletions.
diff --git a/Deep_Learning_with_TensorFlow/1.4.0/Chapter09/1. izplfhudf.ipynb b/Deep_Learning_with_TensorFlow/1.4.0/Chapter09/1. izplfhudf.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.  sparse_softmax_cross_entropy_with_logits样例。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 0.32656264  0.46436879]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 假设词汇表的大小为3， 语料包含两个单词\"2 0\"\n",
+    "word_labels = tf.constant([2, 0])\n",
+    "\n",
+    "# 假设模型对两个单词预测时，产生的logit分别是[2.0, -1.0, 3.0]和[1.0, 0.0, -0.5]\n",
+    "predict_logits = tf.constant([[2.0, -1.0, 3.0], [1.0, 0.0, -0.5]])\n",
+    "\n",
+    "# 使用sparse_softmax_cross_entropy_with_logits计算交叉熵。\n",
+    "loss = tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
+    "    labels=word_labels, logits=predict_logits)\n",
+    "\n",
+    "# 运行程序，计算loss的结果是[0.32656264, 0.46436879], 这对应两个预测的\n",
+    "# perplexity损失。\n",
+    "sess = tf.Session()\n",
+    "print(sess.run(loss))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.  softmax_cross_entropy_with_logits样例。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 0.32656264  0.46436879]\n",
+      "[ 0.37656265  0.48936883]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# softmax_cross_entropy_with_logits与上面的函数相似，但是需要将预测目标以\n",
+    "# 概率分布的形式给出。\n",
+    "word_prob_distribution = tf.constant([[0.0, 0.0, 1.0], [1.0, 0.0, 0.0]])\n",
+    "loss = tf.nn.softmax_cross_entropy_with_logits(\n",
+    "    labels=word_prob_distribution, logits=predict_logits)\n",
+    "# 运行结果与上面相同：[ 0.32656264,  0.46436879]\n",
+    "print(sess.run(loss))\n",
+    "\n",
+    "# label smoothing：将正确数据的概率设为一个比1.0略小的值，将错误数据的概率\n",
+    "# 设为比0.0略大的值，这样可以避免模型与数据过拟合，在某些时候可以提高训练效果。\n",
+    "word_prob_smooth = tf.constant([[0.01, 0.01, 0.98], [0.98, 0.01, 0.01]])\n",
+    "loss = tf.nn.softmax_cross_entropy_with_logits(\n",
+    "    labels=word_prob_smooth, logits=predict_logits)\n",
+    "# 运行结果：[ 0.37656265,  0.48936883]\n",
+    "print(sess.run(loss))\n",
+    "\n",
+    "sess.close()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/Deep_Learning_with_TensorFlow/1.4.0/Chapter09/2. rwuasmk--fondc.ipynb b/Deep_Learning_with_TensorFlow/1.4.0/Chapter09/2. rwuasmk--fondc.ipynb
@@ -0,0 +1,138 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import codecs\n",
+    "import collections\n",
+    "from operator import itemgetter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1. 设置参数。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODE = \"PTB\"    # 将MODE设置为\"PTB\", \"TRANSLATE_EN\", \"TRANSLATE_ZH\"之一。\n",
+    "\n",
+    "if MODE == \"PTB\":             # PTB数据处理\n",
+    "    RAW_DATA = \"../../datasets/PTB_data/ptb.train.txt\"  # 训练集数据文件\n",
+    "    VOCAB_OUTPUT = \"ptb.vocab\"                         # 输出的词汇表文件\n",
+    "elif MODE == \"TRANSLATE_ZH\":  # 翻译语料的中文部分\n",
+    "    RAW_DATA = \"../../datasets/TED_data/train.txt.zh\"\n",
+    "    VOCAB_OUTPUT = \"zh.vocab\"\n",
+    "    VOCAB_SIZE = 4000\n",
+    "elif MODE == \"TRANSLATE_EN\":  # 翻译语料的英文部分\n",
+    "    RAW_DATA = \"../../datasets/TED_data/train.txt.en\"\n",
+    "    VOCAB_OUTPUT = \"en.vocab\"\n",
+    "    VOCAB_SIZE = 10000"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.对单词按词频排序。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "counter = collections.Counter()\n",
+    "with codecs.open(RAW_DATA, \"r\", \"utf-8\") as f:\n",
+    "    for line in f:\n",
+    "        for word in line.strip().split():\n",
+    "            counter[word] += 1\n",
+    "\n",
+    "# 按词频顺序对单词进行排序。\n",
+    "sorted_word_to_cnt = sorted(\n",
+    "    counter.items(), key=itemgetter(1), reverse=True)\n",
+    "sorted_words = [x[0] for x in sorted_word_to_cnt]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 3.插入特殊符号。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if MODE == \"PTB\":\n",
+    "    # 稍后我们需要在文本换行处加入句子结束符\"<eos>\"，这里预先将其加入词汇表。\n",
+    "    sorted_words = [\"<eos>\"] + sorted_words\n",
+    "elif MODE in [\"TRANSLATE_EN\", \"TRANSLATE_ZH\"]:\n",
+    "    # 在9.3.2小节处理机器翻译数据时，除了\"<eos>\"以外，还需要将\"<unk>\"和句子起始符\n",
+    "    # \"<sos>\"加入词汇表，并从词汇表中删除低频词汇。\n",
+    "    sorted_words = [\"<unk>\", \"<sos>\", \"<eos>\"] + sorted_words\n",
+    "    if len(sorted_words) > VOCAB_SIZE:\n",
+    "        sorted_words = sorted_words[:VOCAB_SIZE]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 4.保存词汇表文件。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with codecs.open(VOCAB_OUTPUT, 'w', 'utf-8') as file_output:\n",
+    "    for word in sorted_words:\n",
+    "        file_output.write(word + \"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Deep_Learning_with_TensorFlow/1.4.0/Chapter09/3. rwuasmk--fohdrm.ipynb b/Deep_Learning_with_TensorFlow/1.4.0/Chapter09/3. rwuasmk--fohdrm.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import codecs\n",
+    "import sys"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1. 参数设置。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODE = \"PTB_TRAIN\"    # 将MODE设置为\"PTB_TRAIN\", \"PTB_VALID\", \"PTB_TEST\", \"TRANSLATE_EN\", \"TRANSLATE_ZH\"之一。\n",
+    "\n",
+    "if MODE == \"PTB_TRAIN\":        # PTB训练数据\n",
+    "    RAW_DATA = \"../../datasets/PTB_data/ptb.train.txt\"  # 训练集数据文件\n",
+    "    VOCAB = \"ptb.vocab\"                                 # 词汇表文件\n",
+    "    OUTPUT_DATA = \"ptb.train\"                           # 将单词替换为单词编号后的输出文件\n",
+    "elif MODE == \"PTB_VALID\":      # PTB验证数据\n",
+    "    RAW_DATA = \"../../datasets/PTB_data/ptb.valid.txt\"\n",
+    "    VOCAB = \"ptb.vocab\"\n",
+    "    OUTPUT_DATA = \"ptb.valid\"\n",
+    "elif MODE == \"PTB_TEST\":       # PTB测试数据\n",
+    "    RAW_DATA = \"../../datasets/PTB_data/ptb.test.txt\"\n",
+    "    VOCAB = \"ptb.vocab\"\n",
+    "    OUTPUT_DATA = \"ptb.test\"\n",
+    "elif MODE == \"TRANSLATE_ZH\":   # 中文翻译数据\n",
+    "    RAW_DATA = \"../../datasets/TED_data/train.txt.zh\"\n",
+    "    VOCAB = \"zh.vocab\"\n",
+    "    OUTPUT_DATA = \"train.zh\"\n",
+    "elif MODE == \"TRANSLATE_EN\":   # 英文翻译数据\n",
+    "    RAW_DATA = \"../../datasets/TED_data/train.txt.en\"\n",
+    "    VOCAB = \"en.vocab\"\n",
+    "    OUTPUT_DATA = \"train.en\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.按词汇表对将单词映射到编号。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 读取词汇表，并建立词汇到单词编号的映射。\n",
+    "with codecs.open(VOCAB, \"r\", \"utf-8\") as f_vocab:\n",
+    "    vocab = [w.strip() for w in f_vocab.readlines()]\n",
+    "word_to_id = {k: v for (k, v) in zip(vocab, range(len(vocab)))}\n",
+    "\n",
+    "# 如果出现了不在词汇表内的低频词，则替换为\"unk\"。\n",
+    "def get_id(word):\n",
+    "    return word_to_id[word] if word in word_to_id else word_to_id[\"<unk>\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 3.对数据进行替换并保存结果。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fin = codecs.open(RAW_DATA, \"r\", \"utf-8\")\n",
+    "fout = codecs.open(OUTPUT_DATA, 'w', 'utf-8')\n",
+    "for line in fin:\n",
+    "    words = line.strip().split() + [\"<eos>\"]  # 读取单词并添加<eos>结束符\n",
+    "    # 将每个单词替换为词汇表中的编号\n",
+    "    out_line = ' '.join([str(get_id(w)) for w in words]) + '\\n'\n",
+    "    fout.write(out_line)\n",
+    "fin.close()\n",
+    "fout.close()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}