Download the datasets when needed

ogrisel · Nov 23, 2015 · 4584219 · 4584219
1 parent 2a57295
commit 4584219
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 20 deletions.
diff --git a/fetch_data.py b/fetch_data.py
@@ -130,7 +130,8 @@ def check_covertype(datasets_folder):
 if __name__ == "__main__":
     import sys
     datasets_folder = get_datasets_folder()
-    check_twenty_newsgroups(datasets_folder)
+    if 'twenty_newsgroups' in sys.argv:
+        check_twenty_newsgroups(datasets_folder)
     if 'sentiment140' in sys.argv:
         check_sentiment140(datasets_folder)
     if 'covertype' in sys.argv:

diff --git a/notebooks/00 - Tutorial Setup .ipynb b/notebooks/00 - Tutorial Setup .ipynb
@@ -1,21 +1,21 @@
 {
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.9"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.0"
   },
   "name": ""
  },
@@ -149,7 +149,8 @@
      "cell_type": "code",
      "collapsed": false,
      "input": [
-      "%run ../fetch_data.py"
+      "%run ../fetch_data.py\n",
+      "# %run ../fetch_data.py twenty_newsgroups sentiment140 covertype"
      ],
      "language": "python",
      "metadata": {},

diff --git a/notebooks/07 - Text Feature Extraction for Classification and Clustering.ipynb b/notebooks/07 - Text Feature Extraction for Classification and Clustering.ipynb
@@ -1,21 +1,21 @@
 {
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.9"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.0"
   },
   "name": ""
  },
@@ -46,7 +46,7 @@
      "cell_type": "code",
      "collapsed": false,
      "input": [
-      "%run ../fetch_data.py"
+      "%run ../fetch_data.py twenty_newsgroups"
      ],
      "language": "python",
      "metadata": {},

diff --git a/notebooks/08 - Large Scale Text Classification for Sentiment Analysis.ipynb b/notebooks/08 - Large Scale Text Classification for Sentiment Analysis.ipynb
@@ -1,21 +1,21 @@
 {
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.9"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.0"
   },
   "name": ""
  },
@@ -193,8 +193,23 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-      "To illustrate the scalability issues of the vocabulary-based vectorizers, let's load a more realistic dataset for a classical text classification task: sentiment analysis on tweets. The goal is to tell apart negative from positive tweets on a variety of topics.\n",
-      "\n",
+      "To illustrate the scalability issues of the vocabulary-based vectorizers, let's load a more realistic dataset for a classical text classification task: sentiment analysis on tweets. The goal is to tell apart negative from positive tweets on a variety of topics."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "%run ../fetch_data.py sentiment140"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
       "Assuming that the `../fetch_data.py` script was run successfully the following files should be available:"
      ]
     },