Forked from wiki2vec and revised

- sbt v1.1 - java 8 - brought README in line with current code - made input and output artefact names slightly more consistent across scripts - workaround for issue with wikipedia-parser bug in title splitting on ":"
cdagraca · Mar 25, 2019 · 6360f50 · 6360f50
1 parent 2240ccb
commit 6360f50
Show file tree

Hide file tree

Showing 16 changed files with 96 additions and 57 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,20 @@
+FROM ubuntu:trusty
+
+COPY /resources/config/requirements* /tmp/
+
+RUN apt-get update && \
+    apt-get -qy install python3-pip build-essential
+
+RUN pip3 install --upgrade pip
+RUN pip3 install --ignore-installed -r /tmp/requirements1
+
+RUN apt-get -qy install libblas3 liblapack3 liblapack-dev libblas-dev gfortran zlib1g-dev
+
+RUN pip3 install --ignore-installed -r /tmp/requirements2
+
+COPY /target/scala-2.11/wiki2feck-assembly-1.0.jar /wiki2feck/wiki2feck.jar
+RUN mkdir -p /wiki2feck/scripts
+COPY /resources/scripts /wiki2feck/scripts
+RUN chmod a+x /wiki2feck/scripts/*/*.sh
+
+RUN ldconfig
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Wiki2Vec
+# Wiki2Feck
 
 Utilities for creating Word2Vec vectors for Dbpedia Entities via a Wikipedia Dump.
 
@@ -44,9 +44,9 @@ You can download via torrent one of the prebuilt word2vec models:
 
 - Once you get `language.corpus` go to `resources/gensim` and do:
 
-  `wiki2vec.sh pathToCorpus pathToOutputFile <MIN_WORD_COUNT> <VECTOR_SIZE> <WINDOW_SIZE>`
+  `wiki2feck.sh pathToCorpus pathToOutputFile <MIN_WORD_COUNT> <VECTOR_SIZE> <WINDOW_SIZE>`
 
-this will install all requiered dependencies for Gensim and build word2vec vectors.
+this will install all required dependencies for Gensim and build word2vec vectors.
 
 i.e:
 
@@ -96,8 +96,7 @@ params:
  - path to output readable wikipedia
 i.e:
 
-`java -Xmx10G -Xms10G -cp org.idio.wikipedia.dumps.ReadableWiki wiki2vec-assembly-1.0.jar path-to-wiki-dump/eswiki-20150105-pages-articles-multistream.xml.bz2 pathTo/output/ReadableWikipedia`
-
+`java -Xmx10G -Xms10G -cp wiki2feck-assembly-1.0.jar org.idio.wikipedia.dumps.CreateReadableWiki path-to-wiki-dump/eswiki-20150105-pages-articles-multistream.xml.bz2 pathTo/output/ReadableWikipedia`
 
 ### Word2Vec Corpus
 
@@ -125,7 +124,7 @@ DbpediaID/Barack_Obama B.O is the president of DbpediaID/USA
 2. Download Spark : http://d3kbcqa49mib13.cloudfront.net/spark-1.2.0-bin-hadoop2.4.tgz
 3. In your Spark folder do:
   ```
-  bin/spark-submit --master local[*] --executor-memory 1g --class "org.idio.wikipedia.word2vec.Word2VecCorpus"  target/scala-2.10/wiki2vec-assembly-1.0.jar   /PathToYourReadableWiki/readableWiki.lines /Path/To/RedirectsFile /PathToOut/Word2vecReadyWikipediaCorpus
+  bin/spark-submit --master local[*] --executor-memory 1g --class "org.idio.wikipedia.word2vec.Word2VecCorpus"  target/scala-2.10/wiki2feck-assembly-1.0.jar   /PathToYourReadableWiki/readableWiki.lines /Path/To/RedirectsFile /PathToOut/Word2vecReadyWikipediaCorpus
   ```
 4. Feed your corpus to a word2vec tool
 
@@ -142,7 +141,7 @@ pass None as an extra argument
 #### If you are manually running the tools:
 Pass None as an extra argument when calling spark
  ```
- bin/spark-submit --class "org.idio.wikipedia.word2vec.Word2VecCorpus"  target/scala-2.10/wiki2vec-assembly-1.0.jar   /PathToYourReadableWiki/readableWiki.lines /Path/To/RedirectsFile /PathToOut/Word2vecReadyWikipediaCorpus None
+ bin/spark-submit --class "org.idio.wikipedia.word2vec.Word2VecCorpus"  target/scala-2.10/wiki2feck-assembly-1.0.jar   /PathToYourReadableWiki/readableWiki.lines /Path/To/RedirectsFile /PathToOut/Word2vecReadyWikipediaCorpus None
  ```
 
 

diff --git a/build.sbt b/build.sbt
@@ -1,15 +1,15 @@
-import AssemblyKeys._
 
-assemblySettings
-
-name := "wiki2vec"
+name := "wiki2feck"
 
 version := "1.0"
 
-scalaVersion := "2.10.3"
+scalaVersion := "2.11.12"
+crossScalaVersions  := Seq("2.11.2", "2.10.4")
 
 resolvers ++= Seq(
-  "opennlp sourceforge repo" at "http://opennlp.sourceforge.net/maven2"
+  "opennlp sourceforge repo" at "http://opennlp.sourceforge.net/maven2",
+  Resolver.jcenterRepo,
+  "Twitter Maven Repository" at "https://maven.twttr.com"
 )
 
 libraryDependencies += "com.google.guava" % "guava" % "16.0.1"
@@ -28,7 +28,10 @@ libraryDependencies += "com.bizo" % "mighty-csv_2.10" % "0.2"
 
 libraryDependencies += "net.debasishg" %% "redisclient" % "2.13"
 
-libraryDependencies += "org.scalanlp" %% "chalk" % "1.3.2"  exclude ("com.typesafe.sbt", "sbt-pgp")
+libraryDependencies += "org.scalanlp" % "chalk_2.10" % "1.3.2"  excludeAll (
+  ExclusionRule(organization = "com.typesafe.sbt", name = "sbt-pgp"),
+  ExclusionRule(organization = "com.typesafe.akka")
+  )
 
 libraryDependencies += "org.apache.opennlp" % "opennlp-tools" % "1.5.2-incubating"
 

diff --git a/prepare.sh b/prepare.sh
@@ -9,7 +9,7 @@
 # $2 Target Folder( Output Folder)
 # $3 Stemmer
 
-WIKI2VEC_VERSION="1.0"
+WIKI2FECK_VERSION="1.0"
 
 usage ()
 {
@@ -31,7 +31,7 @@ TARGET_DIR="$2"
 LANGUAGE=`echo $1 | sed "s/_.*//g"`
 WDIR="$BASE_DIR/working"
 SPARK_PATH="$WDIR/spark-1.2.0-bin-hadoop2.4"
-JAR_PATH="$BASE_DIR/target/scala-2.10/wiki2vec-assembly-${WIKI2VEC_VERSION}.jar"
+JAR_PATH="$BASE_DIR/target/scala-2.10/wiki2feck_${WIKI2FECK_VERSION}.jar"
 READABLEWIKI="$TARGET_DIR/${LANGUAGE}wiki-latest.lines"
 SPLIT_OUTPUT_CORPUS="$WDIR/${LANGUAGE}wiki"
 OUTPUTCORPUS="$TARGET_DIR/${LANGUAGE}wiki.corpus"
@@ -64,15 +64,16 @@ mkdir -p $SPLIT_OUTPUT_CORPUS
 cd $WDIR
 
 echo "Downloading Wikipedia Dump"
-curl -L -O "http://dumps.wikimedia.org/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles-multistream.xml.bz2"
-WIKIPEDIA_PATH="$WDIR/${LANGUAGE}wiki-latest-pages-articles-multistream.xml.bz2"
+WIKI_DUMP_NAME="${LANGUAGE}wiki-latest-pages-articles-multistream.xml.bz2"
+curl -L -O "http://dumps.wikimedia.org/${LANGUAGE}wiki/latest/${WIKI_DUMP_NAME}"
+WIKIPEDIA_PATH="$WDIR/${WIKI_DUMP_NAME}"
 
 echo "Downloading Apache Spark"
 curl "http://d3kbcqa49mib13.cloudfront.net/spark-1.2.0-bin-hadoop2.4.tgz" | tar xvz
 
 
 # Compiling
-echo "Compiling wiki2vec..."
+echo "Compiling wiki2feck..."
 cd $BASE_DIR
 sbt assembly
 
@@ -81,17 +82,17 @@ sbt assembly
 echo "Creating Readable Wiki.."
 java -Xmx10G -Xms10G -cp $JAR_PATH org.idio.wikipedia.dumps.CreateReadableWiki $WIKIPEDIA_PATH $READABLEWIKI
 
-# Create Wiki2Vec Corpus
+# Create Wiki2Feck Corpus
 echo "Creating Word2vec Corpus"
 $SPARK_PATH/bin/spark-submit --driver-memory 15g --num-executors 4 --class org.idio.wikipedia.word2vec.Word2VecCorpus $JAR_PATH $READABLEWIKI $BASE_DIR/fakePathToRedirect/file.nt $SPLIT_OUTPUT_CORPUS $STEMMERNAME
 
 # joining split files
 echo "Joining corpus.."
 cd $SPLIT_OUTPUT_CORPUS
-cat part* >> $OUTPUTCORPUS
+cat part* >> $OUTPUTCORPUS.tmp
 
 echo "fixing up punctutation in final corpus"
 cd $BASE_DIR
-python resources/fix_corpus.py $OUTPUTCORPUS ${OUTPUTCORPUS}.fixed
+python resources/fix_corpus.py $OUTPUTCORPUS.tmp ${OUTPUTCORPUS}
 
 echo " ^___^ corpus : ${OUTPUTCORPUS}.fixed"
diff --git a/project/assembly.sbt b/project/assembly.sbt
@@ -0,0 +1,4 @@
+logLevel := Level.Warn
+
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
+
diff --git a/project/build.properties b/project/build.properties
@@ -0,0 +1 @@
+sbt.version=1.1.2
diff --git a/project/plugins.sbt b/project/plugins.sbt
diff --git a/resources/config/requirements1 b/resources/config/requirements1
@@ -0,0 +1,2 @@
+setuptools
+numpy
diff --git a/resources/config/requirements2 b/resources/config/requirements2
@@ -0,0 +1,3 @@
+scipy
+six
+gensim
diff --git a/resources/fix_corpus.py → resources/scripts/fix_corpus.py b/resources/fix_corpus.py → resources/scripts/fix_corpus.py
@@ -21,14 +21,14 @@
 import string
 from gensim import utils
 
-replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
+replace_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
 
 class PreprocessingLineSentence():
     def __init__(self, path_to_corpus):
         self.path = path_to_corpus
 
     def __iter__(self):
-        with utils.smart_open(self.path) as fin:
+        with utils.smart_open(self.path, encoding='ISO-8859-1') as fin:
             for line_no, line in enumerate(fin):
                 if line_no % 10000 == 0:
                     ln.debug("Processed %s lines" % line_no)
@@ -55,13 +55,13 @@ def __iter__(self):
 def fix_corpus(path_to_corpus, outfile=None):
     fixed = PreprocessingLineSentence(path_to_corpus)
     outfile = outfile or path_to_corpus + "_fixed"
-    with open(outfile, "w") as f:
+    with open(outfile, "w", encoding='ISO-8859-1') as f:
         for line in fixed:
             f.write(line)
 
 if __name__ == "__main__":
     import sys
-    if len(sys.argv) == 2:
+    if len(sys.argv) > 2:
         fix_corpus(sys.argv[1], sys.argv[2])
     else:
         fix_corpus(sys.argv[1])

diff --git a/resources/gensim/gensim_word2vec.py → resources/scripts/gensim/gensim_word2vec.py b/resources/gensim/gensim_word2vec.py → resources/scripts/gensim/gensim_word2vec.py
@@ -7,7 +7,7 @@
 
 ln = logging.getLogger()
 
-fileHandler = logging.FileHandler("wiki2vec_log%s.txt" % datetime.datetime.now().isoformat())
+fileHandler = logging.FileHandler("wiki2feck_log%s.txt" % datetime.datetime.now().isoformat())
 
 fileHandler.setFormatter(logFormatter)
 ln.addHandler(fileHandler)
@@ -41,9 +41,8 @@ def read_corpus(path_to_corpus, output_path, min_count=10, size=500, window=10,
     workers = multiprocessing.cpu_count()
     entity_min_count = _entity_min_count
     sentences = gensim.models.word2vec.LineSentence(path_to_corpus)
-    model = gensim.models.Word2Vec(None, min_count=min_count, size=size, window=window, sg=1, workers=workers, trim_rule=rule)
-    model.build_vocab(sentences)
-    model.train(sentences)
+    model = gensim.models.Word2Vec(sentences, min_count=min_count, size=size, window=window, sg=1, workers=workers, trim_rule=rule)
+    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
     model.save(output_path)
 
 
@@ -60,7 +59,7 @@ def main():
 
     parser.add_option("-e", "--entity_min_count",
                       action="store",
-                      dest="entity_min_count",
+                      dest="_entity_min_count",
                       default=5,
                       type="int",
                       help="min number of apperances for DBpedia entities",)

diff --git a/resources/gensim/wiki2vec.sh → resources/scripts/gensim/wiki2feck.sh b/resources/gensim/wiki2vec.sh → resources/scripts/gensim/wiki2feck.sh
@@ -3,7 +3,7 @@
 #| Idio Wiki2Vec                                                                      |                                                                                                     |
 #+------------------------------------------------------------------------------------------------------------------------------+
 
-# Creates Wiki2Vec vectors out of massaged wikipedia corpus
+# Creates Wiki2Feck vectors out of massaged wikipedia corpus
 # It uses gensim
 
 CORPUS=$1

diff --git a/src/main/java/org/idio/wikipedia/dumps/WikipediaPage.java b/src/main/java/org/idio/wikipedia/dumps/WikipediaPage.java
@@ -402,17 +402,29 @@ public String apply(@Nullable Link link) {
      *
      * @param page the <code>WikipediaPage</code> object
      * @param s raw XML string
+     * @param title page title string
      */
-    public static String readPage(WikipediaPage page, String s) {
+    public static String readPage(WikipediaPage page, String s, String title) {
+        if (title != null) page.title = title;
         page.page = s;
         return page.getContent();
     }
 
+    /**
+     * Reads a raw XML string into a <code>WikipediaPage</code> object.
+     *
+     * @param page the <code>WikipediaPage</code> object
+     * @param s raw XML string
+     */
+    public static String readPage(WikipediaPage page, String s) {
+        return readPage(page, s, null);
+    }
+
     /**
      * Reads a raw XML string into a <code>WikipediaPage</code> object. Added for backwards
      * compability.
      *
      * @param s raw XML string
      */
     protected abstract void processPage(String s);
-}
+}
diff --git a/src/main/scala/org/idio/wikipedia/dumps/ReadableWiki.scala b/src/main/scala/org/idio/wikipedia/dumps/ReadableWiki.scala
@@ -34,7 +34,9 @@ class ReadableWiki(pathToWikipediaDump: String, pathToOutFile: String){
       parser.getContentHandler.setRevisionCallback(new RevisionCallback {
         override def callback(revision: Revision): Unit = {
           val page = revision.getPage
-          val title = page.getTitle
+          //TODO: add the ":" back in here if/when the stratio wikipedia-parser's title splitting is fixed
+          // (at present it leaves a leading ":" on the title if there is a non-empty namespace)
+          val title = page.getNamespace + page.getTitle
           val articleText =  revision.getText.replace("\t", " ").replace("\n"," ")
           val line = List[String](title, articleText).mkString("\t") + "\n"
           writer.write(line)

diff --git a/src/main/scala/org/idio/wikipedia/word2vec/ArticleCleaner.scala b/src/main/scala/org/idio/wikipedia/word2vec/ArticleCleaner.scala
@@ -44,7 +44,7 @@ object ArticleCleaner {
           val canonicalDbpediaId = redirectStore.getCanonicalId(dbpediaId)
           processLink(surfaceForm, canonicalDbpediaId)
         } catch {
-          case _ => {
+          case _ : Throwable=> {
             println("error with link: " + linkMatch.toString())
             ""
           }

diff --git a/src/main/scala/org/idio/wikipedia/word2vec/Word2VecCorpus.scala b/src/main/scala/org/idio/wikipedia/word2vec/Word2VecCorpus.scala
@@ -33,16 +33,16 @@ class Word2VecCorpus(pathToReadableWiki:String, redirectStore:RedirectStore, pat
   * Returns a PairRDD (WikiTitle, ArticleText)
   * Out of a readable wikipedia
   * */
-  private def getPairRDD(articlesLines:RDD[String])={
+  private def getPairRDD(articlesLines:RDD[String]): RDD[(String, String)]={
     articlesLines.map{ line =>
-      val splitLine = line.split("\t")
-      try {
-        val wikiTitle = splitLine(0)
-        val articleText = splitLine(1)
-        (wikiTitle, articleText)
-      }catch{
-        case _ => ("", "")
-      }
+        val splitLine = line.split("\t")
+        try {
+          val wikiTitle = splitLine(0)
+          val articleText = splitLine(1)
+          (wikiTitle, articleText)
+        } catch {
+          case _: Throwable => ("", "")
+        }
     }
   }
 
@@ -59,7 +59,7 @@ class Word2VecCorpus(pathToReadableWiki:String, redirectStore:RedirectStore, pat
         val wikiModel = new EnglishWikipediaPage()
 
         // cleans wikimedia markup
-        val pageContent = WikipediaPage.readPage(wikiModel, text)
+        val pageContent = WikipediaPage.readPage(wikiModel, text, title)
 
         // cleans further Style tags {| some CSS inside |}
         val markupClean = ArticleCleaner.cleanStyle(pageContent)
@@ -118,9 +118,9 @@ class Word2VecCorpus(pathToReadableWiki:String, redirectStore:RedirectStore, pat
         val stemmer = try{
                           new SnowballStemmer(language_local)
                     }catch{
-                      case _=> new NoStemmer()
+                      case _: Throwable => new NoStemmer()
                 }
-         line.split("\\s").map{
+        line.split("\\s").map{
             word =>
                word match{
                  case w if w.startsWith(prefix) => w
@@ -149,18 +149,17 @@ object Word2VecCorpus{
     val pathToRedirects =  args(1)
     val pathToOutput = args(2)
     val language = try { args(3) }catch{
-         case _ => {
+         case _ : Throwable => {
            println("Warning: Stemming is deactivated..")
            "NoStemmer"
          }
     }
 
-
     println("Path to Readable Wikipedia: "+ pathToReadableWikipedia)
     println("Path to Wikipedia Redirects: " + pathToRedirects)
     println("Path to Output Corpus : " + pathToOutput)
 
-    val conf = new SparkConf().setAppName("Wiki2Vec corpus creator")
+    val conf = new SparkConf().setAppName("Wiki2Feck corpus creator")
 
     implicit val sc: SparkContext = new SparkContext(conf)
 
@@ -176,8 +175,6 @@ object Word2VecCorpus{
       }
     }
 
-
-
     val word2vecCorpusCreator = new Word2VecCorpus(pathToReadableWikipedia, redirectStore, pathToOutput, language)
     word2vecCorpusCreator.getWord2vecCorpus()
   }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		logLevel := Level.Warn

		addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    scipy
+    six
+    gensim