Examples for Elastic and Solr

o19s · Dec 10, 2020 · bee42da · bee42da
1 parent de15f81
commit bee42da
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 20 deletions.
diff --git a/example/osc-blog.py → example/elasticsearch-blog-example.py b/example/osc-blog.py → example/elasticsearch-blog-example.py
@@ -3,21 +3,16 @@
 """Example for `skipchunk` package."""
 import json
 from skipchunk.graphquery import GraphQuery
-from skipchunk.indexquery import IndexQuery
-from skipchunk.solr import timestamp
+from skipchunk.elastic import timestamp
 from skipchunk import skipchunk as sc
 
 if __name__ == "__main__":
 
+    # If you set LOAD=True, this will load what you previously enriched from a pickle file
+    # If you haven't saved anything to pickle yet, don't enable this
+    # WARNING! This file can get very big.
     LOAD = False
 
-    skipchunk_config_solr = {
-        "host":"http://localhost:8983/solr/",
-        "name":"osc-blog",
-        "path":"./skipchunk_data",
-        "engine_name":"solr"
-    }
-
     skipchunk_config_elastic = {
         "host":"http://localhost:9200/",
         "name":"osc-blog",
@@ -26,10 +21,9 @@
     }
 
     skipchunk_config = skipchunk_config_elastic
-    #skipchunk_config = skipchunk_config_solr
 
-    source = "blog-posts.json"
-    #source = "blog-posts-one.json"
+    #source = "blog-posts.json"
+    source = "blog-posts-one.json"
 
     print(timestamp()," | Initializing")
 
@@ -39,28 +33,33 @@
         maxconceptlength=3,
         minpredicatelength=1,
         maxpredicatelength=3,
-        minlabels=1)
+        minlabels=1,
+        cache_documents=True,
+        cache_pickle=True)
 
     gq = GraphQuery(skipchunk_config)
-    iq = IndexQuery(skipchunk_config)
 
     if LOAD:
         print(timestamp()," | Loading Pickle")
         s.load()
+
     else:
-        print(timestamp()," | Loading Tuples")
+
+        # Produces a list of (text,document) tuples ready for processing by the enrichment.
+        print(timestamp()," | Loading Content")
         tuples = s.tuplize(filename=source,fields=['title','content'])
+
+        # Enriching can take a long time if you provide lots of text.  Consider batching at 10k docs at a time.
         print(timestamp()," | Enriching")
         s.enrich(tuples)
+
+        # If you set LOAD=True, this will save a pickle file for later on if you want to enrich more content.
         print(timestamp()," | Pickling")
         s.save()
 
+
     print(timestamp()," | Indexing Graph")
-    gq.delete()
+    gq.delete() # In this example the graph is deleted and reindexed every time.  You probably don't want to do this in real life :)
     gq.index(s)
-
-    #print(timestamp()," | Indexing Content")
-    #iq.delete()
-    #iq.index()
 
     print(timestamp()," | !!!~~~~~DONE~~~~~!!!")
diff --git a/example/solr-blog-example.py b/example/solr-blog-example.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+
+"""Example for `skipchunk` package."""
+import json
+from skipchunk.graphquery import GraphQuery
+from skipchunk.solr import timestamp
+from skipchunk import skipchunk as sc
+
+if __name__ == "__main__":
+
+    # If you set LOAD=True, this will load what you previously enriched from a pickle file
+    # If you haven't saved anything to pickle yet, don't enable this
+    # WARNING! This file can get very big.
+    LOAD = False
+
+    skipchunk_config_solr = {
+        "host":"http://localhost:8983/solr/",
+        "name":"osc-blog",
+        "path":"./skipchunk_data",
+        "engine_name":"solr"
+    }
+
+    skipchunk_config = skipchunk_config_solr
+
+    #source = "blog-posts.json"
+    source = "blog-posts-one.json"
+
+    print(timestamp()," | Initializing")
+
+    s = sc.Skipchunk(skipchunk_config,
+        spacy_model="en_core_web_lg",
+        minconceptlength=1,
+        maxconceptlength=3,
+        minpredicatelength=1,
+        maxpredicatelength=3,
+        minlabels=1,
+        cache_documents=True,
+        cache_pickle=True)
+
+    gq = GraphQuery(skipchunk_config)
+
+    if LOAD:
+        print(timestamp()," | Loading Pickle")
+        s.load()
+
+    else:
+
+        # Produces a list of (text,document) tuples ready for processing by the enrichment.
+        print(timestamp()," | Loading Content")
+        tuples = s.tuplize(filename=source,fields=['title','content'])
+
+        # Enriching can take a long time if you provide lots of text.  Consider batching at 10k docs at a time.
+        print(timestamp()," | Enriching")
+        s.enrich(tuples)
+
+        # If you set LOAD=True, this will save a pickle file for later on if you want to enrich more content.
+        # WARNING! This can get very big.
+        print(timestamp()," | Pickling")
+        s.save()
+
+
+    print(timestamp()," | Indexing Graph")
+    gq.delete() # In this example the graph is deleted and reindexed every time.  You probably don't want to do this in real life :)
+    gq.index(s)
+
+    print(timestamp()," | !!!~~~~~DONE~~~~~!!!")