Skip to content

Commit

Permalink
Examples for Elastic and Solr
Browse files Browse the repository at this point in the history
  • Loading branch information
binarymax committed Dec 10, 2020
1 parent de15f81 commit bee42da
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 20 deletions.
39 changes: 19 additions & 20 deletions example/osc-blog.py → example/elasticsearch-blog-example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,16 @@
"""Example for `skipchunk` package."""
import json
from skipchunk.graphquery import GraphQuery
from skipchunk.indexquery import IndexQuery
from skipchunk.solr import timestamp
from skipchunk.elastic import timestamp
from skipchunk import skipchunk as sc

if __name__ == "__main__":

# If you set LOAD=True, this will load what you previously enriched from a pickle file
# If you haven't saved anything to pickle yet, don't enable this
# WARNING! This file can get very big.
LOAD = False

skipchunk_config_solr = {
"host":"http://localhost:8983/solr/",
"name":"osc-blog",
"path":"./skipchunk_data",
"engine_name":"solr"
}

skipchunk_config_elastic = {
"host":"http://localhost:9200/",
"name":"osc-blog",
Expand All @@ -26,10 +21,9 @@
}

skipchunk_config = skipchunk_config_elastic
#skipchunk_config = skipchunk_config_solr

source = "blog-posts.json"
#source = "blog-posts-one.json"
#source = "blog-posts.json"
source = "blog-posts-one.json"

print(timestamp()," | Initializing")

Expand All @@ -39,28 +33,33 @@
maxconceptlength=3,
minpredicatelength=1,
maxpredicatelength=3,
minlabels=1)
minlabels=1,
cache_documents=True,
cache_pickle=True)

gq = GraphQuery(skipchunk_config)
iq = IndexQuery(skipchunk_config)

if LOAD:
print(timestamp()," | Loading Pickle")
s.load()

else:
print(timestamp()," | Loading Tuples")

# Produces a list of (text,document) tuples ready for processing by the enrichment.
print(timestamp()," | Loading Content")
tuples = s.tuplize(filename=source,fields=['title','content'])

# Enriching can take a long time if you provide lots of text. Consider batching at 10k docs at a time.
print(timestamp()," | Enriching")
s.enrich(tuples)

# If you set LOAD=True, this will save a pickle file for later on if you want to enrich more content.
print(timestamp()," | Pickling")
s.save()


print(timestamp()," | Indexing Graph")
gq.delete()
gq.delete() # In this example the graph is deleted and reindexed every time. You probably don't want to do this in real life :)
gq.index(s)

#print(timestamp()," | Indexing Content")
#iq.delete()
#iq.index()

print(timestamp()," | !!!~~~~~DONE~~~~~!!!")
66 changes: 66 additions & 0 deletions example/solr-blog-example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-

"""Example for `skipchunk` package."""
import json
from skipchunk.graphquery import GraphQuery
from skipchunk.solr import timestamp
from skipchunk import skipchunk as sc

if __name__ == "__main__":

# If you set LOAD=True, this will load what you previously enriched from a pickle file
# If you haven't saved anything to pickle yet, don't enable this
# WARNING! This file can get very big.
LOAD = False

skipchunk_config_solr = {
"host":"http://localhost:8983/solr/",
"name":"osc-blog",
"path":"./skipchunk_data",
"engine_name":"solr"
}

skipchunk_config = skipchunk_config_solr

#source = "blog-posts.json"
source = "blog-posts-one.json"

print(timestamp()," | Initializing")

s = sc.Skipchunk(skipchunk_config,
spacy_model="en_core_web_lg",
minconceptlength=1,
maxconceptlength=3,
minpredicatelength=1,
maxpredicatelength=3,
minlabels=1,
cache_documents=True,
cache_pickle=True)

gq = GraphQuery(skipchunk_config)

if LOAD:
print(timestamp()," | Loading Pickle")
s.load()

else:

# Produces a list of (text,document) tuples ready for processing by the enrichment.
print(timestamp()," | Loading Content")
tuples = s.tuplize(filename=source,fields=['title','content'])

# Enriching can take a long time if you provide lots of text. Consider batching at 10k docs at a time.
print(timestamp()," | Enriching")
s.enrich(tuples)

# If you set LOAD=True, this will save a pickle file for later on if you want to enrich more content.
# WARNING! This can get very big.
print(timestamp()," | Pickling")
s.save()


print(timestamp()," | Indexing Graph")
gq.delete() # In this example the graph is deleted and reindexed every time. You probably don't want to do this in real life :)
gq.index(s)

print(timestamp()," | !!!~~~~~DONE~~~~~!!!")

0 comments on commit bee42da

Please sign in to comment.