-
Notifications
You must be signed in to change notification settings - Fork 0
/
TermPositionVector.py
53 lines (42 loc) · 1.7 KB
/
TermPositionVector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import lucene
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.store import RAMDirectory
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.util import BytesRef, BytesRefIterator, Version
from org.apache.lucene.index import \
IndexWriterConfig, IndexWriter, DirectoryReader
if __name__ == '__main__':
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
directory = RAMDirectory()
iconfig = IndexWriterConfig(Version.LUCENE_CURRENT, LimitTokenCountAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT), 100))
iwriter = IndexWriter(directory, iconfig)
ft = FieldType()
ft.setIndexed(True)
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ts = ["this bernhard is the text to be index text",
"this claudia is the text to be indexed"]
for t in ts:
doc = Document()
doc.add(Field("fieldname", t, ft))
iwriter.addDocument(doc)
iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)
for doc in xrange(0, len(ts)):
tv = ireader.getTermVector(doc, "fieldname")
termsEnum = tv.iterator(None)
for term in BytesRefIterator.cast_(termsEnum):
dpEnum = termsEnum.docsAndPositions(None, None)
dpEnum.nextDoc() # prime the enum which works only for the current doc
freq = dpEnum.freq()
print 'term:', term.utf8ToString()
print ' freq:', freq
for i in xrange(freq):
print " pos:", dpEnum.nextPosition()
print " off: %i-%i" %(dpEnum.startOffset(), dpEnum.endOffset())
print