39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
|
import gzip, os.path
|
||
|
|
||
|
from whoosh import analysis, fields, index, qparser, query
|
||
|
from whoosh.support.bench import Bench, Spec
|
||
|
from whoosh.util import now
|
||
|
|
||
|
|
||
|
class Reuters(Spec):
|
||
|
name = "reuters"
|
||
|
filename = "reuters21578.txt.gz"
|
||
|
main_field = "text"
|
||
|
headline_text = "headline"
|
||
|
|
||
|
def whoosh_schema(self):
|
||
|
#ana = analysis.StemmingAnalyzer()
|
||
|
ana = analysis.StandardAnalyzer()
|
||
|
schema = fields.Schema(id=fields.ID(stored=True),
|
||
|
headline=fields.STORED,
|
||
|
text=fields.TEXT(analyzer=ana, stored=True))
|
||
|
return schema
|
||
|
|
||
|
def zcatalog_setup(self, cat):
|
||
|
from zcatalog import indexes #@UnresolvedImport
|
||
|
cat["id"] = indexes.FieldIndex(field_name="id")
|
||
|
cat["headline"] = indexes.TextIndex(field_name="headline")
|
||
|
cat["body"] = indexes.TextIndex(field_name="text")
|
||
|
|
||
|
def documents(self):
|
||
|
path = os.path.join(self.options.dir, self.filename)
|
||
|
f = gzip.GzipFile(path)
|
||
|
|
||
|
for line in f:
|
||
|
id, text = line.decode("latin1").split("\t")
|
||
|
yield {"id": id, "text": text, "headline": text[:70]}
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
Bench().run(Reuters)
|