debian-python-whoosh/benchmark/reuters.py

import gzip, os.path

from whoosh import analysis, fields, index, qparser, query
from whoosh.support.bench import Bench, Spec
from whoosh.util import now


class Reuters(Spec):
    name = "reuters"
    filename = "reuters21578.txt.gz"
    main_field = "text"
    headline_text = "headline"

    def whoosh_schema(self):
        #ana = analysis.StemmingAnalyzer()
        ana = analysis.StandardAnalyzer()
        schema = fields.Schema(id=fields.ID(stored=True),
                               headline=fields.STORED,
                               text=fields.TEXT(analyzer=ana, stored=True))
        return schema

    def zcatalog_setup(self, cat):
        from zcatalog import indexes  #@UnresolvedImport
        cat["id"] = indexes.FieldIndex(field_name="id")
        cat["headline"] = indexes.TextIndex(field_name="headline")
        cat["body"] = indexes.TextIndex(field_name="text")

    def documents(self):
        path = os.path.join(self.options.dir, self.filename)
        f = gzip.GzipFile(path)

        for line in f:
            id, text = line.decode("latin1").split("\t")
            yield {"id": id, "text": text, "headline": text[:70]}


if __name__ == "__main__":
    Bench().run(Reuters)
python-whoosh (2.7.0-1) unstable; urgency=medium * New upstream release. * Update watch file. Thanks to Piotr Ożarowski * debian/copyright: Update copyright years. * debian/upstream/metadata: Added upstream metadata. # imported from the archive 2015-05-07 13:22:20 +02:00			`import gzip, os.path`

			`from whoosh import analysis, fields, index, qparser, query`
			`from whoosh.support.bench import Bench, Spec`
			`from whoosh.util import now`


			`class Reuters(Spec):`
			`name = "reuters"`
			`filename = "reuters21578.txt.gz"`
			`main_field = "text"`
			`headline_text = "headline"`

			`def whoosh_schema(self):`
			`#ana = analysis.StemmingAnalyzer()`
			`ana = analysis.StandardAnalyzer()`
			`schema = fields.Schema(id=fields.ID(stored=True),`
			`headline=fields.STORED,`
			`text=fields.TEXT(analyzer=ana, stored=True))`
			`return schema`

			`def zcatalog_setup(self, cat):`
			`from zcatalog import indexes #@UnresolvedImport`
			`cat["id"] = indexes.FieldIndex(field_name="id")`
			`cat["headline"] = indexes.TextIndex(field_name="headline")`
			`cat["body"] = indexes.TextIndex(field_name="text")`

			`def documents(self):`
			`path = os.path.join(self.options.dir, self.filename)`
			`f = gzip.GzipFile(path)`

			`for line in f:`
			`id, text = line.decode("latin1").split("\t")`
			`yield {"id": id, "text": text, "headline": text[:70]}`


			`if __name__ == "__main__":`
			`Bench().run(Reuters)`