From 50b9306f9b4e3cd4b08dcc2f5bcb39ade6d0c32c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D8=A3=D8=AD=D9=85=D8=AF=20=D8=A7=D9=84=D9=85=D8=AD=D9=85?= =?UTF-8?q?=D9=88=D8=AF=D9=8A=20=28Ahmed=20El-Mahmoudy=29?= Date: Thu, 7 May 2015 13:22:20 +0200 Subject: [PATCH] python-whoosh (2.7.0-1) unstable; urgency=medium MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * New upstream release. * Update watch file. Thanks to Piotr Ożarowski * debian/copyright: Update copyright years. * debian/upstream/metadata: Added upstream metadata. # imported from the archive --- LICENSE.txt | 26 + MANIFEST.in | 8 + PKG-INFO | 88 ++ README.txt | 68 + benchmark/dcvgr10.txt.gz | Bin 0 -> 201819 bytes benchmark/dictionary.py | 43 + benchmark/enron.py | 185 +++ benchmark/marc21.py | 297 ++++ benchmark/reuters.py | 38 + benchmark/reuters21578.txt.gz | Bin 0 -> 181938 bytes debian/NEWS | 8 + debian/README.source | 58 + debian/changelog | 391 +++++ debian/compat | 1 + debian/control | 60 + debian/copyright | 144 ++ debian/python-whoosh-doc.doc-base | 10 + debian/python-whoosh-doc.docs | 1 + debian/python-whoosh-doc.maintscript | 1 + debian/rules | 16 + debian/source/format | 1 + debian/upstream/metadata | 5 + debian/watch | 3 + docs/source/analysis.rst | 329 ++++ docs/source/api/analysis.rst | 62 + docs/source/api/api.rst | 9 + docs/source/api/codec/base.rst | 32 + docs/source/api/collectors.rst | 47 + docs/source/api/columns.rst | 49 + docs/source/api/fields.rst | 41 + docs/source/api/filedb/filestore.rst | 31 + docs/source/api/filedb/filetables.rst | 22 + docs/source/api/filedb/structfile.rst | 14 + docs/source/api/formats.rst | 24 + docs/source/api/highlight.rst | 50 + docs/source/api/idsets.rst | 23 + docs/source/api/index.rst | 39 + docs/source/api/lang/morph_en.rst | 7 + docs/source/api/lang/porter.rst | 7 + docs/source/api/lang/wordnet.rst | 20 + docs/source/api/matching.rst | 34 + docs/source/api/qparser.rst | 97 ++ docs/source/api/query.rst | 83 ++ docs/source/api/reading.rst | 22 + docs/source/api/scoring.rst | 42 + docs/source/api/searching.rst | 33 + docs/source/api/sorting.rst | 48 + docs/source/api/spelling.rst | 34 + docs/source/api/support/charset.rst | 13 + docs/source/api/support/levenshtein.rst | 10 + docs/source/api/util.rst | 7 + docs/source/api/writing.rst | 30 + docs/source/batch.rst | 114 ++ docs/source/conf.py | 198 +++ docs/source/dates.rst | 202 +++ docs/source/facets.rst | 771 ++++++++++ docs/source/fieldcaches.rst | 52 + docs/source/glossary.rst | 65 + docs/source/highlight.rst | 419 ++++++ docs/source/index.rst | 50 + docs/source/indexing.rst | 440 ++++++ docs/source/intro.rst | 60 + docs/source/keywords.rst | 94 ++ docs/source/nested.rst | 238 +++ docs/source/ngrams.rst | 51 + docs/source/parsing.rst | 437 ++++++ docs/source/query.rst | 10 + docs/source/querylang.rst | 191 +++ docs/source/quickstart.rst | 244 +++ docs/source/recipes.rst | 229 +++ docs/source/releases/0_3.rst | 61 + docs/source/releases/1_0.rst | 482 ++++++ docs/source/releases/2_0.rst | 333 +++++ docs/source/releases/index.rst | 11 + docs/source/schema.rst | 377 +++++ docs/source/searching.rst | 400 +++++ docs/source/spelling.rst | 130 ++ docs/source/stemming.rst | 217 +++ docs/source/tech/backend.rst | 175 +++ docs/source/tech/filedb.rst | 29 + docs/source/tech/index.rst | 9 + docs/source/threads.rst | 74 + files/whoosh.svg | 434 ++++++ files/whoosh_16.png | Bin 0 -> 909 bytes files/whoosh_35.png | Bin 0 -> 3231 bytes files/whoosh_64.png | Bin 0 -> 7708 bytes files/whoosh_small.svg | 604 ++++++++ setup.cfg | 40 + setup.py | 60 + src/Whoosh.egg-info/PKG-INFO | 88 ++ src/Whoosh.egg-info/SOURCES.txt | 224 +++ src/Whoosh.egg-info/dependency_links.txt | 1 + src/Whoosh.egg-info/top_level.txt | 1 + src/Whoosh.egg-info/zip-safe | 1 + src/whoosh/__init__.py | 49 + src/whoosh/analysis/__init__.py | 69 + src/whoosh/analysis/acore.py | 156 ++ src/whoosh/analysis/analyzers.py | 296 ++++ src/whoosh/analysis/filters.py | 479 ++++++ src/whoosh/analysis/intraword.py | 494 ++++++ src/whoosh/analysis/morph.py | 267 ++++ src/whoosh/analysis/ngrams.py | 237 +++ src/whoosh/analysis/tokenizers.py | 338 +++++ src/whoosh/automata/__init__.py | 0 src/whoosh/automata/fsa.py | 714 +++++++++ src/whoosh/automata/glob.py | 90 ++ src/whoosh/automata/lev.py | 30 + src/whoosh/automata/nfa.py | 388 +++++ src/whoosh/automata/reg.py | 135 ++ src/whoosh/classify.py | 377 +++++ src/whoosh/codec/__init__.py | 32 + src/whoosh/codec/base.py | 843 +++++++++++ src/whoosh/codec/memory.py | 334 +++++ src/whoosh/codec/plaintext.py | 452 ++++++ src/whoosh/codec/whoosh3.py | 1281 ++++++++++++++++ src/whoosh/collectors.py | 1162 +++++++++++++++ src/whoosh/columns.py | 1411 ++++++++++++++++++ src/whoosh/compat.py | 206 +++ src/whoosh/externalsort.py | 240 +++ src/whoosh/fields.py | 1603 ++++++++++++++++++++ src/whoosh/filedb/__init__.py | 0 src/whoosh/filedb/compound.py | 331 +++++ src/whoosh/filedb/filestore.py | 655 ++++++++ src/whoosh/filedb/filetables.py | 735 +++++++++ src/whoosh/filedb/gae.py | 164 ++ src/whoosh/filedb/structfile.py | 402 +++++ src/whoosh/formats.py | 481 ++++++ src/whoosh/highlight.py | 952 ++++++++++++ src/whoosh/idsets.py | 703 +++++++++ src/whoosh/index.py | 707 +++++++++ src/whoosh/lang/__init__.py | 140 ++ src/whoosh/lang/dmetaphone.py | 415 ++++++ src/whoosh/lang/isri.py | 382 +++++ src/whoosh/lang/lovins.py | 570 +++++++ src/whoosh/lang/morph_en.py | 933 ++++++++++++ src/whoosh/lang/paicehusk.py | 242 +++ src/whoosh/lang/phonetic.py | 119 ++ src/whoosh/lang/porter.py | 175 +++ src/whoosh/lang/porter2.py | 313 ++++ src/whoosh/lang/snowball/__init__.py | 74 + src/whoosh/lang/snowball/bases.py | 133 ++ src/whoosh/lang/snowball/danish.py | 115 ++ src/whoosh/lang/snowball/dutch.py | 173 +++ src/whoosh/lang/snowball/english.py | 465 ++++++ src/whoosh/lang/snowball/finnish.py | 266 ++++ src/whoosh/lang/snowball/french.py | 348 +++++ src/whoosh/lang/snowball/german.py | 144 ++ src/whoosh/lang/snowball/hungarian.py | 268 ++++ src/whoosh/lang/snowball/italian.py | 230 +++ src/whoosh/lang/snowball/norwegian.py | 84 ++ src/whoosh/lang/snowball/portugese.py | 205 +++ src/whoosh/lang/snowball/romanian.py | 253 ++++ src/whoosh/lang/snowball/russian.py | 422 ++++++ src/whoosh/lang/snowball/spanish.py | 248 +++ src/whoosh/lang/snowball/swedish.py | 80 + src/whoosh/lang/stopwords.py | 285 ++++ src/whoosh/lang/wordnet.py | 242 +++ src/whoosh/legacy.py | 77 + src/whoosh/matching/__init__.py | 31 + src/whoosh/matching/binary.py | 803 ++++++++++ src/whoosh/matching/combo.py | 312 ++++ src/whoosh/matching/mcore.py | 622 ++++++++ src/whoosh/matching/wrappers.py | 572 +++++++ src/whoosh/multiproc.py | 381 +++++ src/whoosh/qparser/__init__.py | 30 + src/whoosh/qparser/common.py | 65 + src/whoosh/qparser/dateparse.py | 922 ++++++++++++ src/whoosh/qparser/default.py | 439 ++++++ src/whoosh/qparser/plugins.py | 1413 ++++++++++++++++++ src/whoosh/qparser/syntax.py | 641 ++++++++ src/whoosh/qparser/taggers.py | 93 ++ src/whoosh/query/__init__.py | 36 + src/whoosh/query/compound.py | 660 ++++++++ src/whoosh/query/nested.py | 412 +++++ src/whoosh/query/positional.py | 249 ++++ src/whoosh/query/qcolumns.py | 117 ++ src/whoosh/query/qcore.py | 715 +++++++++ src/whoosh/query/ranges.py | 347 +++++ src/whoosh/query/spans.py | 872 +++++++++++ src/whoosh/query/terms.py | 534 +++++++ src/whoosh/query/wrappers.py | 198 +++ src/whoosh/reading.py | 1295 ++++++++++++++++ src/whoosh/scoring.py | 616 ++++++++ src/whoosh/searching.py | 1658 +++++++++++++++++++++ src/whoosh/sorting.py | 1156 ++++++++++++++ src/whoosh/spelling.py | 343 +++++ src/whoosh/support/__init__.py | 0 src/whoosh/support/base85.py | 103 ++ src/whoosh/support/bench.py | 610 ++++++++ src/whoosh/support/charset.py | 1379 +++++++++++++++++ src/whoosh/support/levenshtein.py | 70 + src/whoosh/support/relativedelta.py | 437 ++++++ src/whoosh/support/unicode.py | 527 +++++++ src/whoosh/system.py | 79 + src/whoosh/util/__init__.py | 142 ++ src/whoosh/util/cache.py | 375 +++++ src/whoosh/util/filelock.py | 163 ++ src/whoosh/util/loading.py | 84 ++ src/whoosh/util/numeric.py | 317 ++++ src/whoosh/util/numlists.py | 373 +++++ src/whoosh/util/testing.py | 130 ++ src/whoosh/util/text.py | 132 ++ src/whoosh/util/times.py | 467 ++++++ src/whoosh/util/varints.py | 110 ++ src/whoosh/util/versions.py | 165 ++ src/whoosh/writing.py | 1272 ++++++++++++++++ tests/test_analysis.py | 532 +++++++ tests/test_automata.py | 372 +++++ tests/test_bits.py | 185 +++ tests/test_classify.py | 132 ++ tests/test_codecs.py | 621 ++++++++ tests/test_collector.py | 229 +++ tests/test_columns.py | 280 ++++ tests/test_compound.py | 65 + tests/test_dateparse.py | 356 +++++ tests/test_fields.py | 597 ++++++++ tests/test_flexible.py | 104 ++ tests/test_highlighting.py | 282 ++++ tests/test_indexing.py | 702 +++++++++ tests/test_matching.py | 556 +++++++ tests/test_misc.py | 161 ++ tests/test_mpwriter.py | 277 ++++ tests/test_nested.py | 361 +++++ tests/test_parse_plugins.py | 650 ++++++++ tests/test_parsing.py | 996 +++++++++++++ tests/test_postings.py | 87 ++ tests/test_quality.py | 172 +++ tests/test_queries.py | 574 +++++++ tests/test_reading.py | 397 +++++ tests/test_results.py | 635 ++++++++ tests/test_searching.py | 1737 ++++++++++++++++++++++ tests/test_sorting.py | 1053 +++++++++++++ tests/test_spans.py | 339 +++++ tests/test_spelling.py | 353 +++++ tests/test_tables.py | 215 +++ tests/test_vectors.py | 103 ++ tests/test_weightings.py | 81 + tests/test_writing.py | 430 ++++++ 238 files changed, 70642 insertions(+) create mode 100644 LICENSE.txt create mode 100644 MANIFEST.in create mode 100644 PKG-INFO create mode 100644 README.txt create mode 100644 benchmark/dcvgr10.txt.gz create mode 100644 benchmark/dictionary.py create mode 100644 benchmark/enron.py create mode 100644 benchmark/marc21.py create mode 100644 benchmark/reuters.py create mode 100644 benchmark/reuters21578.txt.gz create mode 100644 debian/NEWS create mode 100644 debian/README.source create mode 100644 debian/changelog create mode 100644 debian/compat create mode 100644 debian/control create mode 100644 debian/copyright create mode 100644 debian/python-whoosh-doc.doc-base create mode 100644 debian/python-whoosh-doc.docs create mode 100644 debian/python-whoosh-doc.maintscript create mode 100755 debian/rules create mode 100644 debian/source/format create mode 100644 debian/upstream/metadata create mode 100644 debian/watch create mode 100644 docs/source/analysis.rst create mode 100644 docs/source/api/analysis.rst create mode 100644 docs/source/api/api.rst create mode 100644 docs/source/api/codec/base.rst create mode 100644 docs/source/api/collectors.rst create mode 100644 docs/source/api/columns.rst create mode 100644 docs/source/api/fields.rst create mode 100644 docs/source/api/filedb/filestore.rst create mode 100644 docs/source/api/filedb/filetables.rst create mode 100644 docs/source/api/filedb/structfile.rst create mode 100644 docs/source/api/formats.rst create mode 100644 docs/source/api/highlight.rst create mode 100644 docs/source/api/idsets.rst create mode 100644 docs/source/api/index.rst create mode 100644 docs/source/api/lang/morph_en.rst create mode 100644 docs/source/api/lang/porter.rst create mode 100644 docs/source/api/lang/wordnet.rst create mode 100644 docs/source/api/matching.rst create mode 100644 docs/source/api/qparser.rst create mode 100644 docs/source/api/query.rst create mode 100644 docs/source/api/reading.rst create mode 100644 docs/source/api/scoring.rst create mode 100644 docs/source/api/searching.rst create mode 100644 docs/source/api/sorting.rst create mode 100644 docs/source/api/spelling.rst create mode 100644 docs/source/api/support/charset.rst create mode 100644 docs/source/api/support/levenshtein.rst create mode 100644 docs/source/api/util.rst create mode 100644 docs/source/api/writing.rst create mode 100644 docs/source/batch.rst create mode 100644 docs/source/conf.py create mode 100644 docs/source/dates.rst create mode 100644 docs/source/facets.rst create mode 100644 docs/source/fieldcaches.rst create mode 100644 docs/source/glossary.rst create mode 100644 docs/source/highlight.rst create mode 100644 docs/source/index.rst create mode 100644 docs/source/indexing.rst create mode 100644 docs/source/intro.rst create mode 100644 docs/source/keywords.rst create mode 100644 docs/source/nested.rst create mode 100644 docs/source/ngrams.rst create mode 100644 docs/source/parsing.rst create mode 100644 docs/source/query.rst create mode 100644 docs/source/querylang.rst create mode 100644 docs/source/quickstart.rst create mode 100644 docs/source/recipes.rst create mode 100644 docs/source/releases/0_3.rst create mode 100644 docs/source/releases/1_0.rst create mode 100644 docs/source/releases/2_0.rst create mode 100644 docs/source/releases/index.rst create mode 100644 docs/source/schema.rst create mode 100644 docs/source/searching.rst create mode 100644 docs/source/spelling.rst create mode 100644 docs/source/stemming.rst create mode 100644 docs/source/tech/backend.rst create mode 100644 docs/source/tech/filedb.rst create mode 100644 docs/source/tech/index.rst create mode 100644 docs/source/threads.rst create mode 100644 files/whoosh.svg create mode 100644 files/whoosh_16.png create mode 100644 files/whoosh_35.png create mode 100644 files/whoosh_64.png create mode 100644 files/whoosh_small.svg create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 src/Whoosh.egg-info/PKG-INFO create mode 100644 src/Whoosh.egg-info/SOURCES.txt create mode 100644 src/Whoosh.egg-info/dependency_links.txt create mode 100644 src/Whoosh.egg-info/top_level.txt create mode 100644 src/Whoosh.egg-info/zip-safe create mode 100644 src/whoosh/__init__.py create mode 100644 src/whoosh/analysis/__init__.py create mode 100644 src/whoosh/analysis/acore.py create mode 100644 src/whoosh/analysis/analyzers.py create mode 100644 src/whoosh/analysis/filters.py create mode 100644 src/whoosh/analysis/intraword.py create mode 100644 src/whoosh/analysis/morph.py create mode 100644 src/whoosh/analysis/ngrams.py create mode 100644 src/whoosh/analysis/tokenizers.py create mode 100644 src/whoosh/automata/__init__.py create mode 100644 src/whoosh/automata/fsa.py create mode 100644 src/whoosh/automata/glob.py create mode 100644 src/whoosh/automata/lev.py create mode 100644 src/whoosh/automata/nfa.py create mode 100644 src/whoosh/automata/reg.py create mode 100755 src/whoosh/classify.py create mode 100644 src/whoosh/codec/__init__.py create mode 100644 src/whoosh/codec/base.py create mode 100644 src/whoosh/codec/memory.py create mode 100644 src/whoosh/codec/plaintext.py create mode 100644 src/whoosh/codec/whoosh3.py create mode 100644 src/whoosh/collectors.py create mode 100644 src/whoosh/columns.py create mode 100644 src/whoosh/compat.py create mode 100644 src/whoosh/externalsort.py create mode 100644 src/whoosh/fields.py create mode 100644 src/whoosh/filedb/__init__.py create mode 100644 src/whoosh/filedb/compound.py create mode 100644 src/whoosh/filedb/filestore.py create mode 100644 src/whoosh/filedb/filetables.py create mode 100644 src/whoosh/filedb/gae.py create mode 100644 src/whoosh/filedb/structfile.py create mode 100644 src/whoosh/formats.py create mode 100644 src/whoosh/highlight.py create mode 100644 src/whoosh/idsets.py create mode 100644 src/whoosh/index.py create mode 100644 src/whoosh/lang/__init__.py create mode 100644 src/whoosh/lang/dmetaphone.py create mode 100644 src/whoosh/lang/isri.py create mode 100644 src/whoosh/lang/lovins.py create mode 100644 src/whoosh/lang/morph_en.py create mode 100644 src/whoosh/lang/paicehusk.py create mode 100644 src/whoosh/lang/phonetic.py create mode 100755 src/whoosh/lang/porter.py create mode 100644 src/whoosh/lang/porter2.py create mode 100644 src/whoosh/lang/snowball/__init__.py create mode 100644 src/whoosh/lang/snowball/bases.py create mode 100644 src/whoosh/lang/snowball/danish.py create mode 100644 src/whoosh/lang/snowball/dutch.py create mode 100644 src/whoosh/lang/snowball/english.py create mode 100644 src/whoosh/lang/snowball/finnish.py create mode 100644 src/whoosh/lang/snowball/french.py create mode 100644 src/whoosh/lang/snowball/german.py create mode 100644 src/whoosh/lang/snowball/hungarian.py create mode 100644 src/whoosh/lang/snowball/italian.py create mode 100644 src/whoosh/lang/snowball/norwegian.py create mode 100644 src/whoosh/lang/snowball/portugese.py create mode 100644 src/whoosh/lang/snowball/romanian.py create mode 100644 src/whoosh/lang/snowball/russian.py create mode 100644 src/whoosh/lang/snowball/spanish.py create mode 100644 src/whoosh/lang/snowball/swedish.py create mode 100644 src/whoosh/lang/stopwords.py create mode 100644 src/whoosh/lang/wordnet.py create mode 100644 src/whoosh/legacy.py create mode 100644 src/whoosh/matching/__init__.py create mode 100644 src/whoosh/matching/binary.py create mode 100644 src/whoosh/matching/combo.py create mode 100644 src/whoosh/matching/mcore.py create mode 100644 src/whoosh/matching/wrappers.py create mode 100644 src/whoosh/multiproc.py create mode 100644 src/whoosh/qparser/__init__.py create mode 100644 src/whoosh/qparser/common.py create mode 100644 src/whoosh/qparser/dateparse.py create mode 100644 src/whoosh/qparser/default.py create mode 100644 src/whoosh/qparser/plugins.py create mode 100644 src/whoosh/qparser/syntax.py create mode 100644 src/whoosh/qparser/taggers.py create mode 100644 src/whoosh/query/__init__.py create mode 100644 src/whoosh/query/compound.py create mode 100644 src/whoosh/query/nested.py create mode 100644 src/whoosh/query/positional.py create mode 100644 src/whoosh/query/qcolumns.py create mode 100644 src/whoosh/query/qcore.py create mode 100644 src/whoosh/query/ranges.py create mode 100644 src/whoosh/query/spans.py create mode 100644 src/whoosh/query/terms.py create mode 100644 src/whoosh/query/wrappers.py create mode 100644 src/whoosh/reading.py create mode 100644 src/whoosh/scoring.py create mode 100644 src/whoosh/searching.py create mode 100644 src/whoosh/sorting.py create mode 100644 src/whoosh/spelling.py create mode 100644 src/whoosh/support/__init__.py create mode 100644 src/whoosh/support/base85.py create mode 100644 src/whoosh/support/bench.py create mode 100644 src/whoosh/support/charset.py create mode 100644 src/whoosh/support/levenshtein.py create mode 100644 src/whoosh/support/relativedelta.py create mode 100644 src/whoosh/support/unicode.py create mode 100644 src/whoosh/system.py create mode 100644 src/whoosh/util/__init__.py create mode 100644 src/whoosh/util/cache.py create mode 100644 src/whoosh/util/filelock.py create mode 100644 src/whoosh/util/loading.py create mode 100644 src/whoosh/util/numeric.py create mode 100644 src/whoosh/util/numlists.py create mode 100644 src/whoosh/util/testing.py create mode 100644 src/whoosh/util/text.py create mode 100644 src/whoosh/util/times.py create mode 100644 src/whoosh/util/varints.py create mode 100644 src/whoosh/util/versions.py create mode 100644 src/whoosh/writing.py create mode 100644 tests/test_analysis.py create mode 100644 tests/test_automata.py create mode 100644 tests/test_bits.py create mode 100644 tests/test_classify.py create mode 100644 tests/test_codecs.py create mode 100644 tests/test_collector.py create mode 100644 tests/test_columns.py create mode 100644 tests/test_compound.py create mode 100644 tests/test_dateparse.py create mode 100644 tests/test_fields.py create mode 100644 tests/test_flexible.py create mode 100644 tests/test_highlighting.py create mode 100644 tests/test_indexing.py create mode 100644 tests/test_matching.py create mode 100644 tests/test_misc.py create mode 100644 tests/test_mpwriter.py create mode 100644 tests/test_nested.py create mode 100644 tests/test_parse_plugins.py create mode 100644 tests/test_parsing.py create mode 100644 tests/test_postings.py create mode 100644 tests/test_quality.py create mode 100644 tests/test_queries.py create mode 100644 tests/test_reading.py create mode 100644 tests/test_results.py create mode 100644 tests/test_searching.py create mode 100644 tests/test_sorting.py create mode 100644 tests/test_spans.py create mode 100644 tests/test_spelling.py create mode 100644 tests/test_tables.py create mode 100644 tests/test_vectors.py create mode 100644 tests/test_weightings.py create mode 100644 tests/test_writing.py diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..b026632 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,26 @@ +Copyright 2011 Matt Chaput. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are +those of the authors and should not be interpreted as representing official +policies, either expressed or implied, of Matt Chaput. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..259e54b --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,8 @@ +include *.txt +include benchmark/dcvgr10.txt.gz +include benchmark/reuters21578.txt.gz +recursive-include tests *.txt *.py +recursive-include benchmark *.txt *.py +recursive-include docs *.txt *.py *.rst +recursive-include files *.txt *.py *.png *.jpg *.svg + diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..84d0d80 --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,88 @@ +Metadata-Version: 1.1 +Name: Whoosh +Version: 2.7.0 +Summary: Fast, pure-Python full text indexing, search, and spell checking library. +Home-page: http://bitbucket.org/mchaput/whoosh +Author: Matt Chaput +Author-email: matt@whoosh.ca +License: Two-clause BSD license +Description: About Whoosh + ============ + + Whoosh is a fast, featureful full-text indexing and searching library + implemented in pure Python. Programmers can use it to easily add search + functionality to their applications and websites. Every part of how Whoosh + works can be extended or replaced to meet your needs exactly. + + Some of Whoosh's features include: + + * Pythonic API. + * Pure-Python. No compilation or binary packages needed, no mysterious crashes. + * Fielded indexing and search. + * Fast indexing and retrieval -- faster than any other pure-Python, scoring, + full-text search solution I know of. + * Pluggable scoring algorithm (including BM25F), text analysis, storage, + posting format, etc. + * Powerful query language. + * Pure Python spell-checker (as far as I know, the only one). + + Whoosh might be useful in the following circumstances: + + * Anywhere a pure-Python solution is desirable to avoid having to build/compile + native libraries (or force users to build/compile them). + * As a research platform (at least for programmers that find Python easier to + read and work with than Java ;) + * When an easy-to-use Pythonic interface is more important to you than raw + speed. + + Whoosh was created and is maintained by Matt Chaput. It was originally created + for use in the online help system of Side Effects Software's 3D animation + software Houdini. Side Effects Software Inc. graciously agreed to open-source + the code. + + This software is licensed under the terms of the simplified BSD (A.K.A. "two + clause" or "FreeBSD") license. See LICENSE.txt for information. + + Installing Whoosh + ================= + + If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install`` + or ``pip`` to download and install Whoosh automatically:: + + $ easy_install Whoosh + + or + + $ pip install Whoosh + + Learning more + ============= + + * Read the online documentation at http://packages.python.org/Whoosh/ + + * Join the Whoosh mailing list at http://groups.google.com/group/whoosh + + * File bug reports and view the Whoosh wiki at + http://bitbucket.org/mchaput/whoosh/ + + Getting the source + ================== + + Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/ + + You can check out the latest version of the source code using Mercurial:: + + hg clone http://bitbucket.org/mchaput/whoosh + + +Keywords: index search text spell +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Natural Language :: English +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python :: 2.5 +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Text Processing :: Indexing diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..94be5ec --- /dev/null +++ b/README.txt @@ -0,0 +1,68 @@ +About Whoosh +============ + +Whoosh is a fast, featureful full-text indexing and searching library +implemented in pure Python. Programmers can use it to easily add search +functionality to their applications and websites. Every part of how Whoosh +works can be extended or replaced to meet your needs exactly. + +Some of Whoosh's features include: + +* Pythonic API. +* Pure-Python. No compilation or binary packages needed, no mysterious crashes. +* Fielded indexing and search. +* Fast indexing and retrieval -- faster than any other pure-Python, scoring, + full-text search solution I know of. +* Pluggable scoring algorithm (including BM25F), text analysis, storage, + posting format, etc. +* Powerful query language. +* Pure Python spell-checker (as far as I know, the only one). + +Whoosh might be useful in the following circumstances: + +* Anywhere a pure-Python solution is desirable to avoid having to build/compile + native libraries (or force users to build/compile them). +* As a research platform (at least for programmers that find Python easier to + read and work with than Java ;) +* When an easy-to-use Pythonic interface is more important to you than raw + speed. + +Whoosh was created and is maintained by Matt Chaput. It was originally created +for use in the online help system of Side Effects Software's 3D animation +software Houdini. Side Effects Software Inc. graciously agreed to open-source +the code. + +This software is licensed under the terms of the simplified BSD (A.K.A. "two +clause" or "FreeBSD") license. See LICENSE.txt for information. + +Installing Whoosh +================= + +If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install`` +or ``pip`` to download and install Whoosh automatically:: + + $ easy_install Whoosh + + or + + $ pip install Whoosh + +Learning more +============= + +* Read the online documentation at http://packages.python.org/Whoosh/ + +* Join the Whoosh mailing list at http://groups.google.com/group/whoosh + +* File bug reports and view the Whoosh wiki at + http://bitbucket.org/mchaput/whoosh/ + +Getting the source +================== + +Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/ + +You can check out the latest version of the source code using Mercurial:: + + hg clone http://bitbucket.org/mchaput/whoosh + diff --git a/benchmark/dcvgr10.txt.gz b/benchmark/dcvgr10.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..e0e2877df16338eaa0a41738db0f06e58bff148b GIT binary patch literal 201819 zcmV(tKqB?`Zf+587oKb-6yBh$Kj zX1ZsjhcZf{Bvw+QilnwG`wb*P5?dg^!bME(U;p`9?j8Xuv(9H{%%*Fnh-)Ci!!OI% z8r|HC7mGns)yZ^p_dOZ$-(QnavPw^zLDKHC|-Q>$Ki9IBzR&c}mysS=;EF6;+efNz-lRHo06UZIVM;o)dZB z>RqlN|K?NkY&p11c5J?OGZj(VKjeEnX^ zH`nXjj_$@!-#JTOUKfv(=Z9o*`#78N?R_>ROBt*CqDnCcc;L}G-DHRJwGMT@+qYTW zU|Osy`6Bqfr+vQO%g^!|lB4`sl^p4`&XRRiZu2tR=to_h@tSFyC3UuwY0&0!89Pkw zA;%?cg;V#qa*?j8uD+N_ZJn0Qv8r31`gJ(GO%KOaRuo^FpB}Z5Z#+o0b#;)N?qrq~ znT)p1VVO=Pf5uboQ_g0bBhMkXSht*GANjiN(xS-uIb{NM#e0Ln^Fgpt9VfHn@|nL^om{3U zk~-gH^7Q+vmZ9BNX(1zoL6lKjSG6pa{A>0V4~G}n$+OD39muT;9LQ;f2Y;)ojjUu@ zp7L?)EE^WtZYKvk$erk6HcoCQ%gKL@dEKJw8e67!qo*&6WHc3f z@FsJ-4u4ISlj(SlqmQy4j}=T8=0MX_>s*f1X;^3LOkSt)&)Kw9Azy7LA5Xq1K3?6G z7`v?+UODX0H%E7O)3JP$CCpJNi$>;hBh#nGa@$r_A+}cRFs4ZV`00#aPZm{P47s#0 z_J_0{%Ez+*V8!txQf?sBRBFe6}3PssDUC zdb*iU?jFYfGQj-J>NelX(BW(P6i$z@+43FEYQSVH&a#wYQ76xLlf}zyF}@ol|D1nK zvakQb1ykYw%H5AJ^^ZEy@YNfv!%?=AT_BZ5l#9z(#rC07uPf74!KIeA+*xk*5?m{F z@`%ZjOxk2q$%H5?7*NbKxlnbIeNZgGs%!NL@jc~Ui{g5~^1xS>-IuQ?`*`oc-?FAXa7)q0 z*7w6W)?)s9GM2beoDOi1^gns+W7zFwZZ?g~aM>^9JBsJQF5bz|!r_vSBzM=B z6Cde$vE+(+n9F~EOC)i=O<%gqOLmuiXtQK5- z(x$S*$V8WuWH%e&t8TJJzM-C2obB>+*|_gyb+&RxnN^rxVuEB;We35~z>So<)g7Fh zYw-x)GjpoNal&(dU;-BM__gfwHi(mXGJ3vC#(x{nZzr0mH*!>%*j6O)CLLw00Lc+3#ljX8d2b0iPx4grhUyT+g8K{q$C+$ra%hbGsVBxX=BJtiD6_v5FG2!{b+9U2nsT^rnQ(xKnUPuSz^eVlrc&#T%om&=T&0H!$}zTRoCPt1IjvVkw*!*xG`6>I^Bk! zz5l$^*&8SRiE|ojNuP zuX8?(T5jJ#SJ^HvOIk7byY_tkxX+{IWcEz2eq9~7>my=8NGo^7`I{U;C+2f=%xiHV zxZ_}2%bZRze-1-2*GGJ6amv?NLHd&!2>KgzhJEF1U@_&1!tS=JT5)(9nuou4vP835 z=5K{98+Pk(6hpW1XI^Lk2Y(w6+3@Qn3^v3!$JZUiMaZGRk^4@zD>~WvuB$WJQL)Xk zFw{po$Rktjw7+N$-gAJgPpzRKX~pSnaFDYz>~{UU1@-pA?(wA zOjav~2^Tm@3`S3ni}ZAs$Ff`Rs#DoNtw7`$&f8*x-(RVz z_IM#abdK0V?kiD3C$>|gw6=zY_bb{U?c3f(#i`rFYi2}mLyb_=mY4WZ>$4dYy7B3I z)_=tKV)hZ25>zoJm+@Mq1w&5SI1L&Hys$hN0w!@$kWVpJhZ`!txy)DbmgOD9vEFoR z-7}C3IuB{L?1W=i?vU)Tza{_A`fKtVg3i0w!zi^b?}#z3g8#kJ(_3uB06Pa;#Zs*y#}AVuDq=^7a^ z2_#DdZggK-yi<{$HD=QsL?hcG9|Y6tl-375(;mpJW4rH|7?OQzbH%s>j#k^P@Z&aT z%#4xM5Tu10fzQ>RD(wAL!C-N%_tk(C10{=8g7;)*){##jW+>A;)-oNDB1`LI+KFRB zI|cI~Ua8Jwd@Y>#o*iXVrkG8cjHXXTeGTr! zVsAsNYcZz0mwrw&;bM@~=ajpxGJYTV2S%KcX)-#jBydeP;%@k@)yb4c%a}BhC#drc z=A}$3nPOQcV3c^bT=XQ_5>JYIgKY*oVygY9u|e(dh$> zpOf3yxu+#07H{^+y2$GH@H=;D22bB@^BE!NMz%%fy2TyW7()ggp1SM(XqTEikM2e{ zxXo5Qozi(i3c$2}u zK{R$8su$SASW;8db+%&Ucx3RK9n>G?lZd~+lg<1hql}lCi*=nN_uKT6EE{_vc{9GD zzF3YVvgW~Qvn&mpVfgFp6z8%J0K!FE<~Z#0D98|8d!k9B$5VP9*qyNX9FB!V#GFt6 zY{4lV#R|DB@z#0rF>lXSCMdSUm3$UlUT^y^I&{^$&RS$r>b5I!2J(LeJq%nDP6mhM zd2};+Sx(fcj_EVRtuU4Lg}3RtRYRdgg=}--sUpJ8<=M+DOattv2;Y%e6-V7ewCCGL zY*06Rh_iSL42JlINT9Cwh_bl9i7%LJ#JY3x73#C^dWX3cJIVW>EK#n3(TPc4%osjl zUWS+{W6Tr})b!BX#HPpd!<5tGB`@G~Vvf=EZaLsqZ;(kd@X8Dl81EWDnj^Qz*juzG zUk_1jCLRWpj0s!{xj82^9vjJIz80e}ItRW!7@}c*B0|DfkOzT7DXw}~o{$Fa6#}pT zxZm52S6QK^i~b5@=S}v3nHAm>wTXMNwD+U=5`_vBrLx_Q={J4K6xYN|)+g=Nqt@w> zS6@{>1{hO*rrt~I*+r#$r^J}SR6T#2GQiyBb-}lzL)q$u*ZUm#V{9LmQDWTK9O19) zghLJ~E#E!rd!F5We;y~#;zUhp*G54(fc&P`EVtQT=ElVp`Ex!2@OM} z)rozU!7Grn{1l&}@`i^HQ#CU5nfzs=OOHmIp$y2}hj>1k{AM!xh%USvCbzTc~}7D2e~>pZP#+K?Yb9zjA1*lpkQ#^tWyt>pjT z*9;~9nQ_0I(tEvs;kuC_;4|`>Wr1N1A27=N+S^Qh6^1Sn6o}d7$MUn1Q`Hq4tyFMO zVQpnReeq$FEEz0ovQZrKkDN(8F|oPElN{%2)pADZ-lNvwpoZCoo=<_t-C(pm(XWFY z%PGht!m`q$XGOk4?CLMV>Va*d-=1f;-x8aT@;xxUxm(DMhVNw-%(r^4Hb8s;{MyL> zt?13~GQ1fl)*9Q@U#c=~G_wB7?0K}%dH*s_rsL6_B~suFqttrsJ6f+Q48M|T81dI6 z!h*+8W;=w@8f>n`ZN$8l7d&(le;nZ>SRremsSD2l!xE(2tc~AJOb+(l-&8w9%Sep#9%CED09N&)S-}xTmsJxp! zD0_7RejCmBvLNGi!~9rj`MSC4l^2?(JIKcu`~Tq+DP@j~zN{tJjjhDq4l8*Fu zw06cc-(|%{KZ;EMZN{W_3~)lw3a7o3Ik(N%d0GT_hhbfY%%fKM>FaQWGPuL}Iyy$O z>czdvQ zRi0rY>kXrIRzS_Fe0*l0&$>t13FDC_=)jtE3YrT z%qLUvIEaO?$ITC#wl=%Rs2g|%8RQB^fzvpvWml1AC7RWiy0bv0^4v>N-?A2PceR?9cFIL)>@BK0S0EoI~(lPA-xm4Adz z+@Qu>^qx0{S6DS&4g5&Mq`loHIAcUX&)~P+F{EIfONsF%K zkB5*>VL+YBpoO7H?yHJmlU!4j@G6ghUgQn8S)8?4%Q$QGabsn)aFNGO)0g<;Lv?!h zg+8A>qqGSB@o_f2;}#>n6UwGy@D)8lYrN@d8%8|BDc{01Ddd0jLVtZ7B`>nbMh}zc z34Cq@44Mo`f~}ZHz^XQa=ge=!WBqNi9F4UCfCY&SGXb8@0_#Zx4|G-_!bUGnzgJXeCpn7d`sr9_xF?0 z^F@G8b1w_#_pV&CI6#}ui2^JeE~}XZTVw&&c90bapDmaUOF6F2ij!Gua*G*%v0X8k zJ;KzLJB)s=)(CYYCPHmO=ko6uOA}F+(`fiYm3_t5&Vgkep3L}83-Zk|Tj!cGSf#SF zUk=24^q4%OiX%&_D-T)9?Kt}(rmGbn{vuNgcg0=_&#oJ%Y>(pN+0_=w)9Ye#JK|UC z4l)roo5CZgxg~c{z0>d3WY-#cVe?J_Ptf#Dx~(-T<8bzL=U#0Jz7B)3VLPq^0+uFcg{(h#L ziF%D)=*rkNtpXR4yvZ0lTl^4S+2A+Ow~&b@fo3M(Ki5aj#d2GHC7^+9mC1g_{E^IR zW~}@^_F-_`x;}fMa-ZhKpg%T$!mQCvN~2=C@%iU8rawV}oLHkNgB%sZfA-p<&f4^M z*$K}`fn>}tcycEDu|}$akA(eE=2TND5T>8W7f(T}O?0FFWsvyhgVMScM`)&c{=h1e zoeo~a#aC~wG>@oMb}8=4{EiAjtx+#=Y$oBm$aNGN9~k!71`&3&Sr-QK%@6WHw?5q` zw+yRpY7`){=2}jNr97p1)9Qa}T|{202>Tb$qDP&T>phc>wb*MvOmjG52k^8=6LHlq z_QZ5v9-92~+vDUVSxiUI4>DMd9fM34v3SV2sj1I^GD;bcs!@2DNpQ^PS$xxt`fth8 zcd-lZJHRoP6;+%yGkHbj0@nIdl-+D~+|w%BvO3 z24;$Yl?aGX9NltnMsq-d>>(GE2L(kma3gwHyyfFa!hYE^leh8n?IYjiudkEa*XO^! zj+5VBr%db0BOyb|SX73zhqzLxr@5Srp%G^{dz!uGJqbbdD?q4E$ik%J7%Bwhq=>`1 zTBD@hAf^cJVk-{kYW#nKMiIZ0Qj+~E2Uk(GpY$o<5Qyg6NZtWTHr8P;Ao(O%| zqi$|^vyC03v5!{5<>E|6$a=V_Pi+)PbThhLsGLr_|hdUI=j#_BwZI$hF^GE5!avvu!(t@N_ld1ojTS^C+Zn~csM%`4y=^XC7 zWj!c5me{gsGs9yc4Rj}#Ro!A#$(m5bi1yDB3fxE#d}PD6g2}_JP#|Tt0`Me1m-z3L zFGiH~51cB&k8F2hvBVjcC+lTMQPDhG+0YQ8A6Xo`*B5(jxgESNWHEYx5;2ViiAAYp zhm5AWnuccNt|j(bTpR{-rPmPFX=@K5@rtZIb#yfwyCX=#Y8R;j2ml7hGZ@;h=y++a z-w?!U(5RCWwQby{mWj_y8&N!gZBK5~vW*WjM6(%oA%$za$R;LpQuok{1Lq1^1O`AxXOl@h z&o&V8Jb@+Aty0qxA<{$n4HhH<68lgB#UXBT2T~xq^?H?OGA^Ftfg`dr+hSnL)a#K& zwryhxN{+_IejZH#Uj*vp?;ZPute&|_8H<+^A^`x8W(FT)A{$^6H?ma{UFpq#T+CV{ zkWJm&j(9P1r+7V`U5Fq1PeJr-ekk&F1z@#o4khXuz6r?0y@h9Mj-8zOFnS=KJN-E0 z4!d+G`<(3g)^_DPx_lNo0CYQ3v`nRSk{xAF%?wZ&90gcPgJIYO739I%ZupbtZ14!* zYVWs<2T@me+F|@{p8vt1fiV*K8ZOj{zP^J1KE92#d-=4?g zFLz;I`v>JeOWvMjC)JcwgGg46)biT~2^YI4;N^Ft#iI<%rbA&oRTrc5r9Pd`Zoe7a zux0pJ7?Oq206ZfxIBoE&G{3|)OH>VdI3xFixwX_>1ei9q;jSiMbPeV$ zFK|)uTIFKAP(o-r*_8a|3$%JZArWPN;Zs<9t!C_V?V<mSvGKT8Fc0At=9?KR?a29OlnWU%kClU3I0wkvyCv))y2X0!jAj;+G_?gQC7 zmU%pVHmlFXXK*mkjV!r`$B}M?z>YWKc$MjH&jnLzTCtbBi@j?o1pjl(gmlas3NPlb@8)~?gd#9VkhXp(>f4;k$+21@69oh)W>?${_ z>c_Q$gPK$3;2?v8>Dr9FJYsA%nYdnTG4qwl!C>pNgT=L?ggWY06Uj7}Rp^5D``QXX zO3WYg=5_u(SuAJrf5qgDl3N7;+{u65ylQ!rNk?WOxhP7s56OOmq6G%;9Nw7iH*zmA zW{OawKS&4!umHX?5y5X|=hc`S?SXRm6`%RUewXmPz4$|}|3HtBxs4?bh1^6DNIk`AW@Q8&U=N7a(9$<_`w#Ejh^CY zWcu;P%M7q0R5Ij#F|qv$%mT5fv5GJ<0?pvhF^kG)v>Tvp4f92umqJdnElXwzx|+ou zMdnL;gCw}s2!K|hf4A#?ojomkEqz&$?O@G_Ba|J(F{HeuK2%($&ky<{y_xa=@OSb6 zd@e?Xr{UJzm#TC8N{i)ugqpxlmKP3$#2VHDh&pl}b{%h{@Diutms6yu{L;i7b*gV#;vdJS~txl4&E`oJ@PmX}nc6kZZ@U$-n*Q|FIP* zFSF3?+{k|Q7g_FAc#GwfvtWd&EKf^s#((=Ro;q8QMzAok`@08F zs!DSxwWa5frQTHJISt*Sa*5L$?G52&82oDDBY?r^eu=3RJL*<=jL!N=3Fmac&AYKq z(@ev51l^&Z37tiu%1dbJf1yQIVvyU%`DC#iKO>l#qqwj_W&f<5McAa^+-TDTJADRz zL2sR;!SjZ=h#Qw+q0gL!(^5rnCNqP_=y{UB+7)%?; zQ=RWJZ4L(n7BTFP-o@q37n29J@ilE~hyYyC`x|7_+tUg1jm+Y!WySojJ|6-qhK;d0 zctWX>ZJheNAE=9J?W*4eXrN2{MLQGZAAe=P!l*2>&_~c)fw6at4fq)vhEHssrXjXw z1l>vU4WDISl`yo`wuQfS(70Se40!<*!ueIusWE94I8ruBTjCj`3>9A(F2x_SjCAWPfUW7_9;{3VpG(L2{w-5he8r>(9)GaRc>& zZi&yf&aRB5#A=Mp0x@1oWxGa_sM899`I0)YM|pA`RMzjdHFC^edqO>JV1`8=yUn~v z5VCQH>`&tG>k9C!w0f%Y{M>@|r5lW)<&6MX(twYXhE*kX%LeWnh3WY){t5ebgYw+) z5Xom4oB9dgkV#-lK&-DZdY}q^8lGgI)@D0%)j}|$t!CW#fpwK@%BPIoQ-jUh7&H&Tx3^Pk~C5 zCMwao)(nRZ!`gf~XAJiSva!oDFaNB+=?^lq{;ZXl+1v093$qDrV471wNoFU>8!V_URDK=~X@agu^CA`O)w494i{k^VB6;Lr@8n9gUM;Sq2b$ z0Y}U_7*@u>>pW8dze;;^*1P4j=SM=}f{=$b!| zZoUT`tHy3zkALsr1|7ti#C~l!6}0>*CdA^--|~8w7s(PSPa1t-O*_z`)iZSIPI$T$`YgS#<#LZY1e?(2psf}mp?RG$C9CDjV_OU z5xdGDX7b*fYXu*p=0+s2C4!t5;+V^m){0Hr=0AE3ZDrNy&h(M5QKEOhTWnj=pmJDc z>{?1M-BjJy0c>3tj)&tLd07uF5FIeE%HRMd4a}>?Ag*-CVe1=1>|rmlT`9QOGMe-8 zG`S!D1`GCxoK+THf2Am&EoL(lh3{-gG;4~owxa*?vcK#=ZDRfTu!LI3Wns;lrY{MT zu6-_Vdii&h>vFWc%ZGZCp!`H)o`5};JFfGiXTLIus0hGE%Ray*RvNBB80Op0ijYm% z;Jo%gV!nKwO=nNzHwiPiVjN^)pbw9XWCuKL)PBkvl?WU7@mX7C_KId=1v|<!;z0McU69Myna(&n6Pv|hr4{{EMg+BO?JL=6 zGnu;`DP{7hYY-11yDN85PBqWdHU@NAV})N`AgLThl{-ooU=Jl*!gYGz zZ1^<##&~0?W?@6y%Fs#?X}HVqKm&QLzTDAJDFe$yY2a=-RZqU69)+?XzKP#%NyH-a zZ#c@@9=h_PY}-G0`aOAgolYmpDkaa-ruBxK5&tX3(u?Id2Ghypi!2SYRlzsXlw)J_ ziodNaSmDpWE$78K+}EP7mG;+$KhK^=cl_zb8*d%0)@%Q|wocO2w;ucZAR<9zUKK4L zD1>}VMaVPnQH$S+;<<*^yqfH5-ipO z%vl_cbkyi=Njc-9;dBUDFWh6rR%lG|K6c)bEBt*3_XI&vWZ>TIQ8!xdIL0? zUIAJnH+)yX8y%ipv99nWKSJcbTt0h)%8x6=cUw=JA2s?Sose=4@xK`F1{x-YxUFY) zKaA(4BVme~`?f7%>9w?}JG+{E{$Blv12&${5W(Gp_ycIBbYnGzEVJg9cc3YnpUzlZ zOw#aT)A4XV`#w_EKc<%yV$cyNw1$;ZGb=lJdmf^b#~Vz=(&VM`$h1h7_S)2yjH29W zcm2`Qs+cXBt8Co!(*${LSf1#Fb2sOuo2wWxvsvREqZ*BX*u9~%UNffmmY7Kr8J17jb5fDY;$M1>n zHY`+t4Cy>1M+Fhow1^6r=EkZ?sv1p_IlI5oV>V#;ojTQ9wQANGQf+`y=qrP&YQ+Vo zRC))rCY>2t8H(tl04Gm1e4M>5#u6dmn;W0R*=hMmDsrt{Qdl)jXL&`P8AmcWRN}R_7~Y%IZT)xI zVp#YdKPStX^`wyna+RLFi{E!E2OdgZRJ7%V`|USJN&D?6^-8x3|;A0 zP`$x|qf!LRfKCbh#SZSlb({pWu_pgqLBrl*+S^W)bA@w)o`vUMMI#UKOd?8j**^D>fx^HwP8Oj6fdJ=LCg5#y!q+%gS$l?w0t0HqI z-NN=98Ep7K@48W2Hp746bkQA%S*h@MeS!e0GSqLb4|1%q^Dte$9r!m~;p=)a`KO0h z+XVo<`JD0?*R*VyH>vLL%r9w0IIqH6ah&MGDB}bizAV>nTDk{Erkm>vF3rHvwAND3 z*vPHC{bPCr(iNMRqOxNw&40 zepm2y%8K~Hvi5#WP%SJg4K;&jpOFHO&I%he*(87+u4&KlL{JqxCd)VYzpUHqJFwHH z>dj7VP`-5Ckr)U2iP#>k%g;bZTvUE_8I$C8v>>8etlS0WWjuNvKFj^J3Q@p~L;j{Z zUTfc>Ec9M`0p}V*)9~c4$#Sl|jqK0BL@(7Rlo_QN97joq>Z@Y;i)s@Z=m5*ew3<^P zp2h40k_DBxyd{t=mVK4+W+P$gE;BydHjD8|sGG_*QXi3*mcum!53f-;dJBxz+}*&% z5U9Rz=S(w;POYR_u%pk(+tKt%dqasqSG0GkNqr^_H0DFc$0;Mg@2e7E$!SQsik;WYe4d180Ii)YbR1Kf!Mh&$^IO#fK&i%;Tu6T zA^9Yq<2F5rLBq};s9mi;6_am)?j*L%q7nG0{awfd6RW?3ugC9UygXV6B`m-jDhyuW z%vrIueMKq`n@t+P1~eB#%ng}oGE_3L{UVp}?%OQg`-X&~-oNfl>HNYgPaPeUM99Wr ze6JMZ^_h|Ab)>$AF_qZqU@t}Yb58kv3#*PXE zMZJ?kpSho=cSO#(aO}2{0U-AYgRzhu9`k>t(VL=MGi_Hr5cbiN1YmMb#>OzaVyM@9 zT}(g)w1hJQ7*s@pN^3%@vkhWQbwBj1(819}cV31v;aohLD`rZrJH8~m9fra zR_fyXZ8G5?GXuSaqkPgXcUDvhqqICXbO>7SGGBK`>#l5d(ODYhMi0vswVm^qs;;}E zv5nl==NqTmBr)TBa4uC1R{~K8BPY#e_sDLW{EsXKM1f7t{bpqEOVfieHi-`U1DcDl z{<}s#tvs4W{6OGa++Q)n;T&u>A$*9DVYr#k-o}O%v}?&zqLaU_em4S-?3}HTIPRbK z2qy_PqeHygwB0dxYw#c*T4Dxa6qv--h7{TGCpuO=1B?%^(-4tSxP+F8yl3s85js z9QAt)ArWA%g68!uQ*ETER<4E4s7KBtkeaTLwtWX}W8pUqo89yAwu7O!&V3Ecu^XN@W zN#3`@}P8qfV#a)xFQo*8Pz0cC25erLV8z ze0|w<-)q7q+j<+*8A_4K4XG-P7M@61UQe9?;2s^qhzGbgOsf?Utd26EWKiy3)q%;@ zx_-_<7Xr@~j+VHtfo7*i2N$SInn*rCj|i-B+UW(K5$Ue@G>KTPkya)td(r#lWjY=i z8>`H8mb&X~SIX2o4|+bRc>ZkoR$>K9ON&LqVqT}myya1rl2q}X*w2BtTq~WdVGTc! zU)&=51jZhg&2{~WPtevmiI}wocb8U$ZRhSNaANyjIzyuga<5{CkP;baGFNdIHe@@p z95w-4Bb$*-p0nY_*kZDETGQ;uQT#glK9+X}*ePp{p^1nCwhH)n4A=Lqo)N4|}hkG=+sS>@939F!-MO=cW|exKR4Abr!8 z*el-8kjjcX`m{|v%ck%kZ!#RpeGTG&Mu35#} zs=fTDL+(cNZ%?E;!PRNaJ|A)qCVF%l-D;QeHp{(f7_3^&FOFxA%IrRQ_ZcL*^RuV7Qt!*sl1t!PhcPxSR3Pa$~Ih^nf*9(xLM_M0NB% z3%n5WF31WUWpV@N);KTf3Og};W#<|3j{n=UaFD!!&A|c ziQDxa$H6-u?u&})TXqH+nxcpm=@l5o!o~*KJL2Lb0C`UMZ~H;>ni_~JhwkmZ1_o5V z&EYDz`}!-VfF{a&M_rSqAuZZ{tezYERk5#jFP6_wP!vJT4c|#`PANP4!H!&hhp-lT z=9XgeWI`U!*eeHI^k;~4wRzd=9dYP4y%r@`W-Hx;>Wg@>j#~(72JFH?YIzPP3kzO7Un1mid|{E99-xMl*Alvl5k0sQyDaNs%-)m zIm7N_3)RiNHMn`iW)EUtQ|NT^)EMRm>z{c*0Xp?Um>D4H)TWT2eWxqej$Fz4?g>oB z&)7@G2we&~$co4Zm~B&#bXAT={otCA=5-}=Vq-E_Tg9{MQ4%kIP%+4EW7ii?W3&X^ z{!y2_p{fwjgSteKEk-#`niG{k;glgUm_rUf@YO-ioUZ*_PH$%9$+EoMOUFL<54N7{WgA$tBp8F#XD4q_CypB3QEPURG|KZnx&ODDDPPy+i>ktEMR` zPw{_b7RqSVtg-{HqoE?SG+Tj3%#7oK)s1W8ZSL*WFp!h!H_dVGiI+0IwiUu^BEW%D z!L--*tb#vl1=4>^s!EdTe=Nr4&TR7?9iuSG8&puCUM+RS6$7 zEHmy!yDuA?L3-)OFCT|@qwmS$b+QaPQ{Lr`M(Uw=QV&9L(vTSSyNdJLm7H4lkpJ*Z zDQUjBCX}nO$CZFQ1ORkO_^488sJZSOIMYWpp}$3GVlO!@L%bnl;=CfZ+-X@i3tNypgU#jg+R5v>isbT1BfXi?W*s2OINfAmFNsL~fLJ`J|p~-4mcVkpETX zCd~*~qK*wRTCju?q1JZjP=v$}4hphm-U$~oKnFGasZ)64abZd-`)WEpDH%mZl&rP3 zL05SRHz%vKcFMd z_4cR@HXI3{UTOAXp9`vbIoO3zf{b**+WF3`3<_APcXs9axPTJ&-+!R#{n+xVZ{(5g9Qa0gLRb?3i6lqkml=@9|-;K1t$y>B2E6|$dC_hRz*O(Oo$&I(KFjrJ8kr; z#9eM0#RUQssY&dHxImeGMG@G_wNR1*rx7tVsj37Z*kF3kJ|6se&yQ6Wt~dc5@f!3D z$To7uQ0xeh@O-h2h{{~owlC|pERPK3fd?=&f)S8G-EjC%6JBX$U`D$>pepjfUt0cv zAr(jSpTf81aLB&MWaS`aD}fIvJtz#*e5<-S3A;L3wB+TkL+-N$M<7Sk1d`EE87!^* zI@D|zbFs$xsiBr5V7%q$ zc!IM+oN~JG>cjcb{kmK{JanPtEUH=CO^zgn$9uAzjoOtULV$jCK0RW`Jd;Zxr;Lv5 zFIoYuBA6IAu@3?9$Fxo?vRhZ1ArV)9R-tfmbQ*K9SfaSa8%{wdp%%I{FV#efOx}jV zGegcemA-4%`G*SPiraWZDAJ4Mr>e%)GA@LmmC1U5(KtO)K^VFY5)SF=ZnAZaaxPCb zqL8;oPW_35fi-6`AVMsCcDcT@Gr&qHRm*fhV}N!1!WpQKHf*w=E{NJJH$9-eQ2z3;mZxY& z#yd6)E=K3f987RBO2|i!_})3V8s(nYJ3&CZN=;t2RWKHz=9v{RyguuTRY`?|K8hoU zWXXw(?|!Sj$KRr|+)r;g{yt_Cjo63%TB^;Inr}4v=h0!2dSb zvI--s)2K!=m!HMVV8bM+4_bwkN6FL!3@z*;RS^OJ_lc1 zza({GNpN2Eq9kR#Q^*K$Jgy#Yz>QqlMz$2;(%QmB|2AT`mKhB`ZfexIk?Db5RB|_> zs1&A1N0_M-^x6(FwKt*X8&5n`$W?Fj|1wfsXKc<5ZW_*->rAhEUAIfsIrzmje(#jL zi#TR=-Ur4ZzD<57nI$~*AjD&5&s3n{NMN!IR##d_w@f4O4e2&%Njh`QH%O!Y(<2^? zU*h-bUE@S8|2CB_i>rV7DbE^>3(vV!VUXRkYRW#{oo7KBg}0cF9~MRz zzbe$!Ix^W-qr}KtBZ=2NZvB4D+bb2?4QM3m3%i(G8X{66DsVAzN1qYhpRM zmIe-cw=@?}nCjl~{J>*klDq0b96dkTqPE3IHAY%|z?a2nsE@s-{(+ zV>fM|bu*{UCb#2c4#HQ>Ktr`kjyoVZN;s?*TVq(-$S@DcvZ#$XReNzc8@oH? z!}>ocX8s0kYPV@!R7{VU&kF~QK1Rw;P<2XHD6U3TonVz<`X2I?e0lP|vp#;g&23!< zPh-1vT=37DHZHKuqPOtgm!B`i*qI-x z&XMYcHPRn!#i`ILXa~1_e3DrJT7DxyGBuY#DNbbHZN;pQYgG+MZY2>2IwjoM_0rmU z%?O);eA))wZ;B>Gqv^?8%}sn#qf=V^sv_POGmq_fYym}&Sn3R^=?>A|*(wh@V~CGu zKG_RznDIqw>@pgkDk@0p^JoU9pn#j!bOR>x4js&<$snWO|7ru(e!euyy11-CC+)HP zDf8VxyMCPt(WoNS(7Cz`dAdnR>_wyV*S_pYA`<{JK+L~tu^ite_e52(NI#!Ep{IMS zl~s`8Ri-OhlU7N|?48}I7l&4QHLaf@edtrx;s-Il+Y2XfJi2|9dxs2jn%6_ByJkO> zS*#x{5H3xqSVh(~Ro0Ll_%Eq;ys>&>q7@na*<`E+qOO^^p#v24iP!p250$?OjyRt^7&qg zS@U*45V6+sI37T)T`VKsD zQ}^&A+ht7qxG@xmLrW7mP>?DGY%s-UpKsfE7{2z{DO^{S;*dR~Yw($J>tcTw(}EUu zcTF@liM6lRmbQ=RX8o=8NI1u2V7^Fv_=;|
+xn2qwlLsDnIA(--+<>6C}VFx;<~3vW6I2+g=?6#^e0o7j{2@GSm=9G$%j^h9hw9oVX$Xl#7C|EX107x#?NCT|?XuhAlvX_MnD1j3US3gqWyDfa6?} z^nO-ig|fQGn&n-strkx<%yh=GJe^c5ZoEoop2S}1__iIfHLeKV*bJ&0aqG ziBE!zc)cswH6Fj@j0_AG_L;_fFZJ&PxPkiJ7?5llb4sqk>eZ5st#?)39bJnVmDCFo z1Q*%E(3~`e8JO3@Q8?>r^MBOhFQx86&lU^r`T$%TB=H#3MIc9Lymml4dNUD9Jw?8~ zgn}XsJ)JEH&0l<*PhOax`^#vaOw=XNT)$Cs*p<{MY@OP&yiA2b+|3+a6misFs!mae zfi*W2(o5&=Vi=!XUPOVJ2M`Fb&&co;CR7;6YGap)4A0@{uCVNqrS@ZNA|tMsDVwsY z{;*FNKuitQ&Cx+&(2q@bIAq3hDSvkM2p1ZXi+yDI&TCuoiJW+eYE-IKZEX2+74wPd zS*R90b9`174sMg;=+Rnxgph~j%k?F;UHNX!B`W-ByKT|2>jb$XaDrTG>%6?}p)z1- zd`47kVpk8JCO>*p4ssri5t#NS$r~rh!+iGoydc*;BBwm?1u42cF2Ze5RX@)(y9jc~ zOO~g%+}pJ3feHxgx$;*n_5VCAhOewshc=@@%!@;ng=>B-QH;8xt4Jk2;_VmXfiIlJ z2zc#L51DVxARqu~OlN4HF~l;Bg}reX8}6E*-ZcrI8}-vYlzzRraS%8uA;q<|{IyJc z`kMWEXwk{^P}Ry`9HmyDwb2e$b>tLVXD(O)R!?br^CneK!J75s!21I+8{mIlyoe7U ze_TZQ7fRfTIYTzls&1BXl)!{6dAxbh3$|+ne{ISONbs^==kBHr!teo#=R46>r8aF0 z^!74wgV*{x?uY1IsI~OLr{EPs;!BH|{kUCWWJPE6^>z9O=psTMY30BCAsa}zq3r3S zxia1uh;G%bs`&HW_&3h~DZ2TT&Nu#eKQ;9VayC`U600__k}#VIBL@eTk~f}+0W*60^AWZ3BLX)+%HKgOgdy09>=;hA414`Y&VT0T^p zUs?YuCR>xUdE5R~tU)X|R&~5ce30qu(~W9*qOB4E zVW$ZH*n-6y6y0HUt=W;z_B{Pn&!bjM-6~9Wn>=I{rsI4#)YNyp23eavDmdlJ4#&Px za27Da@*L9_Vl6205Fd8&ZSvfsB3xy=N?Eal%Lp2tbiV3Eeq=O)$oX~kA10H@kdQnTb`1W0AL-LT9 z@%Q;FUJifKkCIA%D5*fKboNkW_`@KQdLPs-b}uCs!g)J9$%bk5b(S# z4LNz?LcBgph<@|BSZJURlBrz>8)RDzDUAqmV8?`MLeU8*x<3B}JAk>)P0#kyhgRV~ zg`8Y&RIiJr*-MCe0adLU6B!!yXt3Bw!pn%YV6?uteWV2V4IsxSQf@tGryUV%pJ~7q z#>&5I>+waPVb{H}*EhEfc#j6W)1>vas;jX<8e@$%dV+nh@%3h&mU>xion?Iwt6UgXBLMP@{kI2256)1yjnZENft|GeFs;) zV}WI{HO}cPhbvLJLVqDOY#s5f>8;-P@@}RACjpJ}DLFsEIWX7BEtjJS`}m6NwDC=? zKZppIjp%S!2@TZakPP^78}{9Bhn zVqh-kLm?OqT*nl>X-Se{BPe0g;+X`2%vW>=m+ffc0keCED^8v#OR*~ADBdB9ry2j- z(e%EjDXFJC0GE7c#()`4>K9p7*|HNo$%9-Gb?Tq2DkWCMA~LDNrPpxeQLaNpM=Se5>W44M z7M+{4mPj1$GDQ!m=X9C&Oz!-+Pzm}6O7U_*ES(1MfSp&XQ7MVpFLt5G9I)db&9^h& ze?toBLb*#hZdN>V(y!=JdD=z%QJJ4I`|qp0ix*_tUB7YODDCsK3#p6(xFia-;i znPWD*`(A)bu+e^}nx==ku!=xiIx8!(l@i%Tg^O5**W#hG5wo^(EU0+Ek~B zI>gAaSWoIDz=s#*q#IMYKTeTXiet ze)}gt-t|rPc$Ghq3j%+nr>$q;pKFQ*{KH}X(?2EuB!7|rfueBkz_&2stLerqncn^slPsuV&Ny2Q4!NHcQhhK}o1M>g#%uC>g1 zThzfQ4KQpjJTHuei!1A{!dq~=Ga}pyy6A*89*kia8wJ6x<0gy&X~L}rVS*q4Y$2a=tZJ*l&z{E=6%L*Q8<+pqkF1l$zH??EYK=iF ztMR`nUm%VP=f6O#cy3+4ZD44gk!@;0O!8MOF0zJde93UEJ8FJh(JVw&M5(Enc#OZ$ zFG7<(d_M8X&hd@L&e6A*1v?a@&~iw6)g*veIk~VaIHa`(cPmS~gl$)rt%Xmw?&u7|G+S;ii99Pb#YoG;UViZWq#w zhKeZ-7v>j2R^V2-#VV~#U@B>o8@rthfP`|69c20*#8|qglLgqv5yl z_@%F)tL0r;DdHnUQ6Rk;jsL8_uM`W8Op|LLM>#fDsI#=}p|JwLR6v@^^{pbNO%aM&mnBkbJ=O7nvQZwGTst|% zE-0Gfoj~RJ9{X4{TF>iVYe7?X+mK#W{uo1*)v*J&D0H479R8;8TWz@v@_!n1lbcl` zIfRUZ!I+p!V^{<7V2FUlQ@-JCzxymX^8zwiSx07tp-9htmIVCU0xWF>MN$HVCOGzG zq!OPEUu2T>M3zJCeNPfv^Ix1?wx`&uyKZAC7HYqMiN0Uc81hkB>=}Z#RyE?hjR&_Y zVDMz!ZaRn0>b4m(z_a=2_Hi;}BqrZU>4z&7kJd)^&`XP*7_HyEVC5$nwT*0K?=cvf z-;FO~JY&4Z+Sq71(>f^P@zd!{DZft}cYp0Sleyg}V%QX5sE}tWn;vWF%Nsq8q0~5* zwx$4-M<`;!i>Yga-GH+#Ei#L!xS@LI{1!@h$hibG;h7ixxBvV<|H{uN|M74Btj`Hk z3-Z9r$2rWz^RpN@iA?mrUL+bppYrpVI_GG;M{C^jQBHW6&qhmr%8(~TV4=w$@MP8R z6n^Q-rX%77N|7$f?bXuCg8yo(YSZ>yW?+93ze~@B^SdGxn2+z6yq5nSfo2o6(RgQd(u*TLC}5Fd&}5}4eoZ2RtG&06#0UT`Mr^&{1ury%6 z`qVA5%J~k-ViV6Y0U3lU++>pxdzn^sms7D;jiTe?41A+!VvqJ6Rl_6WS7QjiQ~H#i zv;yA|U<+|HmhPzZpqP5l>%*gr0d&F z9%G=5^Nfz|>+`KiOVsI3Rmzakirpq{rflo^!L*Dri)U-gJy3T;&$+-4Ifs1^ z28=3G3N`>@+_$+kJkq|q_o_%5_FI1bA$gv>ycl>&9>0IgaOjPN)~Xpex0t%O zNFv#TP}4q(%{fimqhnObvO0XMeTR>-WjvIA%If8&)CoQwza>=2dQvKg&imUJ(ku2| ztwis8@nw~s>2@2*n+kwOEQ67Zc)m{i_e!2#Wu3}c+*Kf;+{%u%zxdr7VjCH$Mze|R z{*=oFowtTMZ`T(nWfBl~%0Ur4KPZ#+$TL;{jmM3B}$D{ZKrSI{}$!vpQH!L7VEnR-$wsyB61 zfSvt~wWZtH)5~a~^Ry&QMbfKjnD)Fm)lgS{5(H-v8#$N zXc@>G5c-Ky{~eR7^+e~yP48&%CLHapl;xF4BJ>Uhro{KpR_1pqzcqy zu+=W66y6m9exS=Yo`=wO(xykVCZ<*?!B(0VdO#Z0R50JzS(R=+kho$GH>CrzV{3$T zP=!(KMC=~*bmfbuq{CW@q_}}j4nq{i4T@0rGJN8VyzWpaV934A2pQV=VGj>LyFWQY zH@Z=Qb#JrMON;xcm5#$lceC5c=xg$kis{@H%H0^8gwz&F?I%o7T%Bk_jiN-2P6^kN zZzR4`^$(7h)=V_R!|V$wz?q0!Wq*ofhi$BJv5xLx|1weu(htYQ1C0T1$ruct}%M;ttWtv`2DuMukn2M-7h}RqGV#E!{CoY@Aiimm- z&5-k2=P@FjkN#^MCJOSq{xKn3lPNEU>^tO9w=*OrPzI1!WWSR8(c+OhXxQYM0+iTn zJ91%r6%0UjT9G$og~&JN0ud!@8U`yDwbFhn1u1G!sOgBmcPfv_0ktGg9ohJCf>+$2 zEuji7?2fRbJq+cR##Q5@)XJ2-$u2I7Rn}?OUR-CFSMp@p$;1IjWVY36q!U||)lZM9 z0Y^2P;bLa&EWY$~Z&JdBkPb~JhSs98i+cu|ENt_ODfl%RQwKnGRV|^%VGb$3EFQpN#`4(sG0F$jzf-fW zslbIoL*8~hhG|auzP6Xyo9oYY*j4pI{V06Y)?tWjW6~Ul8g3Pz13T8W*TvX`Yp_B# z#hyOdw(um2WpY0qkCoO&i(u@;2N#nK7Vdy)SY+W2xdp*<0$J86lrq6G&ct?fzAu_s zt>-h;^vKbz={l=X@IFm!Dk?1-5*bpO+U55GjbGl-xkj%;;DgApdXw`~Uu*LFfsOE~q+{qfL6C zH5tP*$XJa6f))C~Y5;ec8jX|!zniSDf&SCu{g);ZYSaP6^MHo);~ zBV=Y`jIw&@FGoW`^z_0uH+{V_cYh$WKG=kB36Yq1cB>t8%b}0qBD*}FJbg|6d9+%A5IOa4 zb|Y~CTkyXA%SDft>jSTkQ=-{RXA0euKYb<GJkS)KUnLP|uNN`xVUS z8s60XehnZ-7EZoi{%FFnb#}Tu?XRyBQliTw?&L)-Pg^YK)@G(HN|(RBJx*RUf2{P0 z7f%e0+$m3<^$R7@5k?rO(>)L}<;)J`MJy2Vs4|qx=17( zldHDl&(!hy4G1Fp5>)Nw}7iKM&{r+=EyJ^fRuJ_-@$E#=Y6B(Q$1DB zG#siv1vF^96E^>V0kDhqZuc;r%^$4wld=t*m_?Ot2jq&}`HZ8HsZJX(;#_hAf!e25 zA95FP#Oihbo&}vY-tB30H+tg3Jxvx1?F~oFh>)tAWU%iX|zi5-E?6PMW_i9s`I(p?gkD+A(|vBA3&u z5MTFNYv=skpnj|jZk2RMJErZNe#eDRp3sTGxal99VaF%bB@a?faBHI0u(-3KJN=lo z)*5O8Om@BNfZo^qgv{l4GRV6*S6C3|_Z)t; zYsbM}#4r}oYmQv$hUL8p%2|>1Y^zh2AHxDDok#|gQE|i^)*swl3cc5G2fP}YiwN-{KQ9K`Qt}6;vn@yvQm`;unh3I_(6ygWf zS3!NZCgJkhWT?Qc-pKXxjj4Q)&eXr(^n3D`nYP`DYxYj^UD(IPE$gA-urbpHi@&~7 zY8KK~26JAmAb+^6RYX@^R}G@n@4ZwVXZZpjLg_fPa`^Yo_&MY|6pb<#d}I*=;ZI>- zKQx5M<{>eR|7>!C>>p-*G0wvjeUz)TLko<5qxma4+j$y*4HXfQ2r@saHZzz=Ru~j~ zI1I(ukgdkK@~MGmzJaPh=!uMykCsdb*(@hH)f>BWa6g$#VED+u*!NuE1nrSkyvzi8 zEV)-3`KYFa(B!K^GO2ZOvazEwsZ}k_el>WdX>%$Eu8TjZ2xipsQ7jmdhUhalrFj~9 z*|6HQ9|GOiptXS#X>+ByHq#%2`ikYI=d569+2)-&FD=^)J|PAeKKW0X{lS&A@ey^O(RtB9i?6=A}_UR}25QYIL5MvP|48Tmk zTR{#RYg4559%zw1{-%gUhF(slohe)RCXX{!+^aEV&1SP-Ri|Y%TRHpW<4^dwD(}E2 zaK}~E4!L(_=(i}V7C$**W85OqYb^dFTVu8C2W=MuK2?-7;1LyQda9j_Zz={-{H7Nu z2WW?J$QK-rmZwz`HAAte_8B=fYqJT&P!xqeZgt^2a&rQKO5J!jp5ot-!z8xRK|osM ziTVFm5E`M~Tcv31j}pRaKh3kHz4)hI4X*)9uR=9-m$5{wD_RUguQ@!SH|r2NeqX@^f9SvZk-sc@EWPuT@h0+D^bOYkZshJJVFyGqqON zteGDJL3M8z3X!{*O}_u%shOZM5GJ>l+%W;ki!#Z_%@Pk~S*8ttXSYNA4ImGaqXsK{ z_u$eP+T;c1Om{hW@%AFKW+~lXxP)mci*tQpL)!EQyJo3)yXldG9nj9-nkhAa0NH~A zYjxRl7_GD7d>eV7;h}P5F(X&MgzLPvRYc@kNMUcF$!olXz3n{>#irV!*J7(F5H#)J zPhSX@+Vp`3a(wbU3SLpQ)49aPs@_5O)&eBEMYr98(7eu%vC&4>``~9bYn~Q)UwLBC zuGGv`XI+5oWKW#mP*yE@7LaqqX)xn0A?!+hGf_ffgg6lF)h**h%{C;N-DNr0Sn1(} zD?xz0@TkQ4$>LH|_YS8ZN-&-ldNYve1drahW+TAgic z=YPjCbBR3evjZQez)0EYgX2e^Zo|jd=cP4hb;o^Gz@CM9|1i3lX>tjoaKlx)(p`Wa z%1~9Z!lCmST%21runfYUeCfwLu(#6XiHk8`i{BAT}ziVuu(Iz!6wFJweCsVD}k%45; z$9=Vg-B{ONhFu^#M-Yn_s#o2)QKlB-83KT&>tU=EHL5*lsDbs{Y2qj4neFc<&XNxJrz;m}5^qNi6(MZ;_ z*_rzAMGm-TH>RVfadHQ@pTWd%U$ z8J&hZskBo7zIcL>GBT0ifk~6lEv`+r)<^ElIQ}C&5T3MU3%V)E76hpR(qKcyOi-u+ z3=chAjt%#kiC9uH*=DM3td6T?n!_`-AqgA5s)<>T^s4X|{(OigR!& z|L306#D=Ev&M6N~N)z(nh_HSckN{@IqKY&g%DZbd@wFFIkuiU`F1kZ=(GY|?Ji_37 z@16aqRRE+s*ZXeed|~m3yBR24UvhMl_ipXPcwW~*F!mAC zz?&ggl8BVV()Kx&;k^HAP;eQ+n=7vv>=Em3#<*ag4nYs@O6H%VaC$D&>C7@O3Vj|J zOnLCBtzU`!MzM9Yj{)E1AF)jYNj~>h^aHFIS)N0%E?;E3g}SP00OGbL+BCjh&f@Zq zOjLTBG6|vQLh*GGQKTk{xH+)z!D<4pvMTaj=H2khMHgXo3eh7>esxe@Zgc#}tC}ll z(hfdo?3u1s&d`JBxQ?vsm5JAe+=nqxHN8G#FLztkKT=YtoSW{BYGSksXk>sucCc68 zbo&gR3S(a4^BZN*u7pZXYjSf3Ww&Dh2_>UV?B)+B+lTph{0z$0-jLsNKKSOltw^r! zs3De>lsCF18?L{Bw^Z11kt>1MoQpF*p#lco)h&g}8}^tFda zuutD(*5fKt&S4cFB^r6fIZfs}_D6s{D**b9xk|fQdSqJ8Q>QJtnUm#Q_5kQOb zr!W>MD$73pr3pQ79Awo*wWm^K&0ET_^7Nkl$xOxpcR$)-=wd?`=@IRO=WFduQ(PeO zn%ryHcd7C?=7JO4Wt57YlX+M$KsZncqMw+tohUcsPtQI0)q{gHS1W@|*IZN^wiT?A z=Bc>$&PG1LT}NHp+|-&5O`wYDNlza4b;^ewSh(Ffr}nThVEPRlZAB6^n9{RF0_rHgSI zyF9~6cFthE5{ImILy_@NLR0#`*ga#jdVGLq)J)R!>%rS`7=UP?LXsGH3w2k57%ocEA(z6eC-={zvP4i4{Es z&B?#u%M|&ZRF7VfAp-qJ-kzNdw>umIs=PX0X%wvZvYxtqyt?s~Xbw~%C7xtcsa@O7 zj@2<64o9ytWu6mx#&LLFLo3J{%w#&8U7JW*#=+{2zods1qoA0@;G7S;rHmnRFLK7) zK5vHylEn8Y-vc09r+aMZ(jw^QqCCe1$2|7?=NqQlztP{5w)qdjw4%r&vA*ubJgW~w ziL_ozjdBcFyT8xYPxX6>J+&2)>Ydu*qJs!$YRwFAyH}dNi^&s{36v$xbzjnh?(#p} zoqN8U+K|)imj-}Jn!+?=`Zx}T&_;yrRE3(7v81?@H^%BWs@C#oT5uqvmhsk^7<}l% z=(PsvV^Q7E72UaqUthAs-V%rCD5usc2-*4 zj>1jQi)4YL2^IvLTmZ5FWflHp?SAs^sx}h2%m*5#DLJT;G$MfOr=Fgd)Imj*;xpqS zD?WctZe^q4PV#%Dc&*-2VV^@h6m?U!(E*g8-?LEBDrF@|i_CgqtXOtT&lf^|@EA2W zN_zT`e5*^-8Y(|Moua5mPmGTWWQ4Mf_2VNsynYW1AU#{&hX!iWMZQj1DG=P*o=c_9 z)%Uaxe;J|oaBFL0EYeG6)iw|GlKQ&vXu`+HYcwaR-d6D;uHsTf9v;E94A^Q`IP;jLXI_ogzypnwAFQ)VnN z){F3rOF#sFmx;Ftu{d1iGjg8yTfGB*kmR(OqzZj&7;(9>RjVw#32MP2v7E1XwGa=(-RtZ7gjLMvxJOE3rJIfzkM2Yf_OURgX<9i#F@tj4YswMJ4e z(k)=LqgD-8{8Lqn&EH?HC&fSc)5gsa>^xA6zl55ahs|g8Yx}u2UgC&OiL3b@{2N8G zJA~Jd`=-pjnKspt%yeb>`22$s5W!)wJeaLj#IMT20i6;ZaXnkj9xhN<&OG2Ttn z6Qg-u44+4#?fF*}IpK&e3)oN3IWevyDQ5prGMwT0&#n!&fr5dlfS+4)`a{}Ls!SP& zoYNN6pRDe!)2N8;P=tev?1bldoXwS1PUbo>R-CV_#`PXa1%;?hZ*WW}yQIA?wv>m; zkgQBStfLkb{2A>MGs+-vviMA1a;FPxeW1z+WIW5w5U5pb`7c_ggSDJ7FjBTAbpgdN z;f@+c6GlwJSeO54sx!xy?S^>YfE9LuHrQU}0wAamRBPrMv4|lU)y} zOwG7OHH&Sp4~!XBFOUA(!t`!3Mt7REq^S>d!ELP|IeCfS@x4Ir)}va~Y#E=d*W2MB zOlt>&*VBI~l$c~uE=WK473uUIOk5WI@6~PxMinS|8frdm0#k+O&&|_TRvq@N@R@7~ zY%h{U3~|i860|nfb!o?(0?FGUE0|E^%aCuH>dRTn;(P7cHc~p18`@8l=sbN-n0)O^ zP0_8~WCwT|xG@LS_+aEMJN9S8n~CCwM2qO+VKsnHxG^p@P$PJJXw6+Fdl|;nFolZw z?;92V1cmT4r1ef(MB zXXz$@t_F^$uF%yFqzBSN!zPOq^Z0^Q8E8qO_9)pRLk|5yXt+PH|MjHE{oZR`{QO_8 zxp?WwIUsY@{qctwz$^6yj$J*}zxPQ)^ito)iK-Xi6#;_Lh~##3b29?W>9(M%xp>@0 z$twQNyIioqyK^}fBZZs|Ee(4oX#nqTPv%OqF0>s&5;l-tdJr_4iB2J}{MIy1$$Ao?W`B)_E05 zQ!3%S-P%I~im2{Pk*=Kmp27&GyW5wqmQg4qI6nvc<2pryzu_5_a1*p4L>|~0(|YxI z^zD1^jv6?aOJjdhr_O!=FGbT|raUOn?*Kemor>mH2i%Y5_%qBK!G9#+rty$0=?4}A9AL(XNt?-M>CRL6Tu5i1d{kGCK(L~ zjLk2)b&GY?YV({HWOU_+d~Wd84EALi-`wixx1$gl;8CgL#D6ti@vhA{#{bwIP@}-Y z(nQ3^CH=)38V|~KQ{n+WGP6ky)CqjLJtR+@SvR?B?ygylqGw$NmYSP%L-N%y7kQej zkeOJjt4R@_dvxp~@nYPoWo{XDbdqPXE66IOc3`eRvVh$QSUjvve zTf$wFwam^wR^JrvjQ{vu?K<5-fuaYLK@q~Bn)059MkbFd zP5ioaF)0I6Iy(X({P$UIAl7^#lVq1S2=SGC`OIr-N)s7KW+HkWHe9869M*W^ z6Ag@cGcPp!w&yTl3m(S%&V*`~${yNd8L^0cogS@Y!(suj5s8U~Tvn4zxP)M}l&~G+ zM4lOaS|DDUfHzQ|iHhQ$E-$md)?+Kv&TQTRJMovC+{_+8&O7f8Mi&Sr0``&(JSz^8 z2}p|uZ(wRJJdpUxzJDq&Hiai`TZaVO@|Cii%t#DDac#~%3v&F!ps%{?fb_r;;LX70 znI60*;ZrK+RVRWGKHYOR(g-K-Ej9Vg`g|pnqK0^s9rteRi=9g$5iX+3u7JF}h>S^= zwQ)?aT^0>Fx#lPldLDdH*w3NeAXj3ih#Jha`e(zC3*Hk<^>CvjE8`MMiFnXnwtQjN+vYS{%j4@Gz* zbHW@RH3mSJq01RQZbsB7gM3gjQ=piG`b064!x7xL9E9Msqj+HoB3sR!IU>s*h%@JY zI#VT6e@cD$=h6Ioax>9j=XaE~SGfXUt!(2HY?V1K$mC_L9z@22o1 zL576GMgugG>zmY5+6GY^&9xXCu|obrwc?g)&Ua$k{hu;hf~14COQ2B&>r)eHr8Z~i zZ6MU`ln{GYnB+yQ3~p7&w7mL%8_G~yFH&)*{UTb?+MgwSMSm0N`$&^Jl*kxv_c1LH zvRw$&C}8PjG#&kBt+09}x}v3acD0pC;~{uH$%Dz%y;bBUgmAcULztojGGL6mwyK;K z@01DeZL$panA}dCz5r%pU1arp{1LTVG5)D$K|Mnz|Il>6cAHuMCm4DRvSJhQ0yH8P zqvOzB|FXzGLO=S+lO&9bltmt19_jS(2o9Xlxv!xnpTs1ft)66zuIxrKB^1xJ?Q#f@ z;XTcj_g-PI#_rC?hDCR)5g=`R0_Dju+Y~8`%zdrwdL9Cc54ux`iK$Ft+{SZlF(*Z$ zj$?8=AHN0Ft$}CSL;Wb+xvw6CCLNHjm2^{|E>j8Il3m0 zK${<3Xl*%@hk7x=qj%@CGKb6k_oK|(Q(41GUfW*{+?Cd>bM+giKVNVgloM3(ZOxKm z#mnMdsDQ;(p0PZmtHTsLyMyU_hdQ2iGw2^%*2=iW!gUfCYlE6bU_0(g`ukBJT%8Hn z@rX*IQK8P3(g|c~RI-)N4mrG(?4SEo`?H<9ah>(V*mUtWpE$)&kdR`tN7R=IcoPrT zJSKw(ZgsE%ybVmPDNu@f8Mw)MV<>rFC}RS0e~yA{_C^<|>K30riSr~|-ddHVGV5`v zby^VLN4+W2F-lEj$k>jRBmu>bp3dg)73s>=l@*u36b79Alu$LbqbQ?Zo1d5+tzM(@ z;6cWfZb$$+s^7gh%hnhZU!Y zXiLC&FITK{%yik*@8>AbX}dWY^hje2TwCBODJ1s5MSllg3mxNnKLa(-UgE>p^8sd+ z$VNt$V#3#mPK~It0t^b z&QJqFk3c^JDI`B|F)`*VoF z<&4S?4mF3QEJ6aB`9>48d8tJC+(FRJ3*7H|-5t-GxH{ErDcX1g5}gJ-l*B?|Q|Pnb z;XMhUHgIC$H1h*n!slKL!EzJ(NY~su6r~;ebYwYnVaT<#X#pWw5=u`YItZ;97%mMM z;A*W-j%FT%^T8RD+)@N)3Gn+^LLl`$M&xAI0sTeg&Q#oO+vMjX7PzgojjAGl$i;1@ zlkw|v{7gIa#9oN3J)x}qf#iiEl+oK0WP-D1Zwy{QB_RKDk!95a`+zKdP$9|}2Y#e%mWW>Tc8MjvEJ zQN}z)O6Z$ndR>gk%k1t>F{VL9%s9Xmb|<@W)(*itFtDhtC(ZT6bE%rkDHRq;Y*Fet zP=jmqvkN-wI51n>vM!zY&7bP8ujBFjd30kcn>`z?rz{>vbLH=H1(qN& z&M~3Qp{ABL!=wb2xCV`VG1H$hApz`p)Qjao1Fs3&EM!#zr7ST%)Qu%i-&qSvhH)xB z*BbO$DK)X=zL9Qxs1GW9X|Nh#R{=r=7ks<41{DH>s^hJ7L2HgCn&3KwC6 zWOI$gO7Y8^?P-nLQ`{4rc@qFy{5OhkM54gL1h#rIk@zZP08c+ipqJ0;RTQ z(>wV;GwtiVL&utS+wuB-OUgIRu}5HokAGc$AHIy1a6TRg;v4Q+p4(vVZ-zM7@V?A- zlgDNF$l}{wxXO3L@wbo4lL56+)s-zl?$#Y79Pj@4qmbw|5D?eo==1Y6yKuCLwg&FZ z`ir%FtTql#I01PiYo-h4C^Z*QWWvi!DBGe!Ma3gBT=|RP1gswcda0qOO359#KE&rk zUA>mw^8Bf7Nz2V8-IgG3nFn=6UemTQO3^AHNs+4&++`Lt`;o0#ii0t~th}i+gbrkl zR?to2tDektZJ??QSJ7hYHZW_TEQ`%O@Q!Gqt#?lvzdkbqCBlt0r$n+_E6^$>8##pB?gOXjNJ2DBV4^ye zR$;N82K#siwyYxwS$f~BJl=0(^>v3RcC+C)@gP&=T zwK1rvHJe_E5oBJR2u}1y*);)zI!)!S!mwy} zIPOR<=)}%CqqI>PW`+7~PFbYdb;aDyTKL6%m4^Ugjn0FnKMZ4M@?y^9hJ?YYS`FCy zV4Tj~%3$W9p05`yH4m*T8+HkyR6*2>u`4pw8<~SPRY=G8bj>hU$!~K-1I1IUW=O+L!=tp;#SzA=w2Y$#(%S1ku>{tPz5yWWnJmB3?ad9{XREeF7NQFkSUPNFy3 zkl!Eab&XF+>1B>edVpCkvy-zNOj^sfoAzX5?JmwYfmpr;gCbho98$mmM7N~OVYu~8 zV7aO`&Jdz;9O#7dU8k&^DRz-IydMGx#rc^2!<`d&a*7R4Y%^sz57?_(w&kymuR@t-u*c{Dz6EMwrHm=+~rP$b}X6fdFKtzA>@ zGj{lcWgCxiRe-KQ>GhO`BXqdKNWpqRc-E?0@i*>U2w;66xt#${H5B*u-S8;99+B(d z_)-*?Nl+atBq0#580!4kBW1AX5Q~4z`Bpu?%#meA3hh8+e=nB(ntc-Kdd-#^qWlzN zPPiUghh)Pb&_b0c;H2;xK~Qp$7Ag(gIPo5{w<3xhk z>B6A89iq`bofXF#X&AYZ`J!-bhxk;31eG!S)sa9vL41q=CcrFx8NIoOu0D*-&trO$ zEBXwzpP*=OUAjPsNIXG>or)7g;z5IpIy=7F0)tWhtXLC#7LUut>RWn}_v`Em`b4@) zPu%I$WO9JD`3C$yW!Lt}b^>>+zUN{}wb$ELlx)|$N!a0O*>t*#N$SeWzx#aD)vU$vKYx(5dBrYinQ>r~` z^U;f4%`Xmab=X#Jvi^^_&yEs+{ND9#HGc3$Si~#NrE%HnEe{BdtA6{DDsK*qKB!0y z+IUKLo}6blavLDOfDf-QFQQDBLcPU-s!D6T=Zmq!Fl&2mHTk{7S4+IT4Ny83FikNx zZ`xiDc3|IHgA5?DZ!y{`HG(1_7NS8Q~57w*SkA*N5X-}ELQ!+TR`$>2LQF5c4+{ChEZI58Q zD`uf_*(jXd!Zvr>XajGtRL_Q@-{Cu$_OR^+toZaFZV*tJ#;~T4|JM4vw??jRi{^K6 z*WEE9D?<>nM=X9!jo$A} z0l^||_C023jp`d(L!*s74o@|AR^KW8L2e%dBJPk8OckaWOb<|U*KF%KIO#?2%zNfh zR9#-y?)7IPYRwe(_?`kTH&S}AAsT!bxz69b4hiG>QXpIJN%>+79I#!oOHjXxLRR^Q zZm&V7#d>Gumz8EJEKjEl=>2QX6a~_I%Yi8eT~e3Cb6{^C5$i9!2)R=(W0qu$cmTY3 z_1V%7gpdUtWd&FD5{K+Q!~W8hZAjiT4>hbckDBJSLB0-_i+3(}uf*EZjpvC(KnrS7 zJtzl7j$TPt>=dT#ovdrD1*N4$M{CpFG_z3vX0`V127>P#|0snCc%e)Q*$8NTFXE9+~MbH{>U zYeaeMD|87>EUCmPF4xG@=n;?SZ=+Y`40%vakfFn!u2a{$s;z;P+VV? zOvw%yI4|!-p654tPo`}aw~wFlj9k1cySfhxOLD(-cI1o0umdT)&7cthOZ&T9Eeylly2Ir6waH~AXSZ;c?mqV|u{ zy4SAU*C>x@kJ$IJlCFft!KC_z$WY;`2dWtanVy}JZ)CtKmuss*sAGs$ z>fn?Y|3m(tVbptX<$F;XCfaf`dv-A#ZFHl@&|Cq!f0Hrq8$WSmqlIS0rz4ciR~;Ju z{ONZT8eB}1IUpSL+vPDwbl9{8B8goLZ^K84$H#Y;YJKyyG1AU=6_?(C8pPGH*@DW$ zOH$FB*!w|7-On)$VuG_8k#V5Dr2>xm45y>Lh4 znYAOSllKzczxcvhT!R&Br7l_9<*_ixSAh$pfZXEhER0SMmRfbX?o|M>A5xstYKdasXN zcmz6bW)f9vv0BY4tN8gNElmfnR+IXLOpfcIQ(!V9r6-EyVoM|~&x-#*k@M`o%tyCm z5#Y6_W2F=TM|fR^-3P{62Ru-M9n<3A8Wi3JADpY}UfV4BaW*E|)W6_BH<95`yEW~` z{!~xiJ+1m6 zHo#p7Vnp>{0pD4sxvg)Vg zwDuw#%=Ra}NWFDIA7-6JLSdaJ=$K+oS81?&$18i8A;}#=9wbkC3O;=V`MSBwf4VcSsUoL}Dz-z@e^%8{1`HoZ$)q;^QvMp%NoT~}&ofFj}BOwV44JUNCx#7s;<>UP9^ilExGIVT@7zMV!DU!H)I{sUrS$X3XI zZPJoV$|yz10+L@ks;<=jn-%l!)9wfCt3)MVe7qOq;c^yA(-3Fbe2u>)lO=pKFkuiE zyh6J|4hkC8^w|I};Dv8asWvuE2j<;tc3ImcdQ3emjj75E+)q*)fET^@0Qyv_4%i$N zZUD4?m@w1Dm_3Cbh)Z{t=Ww2U>nIBJmxejDuoS(k1q}tX(d8Xv`lQ!Nv)#I?hPeCL zHgA3ByFB#{xn4Vw3{DTj&7X)o|FJGo!@31-AgxXz2~eg?)X4I^uYhl=&YCsQ=$k=( zwzHn|jcP^u*=TF4$|04Ap=kxEw+u3yX z3UVYXBdk1*9cWXF?MwW#=_Y71d=Cs#2ls6}f4!MJk&Kb?QDDUnz5veeR{5bJ{x+-A zBRFC_Z<&|cf+16D?WtpV$H&Rs>Ku=#_8~84p}&m3Hpny29@?YJv#n6}>g>hft4g%& zMLHn6?!eq$f%c-1NPL{!ejAR4HMeppyh{WjGiIs?&EPi=0v83ni5%V{hc9p9iAls2HK$*dgKWL`#VnR##r8SW%0se|P&>sNsjc z)W^y6PIWSw4du5oIq=##*Kk3iM>IJR-9%^oH30oUaR83GJRG_<3a6w_Dx|mgOkzQ` zl8Y=0;!b^0i`9`GY|ci8*nM?A`_baN3OLy7JUx%`huO?h zvrj?f?$%|RsRE(@ade*%@$m|=#I=u?7UL!bmN7DzkoXyGO6ZIn|JXwdnaF%j zOVcV9%Xld9VL$e5ZxkdJq zZtW_30z?rGqA^?ZpUZLHh)TOAYNC7efP1;_Z3g>hm@DJMVSDo}SYk~o`%8&<0gK9U$$NY}r z=>bB?K+j8`uY11?$SaA;@_+wt3*$@+7r_jgj=rN^OcOB5*q)k5AVtoz#?oHSD$px5 zn+Q5Z6XW^SkAa6IfK{xiCDlY4aA1eD7r|}CAY_1ABan|(Vn9KRc7Yf{g+~CEqsifX z*C>1eNnm?35=`8Z(GSkWz>H@n+XHH=U)1l$x^%ka*f*rWNL{Yl7G=4nkGv{~NzTna z>b5#iwyw&PmCcD^BmA0@9a$Hmbv>Te6P-R(mBk{j=m2==Qu1cVUypLQpsPZY+u50(9BhU7B?r-F61VK)wSCx|eo$S0vifCzx>5__^SFO;1Z#0hH9O%KUx<+jk4G}A{1LW|Q3 zXzD?z3g)dbyCE~XGV-}Te1~s&6%rO4sF(l&(02&Z4OIRHmzRR|*eFn+CG~@NbJZsF z(`C9!l%JUK2n$!H?giPS-a1nLOT!4J8#H_7f9Rmq8Qmlg95{wcA~=rJB;r+ljs*1% ztBa+(syxT32P5+G-OMIBNM2M3sR!c^=zaTov0{F~nr|D=l@4%;mQH%GCorVh^6ZZC z69^f-PSR*6TeqU8b5zB%#s-QQcm>|np*6JD_N-nKt~**MGcadGDE{%;dg42evp1z* zxt%Q^=A+-nJ_z#Kqou;tDGsvUSL=%6?2Z96MGH!LuzsRUE0gJD30jm00DYWIZpX>u zWip>A=83KG5a~lAEp?lN91#D*@Wz*k6l1?6kK|4DgDex2Kn`rlTXS~10op=rsf>&~H-PlL3%kdLo ziKDmacy5f%RizsYZ~6H8v`{3B{DV#+AmaGtO?<%dd~qG~P@G@CxW=(7j#xT&6+9W+ zc|WqAv6F#Pb;-;^QrG;!2mjjS>tODdU`I8H=bnadkn{f8RWwJXZ!|3@8 zCq}v|2MBycXFb0?d>L-mTwk7si50=jk!BZz^@f<+xme`x!@zGO!|h(y?DftvA6Ytt z7dE&6hi+^1)Nc9rCM{aXm%ba$pkBFThMc;}o3(sZu8f?UmWv{S-&iS2d2E}VTu?dS zhJ7h`>mmCgX-fbzq(Ix6$$k*nxpmKt_ttfiVlQTpqD;{?Y8YqnIo9t(T&)(-X4%u$L}vFWIb+fz3Td>n&# zDyZdn^2>aDKl$7E&YR;fM5BA;wfQ0^89woztFBh2gdW_0&pv);t!rGTfyl)*6IBwc zQ}*0ga$!$h0ar@x0dAO;yIS)>seEd&$gI(M7cK%U7GZk#os7_l{5-Zrkmhx)WZ3VD z5_L%~_x87Gj*U=3P*k#}k%k~`+dKbh1k3*PGMzk)*=aITCvT9vqkJb5f{}>^vyE)n z$aq#Eod>+#qXjl9=msf5TU!UJmhXYPQ|(bF98{7*Ev=gZLGCL7ep-o^=72g*xhnd} zI*fccla5Gu91#7_5rAtXGysd@!872mSd>p^ouSt~?O76i&;Z3Fi3xl()tjGLl^!W6 zAC#XLOzp$z23s`9s6z7W_5S|zL#bY)|NhitfE)U_TqqrUCdJ9KK(w<`#oGUt+MX`btY)b)6t{PReR^vHXJj8)=usimOH*Iz8oA6|rTpp8^f@_X6FUOH<)XKt z7Xe2e(w%NETz=|v0g>&4rZtAjAnE$3_TEhkiAn<{{>&g-L!C0Lc@~jWf$V9w+RA#^l zRGo_cY2W2(03dyM40bIvxlq`Qh$4(=5Mac13K&Va$~WDs&?ZoaKLUCdx{U_JAD{-K znK2WXI)HU<+lx(gQ>Z&ssi=Aj5!0&f5wLnAcgnckz?x}9JX~9>u7m*xcnO;G67zWw z!!GXtW3g=&7=(GOti<5m)Q6uuix-kSjOI5}_eKb6maD;h>F$cK${bavTFtj1d;mnd z*MYjuS@HRr+={O@dm7TYh}3kG`R(h|&G`9la`VdVrN?B(GzAF((FZFViJ66W!@L=s zaRs2fJP-?@KmF`WKtl(aIQbun&C!C9{|L#@>2wU!_?(?~GF0}eQhY~SyWU{t_)}K;>mn`JAPSCv>A;fkQ#v?1ppzYPc?50k-=*gRAl(6I1NA4 z7NyffM`p;7Eh`+U$XCDWiy(#ERN4-4*-?BZ@s^G)`ST3?OEw0g zTr^=v0l)wVG|4-&o1dT2LAvbC0PoW8VlXy?AgNLtleZR-k#1uzr~b-&{2t6=8gZrm^z4AA>GufjFwAi$@hOu zH($J`%7NtC?ISa)127FnL9kO);5xy>fHUt74chH{f@64yWTpyk3z_%RJEQ$D z8QU!hQ&#P@lh~SkROo-`Wgt_d*pmsanqYOf#fy`U8qi1?OET(_BPGzY;}`tTm*673 zP$K`UJ$e-Ep!;|yH)N$?aw@jn`csspGDgAVodD*h9$XC^4Y#1LZk*jF&sE) z$K1uhLOI<1d_P#jT`=(V2XaiZRu(_C0fNk#Yz4U1%rL*WrG+fFc#=&Msg~ZyAH%?n zf5@~#A8vtBFrUom*xyTuJBW5rC`C{_M@JJN?A9thF>5w2S&T`NB9mU zzGNh>{1$j~y{lmGkAU-HqNQ@Lz^tjy2#h>9;F<*lDub{lBt1X zRS<(C&=EpkvYP7kup&$L4>R|{Yo;hbD;+~=J*vvbko2brs^n@{#f&z}nqZ!j*LA$q zCO$1_I+0VSfiPm8n53caALZW2ie&D!lFIh(xkLl(vDc};mGzns4R342#=Q|DOj zrzbU1OBwJ10Bp*wS*g6kJ`wsz!vi@+YXZ?YL|H)CZO-pmR7r4A`AU_*QKUI~Ba5Gq zhi2{RB{CK;oz}<$$cH5GJs~Zw z9x7|}kx6g&4)~UlR7|#@Z8h)29j) zH_8U4O&$Zj%R6HQ<8$7nRWazhp-L;!D-MS&#TRD3E(*(6W+Gjufv3koGBUKF&PFb9 zYSGE2hn{WQjkI2kN~$%^g?lQAN(wwnuhgLSL^*l)-3||w9auoVvZpwPFSqkK8QXg+ zmCaBHjQ8I=9udEWQ;eeoi5P0N0NL2=BL{_P7Alb{MxK*FC&EvZEK?Lrm&!DTftg2g zxT2F#I&#&?a}W|~i6J~H2!HUUo#tb8&W7UBDv^mBPEz1B7i^F2ZmB{_Xm0Q)x&!NM zke{>`cI4r-J@i=MRpS0C$G@=TG)gwiCdqF8an`=*-XSv=Wk(jn|4tUA#(H#Ry**T@ zh_8W6)ChsSU&e2C6^^r}L3rpiTT87W8Nb4kL@ZEs8&#y=7%8|P|J(R^kf;ZzwbH1UBZE8`3e%~ZU6WQA#tXMbmWI<$(bdUP!7In#0!^ulNvAEtLaD_k2aBFy<; zT`;(6Mhg;c3eQfXD6L6_QUbPN>7a}W)ljHc%B)DAhPB*aBM72>T+(b9B8(v@wd;yf zFe;p6>(A)DZm1ud5=g2wp1!pBq6LOM2i9hcSXG9Zv%a@8*~oD;ikdN*w1 zG@Kjd%RnVhJ~DcI{50wrZAvhK=%{m_&)U+5iS>@*z7lLL@{v%!{EfG-9Yd_ zngEuW8W4&F{$L_Q0G~1)FWnxkgO`#g)6z>?7kL`XP1DyWCR$jklCLKJ z!;RklV_=OVey}>fWsnJ$;XRd~biC-K0B_9133e#^bXhE~=JWH+rKc>X@?&|!AN)Fmn!x7$!(4qsota8hRh|?q!|4OY4G}IzeW)A2lJyN+SN9ivTJ6{>K zk|iuIk3vU90!F)EfK?$R4ex!OI>3E4LgC_ZJf12IpqSn@nQYXebCR$Wt$4o^O06La zmR=l-P9%7g@xbe#G_L>lM15)psnZZyy!fT*37598F+*mfzj0b@D+mA?sxQkn9!pi} z57Or_$*o;$0CRX90_;GV9(5`o+2A<-0sDr=g@y^^)XPq^(cW$J)3pOrds;h*0rMgL z;9}L}sTgDSWCsU5X*js6s%t|Ahm8~ATEk3qbmjVi#(74&;-Xs{zyNFi#ZX>1H(m^m zAg{iXr21>#D6b^AZ0%i}984}g{tmTG7H|jA53|1DE9)Ka!@yP~07^HNS6_HVQe)ew zSB=8|G=}y$uv%#ceh%#f_|mc{Ve=sJP4I#u0!h->r<%2dRI1&S^0sYH2@5?t@7X13 zZiWaC$`AqU%KZHssAWutoJLY+iq~h4mfTLQeKUP`YIqA7_vEkx!ro}Ch%1T1e$^9^ z;3N4{7wAGa*E(%fsa5GZVO z^=GXc(BwIT?+Ax9KNi5e_E@qa!EnEY-KVoe5LGb*lbL=C`+C|S9Ad+GXu$oM=;Gz~ zyf!c}yN`MZOFLI3!+piwD2tJY?yD+|YB=`ml8n|W`a;Crbrg9*$V%d?z2>4U6VoAp z)~4X{;$A2X+78u9EbLV*{x(;Cqck>pvuB<-`iL}BvGI*&Eb_FhCESsjqUi*>;{^(4 z{d+8Ao^V%RLLdth^+P_@4q5J9UOJmc9H8584%i>xC-^6u#bmm@ji%osfxf-0rA85` zIQc57Zq?>|ZS8QfkXF@?tX7nSx812BL+2xeX2Y$MktvPI>sM=58nhzaKVmxmu2xh2 zHC4H||2G2Zc;@pR2vlCD(ZWQXkU&{l^-PhHnx) zh!;36G7OQXbkXoNY>s(NydUQda?2i8{9c{oi`|Ugs8?}-w&puQyjI2f%tnKwA7>Ng z`%-iq_1@R<3qt8Hd1HxV;Dry((#S3?8zf`6>sS;qN;EdzhcX8?@Og0C{RWJy|SKPSG_GX!*1$j=mqHsxDqo( zW;IfH0O*qzq2{jR-64^q=!zx|+sg!=I+IIIJ8N){IM(s#`RvAY2&-*7(5Bx$W*HxT z@j8c0y<+*Vh?;BDd|Ms&I8Ltii+dWusEg z&X`aEEDEZW{(3MhM)a}!U3?DmXd=qPznyPVD3)@KxEwX>(Q^1rb+WRqk|}R4a1cLp z7yS$9i0JDeh#a3{ThjVtX^J~fA2CPNMI*p zN%}gmM$v|C%rp?2tbqCeYx3j}L=L}JmWb|%@JwkB91T-*X=ox(>}SSg$>68L!^#MC zBMBlrZjf{W1uCb&Ju)U)#xBIn=`4U78oOK3(_<2Kb@{9HNTD`L(kKb0BAXsj>bS{w zU>zT)bz!#gxvJ&N_d)1f!tBi{w)@pxX%igNdHI-?vO!HIv_FY#d`&XJfrm5s`U@6A zZ{$0q%7J;0&?CTV@s@8XP%`@K#4tQcOIc^J<;d26ec@}pTPU{s&wb#Utfa-`>oQqR zp3H{=jb&Kba_Lji7+YaoRZhAwQ|YQ|6Qq!?yj`-dkK_m6srrr?&ma_JZ@AO8EiKsZ zQg7lo2KC|D-GhO0NWWKYcFG^!<52ShWd|MKj-8*3R1)biAZk>L0AvEqS6G(TD3?|g z;8ix&KkMp^4LR~o7U$H|{Y>jW4~L=4krNvN5bsLVE3rb(;1jB)weqaAY(9xu#@U_q zk2GQ|Tj)bsz4E69Wgu<99YR=V4}IbZNnO_Csfl!z%s<9fBbCQ6)DKKt=e zK4)M+LBeZWC;}EHESv{$s_lMga_Es$=+$W9K~oXq*oR&R4!`zBK7k9JS@JC=lr({mQtA+|#e z5)1u_5^{C0?(C`#cF~GE>b)WW~%~65PqBVQPP9j9S?jD0?;=8yvCZ^0X zH;h`Uf;gBhwm!)(-p0n$T)Bb}MK&grf*f%Rs^`|&o;uYWDV#^9IQImSRdou?`R5E~%mFF#f$_N~zr697SsaP) z*kz>VtIoHDWqD$Tb47jcKs?W(M+#cSAT$A#Zissb{Jxj_yyj-EoWLt|X&~K())WA4 z4NZ5VzGxD(3>m>36!t+(g28+*LCD$rN##o?jG~bmg%VeW3f+4z z7pOW5hrRHFa#KMS1k87oEl zu*u3I0fJ0Q)5;CWlt+VVs#mtJCgOP@)l3hJ(8FVpM6~cqK}bY9=Uc1Uf7WcXan1X} znXaVwNtKVVwT{pSoh9Tz0yR=sw!Kc{Pttzjl9RdOV98^3sjH6M&C$3~ybD}cAiVY! zvjx2_PPFk5O=V0E2pVd3RkKpiP~|k<_-U-%NgK=(@*G+vl~jj3{g9>BzQ|4Ola>@4 z!zT~D9%-C&%(Zs$>s6?r^`__mk=(cr4Er8b?@Z0ZPyOPk#n^f@*$#Wuje<0KEgM1q zqBi3CFk!XX{6YYO@9=teXH=g$X?oJuZQ;W6g?4!J9B)8_p>8~P0X7kIS%pQ=_d%x< z5W>!_)-uJ;8-&%Pz|*ZX^RIh$w%dXZmh!-2&;8qIhxI; z$?bfkwfhDr$xZk{{n&mbLjSfjVjcikVG_iXK}JCKe<1t)--ZTs;X|}vuJa?ys-4LJ z1cB!e{lv0ue7xyM>FKr^bNK2M#RAhhJsh=0A%0znr4LpS8YFl1m7*#E)!HD5%_}RPe<1 zu~rj(uwWv3@V{MU(l@=O8KD-l=`xW20?*cHinP&+zRn@Gv}srlPKYu)6(&iXw|Puh z8~E;;Tx<+7Ypc!w-jSHqq#cYf1y3)&;2+ti$6-(PNJ*I3BrHKELk?@OmQiS%=qvL| zDqv(x#9k~7{eoe6ER!1&sZMbz)Sv^U&hN-tsM3;L-bBZ=$YtFhniWKZD-R_;7MeqH z_0!C?HfB4V8zbo{1h$qlQXr4|X&Te=GIEfbb)?YSui%Zk81yYD$;$Kr(bI>R+1DdM zci|;}8$+?-wn~xf)^-+TSXr$w=JKqkEYB-i%DW6b3@Nqd?V{dLP#spW(4#zn*4~9) zEA24VsI7D%_^U|V$Qia4v(AQ`(7t=|lDLKEi4{t?(DL8P_UOI3i0onb9TZC%CR`&A zVCq>!O7gFX^&r_>jQuS%q%Shyu^zFHXNtDXB?jE&{Idh=D4^ryGF53w0Vnm~ zFB+2R{Q#a}k?wQ-PX90n3rhSFZajUP_#`gipOVEanJkh(9?@sQwAttk6osYpx&3&f z0C5d1-eX6~PGqShD$vLJlilckHeZfVm!CYn;6I;Li5)JuKg-i(v9Oj}<-_YQ*Qcl1 z-8h(Zn-|a*b}&4Tm?*Q1A472TCU31M{<%~g2`|M&n3{#R%$Rslud3Y>WeqoUPQae; zMgda;I)&MM){j__5ODY(K6 zg_PqkYpt`ZwXp{?4E8RNZ9R1 zO4(wWO;#21L*Din{f*Wv*4a=!0sNcY%5=w(_X^W{IUh-UYBp;_5TirLGW~aSt3y|y zHhla5i-mG$16_dDR!yj>7+w$K*j%u#r+pF^FzbTNyUus%rotjAs_p|76O=AhrT66B zB+x6f=4ca7V;2q|(_iTI`ELFS_UCb#?-1UAvx}MMybT}oxAF7D1~h2hT@1G#aQpqD zX2AThy6q@#mZ)vXWFn^qfvoPtzKrjZ*B6J zM?Yz{-&SB~+2S5DntvPK%tw=F%0vMZVN5~;XHN3tc*GrBqx~u+R%{%Bn48q(gIKoQ!=1DTzS_hgSnXZCP3qD zS)Zs#i@Y;IxQ=YLlywjkKe~Mj;7^TTGN~FDpN^`!}u4 ze<5|h+W4SouX*=$R}HL*X|pqkZ|HN)*2K5fh7vAPlI33teO%Diwu)h>>fXkE?CK)V zI-V!CU5wB)^$xRH@2e!@(2W-h-@$%$fM(Qy9&qj!8GbllG=wY1%wt|k_=Ewl!Ww#5 z#pXTeSFF>#P}(IcR@(daugaJH#WRT8U?}0sNC?NwJ*Lry6wDF5u_kK%5sU8X98g;B zK&AD8L~(TAgW$FJV~DsF_-&%4vi~V+KxMfQlA;)Ds&wa9)fPXq2tr zt=g6OAKsRtVHcRQ#I~?CZtom)Iww?V-CyhznFk(g)zI55D-Ob;)0C06bGMZiFQ{09 zX(?_AT?;Dwv+paV_Tlh(--f=FC*&5zAc$93zCwL}YeO%8Ba za0y?rS|HG@Sol-A1KmOW`MKnnmsisz__Zp0JofwlcRD?!2T-7;6SxCYqBiGg`S1ra zVZ8$%XH$foOO*5UFE8`i(`13l0-zt5;#!PWDRl9)8q+)T9uk>=tfj;-1eclJC!@t; zBCbT_DrJ6?UHOsmOeSH-Skf~HdCha8Wm`{<7*7*<7hQH1yRqf+RCTkp^dd6-s;v;v zOb65Rpbh&dJ>jUdDaKKaY*1TpfHgwp=EFN@@6bfe2Z=6T;NLB-~50L`nTm7<2j zp-&H26b%}p@$a7`aKZzC3&Z>SCS6Lbw?3Jbn^HuljrbFD$Bu=hU1p_Vi zIQ`z1{1uv{Ah&YUdzr5Os)UzLAM#8dGY*gO| zQ<7bI*^0`0G6$5@jkJ5NDo7?9;VP02tV8~&D2BkFDQq0MC&WapAF7mF+-XC3R{x5gXjEMlt{12ufuAz!LDQM8Ta7OEPMSx6J(U{^5G z;n0<2cc*R*aYA(I|Y;V^qK$o<^umJZ76+*7VrnOoky;`NZx9$veO61??5z zP1Zo8?W`xv@0R3M&X{Rld@4TgF{p1_HX$gGi2XTQUG4Yu6tclO1uh>cYu4$}SXzJe zM0>oG>9+HMJm(Q8Joz+gBAqeB6BmQ1{6ngJoTHvg5&Sa@-{NT$xn1>pz(I2X(zBwy zln{`15X002fpu;0h_r?21<~K-MeUx3gk}-m{`B;c%w?va`j1jHHEP%fH@J{uie4j6 z5l0aCs)eEZ*eZ;4tWxAy)?;0Rs!E((*DEBVC8X5o;WOS)A}*Z>ha>U3*8w@`NFW~0 z@GIWq6C6KMw+`AZa)}0_Y?DFO6SAe2{dPPO(SeTFBRJN?*4z&}5-@}Hau4a1-7dVGJ zQC~}0^91O6+~gnmCeT=@HiWVjGTYJgECcGY8f;&(JdzO8JoBgVvvn5zTQV2Vv9JX8 zzbCK9nkw;6nydJq$zxT$Grb}EB0E%g;Qum49{l(JMrCl*`@??#t+m@(Qskx93KM?$ z&*TMb0=XRi`~N`fwzVJsZ}OM4erMl{-0UTVms;s@|MsuQP0R9lBi8r{9gWPh$$@VW z75oc%TlCm}L$2xotp~mCzx}%wCS-vxmF|y!{h!I=L~=*H%uBW@Qv2zDB_nYv4vlwT z&HCeY2jX*5$ibz4D8O`1n)Ul!edmF zCL^Z_J333Qb)(g8eo<|zcc9ID^5oK<@4D=^sr9TiTB0p@F#{$-{i*MB-?AOEtgxTv>j z%>wCcqFpp3*Fu5bm@6@FDP;>%S}<=JP|D;~@tANRNCjqjTqR!qkSUL%97<2dTk}yB z-LdUb6?2B>&@o42SeR?i-(H_If2qPVKjEJgC$xAO-!4bM?e@+K?Oi%dKNT13(3cnB zO%}O$u;rSCD0w$cA=pCm`WgufT+CRN>d<)~z(V5ZycmUo#KII_=9w`I&wZ28PsLxZ}X7mU}{$s>z}3uvbcD=6$ij&NQIQZ^YA- zRgssSeh&RtT6U%(6ViOW1{|RBv07O$@>+&+8ptb&kC_czKhiyGib%>*`N&+(dQqX^ zRyI-Ne$eB-I-?TyKmYZA4xGwMHk?$!?66bR&XT7@aUtJPpJLUuLG3#<=AQaGj>vKE zGH(v&5c3n_YVt|KBVT?BlLy|$v2U&{><7vWkj4NxM`1k>ZJH~6FnB?Der^X^ zaKX>LzOc-Z6+e&3wGS>eU(qM-uNzB8+6vHSs-v1)MDC1d`sn>BjF#HjbfSGLjPIVr zG^A&bR{PHo;#D#>UX-qJ$iM&ZmZh$l7-FT4>hsfxl_I@HstJvMH2FR~+tlz%AQiNP z>Ze>kzWgq1kOuAf>W8ppm=sg@dRTXKc!yxJQR(UFbx!tlW*MT1Q7l4<5d^!)3jA=fi_jP8( z9Sb8Uj^Hsv>ht7vCr)ElnPX2$pdVq^aa~(rm$OZ@BD)!9m$qu4qrdg+KOVQj^V;L9 z6kSsHKMKkyD2KHv@rHrGlKa)SoL;svkf9C~XX5O3b~gcKg*bPkxf$~|NSr47^RZfE zLqT|4$u^WbXC4~$(^XfHTNFKm%0i^PAsfGjHDdEEva_|EgfZG=*scPPxjfCQyLI?a ziISG1>2x;NZRu01y-pA@FYz0)^ve=``f>Ja1|gFw)$|-eL)?e_kmqp#y=N!RZ5T8} z^n6bcfhXiOy-?E7%m?x9WL0F>ZdPp4L%8mDlLZ%(0Ye&MRRy!Lc{YTl4Do>j4YE_I zp%lZbppnw|(D=9HhFQ|MxkfU9G^nkk(n*cHM)L)}C&#NO>Cg1W3ShEgs&|VUvSP*5 zAyO~!E0mNFG#(`}wgjC6^@Ok?)Lh{gTC409ZcMf@K4op=>ayYRb43f>8Xfx=m6&G; z*r4gpEmVTx0Wysm01$di0-TaR=9xNJ8v;G@;1O?7}5m(g!5I}*So=$hH@oz-XX%g=3%%r9{c2} zol*xaw6U4#u}u`o1f*S%)%Ov;PJYjHgHz0FV_FWOlDM`9wAbL-7SveIdWNQBN+F%X9Rcx}@41=Av3Hm&|96cN7 z>+7)rlIo4en^e*4_4y9f`9<%|l%e&a9nt7GXGEFgq3rI4;2NRVoq8) zYLG8-!JRTYg(|X4i$EX8`J(C9YDHqNUhga(eQbzC(BN4LpKn=4k-^pz&N${WLTi0c z%H$o|AQue@KLcdthKq?>Ax*W0LSS1#T3w!591ZplV3Pf$MF1Jt(DXuSDDtqFC^i}C zTsE3iEdh6g+MqfU@j}VnWbr6Qy-#NcpYXb`xUF5((A?2MB>%KA1k(0m9)n@FyI^D{tcJ{~b!}ppxo1W}{*RHR1hQ%}2uOH8NNVQ|F}G{r)U!u4I>hvjdB78rhsUcHoKeWCWn z?49kOD2fiWMmjJ#?Kb8#Va&Kx$FI-ba7C8_#F)nsxlUcxkh=J!FPTw@z4wEd@n^ z$yma~1m(+m0L>ENN4QdQKbp|llo{1)7*Xo+tairq`S`cV_$`^;GpYJGPHrHnnB2Zj z=D;$+oA)a|LqBce^jHSHCV!)qA}~i~)<7AQa0lWH#0!?yq7FZ_l^Uk!2Oevk${{lkPBW`2EC5%s*LawRooEYws7`|8N=qXV?#{%!B`I$1DO zxly$*#a?RjE+~aEOMozGQEN^?;mWDbL2IdJq`Aui=#sfP^P_7Ou|fBZg9#g?2+=QJ zo=sbe4}3_Kj2pUfB$L*@0V}nL$pePp=x&(t%R@Tr_37z5>h}a3XU+Z`8iChv63{Gz z)=ey=HkgORtD{Cc-yIbME@!M7o~=THKrT4#1N&?F!BjEXBAA-y(vA(!^b)ruaYc^! zLPfsL)e=^8!Z+zh-X>n*Ve#LbMr_4dBYV!5iTe18?2>VRVv$&$@_7A4Ug<$1K5+vO zL+-8KqF$&fjww1-^>c`wF}@AfB(8y~C8g28y8{4mrK5}DHrm3JJU2(r)4C@sLkKK8 z(puXQFY%C`-a8G#{%0w^ZiQO|z8i#xHf%HNmu?nZL)Mb2;e}i!t}jIeuq=4fKRwIN-ODKU_$22)ELW%Q3#?{E19Q*FI=43 zfO_U29wA(8OhF-?Y_!EqrBk%09;}5Zj*u~^05L$$zkBNe4DuR1F@a(fbJg3pdhrQa zJ|luXVRKXySg$x8;8_re8EBg2p+7ifemUFdu;Al-{Q|QGX49PfYL@8+Fg>so5$I77 zpHR1sW{cmK9S%s#`>bd^8%ZDv((Fh|X2tf}6V7&*=3Os-fc}|$Y3qoYNfg-In zq97?MzLYgr)5*N}K!HDIM*YXLv{gaI=R2+@z|LGzOLwDp({V<3xjsA zQ7_elm};}>43hN~w@7mf4ivIec^&Q5qY{=xS34Q@CNQbL-hLy;+cG-3L2DimZS4Sq z!MKuJtl#CM>YUqGCnQ4#Od0jl%`#g~nemuf5$H3kFvSKm0i|1S@agvW%ObDqbUz4U18m1afj7j!ymHFxvT_)n`-|p_l0Sc;!>acT zTrw#_D2uDD?yH$18d4*cY4$x9B*>Qo_vM2O1#spYRX^S;A99wFbQb4GDJ3R3H%#%S z^Ta+_OKF`QUkfhWj@sa!-F$`gtW%J7(77;<9TlUh_{Zcg=si*lxAB9!D3~)G4MC8$ zF@8yM+_st#0*RY2E{b#s>#^0!U`yKEJ~a6@m6VofAo~@JNiICPq1-te@K~%hVolo(n>rIFBF`Q z>uOiz$~Xj2-ugYZ5x!1eR2f2pO+fbIg%orHQZ@|EF}2c2*wq|1p}HxMR+1N&_t~8j zvIuy9Y}EngY$9SyFy^Gf-yx~Jq;@D$J3FV|gfJdz z=apIuG!8^*5L1g(2MHhNV}m4t(SdX3-0B`(Vxm{Q{t8aG_!g(3LYdA)WGU<*Z2CnP}XKmYST0zI+f zdmGZBLa18P1IC-^WF=Kz44Zd#3f5_(SpX**v*uBRmrVR&<9regahT?_0J&dOJI(JiP6!PDbP7Z5S2orH z4Sh0e=tpQ|YZ!GzkX9X5aweTWmYGFbdO&(Mwh;vdF0$pST{%0SNao4ep4g0W*uZU3 zwD}1Htlq5$C^Y<4P&J5IVvKIHoLphmHu{91Ff+OV>~NShxl8h*pY&oR7muo1K2lF0 z`0|zeg~-^O;AvFmk&;Y+z*-(&#V=?{O}!WeRoSM`}rE#)&447?x`)m@?#vl??p@qs%MWpAf(# zPkH8iWv?$m=4Cz}Evc{}ldfZ1i&~Z*kMZlvaPf%N@*bbU>&wjy?dpo0ebG6J8Vk~A zUcIWS_e(lbWktvQ97NFqC75Ae2~DC;XITG66;Zk)k#5)F`%IGb0AAD`a439^$=x8i znNFVX68?i!7FG}Hi_djCaQg!fXTJB9nD}FEvByq{;0x+WX?)U~X$&5$!xmT2yjB0E z`Yfu`+tC|Yci>noSbu?s{kgQaWqq2&zg9Z+wtW?tNEG*KM+b50kU-l*;0m@{OCHDo zDHB&`lm;hvk*^H+q+lt=$F?6K$G&p)3@b&*Oz!HHo4Or~q*QKCX@NEwA4PozoMp}X zc-Jd)og00Mp8iH^kEImhy@7GHDCSlGnIjW~RUO6KUWtg0lIIs))YnGBa0U zFH@2lJg|elbxGv0sZLFxI+fe5qADmf=eL9Kg8?L~_Y1#qK2AnM?Sk5z;VTpn1S?Y8 z*n6Fq4#sGx*dE%p>pZG3z0QFSyA@aA7|NMirO=&G9;0Xw#oJbgW@Hs6zIddk*Sm+Y z5`CX7I@8|Lmh#@1H+ zCbXf)Mx+iHX)LYGM2`J}__}a4OT>e>BHv^dVksUa5X4bHPR!ZjcD4lW)mlID(uCbG z`%zx?{~B1q-jZem&B7%$2yOvz(d91=XR3`v4i!b2!|EQ)8Y{w8Mc+&C)dYQt#ky(> z<-fLg&uTXtPs)==0T5Q#3`vvr%${V^wshU?NHWiLC3#so$Uyr<{h=NyY^gC+~i;NDthc@isDoUdXUJb=w?PO_nw zU=s^m0BFH81Aay-W1?5_cFUp3h!!@s_Rw<2*`+`TSL)zDKoT$jqGaOl;w zwp~cD$sEm!N{e#?6;8|HS!4y4MjgUo8y5eHO73$zGI2CE7FRI?#t>wP++E=4qn$B~ zyp#D+@NLg*SrS6F*q7h2O)%VX_B5mjiL9R}*wPx`RX~#VxflnYc?E=Zo95V6*&Z1( zRly6St_5qLTqZ6tq6DK&0q4c(X4~oq#g#T@*juCVRAN*EAdu47TaS2@a;-BAqg*QK)XTZfp=`_}po2I1c_3qSp(LFn6S&()*0nd~lwGGZ-1UCT7NTcWNNt>Gj( zmJzTo&nB3tCLw!FDFcmN-iGuOj$XE4?a_?)jL)lgEdX+}Rx|W-1ojdn@k6yj#fQz8 zwVmzqy8XA-D?`-682+q#{*pjGxkZb-Vht>f-39(L^_jG39RyNrzC#qOe97jt(T9KG^#n_ z^B1sv$}?`nUb?;vYqpHAn2HO>s!om{9F9aILBB;i91qKhZU+S@KyZgAS=SOhW+kk- zXz_eqF24_{WVp1CebL5KJE6JZ9M4ui;%X(#-=|3PPU4Z+3XPVJ^N^qvmRth4mbL8` z3c*97G-oAE595>FjkJOy2I)vOmrc4NjVRdMjj7jR@aX+n@u5I9wwHgM#OrchHfngc zp+(25?qu9h8^FozT8BMN-MbM{1*giOu*ot%CI_kor2vtFbQu3`!x0yQdpEkfn+A<{ zJ-cgkR&7!MEUxu~9*pS2$mBjxZVRya3rrfrqL;b=h{ zF}nPal?U0n2D%q3_Q1O6^jeeo1+}oHjz7`Rg-i`bPKhcykg6|S>X7PPwf5waNtz-b zD!j;F^tyh}#dod%-SK}1IP$Dfjpj^LWRf@BpdrhkxkbKh9i2e5DyU8JKqrEFUB$9T zrR-@D<`FVwuIj7;1IJbke0OlMEq%6clUs$Z&j6;cxk41so0p}p6ebq6=&cQ1$bQ2! z2r7~0vY;ur0;n!fvLe-Rl_lo1z{VCjRhm@bo;sOBS7t+vF z0ks#ZPUv>?PbwQdnjTe>Tsu05V*a3o1=@RHm~_lIK8{3o#W~1U;Z3UbI&FfhD&t^m z#zhAupi03pL=9ObQ{~XEV(WgWFCYw+d2r-6!lEjQ)E#>|ESdP3@~&3vGv|Wz%+U^o z;*r(<0e2u6DBvG$34u0=OYD@MFK%Ej0&?cp7xDR@XUj*G=jp?o!$DYc3yFtnAAx5W zVpLhaG|ov3I~;TG|tQ~jXt2Fqhtx!9oS6=y;_AY1@gY3mTe81t8~}9 z*~@%n1Zkx)5TW@$>5G~5QfxAoa@vDQ&2KFz>0#yQDmJ0b$P@nD^+pc*?de=ktcU=N zKUuq}K-sH*iGi2r*nS~q3~{SWIwH$HpBV!5a4K?b)hT@6hiSiYOP{?lA~X;}DQDe< zj5@K>5zI@)Ort}^C0}IG^id)QMe@pU4H2{rJXrDqz@2>Q3F+Q`A|Am{GXLh>H$O9x z%8!6XS`oQNuBIE8%TYKqX%He=6k`FyY*g$LouAriLAp>iRH0k&9PTwXl5R1E;)BX) z)TQQMSH{C{#4Bh=XjRo+S4=IBV%goQ2$L}?x}lYv>8s*=&E%ecs*8K+6(r!~A|Dvj z4XnOjCunEBm-vQCcdx-5w%qU{O5wBg4wFwqD!+nS4H?rOa)`_vjcW*~EJ2)(d{BFe zpgF7UFB_7)Gx>1Q&WvU?=E-{76UaCcC8jDp8l3>yg&u6`9H;MCtHcgbJSBw$r|7>a>G4IyqBHq$J<>RJ8gZ2cZe z@(Ta86dqG=M!1BwM1H$3gR2 znj;-xJj$Sl8r&+OU_YZ3q3sCb6$aob<;L6UiXejL)RTrv)C}$_t7AjWIn&%Qb4U1J zlL&2962z~IGxb|`*hj42Ig;(*LP3SzU6z{p1?{Z_5wi@W>@R;3JGn_LQ;7YBP5^GE z^Ih{NfA)emaX0(C$gGnjAhtlcD&GL zUd3C%X!UTWrtnS6tTTfGAF9WD=2JNL7W%%6O}`nD3%MOeanmup_F;HQSJfPlyLWX0X+?Z#S1@>M_8K(zh3PT5K5$;$nwM!`*b zT3uTN>!jc&`xj5q<2_Wd`;3TV!1<2lD^e9PDKyPlIHUjF=zB5+Zw!8FaX#Q=C*i^- zmY39bXueqNMVdj_QsY!<&!$w{glCw3PoBrWjpyQt<6%@0klozmSPwd)jA_=boiH#a@pZ@{O~ZoAyRuZAN#H6vKxK}r(vyJ-xnRJJ#@lB1HULC z&S#8ah+l8{=*qE|^0QXoha)V*s?c(?)=_hbo^N5=#x7R-^&iR7@@^)Qt|8bcd78Zp z7t4vY#Z^j!pcJtK9sYE!vUD0`C=K-AJJ7cv*Cn6J8ir=e+0*y0!=W{iSZWciB;E$+ z0VJa4K-p(cHhvjuhdaVwR-}eUcIMJhFonQo1%^Pn&FKSO(UUs{NGP2`1|HSPp`1WW zRz^vGH@;u?LV^;RoX=6f_+k94rKr)vIEwgwO`h|Et-T#|{Me2e;vhZbPSze3fbKe! zTn$;KzeVR(iBi84WWMTtfVLx}!Y}l1j+qloE1xb=ljleWxwn22<@j$5>e@93p(6YydiG&tC9aI{oL>|T~r=M(3BH880qcDYzX zB;-z&8gM6Uhv`2_1BgQZLUM17YvK5rMCB1_ z^44-+*1=F#Iz6wvgA!~Xx&S(?SLVi6h&S3Ubu|(<4LE(!Q6z^g#Tn5|guf5}KEgIZbgG`btsb+?8g`oq6 zWa7FTw&0j$)=ud+RRK8Dwmuw8aadxS~K_YlPP?0k<0E zc7>DZa^pwi3C*7V_)cZCD#9hOGHHH_La(~s8(~J1t!bi-89-fNwMQnGtjKp4tagXj zW-m}80B5?`jW>*9??TsGgTbV4ea3`tdfEmQ{C{lb0haBk9T20pne*^!gB7z| zK_FNM^*V7(oyNb~d**C6aBB)S8R|2wnKmQ~2cC|}2i2_snK+>>U*;WP-%gAuVdI~^ zDCcOj7!1X7@1@qiK?AgWcX1I#on4-B;QPFLkcL&2Js(;5*K?$X;?7&YX7{K4h~6sX#+rd)hY?Y;r3TWnxDX2)Y#icMOy z)`YUED?n?nur{@V*D&RE(SpEEkW4FDZC@8r=EFWd4iciX8Tf&2QG|*=h=aAfeRK>J z|8h5<{MJWO1tR9f^XrWsb8+Fj(X051IHM;wufT&KkDD^k6-%zXY3#zVH5Hodd7x*( z`xTB2JA|{S*RA9x7ahyns(5dH`B?Iw(A5RzO{F{TD8+nNeb^O~$Fx2H_^3bk<^?`< z;0g7ozzFjxL5!-WjIgMTo^+G1b?4FdD+;@;1UFy=mb+!eM`grMN*f{vUHeLL*qzn! zBDp4e8{v$`CbT9}a2oPheYSnN%G)#){-Z`cMK9ngR7RQR9Ex>+M%>LuZ~Fa1P-m$5 z`f`9@QHC2yOu3c!$v5f6uMe*(TGMp(M|Q5@U&+3hm%UOWo>&{v-4U$34(%G*dCd+I z-+j=NB4HtEUiEczQ}`2Spt_jYlo$9-A|O(XWJmOIyAx?^WL)kTOvVP@^0?z|TzP;BNf77>^Pegz4Bd^Aw4T{3;OJ;LKza%Qry{p`;Y(pcfHSKG)-QnqvhBs^Vf>Gf{9i8vjG>vQE4fR zQ6O)$mZC0JNDl_05=3oWSlWLk%U$aWkf&3UMYf}@^YZTic-u>Jr4`Mt@B zd;I*HUYkVavUY!c<$w5VSBMSy^hd^WGS1))u0Q`KtD(-?#u_^6fJKL=FRSG8=p71U zjf+V&3cm#f3Q~JiNIxB{-*Gn!yHbB=gqX-*T2551PmapYam2E3^>OlMwYUL4mwz&t$QT3b`9C;P{+u%mBRMjm?2mD zZ)Id&x(P*x32=Oxs6i*7ru4IIo&NtA`?u!Cu{28*d>^y<5A5^enPsF~*RejQyIl`y zltf7kr$iM=g_m+{9zYT#;Q|2`E|N2T{p_{Y_iccRNSo7Eo#B#j$Hv~@WnJ2BuLZHC zW|F{nF;6^F;v`J`S3T_zX;u`FT3KMtKQs-=$RQBiAAQwW$P;O}2-$ zy7XCt{9)bU-F9I#C6NVdM|~YUj$;$|Pg~s1zYhLCcix0`c01HpSF8DO@LyjC;ejXd zPPZv%ohuW}Vhl@XHgpAU$FtdMw~x{}tP$uVVxT7Z*Z(gG*lCf~BT|^RNs+dlGA!*= zkC-aYgXb{D%E2lPLDSd4HMnYLzv}5dExd0<{@i91UTTIh_K5LkazYXx#-6OM`tfZ- z3PZX@mDvv9hX<00oPUHVMi{>8*y`l-p^MeY2aCa7`xGAXM70B=b~fkNyIfWj1|kSk06_FYpsv%s zSrnB7bc8(-OtIqesZ9Al|L6a%yETOlAbz#DhXbbxdLSiRX?a>BnHiZ3GB5q?$v7vu3Q#aFYBrZc{L?at3z}NJ*u6xB3c}#oBSCih*odc}Sr<@Z@6iYzc@LV>H zNJ7Esf3NgX=J%*0u?sn~WD|8YpP4iBEod=7ZZg9(>5EumaR? z{I3MNkWe<~Xxq#7T9muQzLVF%&BJ24TunulIzo4#DX${WxyxW3$VktYFa$=1*o|r! z2^DyJ;511!T30Pl{c)jptWc5es}i7}a;wU$ue+qWFXr>dkg!VN69ivz)l9(?+sn|y zONT%qxk)pVP?L!>sR5fk*C}TX$#Q8B(2ux1X#tn;b}YK%fbKOt^kSLGdG(46vikMb zGETthYF~Cx3`4A!QCmQz1SqN)9;^qCnm*enbFvQDN4BT>_YrT8FZGcNYX~bohKT#2 zJ#JR2BG-1FL^PzOz{e#F_e8FY8+Ef|KMHYMnWITR)Srw9xr@!-SPVKl9q!1$Ib5o@ zgwap2Li-`{(j`F?J|Xg1*u@UmRDqlT!mwy#FD5QqRc;K}r=@m3mze{4~56uzHS13xp;Y6&G74a70PLs7!_h(fr63671b6o%$;Bku{pPEav}&bv-5>O(tB~&20YCp^=A7xa+ft zTR26Ol@RDqoJ>i@+P}B}P`H{u@kL|Z&&zqH_KTqgYYedmPf?7RM^O`AAtH*nm*(g? zZ^Nj+qE=~F8#80km^SdZZ}6(Mgwu~~A}pD4JdRn>@M%mn<>Po3 zhURMZYF4^+k{{7Q*mswbqcE5Jxy@--fZhiZ@5VuMyPcnb>b;Yt%5Nc|JGRORv3s^j zJ5PsjRRHmk`_^1A+@x;nY`8=#f*n5uT`EdZY}-$NpV(UPE~FBq zMH&C1hZ+B5Gl5tPk(XDhDQ{Tc4XWkRtmbAK?hpr||0Fdb;-c8lr!VrHs*piB%dggl zSPX2e(pB@!g&#l=DV?2deGwW#H1dli%}eXycDx_UNzT6-)wV@e;f4Et)7$t&3}J03 z=1&@RSA7s@Y-hF&*!n{k`=P3v!<{lR`FkA_L7mBvP`rqhM6-Krmx70*OPia9j<1ea zj(TQe1YxoWrPIz%I~F7Cca~UlGmmIdnIOP|&q4q?<@({f{^r82R^JZaXjz1$CYM@y z1##!XDT6zxZd-?To#0(DP=exQm`cvISL;vI`T|N}2}6KS*NY?lmVSKw;^l?;V9u9$ z!#S?*$fd_L5r7=H1B>5lVjrRAM|_k3U1;u@ehhs6j3{qoZfil6a7Yn!Yi(h6#B61o z8(2KdqOd3K2HX64`ZQQf?}6vH0FtJ27~))X4@t;}-Bt z81P=yXc$z*))13L?ob$GaS#%I9Va7h^ub8|JTI9>G4UlW8t zV4#kLB$g;CI0rIjB2U9%^xEibIl^3gu6qqw%tF!n>aS;_8(6r793g+}%bnZEGQ3{Z z9)~9K7rxaH031_m;dmjyfmuYPMTnq-bcyAW6_Kqsl?6Y(}H{f@ua^X3thsoDsDmw^&cx;vj;$nA2{Rh_>@r^$pQgZAY0t=sa~CCh0Z()*XYJH`mNkd(q) zl)5mgIT7RKZ_9mT>L%MT{cC2~p%74~aWvH;iQ^USQkxp86ozY(8Ietm4_n}CA!Ht} zWeNc6?fm6>W;{Hjp@OLq;{B-P7cw83@}O^BoG(gZ;`@cf4aj?5waZu?+M=<)nzflL zI!0sAHDqU`IM$IpjCH3JzQP-jzr2o}sa(AgUbuEkZyp8SL z`F#|Lg_alU5MlNaB_+{<{qXiE@7 zDPalq5wPO0)mQGV+p^s&XxT+I7N#YQ=^~V5Cm&3jMapq)s{I)y;j$9K#|5~DD+C8Y zp0E^1_F~@;qd!acrs%$HlC=4EbS4fNOIQ<9^U9NSqQ23s*fq2+5@u|1-=OBQSDY!C_FuoL_@azSHeWmtk2$8W8x&0^%lfC zb)E|Y6l0|uakRTW#z_SAo?6B`R3#xM2#H!&rQ1)7kyukgdw2hmRqNxu55{BeNO$S(V(&k$&>wIkw9 zQG=;^L}G`ru+IogP@a?Hn3e?~3n{unW7Z5~Ii@a?Qx}`qjXX}DCdemdAw8eTun#Do zh8cWR`Ri@WPF!bkBfX_#BA_Mp+#zXDxJ=`L(4|=Z4s$#D%b4rQxr7`n#<_^sTd?j@ zw>)uD>HP6=vQX-}rp3ucf!SRY!diyziHpQZ7@^_RtC?b)@6moVDK~iyv(|uW_$XXa zg}U3SMOD#0VOWnvh}aoN25JNb6sl~t$Mr?d^E43%CH%t@v<0(ESLW^! zqB0yh;7-9sqWPAJuwk)PPq=Xjcg6Z(=ztMF!+iygU58qx%uaTsMp6+1`>4n6Kym6B z0?LI7n-~^x49;&=`K;0F?*w+Mu(*}4nvVF?%1@wG|5Lp8|8#VEu+8B#PysE)+tmnH<_NVLwbs=zR!Ydv7 zGcX?}4ToR-4pNdo_unTY2s(0bc2aItx|eu^`LJlJ-epZ4tuch|rqx)l ztKVN6j2N3WY)%MXi|mhxuF0>jgCF75LF|#h>wj1@`kr#t$NP9P>xnQm0MQ~O;9EB^ zU+!n)C1;>BfCl?KafB2eV0unH3~D5s;@bNtNic$B4Hh8_ii!RZ>Fb3kBB_ux3conY zO%Jy}o_{Ut(xc z7}q(9$kf65d$KE#BlAXh;H@x`S^SMaC{^gIn}_)`TLCT+(KwDVZo2r_tlTjCZYD1t z{#`O<#jf$cGRDANCHk4bA2aeZ#AEJ5862`x@pQ0{Ijb2P8 zH8i7CuhaU(W|%iwx5-{mhL3+SrAHM!>o)xwep0T^dAuTC(|pdB{&Z!N9ajRAy02NX zSp#o#h$7yY6grvw3&mGF;u5rykii@4y_?ib40oG3gAlrBV4}b{j7K`CJ?5tMPO)6kem55AUd_V;_!3^)yVV91bGt893aLnX1XP$Y!;#m?)@$xd}yWW@sa9^6AFtq)?;P_8g00~ zI1Q2;Dmf~)Z_sqh(p`5%pHsZ%BZ&?aY({R(48wNDRZU2t*xo1?>>BcUN2Zoudfxy14AYwgz#}z zCUNbgOQB-)sUq=3f(v8~lj(ry`eN;#Qh>e(G!THfMtwe^)-wjh27jUzG?)7+k%_}ix zg~Sh|ExLj#rqwCvu}W)lf@0iT0>497Y2RJ7A<-#yJI;?K?>d5T+{*lM;`mQ};#v(q;|z*Ubsfi-T24TE9#$7%j+?uxUU>M&u-bvX(~K7WegHugsB=<*wih4 zdU}bP9X=`B{8dJfxmY;%f*6x^fFKbr%Mw!ndCQ=N>w`wJ3nmo&!H&3-znc$8Kx{SI zWXWjFVHC0X9Pxu+JC9`btlgon+-+^Zb%j3By}Rvx7dE`RdamG zFlQ75=nDu!UORD;ANVIj6pcEHOdB6L3ZQ>Bg1&`$@BafqTu1+|O%2g!Xer`mm1Wks zX##}BC;5aTPeP30kQay#AC-WYaw^bO9+^?5qD25^c%eIRxUI*9n z>nJ@rpfg>QriN9w1_j1YPt!6zX8iE3@Dkt{Ut&-8`?Xt!Da;@AAF6lViE5M zi_+O^9Q=-i5rc8&Plmf1_GS{i0sl~0FJfo97ANY~-#}1^1=LN)$>#z&t@nqlVa~DHQ0f4?qnk@RegMD>00GZxgs){Dh=0FN6weYGfd{KDNr)Q4tnkgAsUNU>n7EM$U8XvY9FGFm&YsV?uhXF;z1Uq9J<~>wDil;&>txn&$l2tcWgaLLV?wvT+}?(ToVWPn zVY&>fI-^7VI(#H@%8L&5d8ZZL$^!h7PCzt9eQfu8j0rO$bkQeRP(^vE81cXk)rt?{ z-P9yQCF)yA88w2a#ND` zsfB__l-@%j=OT<(iHI@oCQdHAQPEE#se@?}=?s{6q_oyKqUY&{&qZ$4)#W=cuxz;z z3d(+CfSBHsKM{S`V=%~CtiK4ELvbgoFSgYWe283#hAwKBQj6wB4ggv#{a?{5hx*%hH3>t+G{6to zP-NY}Ja3r?W?=Y8N&^stz`IvdR5MH+WG z1l8vXR7L{$!gZ_B-+`ERpEysCOm%|{LLb@raZ2#LjVy) z?L-xswHvDtnTAHGoX(}5BZMZj9w_5wM|!*-j2Dk_jZeHa z8~!l5o4-8W5^xbNJgmj|7%11HiV8znS2V_f?q@Hnl@KgGs%W6t#K;X>{Xjts+r)5l z-L6aj>h@Kc#X~+yk=qJVYZ?)b(a}DF%hOVE!(y6|Hi|kKIa=gSwk%Ckuy-JL(&6W; z;#b-gzrtjA7b0?@ljcBg-EJ6X78<+b5dQ}zB$b9ByU$1ZJd?k;+}Fb2Aa$3J zqiD$Dr?P@r46YH1T7rH?o6&yTl%audPDZKiQ+HO|iP<4y#&s;o&X|3j-=B}`$7f<7 zKcdPu4j7&_XiT1t!v7Z``!#B#SgVN)2FbG$6I~m`}K&+K*L(#`eP;n16He1McsVOt| zuV;%g@Kz3?gaGSNB0{^cRtIe3Y4T4GCA(|52R8+KUt9MsB;jf7BRKlMPxTIS)rWPL z*I8wH%PrMF#jL;|)>vOI(U{dJ$Vap75&ITAVv+{VBI!SZ|RkmhhVzrwVE z16dp1O--DM+PE(q9g$3#%LElfk|Ki8Pz+Fd|8Y2*E|2v-A^n5$^FKOrYqyP{L}5+{ zx5bug}t~Qr&!2Jd!&iB-jaf5+rIyu*Z#QTF(WAq=`MCDCKzOZ=xTT01WJXMs z!ngi}tQf39l>e2J>>zvhPge>LV+UvpbNAr_m-%~K<(?+@?k$LYVT(!m`1LrS0jD-*?^e0i>8dFko z&MDqs0+e#h*wgkr3k4etQ`O#@>{1w8yVhSM(LL-8sH7W}9(z>Cusk&?HR7RLfj2CQ ztEIn}r@4wA8#2>gdTpo|@vlM77HX^d_7_FV`C$G`7;5;>Q|vgPSfmpy9>$N54JL|& z)tI*;#k$7uCqtw7D0j$b7fsjt|CC-O3n2P(@Nx+YPl%L9N6yOysBWwq` zET%yh`nx34RtXtNvX7EfyY}eJG|V@fjUBv`td!0s5OCLv$>g6Cj0Gx^GIY6;>ye_d zczuuhL@2mU!UV5wmxv}^+j%2Y=OsjqgPZx|-F!hec|=?H<<_X{1*O{S^kQ4s9j$;- zVuzY_Z2|~#&XJXU1&NkdPb$MI*gwkMWW0>WFiyV|Krz_?cp+?45eami&WAn+KH!>klRam2lDCH=n%`MM+_7+^@JnVorSLZOg{A)5k+5H(uQG zAot_PNt}-kO8?Z`2y+nLmCrwdi^xIjUrt3=Ts5USKyd@Sx{!?~Z>Bzv!LofvV~VV^ zm@k9m$tBkMjZsBZ1U9c(kbLiId0bRG?m$w=Wi&34(u(a09#BBx1DK2#PZOz9__vi6 ze0S_ICubG!@C2pRv`(t1HV}4RJq)hLD<($wC^6aZmmx2HjPNNmu}DA!R0_NnO0AV7Tolh3Y6If@3 zw=0o%P6>v2X7nKR_MRVt`m{fj4+FMZtlm-b$v_VOZnx3ekuI2`>q4@y(}LkFE8+~j zk`s+epPnZ)G{OOXZ89{Jfk`1%!J8BB_<`Fcdz_E^5U^l*>@PG7tp)HSoE({TLSc&G8 zPGmFaTuLGgbbc@SKGzv5MD<>7DHI8mic|r%Z+#dP=i}+%Ub&(ueX}XjRGl#mRohMp zD};(+(ob@d8GH2qy4-|u2!m=7x%$`ip8fHa7i<)`)VaHIth&s6w)H%Uy?V6(nEKWk6 zSNTn+5=pM{S9jA=NDE<99p>pCvZ#aWm&N@HeG#9DL$^OJr}s}#_%xb_nF|e~271o2 zQWbe>8l)S|K8KPYj+kla_s0B_v5`*kA1@SPS%iH0%Gen8RRLFHEa9*4<;q2Elt$dH z)wAs23`BZuMWOF;#hnfbhEL>x!Wq;WJ;yLj8CyI8!Xmtpx_;Q>s~%g3Qck993HnE( zI3NTYlm8sdjK~(I)ZNu!5-#JHyJU4it5T*8CV3$JpQ`!!+1D)jB|s;IWx4JY%hgms zt}Q5Az#B;n)mL1dl&!@N>|P(tG?H$$T~o1%e&|Q}eKM#K^FI zmy(0Bdgb&Fvn~sZL7hZ4nY%$K z%@?6)=(GyRYG|t;d?n$@hB++#Ad{2s<{6ZkElo7WZTRz$V^lI-$#ISH{y{9pqHi!D zj-5ps!mI4FO7w@xe@<>*z?&T03LH%$8_FQ|=a7T3hW#C!bdsF%mdl#2Yx@=zc(hm* zc=t>W?1Jk>qeq^}a`( ztmth~z(lUFVt~kb=oB=qN#ca#jh6F;eF`C2?f{zo*dIU~ zMOGd^M=nOD+j1-nq$+X$up#UfW2b+IL@-Z;b1Qi{cjFu6Hs7iCcb)E2nXftxQvRul zDqK`XGTFQ^*fI%(CB)a#_vXpAr9X+V)89TbTss0$XV!6P=pm98=BtVx&0C*(9o-r) zT=YHgz9_9H@br!Y2c<*KvvOXM7fVr6xvj4)lxL^NHhRxQG9)>n{6tQ;8*{D2+acR< zmnf?ApSPvf!OAJ;jB)SA_dOi!U1EVf;1gE@T}4%oD<}O8c%1!Z;RQ-OiZ~+c31}Q9 zVz})ngBPNfT5uy%C-KTGtN&rL=FUSW8he_!V*+q=JvM|g056gN%T;}D)6hsy_C&ofk*p*_3SMJyrB3;6n|<*Fi|n!B+phRFSn*nNW5_sdr!n* za4g`=o+e^AWEOcHtx0z`o~|D5UX1nO&PqvG(!t;Alh7>A{ph0kEeH zGk558&mf4=7Gak>b=PP}{dt2r8fC7lkbFfk$8aesr3z?Xx9f&&$+RREt31Xo${hgE z9)W7t?~_L=(~i{wNvmC&HLNKhMIri`RvD0*UZtHeL)1RL9fY}I40pN}f3&aOCrT>fA@rec zL}Q10oX%bcR)y>E;}Tb@q^gPeONavBM>w|oQVb+!??6J=$1o^9i6ja%5%GNM_H11H zbi0+*lA8#hlKkj;uGHu&or%6p+O>y*2L&yS*@V8szqzgZihbewgy9MQxSla8!vn>~ zaU`9GV3I4I+4-0IynO%6t_Zu|FUCt%d+P(zKE9c6@)4$?8{q%$eqg*1=7*}7M*JGj zy$`GZDY`;m*&%U17g8)yStVYJy(GXY<`8_Fr5{_38_y4t@W%3N&mvWg^#ye%4T_t-9vTfAcZYbEAtYS`2sdW_sMTBQNx{UATPh_%4F~xUsm`4KZ z^rK47LUKf*zFxoQIjeP$or{y>|1FuUCe5JJEN35!%nZz{M??qftd3o#MT#XaZ`W{D zJ0=4Ls@%j*HidqU7?+$Eu?!}oI)b8c{%^gQ%Yg7rMfA?Lm!*aDvWAy!_oUpLhIQm5 zSEK_86Q*s{FUy2>aptUa;>ou%WAQ55r^mwr@pZEp!<0IFLD&6nN4fWHS4vmO-JqHfbG0IM_+j~x$_tHctg>JJfRa)dWxY}Tu?c^4rqm|^^k=Z=aOc$DE2OHW;pb|FF+gdnwo9d z8t!S%;6lSQc4>aX^P*dACqpL6x_nAun(|2jh$rkRe2J696P}XdLQj};PUD*1X5UwP z`0-w5a&S5&{@IV0;j>>2!hgb&5|$Vvl-z}^0;Z;4dn*ZIzHpo{>Dfs2*A`M;H8U&H z%sY%1{)VFvM?|QS{rYA0-G)WeVs;KOnxqqqpbV+H^wmwsIb_ zpmq$*c1fFW$it$tQVjP$Ek$*VMQ#$IM6Ib|pr z@d(QOhUkO0Z`>;q8`c$439Wbr%-d%G{X`jYfSBrz7~?Su${E6Jc(7|?WKpV;MtM>b z1b)zU8#3i-xila#M4CD}657tOCU&%TztjmE3Vh~6IVgWM!c>p((t9L&<(8n_O^i%{ zmP-PqlceXRj0k+ssZ@(3|Dy6_bWAYWvZ!eN$pex0^64OjeW3Ltefi0`CR;DQsst;i z4gnp6J{=~UfkC52MOF|WsVSH|`*3BWExY{G`()x{PKOkMVIxcQND@c9SEq6gZ=N`R zG-U4dC^D&_Ns0C%j37C#Kz^F+jXou?lgC8u$sh0czjBxekNXZL#FGW2H@++fYDA$E zQt)%^WJ~=x*m!h3xh1bd%JS|0U{h>(0>H}Jwv!ZT z&vZ+S22hO=v5joURR>;&d8De8oq$vATvpR(+NVtvYhi6;Oe9_i zrz&}AQoJ_yyjUJ`tr6SKKh`3|S1JMGLZ-@Ys5~E{+eC~G_O2NO85W7)N%KJ93?cDx zPfF}y@MK2g9AB`A(S7sp{$kmY>e>Q1KrQi{!A*#st88nwGF6lk)ijuP?BznKui=ZN zeYurVXss2~OVxRrRO0!LdDZh@J6fJ!Ufc!qv8zb?dHZ!23d^EbQx>l2n4ED(@!|*n z=l}KJ2RS@bV7$@ammt+VrAuj;EfN78o#AQp&M=qM;R}lP>U{HIInoEosw8|&JG}xm z&B}E~=pArZ-vL+f9rfvQo1YzTBZalx8v!(_M4)6NtC9qMh}V;Oe#;yk16_mQ}x zxAyXGf>sgBAG6{;Jf>uwpSvo`#Z{XCb^qG^ncN_t{f=iN8p%HSBP-OEfvyWnKQ^*$ z-io(tj38u6UR#>)?8Fe@~Y5IO=v@Sc|(?p?arPTC3!CcD3Sg zpv8DG#)ldvl_ablA!e?Ut53?S)<>ZQ`p9&|ax`(t|FzNg5T?`ItTL1Bh#Tvk*iY%Q z3%my$+pBZ<5HHVfS=GioI$TCQ46h3a;XY5!3?{zrNWqOw|(;D4OB+s0=>Ou$bdE-5S9pd?jI$w1i=+&%kziS-Cvj zw%i+6=vxQqy{p*heNWm3zJ^09j)b3c|3&ONr3nTRx&y46DqgehuXi~xy-ctBrRr=Q z3m{+M{07!)gn2`2P4XJCjt+c7MCc+Piq&nKLTp!a@9FP**#l?+3)6=>8im+ATu3Jd z9R|i3sug-l(;LxV?&hz{iSbLrSBzTVIi&U*;JJYV0df!tmBs1h`knO`2TGJ;5*KD= z)I%hNU|p_2zsstOq|kiLl<|8I?!r(aIr7Hc}J(U5Dc1aT=)j5J+WmWj!IiG5~ILI zWt1?2wswjB!z}b!ZY`V5+ml2LR8$X5x1HwNRGMNs*RSbJDOi6Tt6d_{J@Pzxi$-?YNI|Or2 z2);)L5@ouji4S+NR1T+w$^Dz_Z}Fcl<7p8fBymG*eI!=84Vo&Xue2lN&j)&9gLGo>s8XC|)` zYSo8H@FZCG*0)?t!%)4MjtPFm%!<|rTiZkxN|e)>@rHqtLyGaLi~9-q zLFLH)=>B!)oJVxIIl&gCgT zvu^V@oiBgGmOmJRI?2V#AZoh*EWugt3)8TkCQ7p1jx}f??~N)8ne%mgr5}>WA+fQs zyR}MI*MHX`*iif;1*Nh%_d1^a`(n_?)r1xaA}57M*&p1ZP=To!-i8p3>`in_OP?y9 zm%7}OcOqJ!9LUb7Urm8mNwW_rdb|Fi_w{&sC63EwTU{_WuBn6 z1GUj~bBG!??w--lkhkS8Qp-)ir8bZjbd;L2;_Nhjg_}whzov9H0o(nxWM=@FPSM@C zio@P1nV19K+&b2tj9Ubr5{&F#nPV8M$0Sse4q_8cG9b~ij3iwz&eGMGpRE&$z7A$R zZ9#kPaQGWu9@dU|P}rd3V7ZB(aLCTr%xk5IVJGTAHSLn1_~m{4=@Z@CHNOF1M^d^ASS=nGLb zrr88r;B^v0%M^{VgCsm15+w#kr(6Cl_EWkV)D1i=3&lEiFRHRee8!ZL1mnRyxRlcZ zu-|s03JOb()L8|ogoHn8cc_X#aFIzvWw>Zv()WMkA%-zLuv@pec{q3oPw&4`g$dt^ z>!@O!w#7e2BLJ1?N}YO1tWoz(YuFw{MniP-)HsMlJWGkMl|?Uq3Tyayu$;Zzo9rGr zG*mzICovg3FWm3=_i$A9Mu`K4y%fvQY5mvjc&#oL>Q?DdY_G-@C%Q8%A7q0ltDwFD z$7jQv{yy{H5gDAF%azE0uf z0v4hs4E;p=B3RQN!i)mVlO*cwOKV!T900=u%~4Tfp#LY~3L;!&=zLg(CQ@C8ZINBd z3ZU<-kt;`s>Vh1pBk<^RMPs?D+wXTi;n^$<>ZLyE+WghI7}W>E^7m#0XnPkHGw&M4 zuD+E8MRo*&5?NH^F8gmxr(k8-{n2ZF4U%R<4NFoProYtAzUD~lL_zsQ<_YY0 z!+lSUElW2U=om@?ZEt=vJ;{Uw-ZCEp@n5E58JqdT#TwB&BG)_;d$YdOD1NHpOZ6wY zQHc=S zG=i|K8H(d6{iSY@29paKJkD2xzujk5{&!x>cbe7@{`QI~>hIzM7*Iq{g=-Kf+&j#P zUrg?&e~p8`T~~Nj;fO?`A>IrB`a))r+{XsejK`pi6 zv-$PlZ>uo<@aUAIb-yG37zfTw{62Xa{EhG6?>#II{-2(~5?0l{(LBcdpV8!a(H=Df zuDD{jva%V-KX!0VU+O_OMv7VaVRPqU_|jExQ-OHXi(@-pk{Kgt#{Ot|pNJMA?QU)tVWXeGR>h zu`11tKw?o7Cd$iiuG`!YXS5_&FQ&&x6)_;p1`r#<7QW+3`>fZHEhdH6*QR&RaTtldXt?ljEbX3}XQknAdG5ZC)b7$eCn$9-sG=f5;0P-usgRV|6?BqW!_gkuF z5SFWUTbrN#G3=_;DEu}43Hp8SX|AW|&#BjEq4LEZ5_zuw;guxa+k8xvH{W?z=^_ZJ&KOaO4hRCs9fw+=8nGs)->QJj1aF9j1E#6R@EmbnVLK+L_0 zFHVpE$4(BgWi^Zy8>2!QzV*`qhzVYi*`0U~5NA7?8g+CHyWDTNqMqN1X<0_ex)Dkj zy<3{gKufe^P;BvWm5!Il_E)_>MGs{`TKrrui>WO{(x{Izx19N?Ll7%JoFEQx)G+M< zYN9p0UC&+^zni^UY^N)I5oqUUYI}rRMk?=*iju$KDbJJUNW_m@iw)LgX1eT9w~{fM zwP)#6n;BfcF!fTJpc(f<-fxBOC-#VDFJVeG@Z@t?sL;{A>vJ=ER7?TScJxhjtZBKn znvOL%>q};5JPBI4pH{q=mA%j^-AIGn{8_6o%O!3N82>1tJe5TovXt4(p|atHW-_1g zi`GF~b~fc7sk9>a5GwSA%%dBTxx9DNE=6j8vH7u&7(fa4&=0rD%}O1wIewk5y*y^T z`L7mS=2$NHoXJ?HIz;cAG~Ie?BO)}JXE1=nqs-4nQHJioQFjk3&Dz-e!rF8iC3pHb z<)d=({l1Q_Uq??M>3r7P(J?C~(s%|R zH44Rj%5+0MwE-qLytnjMc$!!_p#2yYuXU@Eob|J!jg1KnI%W^056WZ$qH$5VJ%(Ub zb%!HKHtbQaNzL)cFj$5)T-Ns4E*;T^Xh2N0NB?l6&6=i@ALN86DFs!fSe6Gr=))CO z%($qV%tOu*TXR&_XV8S;Y6riGzLTvGE`ELH0@$BJ{$lQNB@}-6)|4_a6i0PN;gEGK zguKe#;!wtT`E`U;{uy89k1(q?ok0(To7CBIH=_#dPJmN=Ury&^%HLhfeQ01LXmfYp z63n)1*)Xd6yWM*Ma1}3mq%L47)!d1(7!A#Y2HB z>SrO?at-|*gL$maJ?&i^B0U+-UV|3_Rv1nd-42JHQI?jRGYwXwfl8i8+(`KA4=H#? za2_z%!zkX0oegvbBxi_f)VB!_`2<7GeqGhzWqnH#v)h*Lo-4&ryG2xm{^JNsH^|On zW`sutR2O>>)^txk-ppg>lonED4)5xs-1PcrKGm`Nt3OCl-6nYh7e!f~cNOWTT_JPL z1#3;=#GPuRSD*e)V7wK=Z2FdUj+dhASWnuOeN>KG8BW{rDiP~1%8b}maXq>_1YpZ! z`F4TFo8RLu=V6Y$EB7fy*otsZ5v|6k<2|faTjy|olUi6$H?oI}qT=DMZ1-vVYe-0d z#s~g&_A^F@$jdU;(Ecn>CQbaia!)6uFfX;R^!#LF8n%9+Liidu(3wLHjb6n+0DfYByU8FSLnHIbM7KmZD(XBj^d_Rz zAx(2*$#p=Q8G1=!NCAg2bnZFU-H4}%$c=+ku4_~!51`1~1+G!ybR9VEa)=6RxZu~p z?PU2py&0RK5^lRQMoqw`J%y06lZn2%@KLhwDtuB+>jzy-aA3ZeXP{l~F}lMumNHyP z#|ZR)_x;~@4F0fxc#1^H!8_rFale6I(qdoHV<77FqBCdfg(Zn94xzN%34T#}!o8f; z1@x1BcQlBDL1tl+lC}w|+u`HR^$Ra|^BdK+qtO?$OTzI5go1BR6esktt;9|m7Dx0pjp zlY{0Xl?TP~>zBdP*K2dML6q8eK5=bgqNf_k6hk^4hD~}^+M}0eSPSaHs6!bH!pt-b=|L@BaZK$kp z`m;5Xg*wv`a^s71hkG}p!9l|#Dy^4T9^G*A(pG7AL1i0KWd-)Dx+ksfKijITjM0L1 zpF|Kx%!CR059_MT#*f#_?;*W?q^c$E2T}sa@ya$g$F9h0Wm?lE@RB)qu=z3WKY9!a z`~zaXj;{HD$yN z>#tb)^72dFf> z?x9btEIZqq`A@h3I@Iy~UpTjR3kdI%w1ypY$4(%QJVc4gP3&LeVhAnu`076~=%!&- z{FQOI#9yPo9U^0}RaNiyzX)~$SD^5cqqDo#Bz*syMD7n>$C$qw0x2#L{nEV|_2J9< zfo26(|NF7^zfgA3?wv)WtsL>tliLBHa6fqS%v9=Hm0@rhPqZOKBa3*x>IBeM>JFA! z5mM@gpOe}&n6W$KYn@~<)9s?x z%irb?PjYgSSfMSfC~^2K@lfVQG~$mx2yCSGNJK_8boYklnG!QwLvej*mv}6PGh>jb zFp_+5t*gxZlNp|xHsQx_J$Kb!#8?~mrSh~%DzL^B{mfY7qM1iy7KQfdvAddw`HZpn zL)sT}LWV$BIbqr&7}H!*_tRh}qvSwj| zP2&^W_|xigZVdjd94gHw5Q?k_X2ae5fg{>UO<&V~(ua@$Shh{*+oo|?#K-mo0pi63 z3EwsQCRcwJCE$b)W>6Y$X>2Iwx6`ErD_4I3#Txz@;Lg!a zoqtbn??6^ezj`RXmmZe^Dmh*xlp6`TiAh<#+e57jj7(ecJ)mc*I8su{bE3IKMo+Ep zmHe2s8(RG8-_vX|{=H{5Zil4Uminb-gasR}P-+RL$8{N}_oTvl)kK58{;Nt6+%!~M z>`IvK&`|d-b$U0|0YhSGbqkW96+3-#-W=6_htL7Q|prEgbwDXK}UT$`~J1Z zVWkdaSjr>Jwnx?Nc4Ih+%;#q-aY1o#(M(%+u-QND>E5^@IS;R0jMFgX(9D$)!@7jk zpbsA@&Zc>~t{88D%am-Ip2apWxx>zuy^_l0BeYl02v^N@1v`~t%ooDIc&xqxv5FFu zU)k;Wbp(YrQ@K03{Y6@62wyYo)kh7abf;~%YizJC(pZ@r!rf(y?mTPk_!k0}HSmxW zC$nYpCt=x1wuh$16dq}rYKYxYksn%z1;_9#<|aG~CgydOME$HM3=Y+~n4Yf0}`t9m3 zO+;l0M`L zIlK_Z3DbBhs3?jKR>_F4#o96vBV$T9fHugLOg>nt@Poy6P8}l0gRN!BFNbxJbB%D_ zdWIBn_shMXjlHsV3;lJMzzu&|u3iWFM)2vM2m$EkC8Gz%`F0N5?H?p9+-H7_m+9*6 zWj6TTxD0(4C(H#jT#G3_@Dn~hj_)Ur zj|7_@N|d#RtZAU0h<}C!sN6{JM2$WhE4|w0%QXwy&k)e(+TE_}Fbqrt5qW9&17P_H zJS&8Wh@A*hmgExOXoVw zq`!fxA#H3SnlZ(yJK#PAsFPf|J?Dj}4%>vvq8c=@!;xhqX$IWVZDyAw@&*kv$?=O^ zH})>zS2n*JOdg+C({DV~X@jONJ325G4pgJYoiS z(QA+tuCj7BSn$rdnuMp(sp;OmJoTC_nxeyEH@Gd05`d@~z1g&u7>3*Dwop3}ZiPeWX(VpVapFd-Ea1%^wKycNdAT20>=a z6zy;D*# zMknT*oef*tbI&Dj6HE0fmVV^XEDjAE02Zo_9T8Rr2n=2HlYC!24lv|%4*fQ!vzqP2 zxYR>3=0M1vv#1f3*uxNi(;KfZi$`Kak={h<(D2LY$`F!^IrKMQ>57sx4y0*MJuHn= zE$R$4BiC-;wbc>4FS%oX(;IbyB;e8@jaTQ+dYf(24wLpGClge4sEdX+!=bH68^ahZ zzKY2%UcAY z-%J6LGXNIFC@+&(+Y^YOX14GQdVOX$P>I@@z z-$H~UFOf!B=FeQvkKw8o;nmPf4FxG6gFau#aIjo0tg?%o#(-M)B9@F}&#o7O-j8ob z%egfTyEyt^nf=*Qn1%tJCGyt@PwtODN;}wHG#JoBL1mdAT}^>l_0WEnD3jMcNu+(U z+i6unnPsGUbX7;tXU`27wSU*Dz-jD#>eMLHGnc|3RcIf+oI{fDy884zX){F+jHO`S zm1EZisRjb}&B{vpTo@{=PIRiwjka|2aK3)4b&Kz7SP1*ns+1Rn_B>@G4ko|kmXs_^ z8!5TFQZt7|CnqeM!nuT*3<3<+&d0RYTR!e-c0!pAHo3=tYFyh^g0(z&j5DJ&Dq$e% zkpN3-N;(lyU~zbfav0Ql4f|lj9S|XwG>lQ@j=K>c946j2H`yyPV;p5BZ9&y*;fkxv zvn!oVAIC3u3&AQ5Wk}Rd_@AiZY~Nay9t+U9WB4DY<5QH1%E8gB{D?D&EyDsy{#MQ4 z3?EStk&0er)YT@{AnGP|j6r7q?iQ7qONGR%fy?;W6&vnpnf^7lGNSzwtalR)*ofBU zDb4&PP1KDis?hL#iGio-#gb6Pvu0Ms@G-3qBPm57g%%c=ggGsd)YFxt*B(;e#TnWfx=M-oNou zkYZ%NvUpcMi9`!?Yat%od!T3c^0S0pxFPv!Q-n_pM>E3G-?hYIdb~3YONH`)upZLR zmne)x<{v6w+k8vsanEj&>ga7THnACGY~J+2SOJoLLskUzhb=_Fka$fRT*mw+Jo)g6 z^X0hV}mX?529morjzGZK#8;%7-kkd%)(Sx zex!AC#46gGSXsAy2vK8(Nc=q|k(E&95D8{`)#c1DS8=PBf4;FgcWX8t>VAxl_-B=5 z{^?mJ0>GTHcxsrlH=Qpg79Gv#!OaG0!NSm1(+%c=IkD1lnzI!IoNOwW@q%3@~Ib&^EL&3>p>&S?;qxt!+$G3Bwb0#*S#GPx2 zBf^U~*YAhE92;LRe3USdU>t{;9V+-RRJ`ipoD*8}IQBZMZKxY{fbZ?mOgh8mfcRUc z4g}Zg^_7Kr*0D`}_R(!8nCCNsbEy$k2gBMgM;~=<{D`x;Iv61HzTW}%u@Gou1Q%`?>*b6SV6#+mZ)7{ zs#zG1p_M;KiIhMGP)q&Cdq$^L{3F?Wrg||FEm_@uLU3KsZcS} zWQ_gM&83`m2(e1LSBF3t;UOD;czHQRV}967fNJa>D`0G!n$8et4e`s_1FClzlz@tO$wFUS8xy)WU$-8&Ky_tC)xtDWabwb0lb) zQ5G;kI@>S7n`mMS-ZIs02@ocP7(F3ADC!0$1fKhJ?}Odec(XlR$& zH9QZNtGR21@iXZWjl-WO%L%3M5racfDTs}W4%)rA8Zuiz4P;BZ3E7fbII1FEdX5vtos{4gNT%Fq1;w}x8;xOQb>q35d5 zEcMm|Ta_-SgZ2pu{5(WkL`%V|hz^i_7h@JWd1ea>WkW0M&Rf;Y zn|riLzR{B!-bp5eC4UX8KlB^3P4msosJ7Xb@f2Gk4ocUY=p(_V$MB7*T)@U|ALuIT zUgA8<@a#LvCd`M`FI&rEhpq3e+0%qaE}c*ZEwQA6XWKm(B~D!rJUWE(L;ML@t@z;@ z%gw)410{8+X{q(l-@ui~qod;e_{VpZYZ`)%gg~-hj;(QjvKhS}lYch&_xqABHPTDQlL&Wt*Y)%~RJ0=G~59&iWO3ZnA}eJpw!LV{b` z5;2+Gs;U}J-4?AEUMDm6fgcX;Ckp^W)Fp!d2?lDc0EOFu=HX&|x4Mv-M95iWEHGU7 zhADJX-!0iS1lxp73FW+SV6m=ZiTNmgn+7NXUiK!;MJMY(>nZ@^|CfUGS1R>8bVboF ziVIyV*Xh!Wh{VQUnxdr>Md*FFyAJ)u*|~;1K98S#n(4i*Zj(odB#Gv2A+AtqDg^0?Cl!X`)uRADFZ_4I11xfL}BY)J`b3q|JBvaUsg+tQ!sgX&?$Rb z2PPYG%<8qi38xW=(gObwQX#LA@n)ba#OLIjn=PGR#^Ti|@#F4U_v_BxCa!`KHvv*CsbG(y0J?Je{Ws@ZACF=1a*of-WM>D$ zhiQ_w4@tI5E(C48!N%OAEKusP-B$67WO9!Hy;sNyNqBv#q5z0{acbY`HQb|Ep}@S0 zoVKIJQOtrgKn|I{dZc~T_gNlBdcz6-nC37&DD$)AGWXdUdowET{-%^Z9=qq%MOmJ( z(tRYYz(}S+@7&BJ58oCnX+U*m(VTHV;@4tdnIeVbX6Tz@YJU}&j|h}%vc~2|R9=e= z;4#P@g!{^xkXnF&(_mXu%v3~e?D!B;P-s61p*cX!kg!BbZJ7h!JXhvlo0JuYVeCd! z43(L>poJ3FFNlP}a;*))$~LU+N4S|aB6Jj3?;5XgOs}sei>UF4zr+Tk=HE`XYJWb} znG5*pp;SX8LG2+RH6MToOJJ?Ci%iEf5|}*mTcM zsW)1fw~4cmRH!zGWV11y6#HB))CysxXGV14QXRZA6H!~)FV#zuDN{Cw#(x7J?#?a0 zEJ(ZScDusg?-j4y_kW$lLOr5IS6qAuGsITFjcxf4{0K109>3r}y7wM+LtF{~_@cT? zP&Q%EC6!H^BHPMo%BT(DATvspLlvXM1fPYc)Ym9icSwCQlgd>{bx773na7qH+DUe4 zH;HQ{0M#iA;d}I$X6yIV=83V1!$mCTPn>v`(**(e1Kgd~(-2*c@{|d9_(HM#v~^S~ zpf*9}VYZUUM5+x`8m=1?J?-YQJKV0cYGgeP-t}AEvt~Ad)*>XMb;L%CZdOt4c%SVW zm2wp2*Q}Fa7@j{%hObkb%Uqp9`zGe|^#@APrNUg(i>thD6t2|iiTC4c^`X^x@4D`` z<^8=TXRJxff~5{=y6v<5{)(=|*(u_wA-vQ|HdU1vY=hFn9@q}RwDEE7_dSv|v14Jm z81Z7wj3BEGM4McI|H6Wm%8>;}4u4$2?{k->Ql^1ayUAxBk#g|cRZV>hdfos;AaQv* zwI6j4u)KxGSUO~(Q&Gn(KQww<$SiK#wSBi?+*DVwtF3}Hwz%hC2HO^&qFVJ>G2T)e zLO(s74Yz^w%!D+@crzOS3RdYrIZFWw9k9MhLmy zbkF(SW`2?3zW#YNxQd3w-K5-GQshH`zYPC+dAS3@+8@!~>_s6{$o|?b5G!vi0KF&l zMFbav8KKa&wv+MkKI78vG0@RH-u)G}StWhOc@u(9f5I#atF%59I=>-gO#k#(eJ6Hs z6f!9(hfaGP7INtIgUA}>e4HAG~a zuoe}mVo(#u-od;yVlO!XWQ{ihYOoa5Y^>5#OCjD1N@qQE8IMc75SPZcGnw86%L-ta z@rVw9ru+*2DK`JmHin zdyf>T35Y^<^%hd4d@Xw#ET5-O=HZE2QkJs~Sh9L;P*U1N(B@q+!r?q7% z8|BqQx2qp%vKT#&SCdX@!vw@jZFM1 zde_-RCxgm**quAf?eaj~fRe;E6x(JY2&EHB&RQ8m@yaQdI*Q$_!T8NNMjsUZO#Q+u z)j8Sb$l~H3&pY@*oq>CU<(RuAih+B1|ep<&z3dB???&HA4Bv z>Ng>v%*e2J8xqKbiKiJQ_QPq+cG*`b`)JW(AAu{M^z7+79)^}Tw86qR4cR5s6;eb! zF?z$zLThVT?udqDo(;S&P$iWqvF=NIYE<{LPVO5ar07i?PffJ(@%_QUNLrV3wR1ha zen!@TCI~OQxLV#<4$RSo}` z{ysqvU|V9>M?xR?%>DQxgxa(9-%a~AHUvG^wn-f|ct=zv74K8CGe~&STncMa>13!H zy0L@%JozJIh(}{JmnJ80)b&Q#(#M$4wpVI&*9=UO`Yp9wDo=!!N8RFZi>Hx#XGc+!mpzKZpdk*IT4}hCf3C6C%0WvO!=M9-c{kFc^%1%8Ex2yV%nbDb_moy!G91ZX3e4S>NAQ;bU#8^b8IKhZ(y>Bs`>~4a# z>5@*0(7~xc{~emeoDNt;A}yX~5Nzsx+IaArt*m(nW=HGt{ORwH7@~K)#LvIyH9Q-} z46-{}(WlUZq(bCshhybjpG&R=KDb%cJwlL#Cg=D;P9N!-);(AfZ!a!lj+(-A$Uo4a z>Hq%s$72Im{A?eLQh9!)&u{BItX5`?KG#z zwL##KyBP#wE9Q3wk4BVIq+B<&UPk=&dr}GMr#3CTkW&9)oe4xjdQ0Ns<)+gaJ9fQwb6UxhGrJhkQ+A&f zJ>^-V6sgWt$ev?l@P8x@# z?J+kt>iN3#)!rcXb1I*8%F#|=Rfp$ajk=H2W<$);o1}^zTF~;X`VlhXs+T*M4aK;|?TVk2!} zkqLbO{OGVK$U2Vme3Xp#Wf)9h0!XM;j645@F{csp}ond!ExgN(n(CVW! z*ZHBTprpH9wg)vt4^||QJ_Oz+<0YAT54`k&U;&lduA~Ug^NzBUUsB2n?pbpPmzql_^ z)MU7OW%I(R0yYmbnu1_a_V(ORCdiH3%iPa_e0X`DOWpeR9d;XSajf1(8w|(iLCWFel}Sw4qesD;Tl&=ofq-CoxPs{8L1t&DlF{3l^d> z`jZQ7qUB&mjdaUW0DHz;18ZZbAVf!37-p#i6WLB=jADs|>V9aL^IE8CJ#+`f_pnBa zhHw-svR{dkd3nVTRt5rbjG#Pa*rRBO9;r)VUZ6X+9uNU5RBePZ3v8I|+i+Vsa+@}? zDyA*d(XFk-rs>9Im{YgmiR_ZhaS<4r0{I*IwKDy2zBgC*w3G}V)A5v-60W7aq(L?! zAv1coeX_O(@cmcOp5BglKVLnJ2lItQ-)G~S2^u_0 zX!|4THxLajWCbv=!hvuD^ItoPyT#%m%l|U0l1WjWQH`gp`-V%YQ;99$_{f_2&w4KY zh-(O^#!C(HtW}V6@hdiYd#o=!RzN{mK47}wvJq}POAhEDQRjLvoCjM6oZLE!W#7-|x2}0eKz_J`5KuCXkVFyM1y1zo(PRVz(=$_&T%SOQ z`{wfi=_6Sa%bOK-zRWx%Jobk$7v-tD8dCL3P0FqxLN2kY*dZGG6%v2j7s&Mx-Ivi; z(IY{&HnM%mWypH@4Kx)y9Ys^s;zu23ld2fTaKs#a=!T_~N;s!ZB*ai`m|O@tT+>p% zA}JncuB1fE1j`=okGVbG0!+v{tkb8aM`)6oJ7~X01Mtx@?y_&xDfg6=uxA&Rcxdam zn4yukhLcWI0oU+KEH0I1Xq9;nb2s@r!wf)9mxSu$em=}>3TaqYZ=xS>IFh8xWtj4% zmLsSPFH1Re%Wc?N$Xp6)bp=1X4^LSP&HNY;;XCK#m^Bg=rOGL4x8~$R)U#?ZPNguu zrIl3@yPTnRWp~N?AE%PIs@mtI8ghuP!{vUaiO97oX*{HTv+x8tfGh%fw;E;eFbL}d zn5{BaOix*T_}ufsGaD?Xh6vPQJnRWm@*6G>iD)f1R95=h5EcU_Y+tfbPHJ5UzQhnO z8rc2=_Bg4S59}fXO=GGypD2iB(U<%OW)w6(Ot5HnV<}bf*5$Y~_3q+?_|4IrJFQd@ zrrI<^*Hf>(cJ8Yb)k?`EiLqyg2S$3AR_n7!IYP?%GST_WL^fc^{qpR+izmZFy`C(9 zdwxij<5Bbgi5xFwt;QaidWZ|IjH|l6pj?Ny@RdcK(iFt8$dl4sx7#q8yoo>Qnn)ven=ilQQ|H< z)IWqj+kcD9xK05qdP(2}T$8)8t49(R>qB>noAI-$q(|`a=lpzO7=Rv%Px4a(*O}t) z9<7b?gad|`MNz~)_;9{>bW_MMr0T}ybL;Gvy-|gSGDIDc z7ok27COeu-^9#nCf2VmRRvhLwDnB!}EILJZlL5f5AvsbH^`+at1q%7MfBQG_x~Vvu zNF;+5me2McCSM|i(K0&h?1AWtxH&vVFHeqbi z3%T~|FvW%Z9Z^NNwDB>d7MmoZNfu*C=g|UAa}c^S$$D+Z=xYXZ_E%>R$u-d0gYm_O z#{&>H6|vgQTNoXfC&iNx4In(_0uwk`Fx^;uAXZu>$W=J@FJ}%pe zrx0G?w+kNUP>0d>XVR3k?YgbtL29|dV*K<3$;+E?f!%Lj9sw*2!8iOjMattnf4i`u zap=SvMS*=s>moMhc;WYbVr9H8tL_}?>V3wJ^!9a!B2#>JorExuw;5C`6bTtR+6a+2 zWf&_)guo~LRK9;5{QLjlm-6rb^WXnBLS*Q2Zp`TsCoJS^4yX8ZzjxT3eswV9*azKE z)r8zMvG#60dBh>&#n7E-EnH%lM~X|$aliZL=h=km$6`F4Ev*yKs7T{Im1&jclFF_u zk*?p;bMFWwHZ11Al|O_HH|=g0Pw;z4_zjmelaS9IY{>vkK(fE^NCd@_@()F&Msyv& zgjGn`6nus^dSi1EG*@$=4dQ8h3A+X=e*{`qjFM|rEHGH^=cyFC)Bi1-?+Uwi8$9@3AX4L0sSm-ybaOo=K z{z2J8I$3^*xVmlnCK3=83g61NezrUuOp+!N?>n($-1KTNeOgT)CDmA4n*nv^5bHG7 zB$dD>?xVNcZ(&J(mW_5>H3SNFYY<>BNJt|q(tbU4@W#V{f9#LJmb=iFokvBa$#AcSuY z6$r%3G8|(Qh*k?(%@%*{7K<`L!UCpZc}RW9f`2?4oYS-v5Qo4Cr#_fsCrv-C(9^&0 z{tdm#`8?}arC@=TM0>MvoBJX>R0yy=lq8Zt3dPde)=Zjl0V>SxNp-y>#IlvzQx+=$ zS_AUf4{e}YkhgnBJ3)ja)AZXvsL{j~y?DbK5>!x%Z=H5p=^+V9 z`H!bw?^V=kzrTG*T(2}c65iqbm%V#B$s9&z9dn0o^yf; zvhyhsjRiyrlrDO=_+xtW5XR5qnx*1J3ga|mBiGTAOZs_8oSG<}K#V27#g7a?)i8#> zlK|@Bdz4|2AF5P(sx>2?z5q1hZ+t$rMPO2DD7LWB`h20WViEFbK6nnMcopG$lrFyS z76Se0_-U|Wva#yO`Xtd|5w{x*M+kWhM>i$KC4c8}3lZ^Mh?FNEMrH|n!a4N^k*@4j zEYp2bLGA%CE-Dr?6hce+Zz={z7fX(RcEXYzJCt~;B;Wph$WVta8-V6Fegwa%*TMCK zD5XgPb`i9w_Vp(TDX3QUs^T7ig2E=3 z?#;r!5d~t8GBzmikku=>4&T#nAqb)y!}DzXx*THQtsdsH`Qzlr;BGu!ICY=@pBlWt zAjF&OWNQZXu{9nrTDLuwu`)5{@gxMYzeLZ0&~5%E{5I1@lV|O%Oi9CcBwnzbpM&gkp@sc8G$|DmfB5PX&9v`13EfhLf4(J6!Z>63tt zT+YLQg3~FI2nsIfmG|mKi}~1iPc^nD1E2yN;Y$1@8vX1D6+=;-5}HwFGu|B-`{DrZNlgDa&ToO{)8`%gm?E_y2Rgx+FZYc4)L;%va8+*GO>Z?;YyZ7Z zPJnS&$aMe5f9DO2x5o^cXTOITqm;jsX+_4dEVU0FlEymjEoSNgJ+D@-auy<)%8MDW ziNeVX-V2zw$3JPA-~!uzo1^R5e7T$(vanjW#l~7XKn!)**c}USgfV!D+x_h0vF<0P zPgRndX7gsEe1b^K&H)eGKhV$7CAoT8(_BsM>s*9|!hzt~_9N>Occ7LA%O4X#X_!)9 z{N+Dhri;a7rc;<@eHNzAp z=?=-}4PLGy*$y4#2ledm-rz1@@n99Td$o)%3mV$lC{h6fR6+Y!nYa;L z_*y{ELQoB#)GlA){wU#T9j}j+={Ig6zN>CKI`&QgM}4LQdPp_QN*QAYb|7kSbI&}D zNxuXG%-`l7`uKBmFbOLU(2ZS0sy70oyi!S0{lS^UXP`Ch5W(l7bgJ~Yf%Ev*!C{^Riks#*3hnKHdD`nQ;Jk#8FpKGV|%#Xm+wQDcJsd-`d^3roAE8Es+$>P4^}AYA~8 zTmmOvK!ge($?f>%{$XX#<3v8&3QZ}0JJmBW2&fe$!J9Dei| zI6K5P49ajdN9ohGbRXPj7N0|z|9B?HAc4>M2DVb+7u=4r{I+9*2=W_IsXbiqC;wk_ z;HCjb19?#qTBQ9DFP%`o`|;x1^r#tmp@4gV{tc_0;c;7LFz8HCZHhhiUPCgnwNBLH zWG|sbkJca$;e%S2A1Qd7U4s|KWkaeBqxOkJu-K((F7@cHM*6a%z2#(q*LQdiP0s&-EK-bl^?=ymkp}eE^pt4i4o#e0PS0t>}Rvf_~$oqYSzojw* zL|h`Bt{CUO&L&GEQW6*XU@Gw}*wg9jf(?f9irQrw!{55M2XM#1uC{<89eFQ;O$)lvj} zPgt1}%y-yz5mO1Gg+IoO1MA?a1ph{57k9g3;VdDPzsrP7+n+;yb%7B*DE>N|4MBvx z!P)V`k{~RgC}|HS`|}C*5GG?!tDRo|QTxX<*HBSV>k3I=m`6K9pL)Y;ZEe-?PVU0x z@U6{hHozms!m|77GpQ-o3MBQeN?I-C-Y|$Dv=*=5JViOe!~4YZ)5S8PX=L9XWqYE5 zd6-ri)b@5j*l4Xm!ttRFspwF23D;&hr5L-3F~wu&Q*VIM5}<9$IkD)(wE3#=)PxU+ zPM4p?Lzt7Q(rc#T(jWu!;_rtwC{m!Og!PY(!(?fwl86Mtl?MAlbDc(Aq`M4YnNmb~ zfGXX{8mpGj??q{R2R1P#zq9jd6f)?9m-0hls)r84R*^r)up2D)=ew{Pa*Pa3fPP0cyXHQRKvsxd;-vUTe2g)kP8tXa0D)-nG{-Ss3b7LOS^N zkxQ8u63t>+%nVZ;M{$wMWTu=Wa+Nc}&9+c<79`~J9T%O|iJJOUw@s{s`S}@o62P4y ziIz06wn8Q~s!07t$)%@a_tNa)O^1UU+IZfC$pw(@Ei#r=BO0Ya^pj96DV5{zLF z@e#AF!_SAx(x8D|lwmjJxmid9xb{J58mx{R9ita*~^$zIr;2)=i z?IKVIxMzqU8Eb|=O*t<&s&i+X(x|YkO0*Rmq^|@K#_fki%G&MnO_P;xEvMk{-zj(9kW0{eC59hr zo80`BZP@fjjD>ibbm*BfC8^b<`x0k zz5Ji1-L4ybRVXcmFQEK|M*T@D^Z9w$seVyl)jPw3ZHxNdF{ys?)8Hf&ezB*!7yf~Y z4+Uw>J_i?-tu@OzU93g7=p4tCpw;x4pW=b4fR3C(rjK|w1!p931PSJ5DCtX`OV`QI z8@%=KIO;S#QVZx4O{#~a1OkyKP@$R?iQL;v&^1h4CsrF+dDdD$W@WFjheq43&B;+f zP7EAKH$@NrMv7kK`dkEXHUuw%I5hg8t}OxW`tP00$htH*;Q=AAPj?QSSIiUk3t>U{ zdN9#lbu)cr*fbGc|*_EUFLI34At)#!#anmloSD#=$s1HHV zL0cc(Ix9RiF*V75v<6USY>owUG(4pps1-i14BqAxB4`@Y36&}Fa_;MiEv6sSFAKb`d)E=J16FbZvi_?9)S z__X1nMQFG|%%r@9Hrb77<(kt}^pWbvEC$0a@vNP8lxTY0hHNJUsx|?R4PcT}v_t4q z{DcQ*$^+tHhG$BV8Vl75RkOJ$5sy`3t%S)DX?-c`VVx+H8P{_lW61<8Fhp$KQ)?s+ ztFJes88~M!`2-#Hc*5H|Qd*}|M9_1}c7F32&~~IqW4wMK(Xey{7i9rQWmjl!8{d-Hvks^m{ z?WcsecGKMEyMz1?^2RzcB?q_ZukcLx0z~DEegv9NB1p*6=UzrF8urISmvlIlQm)G=4> zfL{jAhSZy0VeLxQ5U*CCgXv0elBfZl3SbUPCJJz5Cd?@9xyBz6+b>dmiw0@DEvKcB zEKfOA%HSsAGAQV7xzLxphOE!9?jq8N{GzD*E36UG4K`~Xsh?VQl>(kkb1!Rl60%Yn zcOYuYQ$AddTcaT{HRL^u9uyk!iL5K8S+e|=o?;B$7hVf9x}AIIAP3z%rbUnpwT$ax z1bIi{R|p@)k`LqS$>Mgre3&f2u61Ik}OvC>B)&NW=f1kPbqe>}Q&&Yzx;xRj*SO{s1U~%y+~wDN$$p%y$&lz87(n z-ifuHYv0@3S|}+G@X0_w5x~1L-)0uSTPT_pJ9^9*s@L{(u3}_tM@Aw^OMbqgK-o>S zstEHD%?=|e2~;UiSloLn*;Hli;N7D!GONl~NIx*OsjU2TNy@bE4r_VY_}K3~TjNk3 zyYQMIA%8!2kW&L{RzKmT->SaQ=W=Lrp4g7^qj!7O|gwi)+dlndtBKO2OX&4^qSt30s0& z!O}<8m=AW1F|Wk23G4kR8R0*I9zb0cqw!~Hil?|N`7q6IzE4(8&MUls#CB1qB3 z>Xd^=jn7E=fF?TY`R>Z9JN)%{i9^j!v zuV~VS#TB+#NNljOV7;?Lny>{P*5Ok;dz)g=?)wc_JDR|&!sYoHXnpcc;cr+a)_oZNCbO`RN9&ZD z{dxzB$q(rz9x_qdlkGbM=4)D2qHMxGh&Li3b%bKp$vbA@orGgSOlB}%7`F&c7z6Pf z@tV>a@TmBl?gwspIGfMEyWH-3$oz+b#^ylU&{ZJv1iF2vPlKBLNvsd{kTNc0$KcopD-S)|>D}p6!eusorSWHdhzCNB58Gw#!}45C=s#BFq;2 z^0wWq`*bghm>=VNv#P8ba?y!}i)vq%0qHe-t_4dkyAdqF}ES-PqxkP* zH6a#ck}CWk{zy_HroD*%xPF<<2E1awbi*#;M6r_>>%mvO#f9#8zctMVQdVR!3DJKN=cd1R}`2d>i1?Po=sK>kHMNf z3Lir|$@kIgWJFBJkMKdA3>MPU0>Evvifu%6+t1`<`az{b#q`&~^+d9x>kxna#ovvO z#%ghW`PXL~VAYAETqqgJJD6pB-O|(=@4@u+ne>Jfe8{FGan+_6c4cxs1|N&mB6PJs zYK%XQXficXk}U6(Ya7%zbV}tOM!PoX z=|i?g<00`prZ2~%%tS#?xC$$g16*JA8%J!IZOW`?jb~_wE9XyH2V34-l+(@X=wPlR z_=l&D%ak!Y6}&3i0xKDKqX@wc~_-fCGoMypkTW<`Pc#;Nmj%+Ys6f-5XfypdQssrrni&XjDphf;6F{9-7Ht_%^zLE z4%hywK`nfU2U8~g406x|Xrfx@ZbQW8=?HME%NzVh%8uciH3~ZJs68T_s8AH>ldP;S z>l@9{6fmr@I8nz`c6Qz?C=7<<$G4deg_>lOVN>8X#QN1n4FdYid^1VD}&Bced zT!q_x4zW;nVthJuZR4bgBCkc+S^7OB94Ja$%BuOSJX8}zn3urrcf6s$VrUTsZ$}Gj zWYUv9>AF&&S9Qc9Uhd%e_Bg|ng&`Sd1(11(?^s~m?*JzfA7`cZhSM5Nusr&I!?gD`1h%R z^EB!0kImm-!%x45@%aOCXbc%)JFjjn!`ZVS6uu(;_>OIv!G4&a0H=b+Y*k2Br}6MP zBygqHZ;2v<6OhBE{X~U$O!OabTHE{lzUYds?y=P`Y*5SL+d!yO@e}axkXud$ygI*3 z!uV$p_2#M&zJZj~`>M}d-e!av7INGzx1ybWHrzSMY3Hh&kcHC8{V+xPQIAf^NwvxW zZ;VkI*2LY;%D~_oQ=~&xadbEidAdLLk(TpUm{~5Pw603_AKVdb09Zn7nFoTZ*EI+< zSA=)RNX&#?4<7rPw)qvc6*Ue1#Y{2YuylbJ%XTXY`?~Wk0srS@2E^*El^c^zG{tIq zhvbhG&FRdvc)@EQK!)+v9k-sxOBKWLkC2!Rt{JGy&i*7t+`Mrt7eMeyhmLa?=m+|y zpgOZQ$uL0%%U=Fhz32k_XZ1kq))2T$;&@M`a+4fS-DR7Qi!m(0_j-Omo zp(Ek%ESpW%-0B5>n((Bb!sSk9^X2^c;Zh}3oY1f$pXEH5K z0EPU86PX1AF4Yhwin1Y2ME5lQb`DC4D`^%Z8ZW_HGAG)D;wmb8f>Ql|w;Z=?>% zURi}qCm=YFEZpqxDhGwoTp?|$2O7D*K-MNvb$!s^W{rF9ZL~(u`|>@q?7I?|Haz)1 z{ZW+#dV+~A_r~wmp#-Lo+P5%is2_Wz;(3UOPb1R^=^glHHD7$CC)-(W9fg=70fPA4gR8n9>p zR2fxAcmou_$zwG=OxC+_1A3RIwsCbP zYMO@$tYsW#yCB_AB$OScx)GvAyxG)hSCYfoF;ZilwCwHAYoW(hD?Dy@A@*Be65a|- z6ttRZk1!N!A_b7)c%F~Y$ExFf4pvfDJf-`{QV$2W*!@rfb^=Nxug4%b=ILPaAzSUv zVGA6ho3M(M;{7j55O{;5Ux`=b!g#@3$N=DtR2+%<9y$1~SeNFbCh?_DEX%0$S~(Y9 zLWx`rny@kyLc8?Cmrb$`W6V@MZpPQ*fz3o#JEX7yOzfi}TZp&=69x}=!`^$iV+HG& zzNGO!`Y>41bqXQ+aJF;2)Y0{P0a+vYSLEpj4Zt=jxL?x9lX_VJfDr;&*PIIz%LW8m z8}wZhC>+#HnPD)J?1@;0!tXClL0;nnm%>Y5NfB##Acc?b!&J_X?Dy$^PM*30Ke$PT zojqAoLk5+@j>gC>@1_d~zTH@C6p1-NRK0gl_f@#laKooVEa2j!MPuFh79AAjrIV&M ziwYt8d_v$<%#Lo`$QDom118J#)VNR&^vgWZ01a%g{LcSs5pJ<^Fo1)i2JryI&6I|6 zkcPbc$1>Muq6U~9^6}Y(y294S6N6~X{(0UPX6xQDOn`jH5cAwkevb`!feauMZTy7wRlcC9tT94x-ioq>P7!9Rrhq36owvY)mxpk}>4X(rcp=a(kks@FGg~_9< zzhZ7N&NJ%h^_ktWVje{Lb^p$OOmF8u`f;3hN4q<;&7VdLG^t(5Xu;FBaFn=qNeR2_ z^{DS!wfQG;cqQn|5V><#<@Y$!-C}w_UQNFBwBbd93-MqTLN9al0Cbn+qaKf0)YzfL z+yZnkVkChj+4{r;)8j~}9bO1k8ttO1^2yU&sqVLzp=yc)b7B`c$`0w*A8lT<$f5ta2{kqC`o; z0|U&9t)J(Vevx{&2xq`(*s8`ehR7Xwrl>uF0PhPehLm$^Z3o$*rpt4@oYmcYal3rL zH1~b_bSv?+Tc@u!A6Z8)sxQcoI)K}4SrripHRp;bvbPD-X`fq-Jtj}r^VI`%N}xaV zK)f(O<0X+qdArRv?9jM4skN}$0!6?gx>wX8=E#%2aVn38XBq;7b;v^)Q}+&UwO(p? zSM90k&C2&!WD2X#hDnM$C9Us*Y!GrC-95a9Gi9e5aO?1%>ukuIZ@XUk!}M0eZYdji z^>rvV1T{bx&aNeZzKz77nj(>(AT=HyXDo$Dl26lF1q5pcJx57Ot+SI<1hy`~h5Fh? z)>Mg`trBquK!J$d4J4oC$zkTZQI|{{Cbv-xt4fP_R_Ia}m^VV|_m|oC_yn%C)Gp~B zZrvi}o++BgC@&-FBmzW1RaQVk$CVF0trc|Y4tGd^_Av(e^sl~sMJOj~`>kC{vcycp6GF861s$VfFjWc_f0F;vw#->$*hhK`>!B#y%WkG)kP` zO^`<$!sBC<@v+wYd&0uGr4Jg}mV4n2{aC8)MFy!B-)>$4=$2a`9)?s=tU*Tprggs9 zpOyb5noVwTEbVC)n%HgUy_9YlXbTyYt%n`3MBPBt^bAGTt^tzX#FhmvOW7ery9gm| zdb%5sz*MRu26w*g{mrY|oCkbYvGtWHpj-TDZ@`2#to8QH@YA(Hv*Kr4Y%ZL034{Q* z`N|)k3m_{U-fpc(RqL}+#Xju(f3$_C5;_QJK6Uf;|G=Z{7Ny=xixm#MEIeiNps*~r zbM{W0V~MSQwpFRXGu$U*Picx7;G~o|CdR3Xy~PF7JPk3HYHNu(m)NNNZcwRADW6JR zP&f&fbn!igHOEbpDf&ea2XT^oH;HG zE$&rszIm9=Zoe!C*OTQcgh5y%S94hx3Ma-z8;+IoNT9h0e$S$;tGZ#60$0dI}K`X#b_j`@h=kkE_AY^35S0 z1qx@eXZrFqMFn!Yf?I^W$@#Ugrt|CT>C=41pre}!=hv`FdV;@S^t3xjd7udY;aqr3 z|Iu^VwKzzcWb7+3b;KP(wgKjpMiA>s#w5f;&`eJw=i16e~hdT`_2tt1sGQ{TG>`WiCCj!7W~< z(+yx~PzO;y9Y+!z)$vV>@g?^ugA1S`=v`4$Xgvv>Mr!esti1 zn$Nr8q7{?a<^F@gwDP^QM;$(3^q{eF@!NtCJiiuD9wrZS*v%xlOWukFmus?RfgbkL zW-%@Hf>EL#Xj2*DpefXFDLaPej}!nuoG4IZ)%+4Z6<02V4wpfQzIg7g1HIl)q-u2%^uy)U?!E;$R6zhj9JWhF*tX+}G2Pb?h$2!D>4r-=Y zL5_8`?hX|umxZbbYIdtm%w{}dV@U`g@(&Z|*~4iY0j##-ysp?dZTbMOq_;|N!FDS= z4lm(=xTD6T+1G(UN}nT)J>#IOgb=0wosswHE~Yo*S-)Jw*1tYs$Rr3)S3yi921oJQ zYdtzA@+7Cmap8plc?P-$N71DHVfi&Zjn$~{MX=igu|hrBJQ}#ku7F$h z6jRi+&C%29JnAA(BfO2Uvh||da4REFI~}y~PCRMDGDlectrFj4j2}eCWuL;u6bx5L%ixg(Q8gpTlwKrzuxkx{N+zX8qhZn^XUyGwUS2DL6MrvJwEV< za?N-s>6O;R9AUcJ(=@^xc84-Z^O?~=*96Ci1s+{m2Z~3-<2O|p(kXp~8TK?F1qBu> zR!$T)y8t%Ee!plrB95kYpuOw^(qDNTZ!bI(^y3MHnY zK>jwiyG5o&pTEd(Vy|ms!e{J{q^QB$_dUi$kqc(sH!xen`uszzZCU2}g}yAokb$S; zQ^+`=o)~VB#hf~t7yX0=xkNqSjbnCy8h07sJSeb>KFOM#i*$yFrxGAJD?o3SMe|_H zcN2mExC`Usl#;6zNb5n?L}8mQdF)D0y--7!8D0Y#XV~P_x_G)bP|FiFpiqctLNx0F zOX;SH!ouMI=!}ms8$SR-`cQ6KI>-ChX4p+h)P=#37HEqbkhf=ae!z}b7r*Lpt>*Bh zkr+Za#r8_l`x&*VdSYqebKktt{`*TUyw|#fpy#*#!oTg%a8ZAf{uGpQ#1U#QF@ecx z8jj%-ZeItt_}}3m&+0kf(~_gqeo*{{Z=`-}FN3X6V-S|MS_$8aNaxREd8 zx>Q->_jcxOeK2?@&2#|4}mmjDQ2*p_LK)+`hR znKp&&+sv7Dm?rHYw@HzO=htyOjrynn!#oqYA|SiKB$$52QaX0Y>hb)}s}!3$#{1H_ zhE+1PlnpiN)~8$<@!jY0ol+*$Sq+4D1b;5c|FWc2!7xePt&g`47AGae%mG_6^VZF4fsZ-LkHB7cVarl6Tlo(|FRMNgyLM z7fvE1IxguHF(I&GItto@jX=rktzqFop4b-v+otJm)0Xt!0+m8Tp*VNthR#abjNxS` zrG>n&KE!-BTK;r7yXZO*(5>0!DC#v|L}YV?p_1$YA{mFTszgd#`wfQqGmAM*R8~Wu zQi?5+N=5e;6xgtqCRTdLQ}>7@GY8UUk+&s6t{w6-y$_!jp!j82vfPg^S^zpR$rKp+ zGOl4sI7!BDot(qFhm>3!4Y1)Rse*$vq8a0mTc}%g8krS=?GbgQ0UrfJgA@VB8ZtJk z5~W850*1~fq>AIHD=UPlrp;X_%1qWRM%0&FJ3U|gcxmE@TPw|~ZDp1UyKLu;4s8OVwVh!P6GjzsGpM2 zIFbWItAm_EFW#hCA)DmJ2k__8NdZc9XBKNViaLa-xQMP^a?;|U*GKjoV^C9S7XI!0 z)zpqrww&DhWUzW2t?<8EAhYH?YVf~<50Fi+$bj8$&u+cl6>)V4e4rRW!;gx0L5k&1O-mTe}lt@kYb=%owxc&-??Bf>bk&_API5Un{id`MmM5~>8aaNU?wX6 z>wln7U6AK&SJAz?Xj;cmP-hO$gaqlQ^kYx93rE@Xc5I^+nn;MhKXL+xfVi-qqW1`j zWGc9D<13=y#Fn9;X8%A4G2!fCN&<-yi%$r(%BBI7fNOkP7D@I1lzWgn2e=;cw4jeEfabQQ5m|pN$`{#qckg z1&mHyu8iX!9>oG-VcgtW!4J^Y0B#$qvZkV~qL%#{*P!>21BuymV6WwaiiJRGe-I;p*=KU?^7 z=qh4-v_Iy2xWv%YT+MY(G#y5PF`0^$X7!V9RSk+wa`r+~1jI4mj;A!jwv?8m4iY-D zgGVAGRxw%*-wq~uQlVY-iq*AJ&-NfON7A5=a#65`*^u2Yf@wROLc zx{LbYV8r9J1R9%v$fZt^P(>`W z14go#{M3@J#?5~gd*zjSxm%EFNf++ff6d}s|2~<7#bfr}EL?ho^b5Nbd%qDWV9I}x z)-b(fH;bZDK-4}CDTljg^JHP%&co;!RVlkj!vU9Ix)@Q9>>CYJj+E>Jryw%5cXFip zI{3Hn|0pOOLbfoW?W3F;4iA7C!-1#!8Gf)oJVzk64N?PPn{UDq<{+tc&c6=s!WBEm zo3eEXCp&#_9mv5xvNPoe_Tfq=`jsuSL4= zI-u2#PZFY+O<%H8w^mk%6kdA}87 zgkMhC*apR_5hv5g+Vf$y7BJxp5LqeX)KZKbQ?nM*dB`n1*Jt=;f3I;=E1vPV&zu5R zKc@F_7=Ok(g>4=ui<<}hg2pxB7Wn?D9>P!J2DU%lR<_${K6(t9>cfkoO;V5Ah9QJi z_%Do(*w?Z$fV$Enu>HyreVN+hI_l{NY>C7qrs-v&`?GRMICRDiBIi`F`C~c`=%@R2 zhKu}8(JJDb%hQ|LDs@G+Wb5opbCt2cFwc}qv(g+13vJt-(R}p7Hq>8H`>Ctnpwe~>xl$pdyJKjiOxK(B1OXU$Y!-fKsHM)Pq;VavTCxfeLOWAW zuCdag)HjSIe5TyBIVwf>Tv&=Pt&fv@4Cs=qPEJ**l@k_9eB9DZs?*Awh)x$A#Q@D55S9qdy#a5s!G~xb+}H!3diA zGucQ&{iC2xUP>NgEJcR2nAB+&DQ?qK1E;1K2OUwQj8i=|1&WIbNq4$L=kPsyj_qLt zbdSd19aZs3LILIeTB19pw+&HVtWEK@F-^j(G6)=ne^Dlc0fx!`6?K0xg#3v<*0WY; z11Dbzt0H+jj2ck0g{h@EdF|ja9&hy*GH#rlVu8Q2pD}o61UAcgz=hCo@mGWI*bwci zp{p8o0wTb*v{j6i-lRco)2U&qGka*^Nn<-tBb|c_Ru|`fRo#;II#_Qv(GqnloIx z-KGWAklPA8VZ-PMTOkegt0=q&O&TlAyqReCkx0E^BrZ9sv0THgR&EsV%o4r{{s}wd zu#{qdwZ*n$#U{J7Ia^ei(48_BUNpB?N#0`7bY+VoDl}W^GB;DDKYiV6lEP}w`g_kq zZS}h8IH7u}F)cT2yJ>?T(_I87aT#ewLW4SE~aS^BikC>>myD{&lCUf!T>lmU0%>l#JP?9^Js^4y*{Vk$6 z@onF1`peH&H)rSDsVVR@r-zmwG+%6jrtsD7<+7@0Govu+cGbi z=<(vNF3f*ywJ22g#p=6|dPeAVkL z?+7W88e_O!<#_^1XQI}GP&eJs=C7<%XTH|T<%KO`H)X(6xan=5+hh}TaoJt$aC~$} z@4G7Mcs5T>5>?Bm#g^nsOSGcA7fL>P>4LSgW9vPY!R=(Qe0aIL>$5$j2Z-55k0-q7 zke{@>-OzxZ{9A~mtRxyydDwM6yB$1EXVcYq@v7#XpY7#ZbgB#!4e|-oJb}&Ux>Kxv zx%nQh>+zMPU&zB__DT0v#?_ZL{Pd}XeJD1DIN>kV(IE4k(G0q?{Z{N5a#`~eE4C4= znHARj{7AZT%!}w}&`Ip6p;NJ-?NB;J$au5j$|pzlc^F`-y2JgoIWaM)6!i)^Jkgyh z@&K@u?<%uWpkvU9l-a_G`8scFZ|kkO{6SxjW>b}mEfI)s;skh+bmgV70F{X(*EVU(AUJja#F*3ocg zCz_R|4v*3V+U{AvyXbE6mk>Y+lhn7rbi4o>re{BgCi>2U{$y9#=6UHlt1dSDat}4` zNB$=a!?TtmMA&w5h}C@=Q#|AoW~ae!ER-8d^1a*})#c`7D70&eT&1QWW zPh}O7(>J%Yy5QKM;d_qS$IjtLpPzh~?$da*`+B^#@wJsco(gc(uyCa=g?0>wED^3K zInOG$sZ=OEGk~cNSUmWe>`3Bl&1q-p5s6q@Tf;R)MJSOzXcqIU6BMfiLO{ zU;{Php0|mhABEeTV4#{tdD%&>NqhO_aT#^`Ny20j!7)~)eYQ{0x`S-E=DKg`&^G8@ zHlKG0*AV`(SFO`K-8#CRd2l`dPllvzeu`hnBWlZE2fwq}pKoQpEd1LAJ3_*k7^-C? zNDwnQj|aM?5-wkhKyh)54z^(Vgt;n{nnhJ5kayFx6N1$=<`CxBZDF=NM9ab7uCWXN z=6bIG?ihYG<iw7?+J*mgo)hTc@#MaY$kp)Rw!$n8 zJ6TRzcp3WUK<+W{z^u(A1a=YyKB`64W6*6)(*ZH_XikKj2R=-|W7Mat%9?nQ;S*d& zIYFZLJZq<~5GMnlt27qF7gsw&(+ibr=Rm;N@c2b3*nRlu#mBn?>NTG7QHha8ZGM)& z6-{3aj5kIRJ8q4-{wz$O5Fosz_AHd`hL{h+qjOPz=SAJ|`)0xv} zoao8)*SN>Q$_6eKt!alBL%3s9k3N|4)TzgW*4~F!C5Z$w`BP=Rvn;g`!Zf?6o z_w=TUo$c_QhTLP1|GhGmRUk*z^gQegm{F&{zS*1s9v5YxCD9ZnX$6?wg7b7Eee9oQ&2}v# zAjk&T6^6f$uE$bqdI%RZLZTM$JdD5V$>ffe3|VA_$qbUp8OXB8YSq8kwm_7MgdE}^ zBPe6yQ;E9Olp12PgYAL3;ahr+2n^F<#%F3(mdjJw^+;v~kt@piow~a1J9Nc$k1u=K zHD9<<64#hEkJ6Y?L-sm+{lCFi(O=yYG-Gc;uu;Ixk@(OS4H^$6)h=lKt#?8kg<4gt zj1{EZCTHsUlu-JC+7xydMcL{XLVcLGn9;ToYht1f>C(y6&gdkUu(TM~;Ml!kT|l73 zmU_nMkc22bIh$3Z&5%}-09Qb$zx*)wS6$*zrGpQAEqGEXJsHmf;iN6C=_&jYzz2QF z57!LxGPE5&Ovq*aZf!rx*k{M?bWz&nn|I5jjtE|k!31pEZRrB2z9`P|X{0~2CUN!HzQ|9L6!RmIn!ovJ01)VVpxZ||q;AB{QTO}V> z8fuFVcG)bvLJcLCUHo2UpFywduKn4aUxx5}v6#%%h;tV7pSYYMv}_0`(d-=t!e~<_ zn}b`O_v(3sHF+0DBRhO#BbDPehlKfaIE)*zP{oBBRPru&1V63lEn`R35c8c7O=0>v z*vL!)NMK^^iZkM_b2u|L_o&qHc$LqmtkRh9G9;&(9`)SL3;9v_fSRGVhO>D&+a;T@ zcN%8$%<9h?oGh5SF6c#NyDge-eqAkL+_W-h3()vH@GY8}54Sd2OV_Qn>fXU+S6ykx z7j8TD;`ViLkN;KqTGiSebcwhO#0n&r`QFDB3q1QUM{N~dAa-Y@V27Teyb=Ca6)R+0 zDu{%DzY(Ew6U$WE1yr!`z1fSy(r{(=Ts@<;xb5Ik@->glHY0lBfc@K(9iO=N#Mum- zFLsURKAi+nx!a|zDwLOstAr80OWrSHaFqKpfV5d4iI8-^y6>;E`Adk%5XiM5ze#xL zpOC!n84bQHm#^67&ooj7VCs92*^X*;q_1~{@9d0r<3lHrdey=>aVZfYIf5B1%%_<(P~({lUmn9emS9F9Js*XDFJ$JU(nFCt zZ)jK#`$OdJM0Bk!nVIta^9%VZeTKvNRBjGs5hjmh^X(!TvG5{naROY-vY{?HLV?WG zCWW-Yk$)SDeo5_Q2#%f~%F5`>IGCC9+M(wh7|^E!FVyO)@A?{}{ZNtr5GXrXGeK z9;LHXx^v|>yR<0QA)9K=d!Kzm?E2!x@dPX>|I2gOBZO*Y7$0=iRyN|OpAZ!>khOxF zD>rzPRg{ui$!(V}?WmRQj@Ua2*w^t~gYoScIT_Aambuw~jl=VT{7ZO38Ek8A!hzq? z;=|E2+H)0ldY&?r9~d?~3=z1zke%W|D>t4#g)OYw5e+7g@JYu?@G+~B=GGSC?(;h+rY$9wo#(V(8RaSMm}JDH63aegfBjNge%&&) z33u}Yp3>gkHW<%!l5bh$g*0GV5z?cu+@M?vTnt15)HII4n4y)&A@o3;#5MfX=QfEc z)t%4>nP%&!X}O^hO$;-B~lXi9ySz zK1WkDU|XJYE<|*O2(p!zUhD>trBz1YHb>&O!J%)c6aHSl(V$`5ffA6@x7F#wvBDng zU=5N;t+0B-W*xylM2;lFxozK}w}EdPTJ*Hdg1r1Wb$A1iTw)bUec8~wZlCs6l=QEN z=Oh<7j1~*>*$CuEzLnq@T&g6RHtf_1yKblOp8__aw}};8PdH)oS&I3lGIU^Lk$A_V zb6o?HSE-y0pl)w`U9O|X87vIYv*t!!i?p@XhHRTFsBp@kP>9#%?V{8^cjad9CCW*8+F>s`$&C))+fPY1P7u~|9KjOyrK~Ye zpt)e_OD2J3-G*s!CxyXKx?5j9FC@di4PV)=GkXx?x);22>(zMC^U#jJ{xO-}Kkz8K zq+@02PiCC)C;SQeJ8bWWe;oH1KaD9^i}3oS*euM=LwX($z0UUOx=`X;o={AXuftDP zx=j1bH+r0ykJWGE1%HTpzjo$9;ph3aT3Nd$Qm<@bzt}+a#c!Tp>HShc7-gg|Ngl0v z$~Jl@QbND8o{Y{<^kRNu+SkW#;avDOdmi&OHhM;Iz%%?e!lp=NsxdZq{)g@orzO^d3$oV|NWK3-?zbhLID_#1;O7yXlQtY1o7 zJ2tNFd69%^Ztv6WeYrIVIK?LY*%ekbn${H~_B=IK2Br;J57#jhK^zRuECINJ@pt;K zp9GzM8{bYM^|F_KQ4%7`$^SOKf0>AQfX?mZ`~3-X8{2F;8(dEp(D8Yud5W`sej9%m zTwmh<==f4sfW|t>20fyz^+@&cQ9Vl<3v_jy-^NeVe~#f;V^bTD&0SeM*557sG1Iqt z*umawD%RWFprTE8fzRWoC-R=EZK7{R^3Xq%6SQ2U-^-esEq0|@dLXM8plY3>enQ#P z@ExCtB_q72aeNK=$%6iTyI{gdyun8k&j~HbwC9}2eeGfw-z{*=<1hFruTag=1i4Ef zC6&;Q1G9UzOS8r-@P_Wp)>&|XuM3-IvW)5YMN1ujI*Z?@dVdy&s~i?5HL2y@%_b9t zw41Ox{Q)^YgyAx}V0;%=LEGa8W+#DhSPE<@$#{wnVJ4X3m2k!ZAX*8*r0a+ z%yS>3V>;ThubnVoq~&8*az`@qfi?y;=TQo916O(W#Hy-mT(hX&#_Z;wk{Vu9wN_ZT z=`Bp!=4LSt&HfzrL}8m>!x;5yrv3gj|1pDq5{`<@g$JG7-MxxfZLVt6&S7BtGu`df zEBf=_xUh0EJSC=z%|#Y}RJP_7n%^c5FFnPqpXs3;As4=~Djya$*~`Q^jzT(n(_qld z6W3!*NW*Az5<3l0jmyd=wRV}YDXs6|WF$+*uCujuI)0n5DYqvipX|Zzqn|uJhD>Q9 z;yI14!c<(JDjL)ERy=ISkyqgbvjE*oRx=Yrc1Ag;A$rh00~U5-^-2g+aa87!;G_`VJz@ZR|f8hTiNgJpeqF{A7_HC0J8{(phjE2 zzA*X0H(o{}EX}ovaFy$uykThA+<|nY&k`0ij{54G2`ZKJw(PZX$a)0#?1u1%MpX&R zH0%>u!o6fGCFeLWKYWJj3GsL+CGV`dcm3}9XDYMi{t_lSV55j3h$LWE9i=(o2N+ya z74S+a8d*v03->?5R4 zkOM-{Yum7VCOJer?TRg0)3Hb?5i^4Q77y?V1rg46uUF1SZFnA|Q+Z+D`P+21iWd}J z-1G^B!5!`**&7YpD6^>tA&Ix_j-ViV^Dn~cR2`DBTRatF+C>URs0&b-0Eo>7@swMU zFGc-($B6Lf2bMl&XGGpJ-G9M+vcAx0uCx5Tx9}4NA(RFrJck!r{>TDU&e*l+(E;#U zU-oVNyo>QvNXjsiY%i6#24OX8^3IFAw;$le4{E|f(w}?*740NVZ5Hs+PJ8vH^AvmI=MtrL+RLrN15{&G8c0>xl5p0PM{W5RiE zEWGC9B!nFVr0(FTTt-eqIC6JH)evZgUYK?cvu7|=2%t&ov&sBupQjtC8BdB0lRRIC zr%$)cs;-%x2%LTe*{MxZyP*fR84rF;pH;GkC3|1U#J8C*j0#kjTNu)(jA9@eiC5ve z25pFqbBjFNbzhytko+!uLX|(;%P?8c98Jy`CKDDssR9&6m|el-Kc5#N2AN)yAAU{8 zVH?QPCt%foxOeWdM`)m6F>6o;a06>Q_2%9w4M9s-AHKvs6Xwqk+}9fbQLyx_>MtF6 z9YFAEok`1HD~4UcY&;F$;}4ZBVMux_>UUo$07qN8Z`b@~g@0?R3!!f0f>6#Q67|Fq ziH4T@#0x!2? zMpG8CHY^>-tOm=ZEz;hRZs#}OBdQ5imli&UgWpt>D#6oX7^HkE$$=KndSdelYC@8b zi<0MMo8u7@IEl9WDeS^6I>_lY`fMSoKQg><9>3N^59N3XG*!0C9j~Uwi_vrP--jUL zX+E>p!c@`_Z+ze#`9M}9W`c$#=kaAO9rF;1k|5F<&lLrpSfuga>T|}sau=ZRy(dNQq1x@?AAn8I;KBj- zwBlzO7tr`5!WQ?mn?f#G_+-k?hXz??C?(g zq#8a3*Z~l@%4YS@?AcRd%z^1`NE>;J6O++sc??zIkl7Zb9jL*>V@hcveE5*hN_z-= z*^B5{E(%gbdbwTXZ1!rSCHzYUNAWY2zD(X8Pgcq*4V3`PBTOgZw^!PT*%y3d*uX>B zS#*`7cqU;q$JW<@>qSGPeFwn5o<9wKpZ-1uVKBUz&fRPm0wqe8M5ext$U+*T;v()# zV)&v*X*@UlcJZNieG5@XTzkVb$gxhhr|iO3__z7uHUx1U7JSWOfRKZ`ojs)xN=gq= zUk*@eJ0TnB;feONP1W7=()R0!qdlH=NJ{EWO@t*?Ya39A{FLu)>wnSr0_y}` zrRl0<6sNRpmXqg)=+^qBj?|3A+z5!TID}7_vXn`E*tJ~7&omg;!}-Q`Z=Mr5hKK)K z7@4PolIf_a@LYx`b4VPvfJzJPHn5UthZbJhzpgZT!f$_;Bkw{C)P|6cmtXq#Y2U7p z{)qH=t;!Qwaz|=dV(^9Izh%+20g&1=sD$CQ6U4`YNF|XNFZFuMG#G90Qcm!G#gM39&AN&Upz&LZt6V+j6x9B-Ch< zvw4|pJz?5aEhaXlOHFoMOIqB@A16DjF)7RP$Osr$!Bd29w`JmfBiM~}zr@ziGAxz;$7ID%3E=hMysjc$fPp3Md z(TY*tI5I2zB0FR`I}Y}R6l5KK7^ecd8m>pyStoFQKAvcoqer1ei=h>wpF;n3?d-j` zdp#Myh^Elf!$eH47EM%K)Yg;VLsa~v-o`G;mC8gFQ*9$i&Wg~qJVi&(xN1~@ib~%0 z@~WfK-CjG;!V_)4=G!o`tIuu*OSzt+PNG4ccaTnDA!OIiIZ^`2rG+1uf_VFywzVJ8 z5m48@np0T{SD!+jM(7km2i7FtWYE@tMo1m=djZly_MZdyfM%6-)_U%c)H&RE!!NBm zP2H?AWe$EIsBNPhE?-i!fz5Ym)#HNVDM%HvUzeKO1?)Lo6xe7_nY(c}jN+`Fw1zt61^ordM9V+Pff65JVUoK2{A z8Yz)svX1{02@f#)?MC06*LBQU8MB zC~%Q0`l;;kP}tTr1*0%Coq))-Ctz>;qTl}V-2x$N7Ta&po4S8l4Zzj03xQJn#KYl=?e3{~m@Ha7_XLc)2@_9ZFQWz4 zF?8EL(Xc{xLArfpH-hw)o~#n97vX#4_wpmjz5e_1G=(PRY;v=*nxongR5{Ss=dH+> z#jDJG$L@?6mYi=FF>V6G;WKGez0Wff2f*d?X!M*lZR3a_Y##N1DOHY_FAI;-jLJF^ zgd5XHl>Yf)FEF-bk{qr&Ro&Q23u9$p$siLp3;Afrp|6gC z2{bW1`N)*4yxS*)1D=V|=lQ?WLVQdaPjofD4!*4T&_UmR3yYkyQHfrm`=D=mQI-Eb znaskC%x0Ya9wFX#JmuxttPM*8_qGN9DQJu8pcWFxqdbgXW^S(6LWxM7`hIVK$`SAc z5Z6-evaHas8H$7|oq6ZI)JCLBbEwCu^)KtUc7N_Hl#CN?Cl7XXur|VTh6g1=2gaY~e$(10TnGR)y8TLF{(H zgXwDUqRT#2B>?4!HPdy{LbKpUDpF?VazSPYMw<+_6uUZrJ?a>j?hPde)@E4sMoJD1 z$%Ipbn|Bii)N5Dj#m!vG&y#{4Z3c%%i|E>GOMVnQu~XjNkpSp6 zusWz4^=n}7p5rXEzQZPw4gmHYVgck7T!OQ( zP<>Ca7XI0#xYK;p$8uZl*DUdbq5Vh&UleH>i$A$gI^=^%96FX^UhI^WCD)GQKZR^C z6P$n|(%{N|Q>z%iHiv5y1ZBf*t+gh=l}=$^1;`d~Zq=P|5o)nX7S#;-^l6uxm9ZGniY5wsSR&nZ zpp6-|O;a*z!}o?5tUkEJnYp}j%~tfH^fsx8 zZf2PAj)KH>*7STzc*K6g&@P}idp|+%DTIf-6#N+C%NxncU!bBccUp~>tHorp8cco* zQ2^A)zKV?X)-w)~yas#a->h3L)We@lrCde7cDdUq&B6F1u-2f1OC4i7CgQwFxZXKX6jKJjP5TYkPOK1+B%Cbtav-5LWBq=Yi#z_H=7nT zHH7QBr2d*JTO%&rhOn=9IJMEb8y~6{w{rl}fZ#e^8)ic+%X#VYx@icxTtvKoj2;Tz z(dN3~6n&Dq4u8IdQu=0tSs^|jRPtO(rlTa4;+uz9@&Ekay@z(EHUwh_=*%)m-!w2= zhH_jue!R&g;hSdiH7L?A7|@oc*KJ`px)O3Fa|ZAi9clQb@4tSQ!m}OxNd=Wt(cH zL`F1TXmfW?G=0P9&Q5^oGu^g4|G!)Sq~%tN?Fp)-G-QYrX-lqPts8wk_(Zj_7w6DWL_Ude0U3M~_I7FG}U<|)^e z5=2~YvG$ret31%(-a;wz{7t_lr4{r*FG)IC=dH!s4$V#y`UEP1ysq zc}PWXE_{(g@}7tPk7e8VYTu`~JyoeKb;NpXG!nf{;EYW_H#I8~LKHQqZ41cF4l~x*WoElRJ#Gk7URTAwb4))jwGgwY8u!N}c$OtfVkxZYq6h`ij?JKlO_X^n9rz1tcKYnC;IOmmykJ{dTT#M*C3-JJ|y+bbWtCy4&1*TK{DStaC^ zI07E$fCpuCbpB%3xod}c4S)vs-0dD(=l?4RepS}B3lLoR1ec^pP%r=`>MAi=W&865 z`z2iFlLP?=^G>bBm0?4eCaIkeQSqVht2OEaX6$a9#gIa;m%u#0cK9V&OFk`}=6{T8 zTqt|7RdXSP$@xEA^S3AhgKx`MD=e2)xL`-RGS4QHlhL6)!tQM_ z-*romjb6?q?6Y&p3TJ5xs|rak)^;76zt6(xSWE^tvzKe9vjZeiM_-gp-hu?G?t0SW6faA4SeEQmEa?w`=emTc-cBqO zSlTq!ERmxd3JR(>Qe>3q=TQT6ujOBnm1OG0w!&h@<>RTJ+ST?bim*q8jvImZJXgmL1p0 zM^A1l=dZcVrRJ2xqfjI7cy8mhuCyEfs;_&yL9UMe4lTLai-lbe@&=J~Rl(PhB?}Y^x1$+n84fA!0@42i_ zP;V8ZRgyMXebQ|!%CpQQAzqMRRm^eNNWQ>q3m;v{1S=j$C8es- z_m2yr1TkSRSI)Bi5~s7S$<-T|u3KX)fsBpg6s+~`M&C(bvd2J7SN}2He_0NmXY2vp zi3Q#N%a8dyEb>!RupQSr@V=&|3KKOJpL(tjsPip+^S9&iAFEzL5X9Aa)TVckmnJ?2WsYr+SSF7Vy+!Fbg-Nj!+2UD4=7~Ju{`Gko+3+S9KOA(vvOwG9 zD40|iKI%HjF{`UKPrr3m$zi^7RjG>=(87WCM8C_6E5V?evMY>KyTs{x)zh%S?fel? z{)s;ws&UW;sn~DBSC2r;v)!!&UB6EaH~{RSo32#4s$HxC#%t{=u9IJztWHNNp4p+6 z;q6SHCWG+^{j;$eg~IJnteDT7wL&b6-aPFI{2=Kz={n;pvK1vIlpM~KRNz2PZ?Luv zB$y?-K_8tX@^$Do_fWd^c~fFiYT{+G4Yqh$uEbDO`K|-%I-mUb+m;#VKMy z5JYBcT)x&mV9I7hl)Sp@Nm%3E9C7(8!i)qL~%e zCOy4lS;yY!=a|ZGfvn?GlDR@_I6ef6?#pfXI?jXd1@#ezZr^};>dNJ?83gr8mkKVM z5gcu?p+CDY13wL(A6}R6PrM^_gPpviG`le97=udo*-9@qJH8Q*T?|f#bDeF}>_%84 zC9EsQf>PwYNYiZ$RyV-+QO_Oj)0@Urd&1?QtoO{TS{O3H?FKsj?xA#R7z-t^Gz$ z8bE z*V`!J!?3NH#Sfn6vzasxL#&M`oF%v};(HqZK0!!xzcAuOm?*oVl3bkuZAIkKNs(UK zr}FunQ|m@+DRQ`t^lCI{tPo;)S}1W>j6_%tf^khG@Snm9q<-SChe|&i9NvH-2T%m^ z=tjX~g@M4!_ha{$5SaxV#52z{U=!NUBF`%;QFib(+M|Ikm&Z`oGZR2^Yn{&>H~DLJsU4Uq6HmlSv@I}2Dm&lUd^>+O?GA3gutpNF7=LePr`L1N$PQX z?1M^U7Bi0U)!G~o*POl>6y=8g>il!DdC(pRt9lIVE;d>|OjtDG+Gu)6GNQO9D)ji-Uo?8q$Zw{70C-!^F)= z|4{BV&Bso)Eg*e1!I|>3Rwk4#OZXvVz0jGkQ(RKI&~L)8>~G*aqMlC9mNk~HBDg06 zN?@jTia>fV;1P9um1f@rmFGNUxZmuPy!;;Hq>tuWeawJ6j)!2Xz=PE7`sl47TwO>` z2DkGc(0P;DOQJfOJYS;E3%6cLK?hQS$P?r$L`hxZ#oWk*suHBBbj%Wp=v3r$T+G}T=ikm6MgSZ zFgn~fPD>G9YCK5AtFe{qfS!ab1fzc*&SB2FnpdSUk`?ixP&TSVbw_64_b1x00UjLI z46oLHk}e3Nv1r|I5NNn)`zj7bLn1Tx9j!xz1W_00lcwf)p2Yc^v|yW`)17bf#**@wml_4p6hmev(Os^1Hfe> zXXAS>3zM*k?>60jGT+M+HVK4}-d4#zR#`~G!sOEJD@R3tg_Qa`^i3JnrP%MwcmadE z$?sYyqM%{6MYNjsql05A`IAU2S5*)FZD%XRt4O?h&IQ6fon+4XUS`{VJY85V6M6_; zg(tU25AK~~vq7ss(4hpBrWe4^@Lt1%Pxge>UUe|Cx{2%^Gzk1&!=wMfb+8Nj@LP<8 z+m~DUA+-_<0#3@G(eyTaf4;QO?)X*lw=d5K7-u1HeBx+*PCg17is6!Hk9pO4IRB=CWRtF|= z2u4X*O6za^Mo(3);f<6PX>_Dp6QAp3%P=0i<{l3C%vw--PJ82%SRba{718n)IsdfnermLee=@|GG1fW`xb$tr1qJ7;J$G%8uPF3}$mA81}01 z^9$rB!f^UM*(d9g2XCurpcrv_F6qDmY`caz3LCei@i}ycM|_?x##CZ`u)s5p&8LtF z%_iX!Hrf&$qv||O->o==s5_xuf+j!e%U}2?E-HM7b#E_0|$_j-BG@^=nl!NrkyvRfGxB!$f-!+Qz8@ z^InzJ$+q-!@-|Otoit>%Q_HT!qqjovdf;I|jwC^_4;spRCS zK?%r)Q?gHs&hH@2ajiKgW2kvTdX=>|IU{)(9R5;!?I}EwO_oG~tEPmU!yc4cR(9Qs z1iQ|7np;2%U0sd!ShZHA`-X{}Wd62lqKHBzjV;asRklte^xWE-N!~S??XiSO=DJ0N zb##PYlF%gQi}_kYu50gk`Y)pO26P@W+T0ADC0l~(s^AP_#v`f?^FRzW0C>KAxgS4C z?{4s8OlAVLo!G{nv37gg?lqBI=yw7aKN5N`Jb+uoiT-U83O$3YT>Q<$t6ugHA6&7O zb8VPh4rlS|36}=aMlX9HJdSNE+ZSx@hnudXx^0Dz<WSvEenLtNdAeSz*jamHFfI z#cS3n{p5&znH`TO%%4F#bbR9jVeifbSQEdErW87*vInxJ-&}f>k4# z>KJ+Qp<>7(k)sPE8V;V%b`vV!nC}eIsO(6)AqY`xBsy1-Qw9rpMmUUs57>uwKpLKO zAk{(E59JB5>4;{SM33gIw#)WyMV;RK+_`D*C6wswjQ-^q5Y%meDc%(3mt!ifc8S8L zKasna845j5-`zZY=M&c}#L&XSI01S9{CNc-Q8N?;9kKalfq88X;xQ^alRB!KAfFSD z`44C>P^kjfN#va^$QD@GzCNS5fAtn&|79fix?GJ_<%i-8l(W-Bv*_J*Me@4Z+~z{GITRFNG184hNGP<>YqeBNRDh)>7dU(Zw@sFPhv z^UaDzTo5p@@5VX+$}o>nwyI7yCDQON(E8`=waOcplEUUrbV@Qo_QG@`;Y7X!EywH) zz*E33+Y8IRnq%bNiKuJQhOuSKQ=BX&<2%nv0sM-El3kMLQ9~HV|&NlbNFhZ zaFeQ*%D^~aybx22>j9svscs`@m$;D-=}|g(P57{mAq9YiSXVkn6U8XcNwf@8U5qAF<3UaHL_Op<*zI|A zZ+mD#W!0FX(>nF)8r7~S#t>$t3xZXQJC#2<4myZ@M5?9nG;LOtG z6jCu(0j28;?j8kV`@9=;38Pw(UmQj`I_&u74E_HjdRJ;eIh_z=w;1&}vfT;;bUMzw zrxBpShh=17P@F9qN}K8=#T2dXegve;3OM)gbw88VpdIESPVRV3kr)wbHi)2tq>ene5ETpFzYRqoX( z;n$0DM_iK`_M@yY>`RqEL?Wyj-W$hqXG>f{C049-4imF8b<14iFtlMbf#_N5DF@R$ zQzfTwRvxp1;DtA3bepSyPqUVO)Ri5nBn+0%l_`n!G-t!sA(k@76wF%3{;5vw|b$QprGxxP8_4 z;c~(TA{2dM85#5FLqrkF;CQSkHM27 z?&7|i-wIZF?(ElBO;K0B5Jtx)X;bSy+$fZx*j zR@L9+aw_s&zV!{lT#vyx;-_iV=d;0?VUdlijb~7Ksgf%n&-h8WSVbASZ6|0|Z-<{L zGoGn(e}6Lf(tf3A2ms3!@Vk!HKG{Tq8wg4xjZE~c*GLK+a^0b;f4erJe)L0_>1x3Sx6wyw{Mg}p1nJ_IPLwjS* zus4BpHgO+`FG0UgP3!?pLM914U3)BqQ^|(rS*Nhp4?ZDVE=9}rg#`=$V0Cfmq=t(I zGY0^n&rIX6X`cj5afl4Hl-@SH66>E~PU(QIAt^n$g()E@A#21C1R(sKtM=ees)?e2 zGfe+tf}rK8zWU4UY>I+>8(LwgAC)fo2*M62zj9jGswRo@j0Ai_fUCu=fI+o=z-v5b zHd<2)*o|+(fw8|7Ka$d)nYrRh-~{FvG!= z6e?{E+Zn`c@1_o6su0Jlu16z#AvfQb&&0U!1twKeXN`GEBzX#vcaMjrcdKryQ04`r z3x4%560IBgb(FRM1BxV_3^<~3%cS>b`O-ltB(3)Bt7`D=nofa>8_KNI;0#yMe^^ki zO@MWynzqXKrcZ)nYz>W@||>A%93>M+kmY40bK6 zE0x`cHrq%XWm6>|ypGQNWVQ;24hfy1V99Rl%|YRhPi)wsNrhT5l)2Oq38%(0!AS!@~95O;J}c;KYDH zNe%VaaL2bH?9I-E8VSDQhil$q?zE=Q0RwkO99gFll zf_%U!j{C61@?k97pT7YlGJta2{ol1LV&ySIbyt>~O&_n9;FQ0qj-4IXwHZK#DCgsi zvY0(lT5RtK6IRWbi7Utw1G8z)yK=ub?g1;1anHFef)G_F^en_~(TyDKvguRjtL9uC zuXHt&XyuG4 zox=H73`MKoB;F~dEozA7)^%HL9ft=E=CB&Jdq3U%9JLcSNFEnG3uK&gUtV=)%7zQ+ z@bvUYeix;do8sXZ8#KBu`+YqP#*4@K%c9R+vp=Fm57JC#9rdgf@}@?q?GKd&B=)?E znVxW3bgdX_x--cIEc{&sgtFk~z?6Q(svn`0TB$g^Fk4fVx@+b{bR=SFvadS(4$!>; z78tt50>c!1*kzIZNevjLrbjB}S28(Rr^XIbvF$^ZFE7*ak}R;`Wh$M;^%q(x$|BZX z6MdTUns`oMZmuWo|h$dD|8e(5AW< zS{UycqSL5boxgQgzhJs?lR(Em+gQ(kiwRxksfVbG9x|LwG%%P6M|?eAg#UX*lm66I z-0W|Q$>ioij)=6C%#!LtKd!AE$CyV1Y{q;=BttVqG)WZ00 z(f-<)ckX?g>Py&s_+1wJH~^#Q&i%xH$f`p0jzQSB>$N^xWNs)YL zw^>-WReMc;o$!&=B+{3h`M0U1K!aQI62b?$9_qoG1)l`lfHd^Bhffs;lmDi|PU^L? z@V(}}+UBV$4;kEFU701>N>#+iAL92$8vW-^r!?^>RiA`aX*$teIiBYy@VVz`(CMpY zJ1vq&F2Y!foKhXryuZzkRq3#O1rh~295kx`nA_mlggUK*X$ssoYwPl8tpFoc$2F?@=y+4t1uPapsfU3u42on9^-r29YBRO0S=x!uRLW_~@J z&t{X?kfOeNU;R?HEQEObwB!dT{KLfr5h>gsN!54S{j|4_ol8Soc2@@bs!elh$0+>w zmAod+*-IHc@tyFc=IAn>{TAj=A&nU<=Qq=d#W|GzJGY88ZgwcIi-*;@lPOeA^lmMKr+x&~|4Bvc*+TWE)X8;J-(%1q}kzt2{kr&vYimAm!x%A z=8jRv=y;~f)R1$~2_l}CBut+}iu3VwsC$&wXGY-Asv=Sx2n(yV^8j_k1=(#`+iqVW znw5iyj9%=)>njzSU`TmQFngwwB~m4vQA$qpDE(d&r2KPi*WS@?+-sE(dh}qo9nz3T z;dEN1EH0bHLD4BEY`Xo&@+9oy_;q{!|1tJ&%Z(&imLU2*BJ&TZdN`gb0=RTdS9SFv zA5uUPP9hy7SO6GI&yjgB07lS)fEm&ZAOrFBqie0bRTcEeI%j0MGMuoxsj2SEx@Zhj zA*0R;)tPRR0T0Xlp;&F>_rHsm*t?seHVB*5kjuGS6}S7zJ>Su6cz=eeqx7?3galn15Ff`lh~PFu;`OwbA839*UfjV74WK8R zRNmqgiEQ6}7=cY-Yu!jr%5=;`dU^(vC7CVVvB-Rn{w z+mfMyoegEY+)T^1Hw=w7vJeFJ{_>`Rc9!a&_a-oUb;)a$_}3)mDm-hpD(@g6Q? z1@vIeuC$5*gmdo#3|O2l&eVvYI^|G265?!)qEXiCh~3?-h4*>rJQs^U$|L=-m>kC^g{oFE)(LZ8Q^G zoBC>^eupV_BP^{#UU%!`q3VYzGuc-Ti!klkmBh?(>Gj95~Z(AZ6y) z6A0UVb7+0jtkbY=E@KG142$DSmG@#Xn_I$*&g?M?t%n0%N$bdp(S0hZG6*?~@u$@I z7|wg+Z}=>FAq1V=^d6>*-rZ;dp?176`gvJVK?;bl`8+fxU)aI_49_X$I&h&{Hn0Dp zvQoGsZ_en1gWHMu8CU_QcV=P;gbH;6dOI%yPp48*nM|J+nVnb&NOo@~qZq=_k2-EW zm;Ovx%Gv`bKrR_5sL~_!M|Ws!Ws|TFp@*kjzS{RD>!R`6;{A_e)1e7ZX>xCq8xJT_L zd7Wis6b!)+VSUsSP{Tq(llk3uKgp-eDr8OT-grFz8Y>eWViDvrwv({-Y@M30S8>Q} zgcCz>)ymy)U!oed1SV^DDJYOMmyx{f+h!RSPr!j-dbd6+h=h=(d?R48_kZ8cMx*Z+ zyuGITfBOD`+DB^{RebzkGL25o1WKogQ`|ODg>(Z|mRECCHe(DXmiguc_)D{L>*xDr z)VmtpP8R?qUEhr-$^->4>#G^|4qm?YfG3Z>-wJd+1$>5Gp#!vVhTj zTSs0bA7Po`h;P zteVwZit)9>PZnGHVKEy%rrt>y_y;J|hgY)njTSu3YJ1#ur$6vNorup)MslJgH@}s& z!h{c>uAh64%EQ3T+868i=S5hSY_WZIM$fY$5X7>8t(4i{nfC$*P9&}IIJ1#kl=OPn zEgW(G*l$i2v}{q*aPFl@$#HaxRc z6x;pdkN3MLF0WysG5(2sCW1C8w|5!mbOHO&8rbk(v2d&9soG2EAh&rI6sk&1bRJ*W zoUag~s`H>r*o22(9M0h~nUR6&P$u(iO(>SGi1;tt zq{UFX`l-d96vKX~j;B zd0Vk_cA2S;rgsd2tg&@IUI~baur3ok)@nq#%*?hfNx0%F#^;%?dBQPP##DNEO(}0k z;NVVh%9UX`9gwgo;Y6=Ke8Ey(oF8wX<9(98b63(tqZMXZguz2DLGbht9@rg#9Uw;A z-)7f${*;MOcJ&&RC>*?AEGg`-(Go?IqL>pKXY#T*^&{Cr#{%Q4MJ^D}hR{8HYzUF~ z8XC$2xdYY5E6a7xy@e1X*)rkNk{&|RGU91)nQ>6eT!BV^PC&ncdVWh#3?NYbBJD-0 zJW*)V_m^Mfyf%7v{SVLahAjguj64wm`5YrV9LFVIYRMEk%Dr((vX> zsCrnG7C(hp^tdJokj@dPzgfd-8KTPdDco82>N?_U7z2=JvPKoTJ}BmlVdWl;;V2}b zL#gPo=J|O=KSnHl^Bdhl9sr@1Uf2Eb`g%B>|Diu7#pyu!Fugx zb7o`s((EsC-;1Jc98*x3Ch)oT+oBetKH!K9X%36ZH7dUDv|M;{ZMZqLaB!jYo;bzn2eYQD z9jBY}CyH9ZFRI3);z4yj8f3P6{U?F$Btf{>Q9&$bqrec4f;)O(gfh~YG_`vOI6i`f z7^bW*vinu-=KAU3=IP!y|2~?@Ty0r=WaMv0ckPNWx3uNh(Oi}b(h-f-C#JH6*x{*-h(c@>Ep1kaa}20z7`rp*Z= z8qP4hS3P`>qVu}g;d9o%%zZ9x*qaGvLPpd1Q2^uTDt7+o+mTzr2rrKhs5771ID2wC zdf=g++pL=%ss?8Vy1i9-G_RH5uTub!#}mVX&^BfOgu>e8&a@F)r_U&+gW-vf12xhp za2K|Dc)AYIyN6$W4ZXt2iEwVQagLZ|ulba8#x3#*l1>)rHzv=~Br?g?Yz z8j7psg`27-qjA?!r`*r>6j`KU@$~D0>vnHGn$O462NtKg@IERn$_RYxl)_r5XQ?g` zN(H-ETQ!ub)gg|m@1yglnLp|8qu%pKd@c|9SA3oqUY9=8nM?RA=e4J|Iw8hesLjae zi-=GwW4Z^Wdu3a%0v&QG&7ewEeo`|I$NoN6`R#uAG%-}bPu67Ni;sa*Vd9OelGM(& z*Z}j`!3Q(8<;^U-kU$~V+Z7uHssTV~){iE}c1gue@$3sTC?I5lCJU^PR_$6vswCK# z5)FL=QG(?Q0aan=?Fp*DU&Pbc6aNjtXVU}5qxiM>y*yoH%I1c~LY*)wkg>$e3MjmC z3tbk?U}BEEUB?tu4>e?aSX-e7;-Dr_;lsTxeXINV=tu8is<;>3?meroAtbB`A!*HX zF`R9AtUWBnRNfw!Rq2EKAqw14o-G`T*s$okS(ZO-_%!<#oh!rUD+z;8*sS!C@h}}d z{Q~_`?lOdcA+^C~<_q{(9!%%1;1c&(9JdA$mfoBZaWOv491apCW?>R9$+Kd2QgS8T z)8~tRp3YQKaDlV*frPvM)AjKFYBt71`R5P{oXldiE!M~A4)s#KuwcwlS}8&30Y4uR zDx>hf$FP_T3WGwX_0)dyGd_JAnvL;2U_c$9`JNNa|!ZQ@%6frBB-7((s);1%Fp9? zhR2EY0z>jtZ6z(?Kh4GvqPl;gtS^l(^s4^%u_?l-v;^yeD2XowvGm*lz{p z*mZ=-Bot-hVGB8+2@NV-imWrlW6ZRonEfZAFQcLSEQy|F2FBd{3A1xg1xh3KQ|-Dg z9fZ^7LKZC48_mYV55ttSg!12IbXY{Sd0B2SXT$l-s;@jGp;i*rV5L#2!W_T%VT~co zxkh>+;6EFeP!(JzlgwARJH#yJ7kUD7=Sy=UX9831I`5Re6e8!oz`_uigZu2N^;*MH z@lHZzlEg{c2JU~xt0}b52@zvN6V!s&*)73TMs}2@huO?Njq?5x$?rAFtbVObO8hRtDIafCdwimvi`= zyEPUb`9|Bf&rbNPeeavkMswd%%%*jk(Ipt z;Resc+0ATvHA*^Q&9^ZQe<+LAhxijIg(DFdwwAY>l?E^oCsy6!)J&~S1Ql1a>Ee!c^6fh1iQ#7{rgtK+ zwRAQi{12SBF3y9swqSw+ZCN|U^JMfxae2Kyw;0?NA6rx=!hm`!C13G1D$nl1 z6!)#znK<4bZODTUo`(Xz4B@K_jVeJm%^nG|n9?aaUVr~BFJkcA5eE6alP`@z_1K2lwzeQX9TO^2K#88?RW;Ab~K#) z7~D^v7OZzo-=Qzotc~c6?3tl8&?S2q@7YcvZI%~VONFE8^W#<@=5q_5ipyvb&N(|Z zxWZIUHzD+z%P@nQP<%2O*0$<3wbfo&8Y27;h8$@dBEl&Cm8jGtUf9mfLDfxVEK$1W zek3yBaBC$RFEfQ778scKQ}|LFr{<8PF)Cd@5=Q70|@S!qehh_8DUIrcBgVn|;t)o6CD67V9+)e#ms;(wFjb7`$?cTvUx z4`(~kwX*Z=IpR}@H)8qy{^|A;X2X7E_&pCn`ZIJE3CS%$rTT0JB7g&W^&cX|AEQBL z)h4rcdE0IX`i;m@JIi<4f#0J=ga%((ByR}>Cm}^G*OWOiCB3}P%C0!N)->E|YLth> z6)9*3)t9O=rIwl}re ziR~_z4Ai4=oexxUs%=;l&Cw(=~&NC?0W#E_R7mm=En&tJW+>0K8@ z;aRrujqYpVP8ViHc2%?>rG)_S11!w23Jkx7so>xK^WXocy_gwG0@rDYRm1f~?B9~D z!Eht8pY0-m^lCT`lQ5v7<7*#h5EL9f+{{MLsyBpLgnwlEMm1LSclSrifw3O^Yblg@ z*!p))a%>^+T0-O(7MWvrWL?Ipp?ux$EpZwM&CV8UamX)XFdy}~>FFlSdBJ67lE>Vc?EsoMf}9 zARzpkmJvFLWEmJQluWh&Z=zZFjsW?~oIp%SEI<=SBOSuinI$_1!~U_H9t16&My6BF z>CJRS%9H&Xi-P0pyCLd#ch$b%y9b)0-1eh-4@{NAy}VRj2fq99X#3sF@5YZ3S$l?T zB-Q!D!BnKm94p#`^Pcyyk9$9!ggve?(@&FzL6}o2tURqGGfe)(yxZ9`G-<>$;3zUc zW(qIV1hn#?3CO%#hb&UBw9gid(FSz;-jluoSWQ81n0N3ARIC z1%c}#H6HOXHSMZYh~v~w`AC^EVj^U+M=d_kE*?i!mfewx(iZi*tr+yQhhge`xS=%v z(8D}W-{?4IZWGoiCJ`}w{r_N*%fSxEcHgK#p8Nt}Qe4y|JN#-q6Dd)LVH@7Ll{dk_ zd`TZVK)XJ5;wqmWt|y}}lCubT4HK=_XqK^{8`UCJ9*~@mX;h&mfzn(^@xOFYNELo~ zgRB_BrOk|N;|edVz1C1WKtHNyP%G`dOsnh%C3d?8-NqMZS-j zaXP#jQ(pW^{dl$4(Ys^}>vQk)>Qe0!(vYcaqt1PHuhXmXWW4y{_qim`1@^5d+U}X9 z@$0P{&g-^rOv6$)^Ox~Q^5LinPrl{e%3(QdPWhE*L+$R=_gao0m)?-BK3Te`r*4a4|tdd@B{Gw%G^(86}9g+2U?` z|LEl}0MPEkp!uqgnx%KS3ohI~5OiZcLRIAZJx1tV?;aGO#cPNaJMwaOP-Rws42Bc{_bj4FF5ti9vV^G&MlIn4fNyUh92cD&H=J$>`GuQ5D1r(!_RU9iF~-Gn<-K`E?TJqh*Fc zni5se=D*0@TA`9I`(eQeRii zFu^5XbJ6F+Nh2IAf=&0AoLN9r7sNc=Si+yobp8V{*vaU>>G4jd57$q#x#8jTB~WZneRDhc?rXy9;sFdsKSnpbJJlSLD@;<#!$xwh zxjpT>^mT>5&d?>8v)Xt-D2Gx(eUU$wsAy`SFwSRIIZE)RaQEodx#fBo@zx8=_?CF$ zDyxOXWZh56sb=y0QAJupR!zt+OyKca=XWM(wqwbjOErl0FUPII;Sf5!N+cIqdbohT z8GgR939xwePf)UDQCQU_T2?Rua=NyPXkn_FqiFhDA`7EQX~9d)8gV%HkEVU&(~BTZ z&VW|sj8|s2@+ezo1G?1p5j1ziRd(2qQq@be{#jGx+{38}#xrdfi&uPMRwk8tMi-Oo z>0|HdF#$Bg3a*L(oA6O^Fss*2A1Uxi>HuOTxU0N25J4bTT}g+P2YrvZmg;r$v6hDP zD=o)zU|HDczn(^;zgzl>p=8(E&!oy|G{Z#K<0FJ0$q4rfrWA1s;w(>Kl|9J&ZPFND*0j3s;Xm)(Eeb zO=l2wWSpRpwc)hcu{l~7RE^Kf>d*q#$a}yxGDf z%*OOQheSj;OiNSRmrK#|@Sozwlwy}{8bi$G1PASAnO`_!NE4?ju{KSMQn8tD`xzIR zt6Li#R+PM-N}HnBz>pkdFKLdf@(~PY9-s=_Mz6UtdqxB*%+$ku<`RqI+B^T-?5RHpae)biaHA9%nOH&M(%hGE|>y^}@GhYPV~u9)P4OOh6= zCP@#xfP{`sbSy2G)V;^E=y_owT-5iV)2=OR#k%&fzNQ{^sm2CopPu*K2>key-83upaxj~8X?tgYoQYsT&WLw%25 z&Ye}|-y4;cFrS%F{lWFwCMK#$Jsgv6^;YA} z)5EQwxaYoS?&SyVMs%K)2S+ma!N!IR;Yr{G_?Q;1{jl;^+nh_|oG=gVcE`0kt*W!F zW6SC3+qeH8D1A~H8685=L!D)-9|)jw#DX6e*W9h7?vA^&cCsN_1YGNLgT!vkWmm3)Vz7&`swy|GFng<&F4r~O<%*4KKO72dGU7k1i9o~ zuw@4+=uLZEcIKEVF&@b1@3kU?@e0=R$DrF+tyBaJK#)s9N9?+t?cPX~2l&b;FIk7! zc_nGZkjFEjy+688G8#l<)e;K!mbB4UO651l;2iF7$9O(v?dE#3GG-i zLirxpMFZp1mird+GKR;=rPGCe%pkfF2qIyRPsY4dWU$O zI#FgJ2LG=zjM|eA0fpdb>f*P5`(OX-fBAXB+J-stkq}oC$S)1{8@0D&3m#a4+V#8F= zh{Dv@*xQP4xvbG{H`#9oDm>laJ8&>r1>QR)b(*Yp0Ivg;Ilei_Z-o~ke-IO~>?)Q} zye;VagtuY$lOMi!C%-wQ3Z>Y*s=N7acpI<4E*{;~?H=5(hubGMTh@|QqFuuhQn|JS z?l{;sST@KR0t%wDbv1V&1YXQbGNoFD^n*jG2Q$0Px3n0>Q>Rk^Xo6LmHy>_|aOw-B zPmwRKR^Yl%{(!UaP7XdXyu;1)(}p-OCLG2jN3i_1O%r7#s?@C+HptNQ- zS0ng5*x6@{4X7wXSr{JXwn3Nxf_|7bqX(ZF-mWoY+lPs@tvu>{nmyi;*&6Ruf1cmX zrc*{HYEY^4x&w-%JFeFyx(KxD;LU(BlO_!`GD`X<{v-*2*(rn<$EZk&;PU{CT(p5? zLgZKB1O!Io1HM;9Ca=+ZIQPMf?sU3lYMX1VFb_KP7EmM%<8#++sX~aQE39}XlA?Uv z->6!6nEe3$V6@Vgn~BV~%qMftsK$kosgi((0!RA`uwI6#jf}Ov+K(xWi%j?Xr-hay z)$3 zt?ZW`x*DU{CoR@%GV~|~i8Lz~KDc%GeroE5?bxR5=Z?UrFoj8xh^`Q;3a&15_*4md z`upD}=4cdAVf-7f;Js=XVdPFg{?#WWj$m5=vRARjL?li~+K#{fzNIBw#Nuq+6jDL0 zVcdT@0k7o8gL%^2$4=aj+5Pxt>hl-f0g)}EM_|?QgJ+o>#3mf@FqBPu_}>&hGGclx zp&5y3y@tVIA(Vo`y69W+FqAV?SJDs_pfQ{wjHUvVUuAP)f9dp;KmesIDz!nvw}~fo zwl%FMFH>Epin>|*@Oa&3!nbb2mv%{6eo)(T_60i?Q7T@VwH>(>VSPP6V+|3LgKKV3 zZ-dz<1O&w4CO9eFANxEx5me8jcRR*xsV6%1f2sls1w(t}CUzom(r|hxy6TZoYKs0y z?#|hF*sB57LV`5Fjaf*CgZ;p04=Z#e=sp0JiH`^dZPc2te#6tfGjI{=+$D*FtD{M4 zGJvktC8Ji+>!)oQKH0WtOIf#zN46~6&i(N4BguR$TO?Oo$GtraRpwS_vp6ugC6Sz7 za;BZ8@H{17Xa{jv;Vngh*^s`3mAxby=gT`QxrlTx-6>Kt{7V^$yy4^$$yj6-s5oCO z%Z1Bme8S#;R!$7#IZVMk(pI_>FBTn!uM0W)g+m8ctf%4^J8klCt7Lo5EyUVRy_ z%cD3sSy`3V`@a+mdC9g#kt+PxJOU&&WY1jLu;q{5a59RD5Jen-^Y8Z2&kY!5_NpwV zM`RqXr!GHWT&8a8bvVOYSu?B6w?m?|9lTWKHmnoeTJxH3Vp+)9c4t?*pf?Lb$>lOE zXO-u_aKfwk50s0o4;e9=wuGbm)BmaUQ>=dtF_yQfE4Nug!HI>O2c2U>bT`XT_)jIz z{DlgSAMjo_c~1Nx_st8Jz4!s$sJR2eT+;=enh=9YOzG2|KFx$fa=esu7OHK0>!yQg zS8@zdw>o)zJ6^8me2k@q7?`#_4MGens)Py57H5d&hv6?jjNhsE`fZC0&n*b98wljB zqf80T$Sp3dl0u@kr4DDi>fk?m=2{y*<#He^Rnua!E&e`)MON?IbT(Re8wQQ`L$kNS zNo8{2sK@Urat9A_99+MK(;Y0!QtPOaMJY1!l48PM;r~DX53!h9B~>U%?GG+s^gv%} zik<*A{Qy7bl|M+au-S|*avuC$|I z)|SR+x2>%8zb}Q!{+5BT`_Wvm7mJWtf$5Ud&74@r@GXfY!g!}Hmlv24^n?FZ`)i4L zgE@qaiqZA=-h45O-6oXv|>o5L*2jK9xZSh_v8_i`&0XbekICk4yx@v#^i|^#; z`!Yw_8`I?#Yca&~cBF>Amu*&KhR&76do;x@TAOi4^Xu_cqWCBzA>Dc?yH(XNUNK~s zx~~-80!HLuZP^%6h1-r$;V^LH2sQgX%{fN*UY8&bb$yFSk829MhpBB+I1ga*pJHjrsz#8U8zjFVT;yf#BF!|={MMQW9PW%GTRCnpqrdP$xW=N!TadxR= z4_JtsZ2#90QTQ|2c1DjxZuXbKUTvpe5cBN* zgG14xRcBYb_AmeZ^pL-U3PQ)z$GRt>@ikc;M}I)RBvEJnUo=F(Q-K0LG7?*cP=-h* zxZz7PX9Xdk-5q$(iBS-z+E_H7B^Pi(CG9uaH{cO9RqWD7`rNyQk;*OXk)CcuG6WzT zdcHtu5V0m4GBDa9JcbFAS2NIWEp4egFkP zYD}U00ptQ3N^4FQ_61@ZWRP(rOA)EUmEldR9oO+4V=NZhe4Z;DUQ5<%`HJ2NgP8(| z4v=s{*wtbpQU3Ju67OXqy_{(h<3X6Yjh$&S&fywsJKPQzqu$k65e8#ny*d-B$|UGk z^i9^(VUOkPAoJ;Dd_7)BV~3WFz3ABp?t3H3qWd@YUG$swhwILkiSV%}Yuny?nD=DN zRJ%lU*v;&U{Osa4nNw~CK{@+*F&g%6haqJ|8vp6?wZ+|Z0bpnT8!}IKMgFL2PpXtR zRtKvyw{HTvIvUD4J$U@p&a_g^Ul208s{&x>D3PiYOd${`xSy7l=Oz zfC39AkH}KGrGj=RZTn1upbks$8khBtI%HrE_#ai%b~gBYpdkLU2!uCP#N%qEs!v$~ z@er#*xWw?7pc>DXuc@EM-yn|kF#bOJq1f&3Bl8Xl16LcLbu`VrI2omeiyL=EvC1CD z^>;#EPH+G(U^Jmxb>{M@jknr(7{(;eiN3$<;^Ih8_b^njnDXIT|Mbo)#%g|Hc|S_5 z>aO^RypXUY!!gGW{q5V2hhg|Xy>HHxHYA=+W5BPol!S%~+bhmv!3*Lya~Xf+pCBNo z1xZbqHp4!K!`)7pd|w|A?BtoI6JuSRSFJWa-IWRB+eBsn!&IGnph58c@OBKwvf09V z#Vc(QPB&B$&)t@(pJjm?)5wZ5F>g*S5c>!WnbsxCO7rb}aU~Q~JHrPeh0;w$$&!9< zWEld7x}ffA-`)il{8qMMnSD}iz|(?5BuvxW@R_>2beYS0r0{Z8@KS1uexTVQtL^uu zT5F;=m$Vgd^H7@%(flDj7*cVKF4>Wo2%bx42-1TC4}WHOagOtDpxbdd*#juJG{0~z zx61|PQejnccMR{dBn4Q7%;(V!6(Q^z??(?)A_ry8lS4GTk~F?Ilm==+%($VK`cr?V zN2TY!_cj1h4P$jjR?VhlE~P_Y)HcPNUBUhBSgkcUI-j~~RS8CCCT5@Dkc6K{=Zd3& z&wr{^>Cg|*l>G%gkZV$ZKhxX!RuK|gX^sSJ1sTu?J9PTdkM{7xCx?e0cSFDE&r_V# zdZp)hg<3_;FBHq|>UecFG<=#MSnxMgdqZy`?4XY-~jGug}YvVZ0W2Lcp0 z!WLmybV${RO?Y;n?9pOSKU!sOMu5#Q){8TxGB*F=W2u_*qd&WWXTYfYGyes(a zq*fY#cHqQoDMYV&>^}KrjFgA*qDKhrEvmpQC+3J+R4oR|*L;@!TNsW_wopxb*C0Vd zFs^qu`~|h$u(*VIg<-PFhyCyeb2|;W!RxV=-O2+pF+(rd!LAIxyz#yXrTk0@gFi3w zZczBk;tr>`kzcmQw{ozHDmV|3#kXvs6>_Ps4Sz=HS2u1)jlV=AL7EhvEu*##^Ipkw zC-tJOi9OXQ%)Od}3~H>t(~VQ#v&+7@IbYaWnhpzOluB^OG5RTtP_u=lqkpA%AJria z^)=Nf`tfz+x(F~L90^pOIwBQ>PjGYr;4COBh1lLC2PqTAEQ3)P5po>QB7b1ZelkkL z2e&px1LV77_k!q@>Nh!|NO14uZ3HMQDylDRCQm+z)*IYa+tPeI!H^T;A_dRrF6@>f zXHjx2Vkb-z`jbH~Uif-_iZd=N4aZcpgM=#IQSW=ri49wuKmB$+F#THtkLHui8#l)c z&>nYYH~TOh&wuo8rnkKrWMk3)a%Fs2q(bQo<<`~^6C`Zgj;10M=G9ujca4m{Hm~)d z7(6s@CP=nucH!UiLwHJ(poEMm$i%4Pl*LL$wQ+{)n*$ko0(|-|?v@#tLO@B_(2tY( zm?i>MNb?#mICK~8clJ`P4p8$sc6=z^xd5ueZ>aR7=kxEMyP%95xR@Utg*?HEJ-8|59HxbS?cLs zhgZbjfwp-VT(kZUeO98(+Z};2xt_*3FI6iCU2fyl^e`ICpRP@d^I+ABV_i~9u%xGbcYGTe)-(9gmC5A@}sIo^l* zLSwMj-i$L?!y6~ESh5oRboOmJ`5vbKM?DNw#KNt%X*SmeHz5a5D4i|P0>T)J_Sc4i ziDJ_Xum(?4n5Xv`_NE z9-|w*pJN*@`FR*t)Y0|#>j~w^7O9P#6y%)4}jkr3=j7VZ4`N8!P^qRdpLq^o6&sRc3i%e}So+52Y%&kVxg`&W< z8r$8D-th-Mh7~U_TF-K0Z+;U^6b)(gNI#X}vMOuZ-h8vmkaYlV)+cAgrCDj_^eYT|vdEhy6SB**j+h>8)BZ2f4@8fcPkMq+jtD9Reyim}O~W+X)G3peXhY zVDuifH&!hH0Mrgl>|b7iF8R|{L~hF`*y=K*CR;C+=U0jJ?-Xx1`s@^&p9GzWm55lz zs!pnIj7y@Ah}qW_VO)jfM!eE6Sdg1U$0fE+lsUHK`y@|oi&6WszOpkT5iK?*&?N%h z%DD`=L01v`N-hG>HtPr`Nw;}BQUj$v*8a!EOoHXRuyC%BWkoIDq#RVTE^4A2*i}ky%R+@jdQ&D=y!NT&)_j z-Jp3Hgp&>E$({$65)6wtIis|jDVk>l+J?c{!8P4NMpbt4`SFy4e+p@NGG(MCpO)c3 zYVja2LsGfQ0gw+|8y)0_Fi(H#3Q^LJ#tp0vYX$6zUkwJxZ#?=`S z0?@rtj&^565%NFV%`w&VnSmq5!8+O{T^eZ8bybAVh3~wxZ0B%DdPVPze|F*C=GRZs z#rWCQD*Svuk2uXbUX|iC*;A$Pr{x>rIXG5CY{8S2pBcxj%O%Qu=eSyE&!hs%6WTpv zHHd3GF?Y|lxy*XU4AcvB7tKF462{X}zBMokq<0Ekn_syK9}lMSh|vr@A@obl2* zTG%vm!d<0dwv)jxsr;+xjm&o6;H84p0wN@@$#_*6+3dFII7N|992?Osc<1OI--lqB ztox{?JwHInW>Z;LGwYS3sc`f@QaHR`3u&?oq48?F%p|c2eNshRUcF>@5{k~Xqf)5( zA|qh2Kg>1|VSYsgAu>N!phCSSWBUv~qn9(^g(nb(;ddz3{QSDO4`2%j{57XY+@b2i zs3}7P(J~H0-G~c(Ewb-aIQ#x){SMQ0T#QwuD>u;pKitRH-tT zErd~3tBzx*1No%VmiAaRAO|{&*?=5qUfb5@!axxa&4kDjd^n>du@2EM^p1)Cc|8&u zZ`RI*+B5hRViVmXD*F)!x-={kwhJ=~@4Z5v*n$JDm6GzOUUo-xO={vYBAZ@|k2gZ0 zwzd(LEPq6RvaPE%qoL^Z5+3bJ`cDy3$*aK?h;l<10I%iJ1A9-o!+b-WD%~CY@?zx@ ze+B(EWof1}smtT7uYWzRH;w`d|BXJzwx@~fg5N~MqUaMoP_#!5dB(u`HvK{Vi9Jiu zJoOZjitA9>Z@N8?x#Fp@A=utz{O*s6{}F zz@n3}hQWkc5Qx$b>w+djid&|`Z$I&i?eEm%o3mJLeM)N=4GteFzh1xuryy{=LEg z+PXYMdtBPJu|TW>`lnr^)}|rCr$fXtW_;3xZPwP24#OKMqgO%TM7=ge`K66>t6(~U zB<)CP*UpcB7in8oz`}u};y}XXy7LnOqk3Vd9yQ1Q^%l=L9Q6$dy&j7D>wo#z-oFCY z1{LO$yWW*25sy%Lr^pyS!|EkdBL_YSyUrU3Gw*S@aPWWmSM2q#S9TN$knKwR8MiQt z#hj|Pa!XURm-4R^lyAK?16>py%wZ`6MTiHE09}`sLCHapZhO)sCkaVMb`7;~Y{I3c zS0O2$K0d*@_N!qBv8L2vZQ;1#2BYwahYhTl>5SP;!`|0P*1I+#N*04FEa!e%Z}(!y zE)3)rZ1_f$tgs2N6a13e@|EBtgVozDhHq^z@3CA)d3t^Q^e8Q^X0! zA6u{8d}VrbBjCf_&;#!w*$La5XwId{hlQ&Rao9nGvKS{rQNC_i6K|>dQ?xXtmHyKy z9HtFJScwAB@{*{xa-fMHm7)9zinrmBbeS-(?2iu4AIXRr z(_5-}j7|VN&W<7H+@OsFqDaX}LtBj}bR`_lCL{Ij^`Fruq5EEJfeUm!pi!MZY)&rda9jAyf_M=3>F?zh5FZ!N5j zDzjJH@D=~8O zIE^PwhzYh{61KWyN94Qd&xhM7xvg5#r-uI#rg&uX>it@T}Rv>Z-J#l+^|laZVtl#w+3pXgpk6SZMw0C zD8)l8p+R>ZM&Cyds!QRoJ*g>V;3q1%#e+;AM%gi7)0S)ljW%l;^FNN&@4w??P;nWm zwMEMeCcM*sg)sX1%LxS6e$Nj4mw)xMJdWU~`G{aD>c^SfPCxdI=pJPZ*J}F~FY$4D z>->tFvxhfd4EYNhmg&VafU=j`k8nP$V-|Rz(~_RzVmh2JjAGcm(V@`AY(5RPIg$5H z+tT01zl>&I=Dk?9`|J!dolM5PFiSt;Kdg>yVDNJpXOs(hnjNQOA`1ZwjvjNo<4}T1_a3#uMsDEjY%ZPEGnp+a(Y7*{jNXyK}L6EhUS>hZi=l0&^D_KzM=C_m8s%cmH zQTMaQof!KvXJ2P<{*wE;XhxFT(?ftFJt2!%;j16WFnAs2Bmxn$uNvN=n9xF6XM|M# zd9j$@n=HDE$+}EdNKD_#gjD3F<2e}|%wtfdgILwtDbj>}kkd=zFT)r<^Hp1HHpDvl zF6LU^RwiXC9nDpNI_)XnSDiU*UYza9H|W==N^Lrj;8h*#|8v1YyDtzQx4HEfO-Rzn zaal59pIotnB_4r;o#Jhx8RT-5iSX%_I(X{#`La8<{RHDtp-mm;Hnz8?B-$m-1X7_) zso{_VAQ0Hg-btO;aKG>8apz_}J^dT&aCMBGPw_vYp^9HmlrCKuQ@UQ?;Tr+FU zFbxdPy*)bGCWnYmpJe`(%Dwy!5CVpMBs|6@@?-?OGxpr zOiY~3qo!F_!gy=)56`1f(nxU55{1w|G2UOlQh6yf}rW zz3N@}eC!Akjle5MbEKI_7))7UytEZ`G3c@ECB(u%7D5E*X#i_$BRv*_Z81O_>-W+x z(;8Ehg@SyEl!M{pwN;BTJne67;dqYe-(0QWu2iiA79^%&%1`b$D~)sXuPKZf#kDwD zWE zz;JxsmYZE4!Sg}xm{OM8gRWvxq~bTd!VS4GQDo1hIBz6j`*Y>zdnAwHgUITIN>1>I?V@DXHhd&h2HQv z)PiEwnfqP;ObY5%)s&MLez=-HPM>DWwvPEq2oN46{U;84ELdq^w{Iy^G`Cj&@_{5a|>NElDkM@krXRIz?TH zB^J+bO>@{6X14wk{$X27vn6~3h(~=k2$!~Y5Sk&NGYcdNy;p~L!joT9?`jJDH-3?K z)W5K>AUY{xBTV{t)9E)&x#8qjqp)zqzuK{oX+j6++i*PV(U}4WlHWpHD~Ukd^y0KU zwso?n_JvN8zfGGUVy_K7ZF`ReNhk3R{~O_Zt&&uti<_lg0MM~Sk236T^Uj^SJB8>Ns?YM)<*Ax9m~ zZu03rT3J}@Fif%l39LhoslY+-R4LY_elM#_uO*;@6yakVIkigJNsSWAFJVSPrw@%7 zIB?sNCMZvCTi(YRTGnwK?a@c?AkAIWP|nR|(?PU&Bck`LDFe}xSccMWB`K`4ue1DZ zWtd)ovnwxd41_nRoi*@ZLSsjVrL)9!-Pearp3npRZbn%Hc0# z*q;t9U8lTR(zNi5cTtk0Vnyu)4C&iYQU%yZav zh_?sJ7NN5dl!!t*nw%s7gI>0XQWRB|?bBV7(oXgE8VF&B~k6Qz++3mNRIL%ar)$R|9v;3?VAy zHTxaHiiTEvdtLHcW{XOTG%($KCd z7`QpTl^Bc)0h#k(BnaLI(@jkzY=x8i*&mw%8um1PB z>dc!XP%wL<;$DeAYVG~FQxj?ei z&_G6m6-8nL0osh$0>ei^G+#1qpT;*(FG5S=&O%6)xftJU%`TjrfWb?cPG0*`1PxaY zWvX}~U({vH9>5p)EkDg3Mn7yBUqP!V-vXhf@t?hQ@Dn%62~mAhk%;o$H1H6rtzDU1 ze^g7}8sjB=umuCG=nX}!8v1%&M&hMb0MuuF(f7N`V|WfY#TBa(U})~40uZf=kJ~h- z&K6_n#*J$)q0#QkuAlFjj$d5zIRDt^Ea@9SVVf@kFda7lI6%k0KC0$(xX@Y)j?67N zA4@tMWli(9>NBmteB+DzhM13VTtXpCSU4^NX-(tjYLm;$F=j0D`S4b%4Mmtjseo>6 zY{V@RJYP3Z%>deZ7wi3i43@QNwMpV+Z`Md*O1;di)>L-rR8j4GkRTN|A5IO5Gbdf& zD5n?G(zZFA2744lFG0#1pavE5KNyP0h6j7j1~O+R7^w>DKBh&fT%Y&Dl+RR45S$Ke zBj@Hl;#^|vQ89*xy?<{S6@{XMHQtD~yEjd@FSp8uWG)LNO3OC(MbwLEjyOsz0Ef*O zE;^_VeUxJDO4+pDtIVy@+;(kQ>v7!;8w+y)o^45B&H+4SAp4n;5Ktd$kS6pXO(3cZ ziIg2)R%9NWyQGIl8J?x4L1rqve`|lU`5ivvZKVa;-Lz!s&5?&<3Xcc{vi2+)Dt2sn zkPLT-?XG1R-KKGUkIYJWz88C-w)&AxAw7emnSzP->}i_o3}7#17FpK9?02!SFj&L(#tL&-Qd%`(_2D@LMgdqrPT@? z9~agpikTmDkrz4k-M(+f{8jwGpj^-#62F3;DgcRwOr=<1RI%(KPcQa(3amE3so?bE z@Q$m((9dAEAy4oEYKWRw!1#pd$)4e}d5W%$MSnO~wHI?|Uw|m~4eL|*&sb%$*I02N zeq4I!f3Pv}Atw}gB-w~pml1jWu55+DpFkf@o?03isH5yP z3zoU>6bG#td+yL~$HH!xb+XbIT8c^a#Op?lnbT`Gpj0s3C|#-!P=8oBVn#zVOII6h z5_IYwT7gcnt%{`F{&C|pgkch5RqMiel#1d&&} zIs}Jd? z3ry7gaKnZ0lbU#~HNk9(^3)3Y2zsY0QZtETzVLn{fp35q5Tx2+u<8*1V#h{}>AMy# zErm;yAJ)g`G7M2N%HIZ&LzUFf*I{yK577sc^0b(mUaC5h+{F-ug9k&lvT#oQWu_s| zy`bQSbnI-`SLYGJSkXvD1bF??MaN3unb?&*&Fky1DrmlExUyOm@0H}JR++*+`$p(Y zt3SwsRS2v^00fjA>I28^A=7VSTdHSgz~E;F(V@_`_LGOV{7bbu9K#TSVtt4;Y1rH} zzlXr;-G1TUxW2{RlusV(jrS@>*H5S&=gQ3XgJg|?uRi42DuFLyDf||)bYEC#9O#QQ zt_MIFRKS{AZnQ;sBU?P-0XleZY3Pq8CT7(_Qt8~oqV%@{0K9phMpx+QOxi=gc_Z-a z-gyP#z{XJP_KZfD$izMn=5AaJ{~9g;%sjKkU`-Ptw9{esi18!VlPRy98X2r#DD~Q&E@Q%OlPYm?(8eYyF!~j@ z^xDuq;oP{v8VkHKAeF1kpsTNrEDL!uYcXh3Q~P&&E7hFGGEC_EMgrvUH-4dh%OAi{ z!`C2^=q8DvX~lGvD`5&#-o+8{@*{_IAG{rU2fp!t z`@LVOi?#mLf*MWamO=(v2)JK#$JPmxMmtrMF)FqP`+C$J`j>Fw+oV^=TFF~gE1y-2 z!0Yr?=mEZQy{qYPc5}wM_Mabq44%jH5wV4j)2Hx%zfGqT@yvwxfncS??UV+_$vTh7 zVD*JAYdY*RaEt1-hY#kxpqm}xCe6A>b0Jb&E)VXxxI`p~v{ zD+kLEBKNMUgI6mKENz(2sui9fh(nHz&6Q_KnCtmZj+?)*^o=TOR_0EM`Igl;jk@wCk(iCg#rS#(&?gBlCMauu z-7c}QmaR|HAZAIH1oRAcU4*5lhVLY|b&^OrLzqYAQM$#W5l&)O0rCbcd z8e6T>7+7!?eEd8`z`W(#^wvDZjIFH-MwI_HKPLAQI2bDol}mJpHjFj#WWo%H%A$_X=-*I0oab~g4%L#ii*1IyWwm&O+>M<*TLEk_WL+Y`it=c zvD!ry9`uWO-1@AN0EF%jjNt-gn;l^~dKmvLEQ?_R{Z_T6<(dH{3gY*U9;9)lPmQjP z)F{If*--MR-J~kl{N8ne;c_$^(z2Y5O9jsP>Q~a#0y2uI-o6d_J1=dyjWN=F?j{1phNSPEzn#`1sEdF^Uj3 zo4VPZeAmC{JA!c-w)tJm?e$tkA9e+hq0|Y4Ildmwrcbluv?RhM51Y(qi z>yFVW{>ayLT-x&2g&FW&Uxz|)hS?zjBTOPh8FlIPb3Y`3^;no83jyI_&ZpsZQ>_%O z&Zd>`grvcQj^sYLFC4^G8MV0o&Zd*uFkx|Q%a7$wS z2uX+>0I;mb4~W8kG4bijx+~M=-mz&oWYRo0FLt{VE2ns7 zh$bo!>G3MV|FQ(gI08I^s6+2X#dRaly<604%yFg`@8|++|0^OsPW1ay$FzGNT`SDQ z%uhaTv8(CR)2w~bV1R+K2#B~7!w7$A`Sos5TR`&rt_p6*5w=jruq>h1uCk zR=wU`l-WyU+CPU|XK(MmfP>N}&PmbZ#rFP0t=B(W*B+sggcB2c0sshpZI8HcE#)?2 z>k)<>6M2i0>YaR6_6^6U8pk}T8??zvJN8GVNYSxvavkOk(2c^C@ProA^fZ#r4(ZX! zcZnrl>o7??S4&vw<`L=0#QKDfnt#!0;>Org2ka+}tj=cl?BYW7B9-$nJU*JeW49s> zxh-F^R9l`jD$m%_ksG=Tx7|qOU_k$;W5!N!BF;RS4zI@#L+Ol}vdOWIdi{*pi&Ko~ zWo<3RH^aeps7w{BRjCH3jo4eI4cjWN>~JjVU5C*hL)m>rOY|!ko%1i6>}=z2+s2jC zs*M$6w;@}hI9-(p7mD$|qcmwHI@M(uvfriNmz$?v))H%T4yPAJ> zz;tqNbWS6Yb(DYH`5fSW)^yRVRqbCkm8E+ePJpBDIA7BIg}NEChxd=tSdV(v7g1>| z)zb^HC(xP*Pqis`npY{d*}H<8$H`!a>OZi(Vncb;PyrW53?E6ekyx8+-_kj-*hQtT zURkAD(fBr68&3fwONq$%wl@nYstGzLraYJT4Ez}hW>$=3!96T%Z{F4%8i^`gt>cy9 z4MKt|UL2$|V*3$Idx_Js>O zmIGAdq}Q^G_83KN1Ujl+;cG*xZH=&MK5H?(EO{y|p;2YnL6vlo4ERoAB zLCXow;nhyFvXMK&gTbufj@m(I>nHU`ovx+tiRkI&kk=6!;M4jX({y;ABIj*cLfza zbE|?tw9-}upKDbdZjT{I!eYf2#nn}7Ko@BYWEI=IC?uAtk&3MdoZ=bDJgirx;e#Am zkm(nbn!aLX3z`G5ptmEQSA{S$(PIQF!-K=mxuq|Nv>NUiL@|QNTOliqu_|Q-&}oEi|%>^(?(b z>y(*ObKOyoGCXj4MMUeBE|u?iTaoo9IM79eTOTdRV`gV)^H*3L(Wxk_564i!3gc%Y zS9TjF-H2)Nqi`z#oF-fwM2$|AC6r9WvOIi{{tpm*xQ|P)$+SMyY{}+%WIxK!6Pp*_ z{IN_|#AfVkDTJoQ7zhn(yUnVz49@|M$NF5d#ypb}Ni19hz**f-j=emmpobEC(T!)x zD_iV|B2J+TD~LvI0Rh*HJp>2yYP4h9dp(a;wJaghmnY; z$(X96HK<9ii0N&54zW~3I6ieoOJy_PJm|r~K8fnSVXN?H>eVUVO0^*4c<_tr z%vJX_jK$O*{UjA&kvraKE2f1fBPxKJS(a;@BJ*YcmDaa;;=s5@6R98r%%I-Ol&2R5 z)6)_~h7 zj5;iv>vdT(N+u4}D%IiG^08JL48WYrt|o@1DX<NL7vCZX-^%hm{eP1V(9S?WCPxPl!f8vcMar;!*WWtlx-e{!XzwoU*W`Dw!TQ< zk-di84<6hbpPURSwrr^r2%rv{&rKUkzjM@_X5*psvB0-w8IyJhB-HnN!>=L1{kQ-8 z_y6g;NcojDD)=jF5)f4G%K;xtN1x@_GS;yoyS_)C?^l_q9JKrHq!Fx7u{)Sf6*TZ) zvyDwUwoKa#R4=M+)-R*qG_UX#R2tx9HRH&~7PxNT2$j>l7)ndm2kKzIR_Y~m@s_ z&&Mem3u4m=?}_5yc^ktpXY<&8!_N zYUV#-CQtM`E5rlBVs1zCJ)(|__QF}>sR82Sr*^Q~(r@fpiRz~IX0ns5L(aO!!!dbb zBtv^6?-}FV27r5+q?p;XIO?Mk==TCSr&r5tBJSICsLy>}?#Lt#pR;8k6)hD7Tgew2 zaOkYuD}xq~B$grh3}29~xqLQBv>JP{HqAvbzkfXUts>iqK>NeLh&qLb)=|q$ zEtsTArV%l`%tvDE^HSQ`mPZ7({#q7cj9D{J$SI&W$(bag)I?$`Rxls`|pF89fP)q^`v+UC}dPVa4ar`-{RU z($+$-lEkCIY)2=ARt^_l`2NyqiCBk_j9xDQ(ul4bVx|2X^5o9UvEY@_ zlV~4&!mvCthO_(LEp^ri4l3H6cuil{!ql<#$MC`6@yrI` z<(?P5OV?Wxl{wmHRvm%UouBt%kyBZncGhGPhcBL8sur^q00j%a`7kTyU5h)S_AZ_FYz2I7m5LP`ffe%p0k?ot z5;Ka9*5?r)y4E0gL-#QYq*ze(eWR@MW#Xc45RskYBBAIHWz`Kjk|+ZSauH^ZVw2TX zAv=vG=CLIQ1)(jK2KJ!dUpNslJj4$#xeT*Vnp<6-Mb{N6=RP%)z;8?EjJmUzz1Ws8 zkCR4+p|zpo&$)JM*s6e{*mr+Cl$bU#3*) zwcKc-PJP`bQ~XIi^SE47>fU>$RBv=G8pF6IHAU2U*JYjGGd@pg(@C=k#&RdKQzK?X zasLWzjWzltMXq%pR%EI*VRSw-}BmrXfl)M~E@}$NMW* z^94!si&h&rnlr@9i2}(9b4c7JkUXu*l0dn{sy0ND?~;v zP&!1*8-_JvrK>QbkwU=EfeV2hD*2gtFwJvNXWdD@ggB7u+hxrl6-Wr^O^_Y++dnvc z6SW4xrw*)FEZ8Z3%01d$)xOwtYKjyF`^c}}nmGbl>3MhGiZG_wWU|`SCnasHlzBd>WU{h(6T>PX>GW{NX$Ww^+5d z*sr%gMD@YO?9Kfa``$PH1Rd(9Zm0eJPtZe5Ux*Ibp0d`1i``7E;>D`2D+x4jB&rlE zW1yG_)Ip-G9v4vlQ~H!KDqvekFT%+w_IiFtr5=DqF@P zDd1?a1<%uNdptul+gV6Mz%(v3G78}dtOyyc7$yl)wFJE}3kN~WDw?fnPST|l_t6i7Wx;i0dJ$)vxx-u!q3+5z(S8y-3zgArRdOEj1Fd+S#xTKKa0EvdE?sqi@!vu;T|_yogLi%7W2gD)bxV|}*5&?&n% zPfwRUi24rG`}%bHTAFmPd5dv|0zZj5xlQ3+S3W`iR9aT}LISS5W zkDC1nGO|-Ip@AQINeufO0+`{!n{E`lu;<+o?(0F0eG)@cs@h`CKw?~(3KPCrtsuLS zNhE|uP_65PTH|FnW?5`6wgsEjs!(tJHQEl~I&qQYj7ApMM zFb@0g@yW2t-#69FZY3}c-8+tuG0yW97It35WJ1LEJ8EW$>-t{`o;(_;LDYM@FA zQ0n`rStW(y4R8e^zJIKc$cHhoITF%vjz4k@3ETiAVECeE^i7+unnOny-6}dbnUJ1} z_Ly~8Nz|eXNCXJ>r$0D19!rNQSk)z0dUH`bA`w|&dElFQhiM6ET*VHI0sQ<@MYR_4 zyR825r-*U?bNI#~^)r7h_hF4@7_tZnvGr9AhJMKtE8q3$g@(9l&AT?7C!LVxD_p6) zEy9|DDJ64OGx%MxHYeS~vE3kSWogVI?rCc5Jo`I;L_Seom0Nc)dz?P6j}&(74O9~cbmf69^6YrM@q9wNH9>w# z*7ep>`=q@zOsApEV<-V;5rY%78NzvIcnAZ^E=Sa%t96)z017ny2!%e%)s%4Efm>Sa zN?92Z*nmvX$~den*>ekz+t;eO83{g4M{~0EAiA_cd2wtF@ep(~%x8MC41qiw4<@>C z*K8qsK6oYQ#U~~Nn3X%;)?P?bJlT~e&KzG;P&@n z<`)0gb=I5O3r&_C5StX{cYcK`PZud#kr9rmsuh0YX9Mie*tO@nh!Dg~!fGe8T1H-; z6vL=(~8=k?!2lb}Z#@VnDXxMSo+g#aKc41!(PI6vT1mvfdwwQXFn4JjUq*dP2ZA z-A?C^qfF&jPwA+Zx677gRBk(pw|ANYLuAFkZ>lCQx6m#s1}F#fPSD^KvW-IOU94&6 z)}TD*)a!2tFYO%&)ZO|funzfUG(#Kf5e4V%=}<;f&>8%qUTr3j7og!N44?p&&!hJ% z&m^hxplZ>(d(!KUH2NdnqRG6r)~kFgGe^2Tz#>uhl}6B96?*R!^*dZr0X_^iR9vOx zWg=KTZ1!PGPInP8ozXQj+#JhhfQ}`D^ku`8lS;UTrxig*m=S@T{f35%=?H=?kN)Kx z53h)mqCrIymefXy8D7ZqqNC5vRHWZl>%yz9OQHuUK|+`ly2O6MyXo}1Fb;RahuhJ2 zqLknY2FD~o@IKlqscTOcgXiJZl`W=g+@wYfcz-Gx!XqDqrL8TZjTdpMEF z2dIo5QdN5mJwDSSvEajYgO6efJ*7?l-0jMrCPO6|%j2^7>I!83TNp$p{~`3O+U>`1 z=BE5Ii7)1$8MGSlPZhd4J;u4Gjs@oV1C zr2~w^{?q51(M;#Ue*v5Po#159uefqH2t!<;5~?e07NxnPsHNdAEYwW8-) zH^5W9z8g6RCJ$W0uUMF_!V_**Z%NzlP_c#`2KV{X1-!-GrWd7|6<^=6fv&&zd-Jfv z&2SPf-u~cSHSUg!xtjjX7UN8$e1=y&*X^JDODR$B1E}b@XGD;}-V$p59LR`Vh@K(l zjS^N++%6t8f$ZoCyC^fGgj9l+t{-|Ic1fe*&G3F4p74J?4X=k+;lJjFO!oQ#bbpR( zv9AP ze}t6nU^JYKd;(d$4NB_rn|z^HE;@>##SD@Z^;Q9ZWP_^q!peN!T+M+r{(R*S(LzAb z%m|%=1_73a^4(SI{a7d3x|a8ABj1IqVtLv+x&*f{k%`{Wv=jY@(R?wuo(;d*&^4I~ z(HEs@O$%ni80>FaEOAlnb30i92tE>|6?s)ovgqycTqdra8*E^1o50Xxv^Vb*2J8)O_Sf- zE&0Z?YE2yz-j$f>mH~>3?OV?ml14=XlHwEF@C^M|D0^H%5;vrzz5k-vr96lT?HT<`5Hw1QVGtnGlBg$g|9Ju62POWB(>3k-YVtGMdi^Gm=ZZ!12TIJ39GH z7geHQZn|s*-*qRIR{ojNY8A?fgRP|%!KGHNFsIxb6ac>u=&&dKHDd|AI@&b5T2-9{f#Bm561QR(k0lwS8$ya)o&mc;l6_AIK3mbie7qP=ZbhMZ*EwB= z87r*4eL1vMdatp@>j72&>+12MciD zDUQV*Qce50GJdt-d>EuaPLqT{BVI5Nb2FI}N)G?=fBna$JMgK#3M4G8ulA0Y!U+eM4~>4TvsMhPep`>yk!M7Aex;n~{fhj}YgX zMuQ+5EOQ>ISZ@oclx0Z@vux9KG|%L?&9xqIMnJU=GQ6O4cgJ85#Kc5O^PeW88xwpCuf_blE zDq!lMz4>G``eii#Y!zp}E*O=L1=u%%JJiMxhEonVR(sB9KJuV7=;{-6EIF)N=Nj)* z-0UjxTYz$A@TuK7zpbnA#(y+NIhzq41{}@G(0R70s4}Wx^ZBK?`sv#@wROUO#7m4|OS5DUp;VLg!Z3L#_tm1QV2DIf{(V`lc_Gp+!+#Fe*J5q!!S(xkxVI9km zjVzm}G*2bjSp50fiyEt2Y?D9ThP7%2ZEll_uH$Ngj_kfvp9$Y3)~V)X)&Mo9dc!pR z+PK|rINZ&QENxCrNQ#r>_R}qz_BS?`Z^z%pR_8En2cvP59XS_vLBM(FoqapFJ6|WZ$L20M<{^{ovUJ0l`h)+w=BYuX+eJvY&lI#mcJun5v zDt9C}HPqhKF(JQ3^O;sfH@i4#iZu+T37I4FG}I*~Dm5sG81MKoYwF;v)Nv`2S>?$J zcuO4=E20+a*vR9DMfEI0j_rjMhkojjdN3}7Z=*F+*OR=1WQ%z)JCbq5RtFpIVg7lq zfUNPp={bquo|%)IFGCz{99fbsY+^cVAYfM^Qm@O%LoAagX0>`VC686%$o>Q`JoKZe zg=DB;>eSS74T^SWmfxZ*AA@DHjaAzPHWHO_n%|g9QJo-8%ZG*H-(|*JK3CQ8Yu}i;nBk zve(U0R@*!yuk;DrcAs5W>IEPb_GnlNzO3dZ)u?cZ6_0ahk6?14HL#dr;`1VcaYo{Hi9>R%MYh-V=&(Q|}k#`WF>J0KLNbxnNjAkNc7ut;c38!VNpT|GTMnjX-LaUV%J zKQ9IwMC54}QjQ18Kgiqu>vjEeOMNR2x-8t`jY&0_=~1%Jjz>Ft z6Dv3 z7W&EmIDQFJ;<^ec zZ130dFCz8}DLJNk5Iy1l!iz=}*+qXkb7mEgR&{2%a^t)F_1HF;+Td+w*>-qRv)KrP z@Y)4~w#Dxi6!Mc$i^Q_vP{Dqq=F5)k@g#M1jRQUXt#69WA)Ubv;eB`MMAQ4xob!$T zrFvTf1JhUp#V-q)I^9&Lkgip{dtsdjmSUG@5YfIV%=$`aJ{qivDypYtE`vBM7GV$r zNGc_^rLL|QBLa2SY#xg9Uzi7q_T)$b$-DZ~QFnD~(+mQ<@GZVq?crG34(t{%O^Ix} zwYbREkq3)lps!z$cRs+#kcIyHf3jGA~ZO`|plsw5ITkjZ}uPwWY`- z!20w9$MC10$z^(zu-OT-)=D`xMnd;`p_xGcFa%Wy7(_%7PSY>C z6PS=L_2EIijeCq9dNELm1gfZJ8hQ2vC3V((gtuI>tm6!0Z`43vOS$+)W8Jn znB8VP=4K7}Qms7j?28N+*c;CIT0(SnH(Xp1>^?O$(w039uM>c#=iv|eTsdo!gU?MD zH33^FOr@N!Zr93t(xCDrhy-`3ghlW;x3zhkcc_pb2j>yeMKl|d*>=FGEZgu*O7rUt zGqKt>yi_Q1kc6Jy`7?_D5X7TQIvOk6GXMRGdI94)ibSey4`~0uv2lwko_L`i$(WmCu5c3qvtBp%=Mw0Orhi47@{PnmlUR9n4 z!Vz9(O`Q59ZQ%}U20xH}q?8UBS?*~hObhsgM!i>0pzD8PVlmkA8Mcy9f;bG}n>vP- z-}21c2sxATpibdG711<*l7N}e|RZXDG zPba10&al?k62|+Wc^QOb4!{%7kOj8+#LDoCGYb%(y$3?*o9}AVx?G3JY>9j5g9+{b z!vHfg(yO|*43CE(iIo71{Nzvc2B=Ytg}?gIL5=!3giqHNmGP@Ul|{f<>4}9?!oqE! z<91Fy$t6W&YaI0+G|w^CUdvVNekg!+joC79+w;2f%b89_y>BB&TZR-y59>v~p7nOvhquJQ<@gPeck*g9jj;XNgL;#Vz#=!(Hk zC#Cyp3nAWLVIZM?o^hNC=q|geaHe!)Zd75;0bfO*wKbUTAQ)eUh)G&yE=vVDUn0XO zzu$b=TwoJ#KY0RZ)@@75JH)n?OP}SQi(Rh+8zEkuH;ogy7Si3?|X)9cn@t9(E( zQ!RHW?uWAqVydPQ8r!kv{aK)&;uK!2qtJ&@Mg(0EikLbu*&9uWW=0ms-v%`7kXBvm z>aCYq*;}$tTZ%`$fHgFrUY>6{FkoSJn%Q(Z&zeMRJz0xfkHhmS8tk}vmOV_WFXElO z61Zx}e)7DNf}f)w8N-pR1jJy`<{yQ}a?dCrhc$X9B#OzRnB`Ffl6Nl2`Jvyz1oG17sGop#KT&*^Jm?7;Y@nAP>awfU=$Z5 zIUxiYK1b>bL#Z*SC<$uQg&cJ&`VcC1NTDwyI~CKqT4as#Lw{0XhLOhS_?}0*Z!Djn zW^G|u;R2yPl*G1NjsIG*UW1`%35Pd~c8w(SP0nI_Qo`b~i3JgW9P({bZGb2ZhKhSK zW9`Dsv^YxGE+7izwM59}N~Ut^diTvGBo%i^L`WY;JO0IvXWjG=K=Dml-F_NAU0_}O zqz_LI*KY51prGzWv~3BoNwGS(e~Q(j&U#&KT0NikAx&nN%;$;mWP%Q`72#H40(m1k zv09n}qGX1JCyYDYN5&)s{Z$gkcmbch^=wsgy^Qc#@4#<4Nb}|Enb)1!D1|)W<=Dc@ z%|p5I!yJ^E>(}PZ`@Lad0+GIP*N-;q~2g ztVmWxUYe2_3$dT(1Y?4BM;U#k$pv7)wi8T_=6B4Cp!jkgx{hZ?apOa0?R^2lKxfIc zjBoKdv~NDAZUMRbg9$|dhj+&1@qYd;d`^p}FmSIN2kMx$D8O{R>EFIh!`*5(V0|CT z4>;8+UB(O|X#)#}=wDnczYyj0k7KjGOaf=FM{6pSwu>EkF%09m4g;vq5E}9&1;D_x z+-uc{7vkxDXk_~*IjbT5KN>` zn;L_Rc*w153)gCjv_scJ&VQXhS$^K=_98cyN>OfBR?BC1?SsV&q+!o~S~-mpUK5u%+8lHS4A_A!fk;y&IVa8vZfAqeb#Cywnf$Lds=vgtpfw zR@rPgdA#e*AE(m?&G#=ZR4^OfV;vyF6}7vA|Mi~2b2EBS&x#%83OEohC0W<5+2;ro zY}~^uU4R$xKf*L30k^P#zm+s;313U~d(^=Jex{}^qQt*6Z+tPxK-W?Wo3 z$+>h`Estx&%s>R!FPo;ZT4q@@6a`DE&iupb7Pm^cwb~YuICh00`Pzhu|%ox>Sq1?u|p9<(``- z0|+F)Oyf?mpypyBR7Qi=I2E96jUpdK{5+sQqS%IcBPIjos@1GIP&(b1DHy9Fb-6J& zBw&|hVcJgxlWpZC+f4J}4bf#^&OFY?;w4LW%X3f~gAn5@z&@?_Ru{w|>tCvzTx8;TKq#@cxl=k0f4a30k#)~x zluHzrq`gx zK$r#hVkyYp7nYKkGXedm!Wb-*4+Czet-Qbnv8CG*?RzbTAjsd2rnB2oyr!pna@7}L z{5@_(181aJ<56`h{_my1_@kF5mD<(9{b=|tb?GA{V+$z&f6vo*kwi@u5-XVMhd&EL$ke`Io#HuYin?pLR5Se}@-njrB zTNs>zCq!2^!h#RcJEcnK@QsXpI#iQbkJbkcc(V)hW?X5u{AuyNKbRkmT?Dz1wQTpY z5@=F<_YQD~%t9ix?JlhRPV0eqtX;YX2+e12+&C;*DZ@F+5?H`Le9>Kj@(#1II&WgITiLZUk|r&8Td z2vyRyDwIUfQLAAEZkqdkm2g~%zM`-CLe-yqX%I`|*1;>G{fxA;TJ!vr93iBo+JY7H z{b|seQU+R=_=lfnp%q~r60-r7MliV-2U@xj=eB)|&o%lS$3(^FyB&=mUI_i#GzOlI zN3x#Li$4#UOIwZ;!7#cQ_yqBa!ekrHv!#?bdKzotj6=M#Q>dm_Ck$*_czFzHR>|QG zB1F6W;RuvlqS65Oe58$m`ZBFL-H}nSBg74U+=Mp2R@GbAD?<#yk>RS@G!9&P-0B$t|d-tPYv;sopB$()HER~%ojbVOSi3JM%0lNzj z1JxyFcWVG~zWi$Vdzjj1*LUG>)b?~jMwb*86-58F{779#+?^(I3Y->lftBlea#a|( zUq)Usi9tA;#?hYCJ#DSb*kxL0xXK@Vu)4Xdh0= zygs#i;xJ;Zus|1yQvluo9CUit8?NY;UuKIOCZS8!rPF z(FucYCqFw)yl+-Ela|##5eDr4tPF^wwZ-NqX}~TXfK9}>C-}*^q#)2)7$l8g&tB=W zqyFh&KWMeV9)FG(JqODB$8dJjyO~-^{pxtK%4-)gTp*uh6n1Wp+fv*56y-73jprOZ z?A=gw9Ee|sOS%g~4>bcXg^zBZCPSQTjs}QdF}oJ2+T6(}E4W>!epNg z`8XV(>u|1{vLnY@k{||`LMnx6EaObK&1=Qxj&h4!3MjHwG$c}C3;2!_`pE307j&v` zTF<%&!ReA43B~TPk>UCJwh3;c(D&8Zuy)N32fgrl6tyce&=F@C+ zKMG6#^~_Lmx!Ef(Q2J%zp>N{$_g4U=(78IS*poY8BW_qM=R%C7>Y-^v`1vMkM_a_} zdbk<={gBv0X%YPk#JqQC>T)+j*w60r9W7u2x1qyTm@1P{HI4xa~`Oy0wzL^d_d>0mG zGaDwvJ!I~+=2^|ZMUNhM;m$0tggn7LLxHaRE!$C@itwo~Q7_L)O*i28;*uUss30?x z`&Qv0@ZE>CnlteC^2D3C4r@!v7FP&9SqH;?{Pnn%+z;~dZd($M>|10Qp5&nT80PSb z4Z8eFT3CL?C!OupI zkw#;c+@n%h|9LteFQzlDecLdVGF$*v1Vp-aYrp-nbpF>3B2dygCBy?4r}u6l%nW0{luCz+j$y>nHsv!sv(R75>yRZ>*TDCAb@)DxLHD# zzT=#z5L~ldXD-HJ4SCTh$)Ymx^qK>lk6!zkR1eepcv?eGYgIsFQw;wCq7Iu<=#hgo zm7{;Opj)Q#dI)pSt36mtV+|A~nO|DA#G1vy5tZZ_Y(Ebi7+ypbd%V@vihi_Y>{E!4?bGRajv(JJZ9IB*3G%B6fDG49v>tZp}tcsG8SM*ozQ=Z&xAk^E9hj&r*`c4jQfet>$= zJI(W>cSTLoT+)3H*ToLGPqma2TSK@~_~(T@HM?%G5$r;Vx5GWgSHsB-ro~^ZL z){zPn#TW$U31cG5{jAV-wLSX&1PNFaZK~1RBZY?fO{ER3OHtuj7>a{X0CQ~?{;#TI zBVl}Hn!eF;ifT#WU@c%mV&^vgIkX2el#uSV-ShQb50rExtR6lSktfb6!;;-ehZHF(jfdBsWnpOV}B_TeB+x%*Ls* zw|#J)2^FyXLi#;?TGYF4Y`zQWoN*?(@1^4}2vQMqJiVkiCGMpc@KDMiPt&Ao5V zNPVKe%bp8w?;5Zz9@*C+{MW%-{or zGZplRqg&h#4#6SFBk4bUfqmH*hb&ld&|R@TQ!>i9ZW$23mX;5bk+39&rWwTV z(|mlL4UiYsheJ$<=2ncGt=Y9>eRBYNLW|b7NSs@1fU!imIwR&Wexi6*MG!*O7EAiZ zxt)BtwyPsto&0s&h0z0!j~JOK+pHr#*&FGF$qW;-d zu?(}$MrBVZF(^*B#;icsHje3}tUru!GkyqGO22b3UtZn*OXP*}E3QAtE zQ&PPNM3AsjJP=Wib`C#`93pWXl!Cy{UGhND17pP$9BehdZ#?iK)`7>jcc#xv`Nn@C zWYeW+GYfLjQ;piv_w~-&tQ_xVAMGmhA56=mEvg6gW*Z_kzU;((&K#-2{A)>LYT2_{ zWhrSc@Q6n4btJUw7W#eN4`+mT(89{%xD47vyu$$*aY5XOy`b<>0@{Gmr#~SK5n*ES zK_0CIj8aXws*6~&NqP~I=ZV5Dtu^{BBW`(%jf#IgQ;wGGfpqZJ(dYv<9!L{V;N1#f ziD?2v;_V<(A(vU7+u(-3gt53mr8BI#*_2L3uI5Ml6Q`5aL8auayVNSl9>U(&RzY^i%0hwx9gy3+Ff$>&sVesge&a^ZwqeD2>jHjhh zp3U!EXln(p zUd|L5jooj|6uvV(K1>O*ZC2K3#Hn#9Zx948g>NP(&?J7v;v@rAMb40w6$$-lvy$o} z6oKhXteG0-N>!;&?EYRQ#eGCuNu|?YkEX`d#H!+|MQCnU?+VmS3q<{)ERGR6sa_f? zsAL5m(jNR!7UQc}zHB>)O#b_O&VY=#WmKdW4&fi7N!ySzr69A+3l__nq^tG16}F>7|q=0yG4PWB}%rbP%&f$iGtK%*!r04{VL|NILWgkNJf zDjRMR-E*6bj{+SsLVWv#jO;?>7oJUv#zi8ilMjXWK7S2K2aFidkP#Z2PHeQlpiSo(2)jfD+UU;r7%>n5WZ@q+Tn(rS_ z!58r!)t?Yc7c0}?y(>cQv*LuMYRGLPby>JTuF7+<0Y15GDq_SL)uTdCtFV#FX0z*5 z)WB+$EDbZ^Fi?{ae|GfjqdStuq*m>4NeGS^nS{@+EBO3*9etOSBW$2lZC^MZe^5Vq zx8d`f8_d8&uu-P3ROTagX&}xNg+-ykQ%85?*>oX6E<*T#vTk$zkB~f-n2)@p>H9#A zJq!ZMz=g#NV!<|Xg@1%sxx~s<4hGUL2*K+i2`|w?((<9oX z?Y-E9gtGRYAWAFrK?!S zLch4W>YvrJSoZRo7gh=@QliciYZPX62_LRl?@Z;d_?#!B;S*y}SAVq1 zuWRKWKG+?BCLc8mOl`Qq0R9lU!JH}Oh7rFJ)}V!TMD}aWEH=Gu;0>}FsL6hjt6O`C zu&P(v#-`Bf9qAHs_b#=;&=-+UMfj+~EBU|vN5A(U|AXWE-~Pw{@VoHa4Dt3&NRU~1 zcmyOLHP;%iYOTpGZ3t{DJAFj#3tB+bx|eR<7tg_CoW6fAB(U{~`OuFDi5b+lAEPmB z4C~XBru#A+1u&Z*TlI3oz37GyZM91!A2{%3_ZMPru}XK1oLN{)3ZuVu+&pjr$-KbX zW+72%+Af)ck`*Fo20jGW%+Um?Gqb(J*95hO1C@wl$)(@xnkbV3mOAG*3$-ZDQ_{J~ z+%j;LtndhY%~`(oDXNjEf@OM8ASsIodE;HrLF*YN;5aQBkGpJ(;nnnM;ml*mB%!i$ zwmQefS)CXv48$!(!#*gEr}JUhjez{IO>l5%FH*fn7;hnsUZfR>-n6tF!h>U4=oVlA zBwAnw4rtTl_jVsH+;CejNL(qk*t=@=aCA1|7)>p?fVbzXNab6>Tf^A&BQAV!ATGw6 zc781_2E1G?@}mXObu=D~^;jQfA zVJ1Vf4`vOi2odzh)=bI~hw8Jwj726r87SGj7cwsN9q|8b5@35A^szTCSY8UUnrn-w zY*k)Z{{|5e6lXyU2th+6N4qE@O5uzbxB%h4!m~{YMbkylXR=_o%pj)f@ zEM7kTQAaVTFcXx-IDT0HuGM#Dw$OHmBVBA9;&a!881taS;FgUSQx+LW#K|wjRH;Nq zTr$0FQ|T*a)UQ^k)ew{EHG5&=_H`?7w<8s%&vBEA{=J)~{t}YU<1&m}Un4$VkzlUz zi>x-n@%5`+sP#_=10E{uD&L0?pCYKcaNPj@VqwlM%;Rd+hd&cF0&A>wrD=hG4NLfC zLO!Cv_R%B&V0bdT^bgL~+T72CzhUDIr}nd5_$;5Ueh_DHLlF1?txPNRJfWLKA-uY9 ze-2sCP%WCZQq?qBe?BXWS1m~JP$zw+ut~V6mlvTZ_F=5WjsX-ChE}G6fXy~OzWYMc z1a=hiU>QLW9l7_7A)$csMC>A8{!ZX10KjRJY zM~6ge_4i7n6?p864C`$}@|1~VT2I7;^bLnj-7s#Xz)l3)>Mw$hGb!JCdLlOu8)%8i1gS$9egrrG{d&*0nG!H38dm`w>l(L~d`Tdk`uTdN)4zwCd*-7$lXzi|NQy4%IWL>$cdG%1 zu${8`M}=+3h(ez8X{dyQKV4nvn%CuTXUhuLGOwXndo1J&Zw^H?4>A+iIv}a@e2mAv zHI*iRtzE$IO^R3xOLzC9~` z8>n;T^*Td;6WDg9;1UkyBC%=oQzQ1%)!&S8LX<4|uj$`@q@P?a50|YBK5`pB9|imQ z1OKCs>}EE6=Hxr)+lXZ#y`e)x9pCP!ou(#~`~kyXE|o3%ls&ciN~NP7{C*8N3^;z-17gOqm# zoOaA|m+1+@c$t-bdeGm-7~^r2aA3sWo}{~LrgmW&yea?YQ}d* z&8NFR-aeGH+z%bjbexu=u-tlH4Ig{ix6Tt82_minCM>IR3Ql+xd2Wb-A(I?sv()YQd3Jsrjl>v7DAF?-$iZd{=EUdMmFW<3zA-?>*)}G6In9hI{)!`{Fq;h)?@qeYRZ_Wf95ZEB)G}V7L&RrxBnpt2X^{cq;u6?{Q7ji7?YQm zi!}MK)c=ffK4IPqCw3u466lrMQeOvK)a8e)i{ShKe+0qxlewm8qKEAO(vgsP(ns&b zaWL?!f_@+M8j7vr9lW5?d_MYHk6KRl)E=$Eu-l)RdcoSjS{AD}bECiTQq+61 zl^l3V)mTQ;u^xip71NE@7e93=iS{7oQQYK`Q<@qQ!W_g`AT7bA| zOt=#@>YYM~c*3`89pW+pEC3X+4=CzpvXuA&$tAQM!YfA6xs*#ZI~>^6I-T!jgtj$K zo!`}1v*Xiq1qJX>&6cR|#v{ojdiM0tyPZxo=Y(W(UDn3KD&0c~-ywovmCERN8>=1< z56FZ_E*AGf>YU+;?KOjJV;#!2o!?I-;60~?9a(1t$?eHj` zZ%!Spcqll6E2Os*s?)^y{*D~jAveR_;>rvb|4%}Q9gy6 zU;}hGQ?~iZKx##il5C(nms@B^9n*`N-&rw0;_kQ~A#bhnhibUxUsw?21z?eq010dr zbjKa#=IP%Zn|9%Yuh??r<6R#2~H%I9Eyj!U5e26143j1O@Kk#Uo6>}#o3`KqEIjj*p`u8 zY}Pi~5x}vf%Brka;jfO7(0+z1t`63|6WcI5h}{WTd7GbAy7vOVEy zdrDj~YMzIoJO2RknQ#vzq=Yr`t|u5lnI{vwo4v&h{L?vN$?@DT>Bf8>Ix)3*f_-(! z`cL3G)B>$_gb%S3U*lIC9gugr=XB;~!`*PruVSs9+4Zr%n19t_npa4=)OWPeC>}Vdy zzND(ZrB4hKDH43LdJTUc|1uODd5l3O$Xl_!%o@xrf^p70E=_;oUAj-ErpYcH6;H_3E-V8|!t)r^uQX09e2Od)=N*Apd zzw=0JUTo=Eu&(;~B4g0t@UMFe(l8M2!=qT?;2`bT&a9W+m37!+{L5GTBYU2+l!~3bVSB-r21@KDsb5SaG*_E(qm~|&e44CFqSS-xCOQ7kg zG*=Oc43w(5jfIDF6xawq0R$JeFUu_zPeafcKFJac*P|)3LungYmJ+}QysfYvdT~LM z8(HI8L7=mvG17@EAuja(K71(Oq%z2=xA_#zhYvT;W0Z&PiX9AP6|L5CVQFpIQctUZ zdmGyx%6T~!EDs=u$wgc~04hWj?K<2loh^y(x%lM!T4aR49U|jx;);*LG^e3auR}oc z8ZY@r&vh$l+7*)Ia($M9`IfiPyM;<*2#EGaevWx9BqP;@V5HcDaqFwy!}op<|5yI! z*CXgTkHYFhFkErkLVhYKz)1H3JWqUYRqb+C){u2vreo{o!^gSzRyt}E#ERuS%)%ka zH$_O>dhvth!fmxgB#^pm7gi;&pv+6H37(^Ke_fClnv#y!m|&vXzf9LkT3Cf(r}84z zeE4)d>RnC$mP7uOJr*r(6t!m_B>)o7w?qOR3fup^=FnldSAErc(IxgIZaY3jTZ9CG!w zF4LQj(;vIcW*JKU7l*vnXFh~F!nLstvm~Pijj))372?tP^Mvq`yTPpi2;4RjU5XJk zaEEDOS;wwu7=80z*n2j*%oEAXy|0NOVGRq0dhHftuMJWmEV>fGL9wW*q@BQLacT0x0 zFu7EA^gSMaXlmAaAD16+#ReJOKyrXyrwAAE7$WKud51!-c~~TFQ1S9|vOQ@#1mce3 z@Q?Z=pA-;a6j7!+3Uy-gIH*-Z7ov%y2S&qf`5OfZk&dxBk!CqwaE#q}!f@RO>c3{u z(6u@v$v67v`R#f*x#``H=cC#1Y9h^9Gg+XV!LG;#%JAzAlZ=1+&wu~-zB1(E57E%M z`a>2`d;{|Ao<~e4cmJ|t|A~?=Wkf7VMBKE0%r-v|TT$%e0mIw9FQLghhX~#JD_x#( z2#**rSY`0+47J=Ps7jaiqxX&4eZ$(Y&3z%@`(6SeI>t8_ePz9?!5MUXLPv?VlrdPPj1xYydx z@Hn~+Gx+p*o{2n`Oe!=R>y0C|3ooh+55QD0g$rUonq`S?wI6J%C94z&@m|7_ehwGRPCGz@FZP|(LQ0Xso`fk{6cAYsxD*2}>8U&bEssTuoEeaK2r^#sH z`j|tb^dM<^WpH7pd@9~uNK?#ILcDy@yBoW5OgP*x@Xec-HA;Oj2H%$S=l<9533UYw z445dd!jL#!{;NXvjur2*kdoY~_os(&%ToyE-aZZC)b~dgf0k(Aq*GJcnqS|I0>@XrVOfdUZZIIQ|$E{V!}USq)KPv%Kx(LB+$^^-QM|P&$7ZJv^B^@W=WzNS9%o zi0+AUB~^;4hDOf>+h|IaJ$8fH=!itPqRVG|RQO3asSaVv+~#sPR)^G~48pYAfwjj&wtrN52g1 z$zcAiscrZR>SRI(;VE7ZZ$|gyYfL0}v*CQ?4{4EbW|5nUKVRSVZfBz(@P@BkqX5s* zPwqFS6`4|BZFw?Z1ZAg|C*(zM|C=60x`JtbOGe|jwWQdp-BAsI2 zIy#XhW+kQMjcW-g*}Y|TNSm9>EC`C6vVeMg0V zpCV{wyPB^T?X$IDhn+(>d)qQ3Dq4PI_}kRN@4D~->a-Vsq}_pz0h*(w`NElmfx~Q5 zaRomg;M3l>wiXs1n8=A?b1lUic8bEOHsCAJP{rev{{wkZkI zJF76cUAebK&&;v7IxiEC!HhWc^@PRhxIGve)CP6;kAdWU}VcH)+q6BvxDwk z-A|KldhegPP3mLgs*>|ig>MS8qS*#AUKDX+orc_=g09qp)oL4g&1+I_8(G!7oT5T- z2k^96L7Mbo$b$~%b7FDyk{~u0oAxH~j zpI_w_3v^wxKV-fKR{i$UH(rS@XmwnX9NT5Sdo1?FGmZrFUApBf%$4aVzX}5s=6Wws zFkGh%x+wa{{Eqza4~cKEju`@D;zRy%ewg!w5_=h=1Us?RXyu2WPNvHw;jVitw zQVz(gg7S`f>?J>26A#2WE**(eTgBZyMzlWRDlpQGKSegM1n=-ypDdOk&?+jHhtG_wKiboP-SVFVSvi20nV0T}KjZZ*XJ;hn^^5R!s>Bh^*= zLr+2T0S|wpm#g4VJBt?^$y`lsvgp*pin~UgtFME>9LE=51jHg#WOb=I2$(~%Zj4%*nHHS=n@AlDi*>hh{*igu4zog2_|V~hclpya?9_8l;_he%G_`%pr$|iB=g`*}yBtuiMPf`2Xk4iCLt9@oe)MJEVDY!7GB%`;6 zedyp+5$gn_VJ|E;S7xpgOKaiJli~9Z^CS#6@aBJ?*m`7`enEiyjp2_xZ&Bu`-zx~JKmkwUTG8iJXaW_n2*pt^NFxAMJaVC= zB5(6bVJv>)&~PHJLXKuM3H(%ex%|{s29h;tOOu#P*y7^YzJ2M~u3nYqCt-@cSYgAz z-pajYts~>0ui2uY9u<&Ks3+Z>X?gw?TAn^eNG>MGNKKPd6z=pST1Yb$MAiIo{mA)9 zW`&H0N*b`RcEZM)Mf?XkDYnIN9b;UpA86?KfsAXtgfsff%a_h<1ci8M`Vn?h3P87R zgr;vq-eF4>lGt71R$|a6$?#;-k+D(1J6eU=%jW^a6r(s32v5q#hOvA&)()r!gTU-K z;v&ldLF6a=epyEqA-7pxhD4V8*G8q)qT$X0EcHcKu=F|nGI|)y$2TLG<&E!UlZJV> zJbj8V`t(^+N-te}W1JEducFSje9^!ADzugPkAQ?rn~=ZLTq^!|Rr`N((=Ap9{6$xJ zHw;VJY)Wx#ni7j>P_A9;@)KJgi{?Oz6VQ1o&wHPD!*7fE)sv~tp}<|k5h&KYM6ocW z-pYw$;a!D2Z$CwXcbt2cCKej5W^-xjK^-n+xhF}DzB;XUT&8-@MH4c)PMez3L2Br<EZTALTiWHgV{x0gb?yVf<>_Xz@tQG@T(O{Q8Vy^ zJV=wjtt2As(1eTBm9N-2;n+aviwzyKu*ZX=`jb;{L}I5L8h|(g_y?uN7Pk+=ydjH z4Rq&@sq?#0&uA<|qori=uH?qeGuw-cTx8Y%?z5xNxldK%9^tTL$MI89y(9kc5bfX1 zVVm5SRYA#vIO-6$vvM64E8lhW_+{RM*qgKgIX263pUiK=lKCspI+-#GA7_|diMslO z5)qC{491ma9$j)cDv&NI;IMhg3&j8)P*5i#;>_YUyCy0pAJU@lT^R6g=?w6~$`hfd z>00LB#}C%2o0%y&$Wp$+gzyQ42(RVV{S5ZSNA0;;$o$9OzJ(|C*Xh&jVf5qPWq1&x zWcO`bT?zJ9XZ-{QEI^(53>r?=GC@#f&PDRJjhlLLRyHTr!ew*_r@BlToxz&s zlbIrVt59_-jD@}3SHTu(x_!T904ye>#hGl)`6hSM#mGF4s}$+_uaD#F#nWum`!*Zi zTSwH89cd*WVX6;LqI{=^msjGE@LZVDQTt}MzCIpK`JS^MXBe3aZcG7r56Q;gL{yq{ zrI*eu0wMm_98&y}l7BC)MOh>_YZm?E%yjVG@&og=4%O2=6KFHRoqlFrq5MpmJpJUO z*F^;ug7%R6y5gXpDx6{h1PPFGlotpOXiFRe@N~3jh&(9h`9wNf;R3vg^8}iHI7paa z$1{A8@f1G>j3~ot#*m;VlqRcgbACAu2r=ep<4`MU`DXB7ZU-^HCd{~gU_JE+Xwv?hBW7l#^0oI!Rvjlx_yrQm%`YH7M_+pl7ConB zt;5{GI5834G!NEIJpc=~Y0-U8dWS%#Y^BhdBay_=mcQE}3*A%R&NT=mz>C3PNQ{+O zWB3~n*SA_TGJJKiJSjQ97BiE}P`U`^1hY7u=lna?x3q{i%MQ_=X=Yp`nO|u_!=9ju z){86~MworongP|O5DkGBfbojJ+O*>FYWmTkIX9S%E+vTr^Io7hc5ubaNCs(zE|NB? zXFYUcT8qI&mI&?kpwVo)tuD*8dJEy%ZwZ&WETMM*2M@Masf**R8o_unkQR&lUPOfc z)h9Yl7=OFHfnMG8$Zz)mPxgdsF?yU&;;4{i?!}PH+#5x;@y=P{e^&Vv2(mEH4&n1c zh@xiWT3RCFExjB!WeoNC?6ou?&J9B{nj9lahbg70-0D%m2X0fk+jV-gv|TwmJrH{z z56lo@R=s(&M$MxHlV5+c6YHg4l2i>RV~xj=+? zB4LifvN6n|-^s7A*!p#j?uQ`>523_-F{QB0L-}E=Oc0-ZLrcm`<(m5Q3}j?I^UJaP-+W3t0pW<@&;> zuMejDZ6zyn6H&SREExfPfb2y*X2DemSQ_N~mlSM(@OmHMJ^mO+;Pi14ViWgZ=N%E> zN*|>yB)HG?0{3toF4W{w8%0(i?i3gfI9m)t$UGU{^qxl#QjdY1fhU3ToVC_aWDnm> zb?acodNrX@MoZQbmNP~=%I(G z&`>EPB<)IoVe%>*E2`69M*2FRP)M42W0f_GG_|avwP^I3)2<KA4!#ry$k(8A7c&XZQ;JclxQKptdv@is0l8t)5nkNAB zd*4PDq#8NegnJm5#1Fl1QJrW+T7=%q=ys@Vquo2S){lKXiJ7K#IC7KwXqIEp&(-$0 zz>1FEJUdF_%Q2Sc>Qf*5$!>IkG9cI#g5*y3^HWrqDTd}d}tc>?E zugmj;r2uDwiJNYOiVXT*+&TPMLx#m#q%Q{%{XDtIGFd<%_B2fQB;%|Q{G)e0MM&xp zde{`X-@WO3ybwHCw5r$f!CLk;23mPu{kAV7WpRA{y*HePYcbIJof)%di|0Ajt5R1) z109lPVGRvywBr;tBOhym$e2TJAx4tn2W_#83rp{+Ekv<}LsoRX=z8Bce!vvlXmODG z0HcUi>Jf(gZzc06JGndH2_)mxX9}!lj-SQJaicp|oRyd>FASwO)73E5Xg8@MlwXLeXzn_~??-9;R zsU?jog0xm$mWGI6C5VLEb>sd=g?d>kI%YLH)P&v$GGCWVqQDR8oZ^42L@nE5(mzq~_Q1-nPHr zJ^-peR3WSvKD&uptRL07`S^n1s$6$x+lDFjB7EavzGa4*mD$(`q4tsfKg|4}wgDuZ z+jaSd->^3R!N%0mj+^5WCZh*qj+k4D`eB^9Kl@->3sriTJ928dhxctDGu8KbkevRC zC`lRz!&@vFR0z5Pj9=B7OeznB2CJTPBmv{uCZ>gbDZm zY_ttE+he{qE8o8w2CPN%XC9Jg<~TwR8iPEX|2krk!h+hk63*OUf;`@wSXL8!@O&1t z>LR>7C}gc(5n!>IaPnIQmKlm~8Yo_2bgfV+{Y( z)acC8?5&{FRIYs}CL=b&eE2p933{KksOa77q)4fbtZk#XK#n6Ht-~y#!W=}_TQAv! zFlC4PeIs(ful6x~8jXx z!xTpfGwbdcLR{88ai}Xw_fViyDO+`AAzw^_o$PEK&vVBD!>wp%C~4-81dV0!=ldc zYb~}v-&S$*rS(fUVEIGi`{8ynGUZSM6yklmRicXj=yhb9n`^rm#b@74(yL6q)<8Zw zm?K1!-OmRK!TlR?!$t{SX8MpTS$%(!@#)7Bxm5+FmSpMd9c_NX-*3RA5S^L`3Y^2m zOLj&3#vn7AHZw(ro>>?dws&?G2+T}IKdz@UOt*)&kzZxHl*x2#m(m@jRYrvkkl*XL z(TvJw3?uHS;uh|#Emd$VdWXSa9XDNsnSF8hBPIm>sB6a9OJ!B(HlPicN!D2l2t9>U z=QeCgrhn=vqMfK&0?h{K6;`b!Q}h3N8qX}98ZXq{n2koT8clgh*zY7gX<~5v%%=69 zm+6F+5uz<88`3wEkXe~f>lC+rBA$HAi>Cd7vha65&h`3JMr zmy)0~*2d+2;z52#(gf?oKHaztk{^6Aex!|a)x8NY(M6B9vo^{2ZzCBD6Te3I$IF2V z(Bbf^chjn+Gj&$8es*!q%Vd+5!gyL0re^TVu?=BENPGB}pz*>FhO?$iDqK={hISOj z?hyj+SQ~85uJ-RE2HZ$&?}{x81{DlZ=GE`zxf)A&Hzg$-dA+ve{No6NTi{8<4k*)I zq|hqyO*H@+CrR}pOh3%)VtH6nFQ){zPGlcSVRo@9GltyopRR}ZSF`aA2+oK%j-Y>8 zy$DNQ>9Nn)+RpdGhcLq5I+jM^p!xL3dOq!V_}9x`^}=7m3HsasQ9!Q0EPN^MeE#88 z)v)zoprTyYn?6i_q&)P!T1TlOC9m&QY?qGz9>VQE#YgjJpmAMN#x0ug>0rgQH~`0Z za#Xt(1hc7^Kzi6s1F_sSTe@vAV7)qWw3&ukvV;T1A0PRq&|Y9>S9`&jyA&B%e5f0d`m*vnpY7T`Be*x4Gq55Rf8){EW@B6eQG#<9P4GBWaTxMLG zbq+Yomr-n%i+kU@N)QV z%00cTDQ@Xeqn=c>aw`28#!cfzx*(=7^KSMo*vLdY^l;tpA!Mgw(m7_?Hx;r02YjQH zAIQPzS~T_f!W~(QON2~XpDV95wGLK<;*mS+t{IRM!hSqNtxd+U##faXvFa88w}~VnQer_BYuOVR z{0?f?3WGWPUnI&Wm#0jjnX3|{8?7H*m_c{Er0=7L2SH9SDMg@7_$d~l+d8^nWp2cK zu4+6aMHhs3>%w#sK<#BL zLP4I!lp~w7sptfPln3B)cNxl78E8YTpduZRM{Xh_N`Oymmc3TzRtu#q2; zJM-^9U~3c>`dg2bzr#RncK_Lj!}?U&^#20Gg|^EO(N!m@g8AF&vqT+iNDi?%&Gu0# zzI=PEqLTH?mPS#p#};q1&V=shQeqtN@SY1)H!f^xDbW{jqWjy)FD%A9peTnwd1G_@ zfc+kxuEtkS^M3D(zmaWoTz6l;_(OOMIfrBxl$eu86f+BgN%X)bLwL(DKp$Ua?OPDZ zijNb!p3BJloe1OKg$MCA*eOChLqkFZI1;UFF;tNr9sI_J_e8B2;-F%JNf^J%3M5T- z$r2F)5fO7?!zMgN^>EPQkoL~&QKFEDEZ(bTn|d1IgkB0W+ACVgbsWB3=?2GP+=T@j zP*X=)s2{cm|8Nkdnm1ZJg;78(O(8|P}A9Xl0^>vg%qA9%2+SlZ-FE(7`Hw*UuvK*%;n zx2z67JPxjgs9|^?Io#xl26nFcn^KR~9ouTu&C7LoC+rd`totxGdGya&VncTmNzBdk znWOu8yb#`DHX7di=*4pz!m!%y!Xarx^wpeXJ4-j16P*FA^!(&yjfYUQk@X%077wam z+tL^zP};Qv4p}0{>=PG=tm^nsZiYajj?*Re9n^yV?2hXY{cPj6HaCtZG;d^M-kQ3G zgHp2;ae|J&Wm`tqNNdQ%nIW&`nN1ac0LbCdF{GOUIOhWk(NLPf;P`8?E5aOg(VHXv zZcQ7DAwaoD5bzihmGA)$LQD*pUOaR-_Cvw}ciSWN<+gBd*g(fxHXfKWlaI=L0>Cci zWM+#8t4UcCmX#g=$ZWQ>aU}6SVn@ggbEz@29#gJmZMzI9LJYuJme9OFj?AZ#bt8T9 zQ_dp%q-%kT{ozhEOU&B0Qk?;V zES~b0(PT8A_eQhXP$(J{;ebe{poyd_N_hUNNUXqJ#)*_%>MColzEp9p9$U{z$a<+M zNTdn&UsPf}!>Mtib9$!w^hTR=*&92Edubd=ZGSN-0%#3&aUK~z-j2h{vhW}#(|Yp_ zLX7o;j1859gc;h|`~l|0l3HeAM23Mt(Oqj|erl1&1I{N?6kir7(G%Y>*tAEpCBaJ% z89=jAdWO=DsthvO+EM$rE~0*|9Zsqw_f=Pmz$=lajto#AORr{32BySiMqvwoFIGlI zzlYY(d|U0qy0ho&{8p9QHG4zY_pf0E+P{W8V2ygFT>>Ur=rpHW%jp7)-F{CB%Af4w&;-jPlh6m zd<4J-()b|oikG+~DFbB<(6!fU^x~3esMot0^=_Xg&Uz35tquvp>3m>4-JVI{Z(VhO zsIy{UdOzref_^F|5YezDlrl^&(s{z~{Df#n+SofPGQdwb)nGWd68vP1r99)cn9F;M zhKMI(VSbAEuE|sQ;Cnwp)YrR(kLSYJ_9cb-crxG6+HYHwo7tvPTfU%A#Qt04`a)^XN4%iwNG*$P^}EkxoImbcK}%+tt} z@b*$SQBVO7)Ey0KoPfV!9K7kOBivAnL$@pB z%7evocSPZL%uN*Q@vj79sdI{rL)O4%6pIW89nfH`&6|QHH-F(+s3kGZs6e7O=H9%l zpuc@~RDQ_ezzW&MW{7?yV|j#o>Nh|sV@;`nlK+xP~L=?E_2DH@ON&=|*QIao{ z1nn*4C-lgrn$i`;+aTMPygHbw)!wM-sII^L{FAAuVwHhATv)`vo)h=v#7taBM% zlYmjt!WFWDDl;_f*@xV{8o0IV&WOqx+b*mNM(h8R`aK6jo*6##R(5y05qzf3HbuG|!-bIVE`LKLr% zwQNuuDRKwL`Tf}E=nhKKXYPvxH~MDc?1llux>(1LBq#25yQ-D55kVx*y(LkPg0xJ) z{AddkN-x@-?|twx)8Tlq@&RP=_|PL1oRdzsAoTu)QgEhpQW zTo++khgTNXa~2If0mB#n8v~{#c~B8)k0v?MIkKF?=Qj@z@%05!tUZ+J_4mOy!M45? zEL*KI4YK>G=_;6QjADc%?UMWeR^2=_OL*|rRhNDN1n@|fMW1`ARE|jcAIA!u=69c^ z6mH%ye*urAj9{$-QFB-V`Z3y-mELi)*;*}-Isf_y0%OvxT%z2i=N6(PMAcn97h{Io zvPC-l*V019dxeaUkJ6E=<`p>hpWE{#CO(13A#o0os=A9VGEG-9`6uZly7eW~|&{Q5pzYtnxs=TT>Uo zQ%Dc_%?vYA`xMXlE*8r$dPn|rDn;sBURk0^h!ka{U56C^L{Ltb0G zcX3i0rqFbkiYriZM4|5Mxj5mrfXYN1g?P({i%)A}uCt`Y6MPIl11+`5o8`fHiY$sm zmPuWZz@l+vRhWc%QuJzC6E9m6B4r4iuV^mV{4fO8efC-7eqI-l!jQ@Vy>LjlXH!QF zDW;OyStDegKl#hRK|HjE5Z{90~-z@3n`K>m_lahE}q9){m(1*NLQ zWDE_XUizzT^9UdJTV;jUuOZ?%?AQ_CEt~BH6qZAvBZx+^4PVpR#xfiH$0)73Y(BPv zM!@YMeIM3wqwei*iS*TX1G-ja9U3mwS(Fj0?GHh|(~sfqoAme|7fLK^KrpaE_z#q|p8PPbO!<<*fq@NfIiDD~FY};;r z#TQ4-si-&Q32>3roK`O3{V*Dd8THqaF*Zn@V6N)Fw-^})??xzMf%gM~KVIu*OAixN zZJIWliM53#(&O7(!wK_vAg;Yzg}q@Jit(ElUElMgOs+Mdm#5Ms5u3L&VTYtkvOQgR z?TpG~U0rP{ijhBLNs{J#AoD(7m|t|*E3O>;&$3Ns(p4=eZ+3wT(;KQxd8MP%$-G8b zO0XLba>)(w2lY}w+++uY3WlHz6!K7KPbfRQ)5cbSO0k{PS_QE~2}xIpMBBNBJ)ArI zVaqfu>_!&O&^9}xBP9VO((J8^Nyp^kZ2FlF?Fj)B`4;)5ZVs>bucl~{J=5vcA4N@( zf0-S6`uCBIlIfJgo8_#s-1EP1m(A~GCfFHfio+!7J)daRb7z3Fh|JX!$Iv)mf$ zhWcJ4knf>O9CdGbHaw!YGAVmyYQmZB7xp$mFa==b`a}(3qr=--?TVfuol=7pdo<87 zYlkP9+JYs3Dgr7n_JTiLwnQ@wvwj@F52&aFdbld0=C}WB+9JJQ^l|EhdZbrY1C#kcXet4M9N6GFQV}jpH2?~o>W=ij7JZKurR~KLf z63WZXAD2cKo4Wq#p*eSx_Tzo~mbxPxhaMF*FE5NbvwKYO$sdm8bVzecg7ru&QAp_K zV4GJ46uxwWI8WlbbOcm{(INOb(5T8dt2i#r^QvDCaRzuHTo=bbr<{I1~a)GVsf{k4A%X4)veD z4efIQ^e*HzE^a%WOd+{$s<(85W3mT@Hm)#UjzMLEPTBwAU9(D-bzoRV($I2+>a-H+t5A}EP(a7%X0%JdgGT{g8; zCn+6de?$#{NJ#l*12`+Nu53S5p>VR120v|Xu|l$3j9$iGjA-baT5w5Ysqk-)1+iBM zy4b>`*#R%)K=g%~Ah=&@mfKCIi_y#=C1k6_2+v)3b_kMXx)G%;>|=<+C{n+$5++T^ zyMEG)I3@Z}>_Sv6krx$8&7I*{0f|Qo!3C)m3L4Ke3K>+WU%>M;7prcAbb#rzBr$tequ$+k9wu9~Pbv6GsS0$YWq7RPS^`BZ-mXky`l#S-OR~hh z;)IIc@`TECQ9Js~<3MWR!mo-IVFBIF8n`>s1(s`L5ZdHPKkz30)0 z(#Nc>*`H3F4)&tZR@*&PrHY=74!sDv=FgnYY@J-P4cQZd z56QIY!o=%1@&G-JEoO&Aui4eJo>c2vp!Wqk8x&LBCQ)6Pv9 zHlmwEI!*0L}gz@tTPx?Fm4IC=@ExyxaPfC1R zOX478D_95^rscMZJfnYL4 zNd)7D;8-C87AHSAf$|3J^u6L`QIwCNayWAM@GcDsm~}F=RqvXVCsv7NV@>7=a)=76 zR(H1Xx8d#Wbm0uCv(xotHL{>A?{8JH{y@^d*|{UY=bwKLIA-VKoeY8^eBlF?+gy9K~*gO8cBw zmu==tYIpEFoiUC;{pupq_FaN&*kTGoQA9rXR+EKolG%)#F^AOO62GG>KJxQ?pDT}g zq(JT5`iWLKTaq{so3W@pxLR>K0LuWZy7}KsZ`B%`5XSTtXnTk~965%`##_}61t?)s zZMy?$V4tL#Fsc9`)^vmr-1UiZN{+p6L?myDQ}%b1Kx8p_#oLiw37#ROd@_fNlc{$q zqy35LrtW(Ws?`!T$c%vnNW(xjy!AVA{44{)O0M;6E?m(J^VU7MpHd*lUA5BrSOKln zV|DkFHP4y7(FbQPDVJ90=6Sgo-)1=Zt~&1eeDjU)jkeOg5_8EY7jgKqe61J??fRTI zO&)vW-Weu!uFFe>P0pBj6c;`+aM%!jC77LS=(4Yjwy?=_kdK$i$6Bt8SfXz*R%^q# zg`-YxaG#&f-_)+HW_*zl#Z?kVkQI81j{XY+`!M0rsq@&Pg&ej=9VzR%?MqGK?sHFa zZiu+ZC9#YANjm^vNMn{Uua0?bmH;)m*ju>wi_ea0yfD@hHzw0x5GikjFge+k5myee zWZhpu0$dX*DSX2K-sGP`7+?d631P8+h8VHA?U*@$Ucgas#v*&n7gj)ZLZMLW?g}k( zH9ixIm0!)CTcIV7$TkwXucc5Q{BT{Z3LzS90{?p%!6_25(l3bNoE_TvMH=i0K+Bqm z;MzrsFw}s|H!>9Iz9cpgp#R#Wsj+t|K5XiWoN$aNd=FtzM2XKwm9e>O{NJB-25!dr z@Ba@2n40JpVP4L4*F)me337j85>7qKgc9xOO-63T-s2_kJ?tSYr%UO8YwWf^}}C(hhkWWcubht zuNhM4uA84~=|Q zSQPo*qRZGzh=&U!Sn_Zo&>cwn{pPD!XcCf7Z*BZ-GQJ*KHQ03k`YzPc7u#jGP7q(H+~vUdZ?Y< z!G`+#&soFcc;QABg(2Bch*I}4Y4a6X<@&N-3fdPj`Lg#`q<%huv_#O$OV zr~6-e-^TJF7Qf`sS32IQ_)id210B~hyN*X-vBMa+R|$+?cX)kA7moV1N!GIvqwRWJ zv$T451RGLg_)IK39CXpUW;nsfxY{JD;+}Nuve;}(uO~#&(vZCh(&md8&Z^na5F#~8 zxl-6dSulK>ABih!qM0tZrM@Spu>87)rcj<&e)!9`@vU0n&Fbv$kT726W2(Y5w4ZtgD3G0XDR6zf$C`Ce~3(3rXPS)U>N6HuyB>bjUT7uN$>IL=Ejr< zGdWGu1Ty|WXL3PYeV~4BLpWDumAMH(NQz{H&miKvbIVuE2x~8{^7Ea}L|rU@d##qV zX$gz!Ux9)7Yx#QYIs8Dvmz#>C>IB8Tb`9a|v6V>_$Z9LXCh17`fhkj^^zgH_k6yL~ zS;lxY)jKg2hco*KHuTeOAWwuT*c%K>R1-D3PEKKY8Q$zbEg7idUK@h2WSwn zTXCoabIB9Cz5z1&93YVlhiRyHN2ZQ!`A~?CX|cYcCw)V0iPE{;l8XarzK$otHiTRL zzkR)1cOpreCHhyGdYGu11-ZKC^vs+iJ=8)HlE{!i1t>MDUj5*JBRmD}jJp#A`RkAE zy}!@gh{&E^=S*b?zq`5FciERGc_kuw_x$T{K*1Sii=v$5sFoM2#v59G)UGxi!S@`D ztn*(XFxuhkQ1VNANFK({pQ(3!!+Uz$R( zce?l*b2+kS1y?)J%fsj0#H*5hWq1O`>X+XAu@vnhd|!kOqr7cYwTPG735VuK2v2Ab z0--0^49-So%Q4AMA1-81^b&&_rc$Kpu*$qj>_9|unrZ<;~#FvD+g|gj#;%2}m zHe?*3`-olGV^;Y-Z!J^)X0nb*U3l~;wwau{x0te5s@4r}+RoWlbVaN3Ceo}!uu&*a z2PMzsf@u6KQw!Gd!O1trPGzS1?p6(IRH~SfH!JdXC}&=RQpVjSWJ zcUI!l-??A7{ewr1RnVO(P}aXPUFI@TxV$)GgGr}Wq`9}pM9uC_lMtC|Z?01fK92IG z+Mw|nO&7xgz6T&h?$vo*5dZNi2?FJ|oh$|ttMHmy{sZ;Cs>Y!FJwNu=U-FRb6m8e1 zOws^gCn-AN1^p1z;mmGWfO_)IR1(8%FNqB3@PAqn=^NV^d7QS!lf_^-cp1M4~b3*k`cH)*OkKuoRB*Hb+h2d@b;~jst=tWIoPH z)GrR;`h&>Jfbln&CxV(b%G-gHWeR{vBz&ma4ri+A~s+RCM%p#OA_reBc1`Vw&EVM;P8~*$MdvQIz$JdD&4~6gD|G_O5 zX66~!9?`q^T$P-ZiKPnVv=+QK#1+PpweIUWH}BXBfo~qs3Z-G(UHHmyy>WG_@jh0zto7Q$mVC#%N-K!CSLldT>4 zvYD~S6S(bzGn|( zWmB)%AMQsNFFe7Cnm-8jD_oV*%_wsEZYqPYa<^(h-LXVAz>|ehHTymajCY7fSd5|> zxHB^2zR#+gno@1qUJlFW=aHG)FJ}CQF=T6!<3rYVHVmw9|Kb^sDZTC~;-yz)dc#x+ zp$f!>*3+{3PHYppRnqqUL5Rnv5Or0%>ORhD_NUz+*o&Uikj4Hsxf{$EV}Mg1RpB`#dn7=&mwWFb;Hk$I^R(y=WrU)JoFExUP3+lvGH=E&Wt!EFrXxGhE> z@+b!rRNtw*`58A@@V(=@rcZag$VE&|udLhpL1<_hPBzt}^<;kp3%z zUAV;x65(<6%^(u@*;I$?uB%yAIv!SlM6mTgQ)>;U`fc=O^xW^wrZ@aw$fc5-8vZ3I z-D8;PXBBFq4k*JzzYQNB9f>b66OkCrK-N9=@Q>1z2CS#ZK2=ST>~5q``Rb-NA_Ttk zfdy9<89NV)1qmah*|$;W5h=HDmM|2eeQhj^0t~QoM+s|L_VCXO5edv5Q7omzW(a52 zw%(Z-i0jx8o%NA@%m34TjhU<6tHZ@cMR(WMm)aGK6PuRMIb&j%ofC}`5eVQ{ZTwu{ zrKJ!8wOt&v?+g0vIg=u4rEnqQ1G7G9%a-=~cU&@~=y04Gg3GE=60clFQzaI9Ji=ZU zz-@BAs>C+?r1(?JXG+QO>5@jnXKOaWU4zY@oFG`Jpb$pQ+QNZ6ji$5z@IrDJR@Ow! zC7`~qKI5@T!uWHJdbbG}#H!RtC2(lO_$7p@Frai9dSZRTa|c?4KP{5#(to4n9~&*r zmv=gvd^J|}{@8>Av&$n{ql9E6Pwsw<6RiboQcazk**zprw=N6`f>-U0bu?D8wu3gY z;1QNCH61%CUlEBRxEF4?{45;h94__cDe<^vCrJ6K2Q=zvgol3zJqhBVeBYNxEXmK- zjge&M7b?qct9>4wyP6b~7WOiD`#Pljv9jz$KX0lXr4z72oYr)L%xknZOv;2EBO*8W zWi-bRO68+cUP&zPc<^(+jqWgmv%Yf^9!?upw0*ymo3pNwB#*C*z%~rSlH^)QPFZA- zk#1mz9aCiU5kroq)lmfc;`f*8(8#J+?7r}%s*|*>NCLndf-VaFq5_R=$CW?^A;84O zV}>YX_^htE8KKFm&9}I}9}xR?N4rf-+ni23^G8ZE6KXzutsl{^6n~#ysHP{171CGs z4QI30m)`Ab7r>COc3@=^Yi%DoJ0*SaxY#cZ%-~FYeLEJ{a#`~uFI3L?k(Z=TGP-_7 z6>ADZB}Q5a2A9XWAurc%DT^N{z_Nqe*-#H@624t%XJ%Gk(0In=AP;wgrVDEhZ3}}5 zcS*orUnJL5qGvI&)Sa#K!oZP2Wx(jcJ`N-x`@_+|_H$Z57=(1W<7P1vx4LN~vBt(X z)pp%!K&O3EM2Ah#d`y1)bqX=2`6b`|L)_Ld=ZwCKC;&oCe(u{VyEL9#9!+aKk3CLm zZEj?)u_Sg){SRc_XH&B=@KuLwVo0*gFYPiyAV$^C)+{Qz;1~*Su#iIqj&pA5W>4&_ z;Auo=O5%7(tXx@He~dM?LP?-=arjPnAK96B*iK+PQUhH4;mXPSaUDVe`&m{(Jauie zu2qz9x@59?q&VcVLr?9@kpt!}@m)O+Zyl7n&<(=n3*&Cz`zSM%nZ~_re__+Un8K8I zabloR=5{%!980p)KQzK6l6)+W%td+_C}nPd*Tyskt<_UBK==tQmJ91vL>>EyA-kwh9xN@Xo%hePAi9<{n`fgnY2gBa(hC zH)q>Vu+59hu?u+#D{NS1A^-Z`;Vv}6c^!^aI~elPk2euTVyej*w$4w9PjA2XaXvKS zm{5JUf1$4l4+0kqUrZN924>@XDP@=9V7%6Fmu;swElY8mSDA=krzBYHoQ!lbr6VC^ zG1a0VeP))=rDlTF18s8q0Uwk~Hgr7tBe&w>w(~m>D!EGB2O>ko&-K==rPuu03!#K? zAe@$KEb4QbTbT>;Z{gEk8xTI+VA+1jmHtK78GrU*nXVKKthC)VJqOf+NBu4@L5=En z_`OZXevEU1xpss-?b2t%@nT@MTQBgNBTN9RP00!HfG$WzL7rrf-rI0IU))ZI@-1g4 zywk!m-!8&cNYZXhrj=E)Z}z^nnfcZVJ6IE)f?Hn}HpOrAy{hSiEGv9meRg&9Fy+-g z5yrEtqhx>&#msAC+WT`2Pbd4Jiwc!gj%W?63{xL+Hgx!fGEQ3kk(` zK5rS3?u?NJ$SCaaqUhykN>^ql-*W4J#G8cJ2EvT3xj03{wf&Hh+kF*#8#qwoCf0lw zU$g{pmTkQ1(W*i2v&C#=@!pw5-ak=jk;g_|h+xV%Ozzu?`DGhJPrFunt?TgEi-^Sx zmnEsQ#7E=*uIvJXZ>auvGZd>eb|PFSm+`t-$|l)PbP87+S32GnU*2Ozm=J+zyY4!r zSbH5SGfPA0mVU=@z?1~ODFQfVX;&x^v?HSe!{>#M@G|S?Q1rE={<*4 zbx5deRC8a!*W$M{5MyRzqbA1!2cx5nrm6^lRyKH)Y)jz0C^tZ^eOiJFb&|>Nl z`5YsM5a)s^pANCtt`lJ^Q_`DAx3jo?LkCL&;X=xqmuALOCbDQ@A*L>ry`dOURR4O%0aS} zma*#7kKd}aZG;D31Jl2@u~gXukW&tZj;z-OJJdKZ*RV&p`4) z30mwmq+dg5;)7%V5~8l&^SgozE{FaP{uIW72{A9Q~`(m(XqZPbH65a2}Q?I z=D0!^fDT%#Pq)4K%tbo}NA`I-`HD0Gj-cmjX-~vy;nnnR4`Dc%WJOrvZIRjw3Xkaw zGey&}DT*D9Gd_)oQif<^m($4=Z6yER1pI~C{3!uH$g35)_i6-vmM+crYHG2MkQ!ps z&04=HH0~7ykg>%qM~L1bfT@{{%47)5Y>EZ2?c5Zt?4})%P?X(!x!gU8#1J?}h ze04SazZEs^S4QXQCQ>x7GGcM%ves0e#@@iyT$UU)-eT2}f^#7xJhuD%!}(od%{HGL zGi$T+J$+>jG(66Mrl!?41GY=92V{u=R&n4hgI!Tkz+44-nM$gPc9nEpl45bzyBp1? zH&5e-QTH`Yx&?!ujYo>dvLkFn$sqvl!@rQ z4d1mSdlTz}Cf3Wk%Iw<%R47xGsP9lfw@()^y77Ze!aPPow#B;?`9@$%-A&(W9zw4X zOP@`6L{T~VHALK#;qCMZbMpgs*LTCmVd zfqn}tv%C$hB8r(IQyYLX?qSc*GE*sFwL30xq*d;<9(cfU@~g4#_M?@K zOSmITJcb02Zi1BgM^fLqSkx(7>cUpL+<3YnSU>nBnZ9}W4^us{h3khHF#LXJT1iVJ z2Ns&Ms}K)$_Y`jrGHly!T{nS5)nx%h5j4pL6~Tn^-1Xo*^K>sMZ+0Fl2Vt64KbJ z^OLzlp!11iIVL0v&oWFVA4L{ohLaozgnMkfpR$&vWmP8RcRMaw*G)(M5G)PqtH)X{ zuHliKcM2S!4u)+1x!Qv8|5Le~+!$#d(NB-6#3YI5TDq+>;ykkns7;ORFDvpuC`Sc)~U5%x9j|SxC1&n z?{cEI=ZHzZG)h@e-Md`N;|}KA?Uj!1LVAxQ;#G!ftHEw1OtgmZOPVbEtO-F?2(+C? zh7=W9?FVV8eW!C-xuU3{qqd76M9%$6nxO~-vF)pHbrJ`$0G07on3F!^+o$PA0;U5* zCSLg@?OqlHZTC06U&E;k>)ufaXrrayxiZ znKf`-=%Cc=j&aRrWwQ=UQOru+UI!<)@U-zw=~NRfsP{u8oqJs2=nkVrmMKN! zZCI&=Vltlpp9P_*gJ#itS#=~6l;)aOCdDxR(tj&;i!x7=`D1zLxTz#(<)}v z-rA(5jTwJ|6o(l!jQQ}_<9;x(MVB96!qgI#d?_7DRkL>cn6T5J!2cEQ$UaP63MiDl z8+7LU6HXqadZNJo2;0y%*w}^1y6t!a&IIC@oACa2=55&Fl12Is#|(2i#B6I)n>S=% zd-VAx9&)tD5hnv(fCN?$HhLbBmMoqQ0fKb~=~T64WEHR0lT--|61hBzKDsk1Q=k87 z1|aH$5!9*L;}SZ1)R;Lm(XG3StEM5YBJ|Pq9#MJC*TpX0%hOYDAr@h$`v?gwpP9zb z&n}(NsUZ3){^K?b`y!3k)pt&iuOJwSIN8It&a9mjmu5Yrghl&vYrV1d1!al+Q)i33 zPVRN`;FF)36B6`Ez~IT**-5e5m=>4CYP+eB>@S8AAl+6U7w1UA)QgHWezOQj6>gq1 z6~Xn=H+M@-tA^dS;iF-z;|QK__(weU-BXI7td{r`8xz|PPsR)rVe6W~8?>`;jcyFo zCZw_ShCJRVvgIPjhFNejrL8c;7C>9oO^O%MTo2!2y6)kv$km`DJKsL?;>Qzd3WH=@ zyZ2S}`~(}wOb3*%weDQnO_CP@ohN)&`}M;r>1J9Mk4;#urwt)2@P{cKkpdXwrMYgq zg@zAAt-tuZMLzzFHhFJi^St1R5TFaq<>uP;v~x5T+4QLQD$QMt+)p~V@r#LG8mh7o zcty{c&I_c;3)kF?Q#H@b8 z-$qY^(K`XgPFwPy{`pU2Y>e)P+PnQ(i5? zY7x<LUX;pmzt_{r~(IyR+%+t~VRXa)$WY9h_y0GHC<(X)P(Q3?>B3ezgCT@aZ47JR5f@xgxYrs9JaB``Ig3cnq}(;XvramQ^T! z)a_Mo3;+PH0RSSn@@0;}w|_Lu`fyaj1H;L#ZL|vMGRK^bzo|4Efq+@W_z)(|3^F8DwG~AF3Oa z)f%8-ln+XWHbdKqIj}gto!q55>Q~-a39IL#_cWS3EFS%w53W!S4e)Sp5l^WwXPqxw zmhrtoghL; z>QVKJkiMfz<>2U=1^{=a@y7)X5oJbhW|#HS6?P1J30lq}BFnbGl7)0{kd*_qyj)iT zTd9`jI27aB1Dlnj2@ytBaVc)D!@YQKkpdPfnwT_yYL zEgH^Rl7R~)uwKA>>ipw-%4&n9c@DCNY%TYsSUiA>ZN@}eEHppY@NTT$35*YA&4~=q zyMUs5Wqu$J8m+Z8O63`Cwmc=txS3LwpcsOd=|Dr%=8z%8DkEV-;&jrFM;m53O zPHfV?WNUnlUlv=X8@U|>X#lj<`d=(8VWly-uCULruCr}8*oy8U+is&B749^XlhI&89q@4;I-g%qH`^DBPwre!ogH>2yRM~hyK52>? zZPtps5RULI+jT1jd{FOM!CAY7Zuq0w<()8&#Q&u99oDw++UhGc@o@#VjqL}B&4bsC z__73K{^iD{w-wk$~-q^WT^nswnGy%xbW$98ACQhYmNax)o9uG{<+!Wwd!43cD( zkNvgPSz=j9MZk=2nmr(3Z-fb?vRq-BJBG|))Iq$Sqb6HIE0^Yf?kegqyQydm(tS;m zxuLe$+H#Pjk`>N@Ohw9Z{xsjZlz&+vA;aM73!><3K}ouDeP4$EG~?H`n}eWpc0}J7 z9;|G>)Wi%Zj9=0rM^{?)rP7o1LW4|ulB$LiZ1dd3fQoPmJ)iR3e4_lZAX z0l#+l6bgCN>rBy#%HA{;W>wBppJo1YNfCNYWppbID}XRHOxcd>I>bPK#hlk@VHp9a zK<&1s`I~h|oe70#GvqTNOoPw6tp}Xd1({*APEiaBxu=3oA>#`R$1=>H*KFagWHL!K zY}eT8(C?heyPLw9=B-_?kvls&7L|oeF(N$vb?uAy8%l3C#9sCFmv^l7$X1w&#=muJ zxwPvbE5)vD-B?#SYG_-l8J<|ImIDzNWuIArIIEt4kzjm(*)y(V2-@}BzS!}tr5h_b z?@!Y(2Tky(#D8p5Rc0skIu$fAP>LC$n2bkU?^sOtQIT)hx6M|Uo(i&6L@aXcg8XdA zo2t0s+{^S@#36~+vc`8<3jC-mTw&-Mj#~q!AFo7myYPs2urk~n%Qd{$>wYYFcvr&} zxs-vHgwXnxh>O8zgNj^Pdm0$70?X7>NcvT5bZWPtdZ=`$a_qz-%B^U+Tv%#&gVa!A zUGu5<|D>y;;sq&*#gY_9PEdaru)$f;+W`TTO-tD#TNg;@Z1zVMsHLz@vmgmPjE~qjGDM+`zsCxnR z7J2q@%w?1bW@W9o-1q3Hh#|%NI-Hpl=^9Zp`BSlz8H=|1tRp?jj&yhTF6i*xoDl=! zCC-8-u0jJV011llIVPMHRUFO@Ik`B{SmGyxGCNpPulPfC5|hBeklE_;(&pyq8Fc@b z@$FY>D7_r_2jMj=!z2AO<|VcOn5N#Ez$767alPIG&R{wKPJUUiiQs1D`S-kD(f`5T z{^nS&M1;jlw)lEi>Pi*0t%e?x(Zf)A(NH^3JBu0AogQ7R6Kwq$G8*3-ug!*oI&)XFEt%tX0L@1ZY7i<{TkdkVa3LMzpG?<`>oEmXC%x(9C1yaorKmu%luMs8hw-xU0t%xtqn9uv3V< zem~Y;*?5FS$KYZ3B+s9r6u~%I85I^Ko+y@VI}wLJ4olsh2JH)XOJdE9wV{Tk)G(lS z`9KR_8f9btfN9|z#q_!p!k%~1k5&$zm%)fL^mo8H+Y*|&M$p=B$(8)sAPDaU= z@k_;dFtrRZb+SzKlCa25xJdf?9w8p3s`Ey=bM}-ayQQgnd8|uUmd5hv_9IX_H0>^h z%oOlMmmiR8GsO-+M*9X`7P=$|Sns(qC}0gsZ?!^`)P_}fv!vWvv=s6#`UUq>#b-z| z)CxuS{X{)Re6kQpl6-ZcLCAErF7^zQanlG`TGPGtj7+K?HrMX8u+a$#;iNn?Vr_)r zQWxA9yE|2?uVl}h*(%<3D}CD}aLc3+=wU@;x<_PAeGOGFp`wOQc9R*LJyT0tWz zZECS~*vzq{)`r@gjp=5DE5upe!krA(U(Wd;46ZNvK0PmpahDyZ$MMWfA1txBu>BH9 zPpIg&$bDVm^_?m@3Ur`o%d>Vu0;w7n+n~~O+*UgVPIc4OY{1!Dvkbc-FT_=Z1Wt+~ zPTF|yA~*H|qX#GM1cFl$O(u}rg|cx%ZXsa5@kR~q1A8UQ^$6MY&#i8zI9&4$Nl7pKLkUpj^PD#X@Q zmNmQfQfF5rFt`xl0*R*$H&eUzXxp&-&tyT#1vOUAocsFLJ8(+{6=7Hg< z;(2GywPUOC8Veub)7fP1z&9XXSTBKIZpJ=xcXa zi7F+~nfTN6_2IF1`}jJ$eI)5?n0&@!eqq`YeGN!iFTI<}n8AWNP_5~IyI3If^A5># z;Q0n2f4mz^hKtu3e5>eL8UcSAY$0)vC;&W%HRlipK+xM;yRT-vg@unU$JoRw(*;YcavuFrbkm0WM_;@KrueM^+WeH zV!g7{x+onBfL`udc(4+j3jH^)U)&A0L@6m%>N9hqm)fz?|@Pd>tk(W@}jfV2s zz7pGlMn6r{opJCo0wMABWj5wR9m*A9Cq(v+UJmROSrOgls}5pKk>>El@(CfU3hOx( zdn1p}tKdolA#mTF{Au*?5Qh8na5gf{+3D0I5RF9FrBTL`f<2;Hp6IclP3@y3nR0h5 zzhVo1GEmCEI!I8;-dk#6y4*M5e6F@dX}V9^-x|eW7TLkwOSojVd4nsT#cC8T_%b0h;k2>=zEvNBSkelOP_d4%o$oQwR~hoyfE&k)mAOFmI7xCiRciw zq5xNz-vog?mTRw-^CGStS0B-AV>RERd&PWPESF(rH*87x%VQo^sZ-P4F0+LP6xjZY z-<(_|d=+{dn#g0+Si2zPhR0r4O>=?xvBS%WL)r*CI#u(Jay(yo^>}>aRp^^nbe`Xh z2OsT0Awy9DroCLM&5F1!f@$p^DU%ZbRFU8^X86A!*B=;bSLWQ#215t} z8NNhhAxhW4fd22r55AT5%S(uwzK(AG-1p5nI(`&`rPgL!c7YO)Sj8l$4!fDk4`4cR zei`4rgJ%_A)crjN%Sf#^R$0yG%Gh;#5|#@zr4a)hosugQ($2Ot5cTdt01TAEKGok; z$UPkK2SXi+Mp2!1@fW+Ir<+#@PB=O~Wk0YuX*Ok=U?R|2_{v(i=v$25C?&_e9>C5* zXA3WMQS6qO-0mw!^2i&t0ztNAz`BuQoqZI<{rZ?8k`5cDU>eR5DYT6dne{wd%XFhU z&@Ss<$dlJ#J8<#%8O9QG>2Beu-d%On^mUe~l zRyVthwtSu#mQ|CO6LU%6{OD4v7kP-y`+2$U48Z%{-m3m9qpfoV>^6a$oMcrSxvwe* zoMJ_UE3>1mrs0x(e0>9BoWr98I`P9L?rbXzXf%6ySD&RBr85dMPYN}q51@51vy^_U z$w};^UxvYunoqtUP}9{N^Nm4MN5i*~?j#2YXsWP$OFoB|V$K^A*sxBPMF6KA-`KTu z=a8>l%hp$|?y3-_wl$rHFgOMamnaSCR!c#SZFXq%b#34tXkOqS<_FQetyX;-{wIR@ zA9-J`9n?(u>aHxC)E*%HA}Ptk00)GWVFa{mZ%XGj*3VnPSs{FTZ`#gpBh1dj**iL0 zi&vMR1wNcz5!yV3C2^0?35mBOQkI-tcDx9_G5m!Xz}% z$L8H2`wHKIA7L@#0ivmeDctD(FIbKK1;@T6zr*WJPJ}u`7%JI5JXjYMtyK@VI#(>$ z=4yo~s{rpVe2WeB;Vn8npbN^Vu>@Qd%b}}W)cK|1W0U8D2Ev0cHaG<2V|v*`bW^O9 zJS63Ker;9s=){wF8V&FyFqFq|DPza&NVazaE*5-9ICzLA+I>c3FPC`yB_8TEg_qML zj}-uU<4C*Bw*BWn-+y4pEv7UJwbk-aN!B5w@7K<(+o*?lq(r#+mUh}(H8JRGJ3FUt zgT^Wf7CWQw4EZwAu)^HJ!r=a*U*&062#|iSK21(Q$UPGaG7t=4l;Dyb_pF+HW0%-+ zkwkPmVcW*me1V-*?ivJLRnhv}o|9NQzd~X_~6HO(dk5JQym&s=N$L4?i<_TSRT21=+*Sg(~&Ya)5va zI?n@lM6vzWI!1m9vbBnJJ~&hWAg9LlN$Gl>H(^q5ycw!$L8lRK!u(}L&HJcJ!PA@c zkQQFD#^cG0=4XKXbU0s_RKl*^b`GR@7Lo+C<^=~)TTB~S*HrXU`7SKa^kZdYq3%?a zP9$vfuUTV=Ufk~ATnz{tTwR=v0%prhH(bQz8-M8=3g7Mc01qNT&TnmOE@^?uKEmWJJ)kGGGz>{v4neCg~;^^JNNW|3zI{rR5bxEc34T^y)!ow3FA4k_@|8le_`{(dZ0iO=M! z${4EtNFE^@fo?43>(WUoN=q@G!itj=I3+6LQZX9(5IzvhVoS2o3X-u{b37p9T2nBd zI@4yfSsmM`F7FaMGJem?nYhlWr^xPLrTGyOOe_T#tJn91+^26=RLUFk|^V+x&o) znm+dygTbA(Mb)onwB)|4;p%`i=!~HGfGMHxM=)A%cv>tyBH)(-TZpzdhQ?0vyuRsu?`CxOG<@!c9%h)~vd;FsnC^uQZ>N)YA>07iyhszm zuTN9qDv$>WIzl#dOIJQOz%&7+aA^yfUp-a$a!p(t(m~hhCuB>m+JRXk#JR)x5UQ;@ zE7b6yRq!A7)?tD{yyyuHoaCx6`}2Sm5a0e8QE^y4l@s+l3M2o0ZA!4I8-+d9Xl#c@ zg1T$4bjQ-lA-xPi{SoZrn;OrWhvTZ3?e|C1R?xvyZjTIzxrhDghFsRj1J|$d3~77Z zefj_yJWpq{>05X|(03pGhSlngz^;%4$=#Sw-YOXaLtJZFwL?rEUOCA^EW*4k)WAFu zPpH~Q9(If~oU?^@pA1ATY+~$#Pyw*Yv#C!weBItgwR5=~Gz_p64$N#dOzG&QGUOnV z-owuSbHwt8wB%8K9y~|Q_2}<@-3R(e3UDguG!a_^U4^__@~&_kN3)>39pJhsOKL>y zb2g02^A)2-4Ei?iJbkjM`hdn}E5Uh7H@lCzuE6DJGz9(ruB)kqKRy8;Vc!V+ML-nC6E<0zgNpHu z4fEI8d}IQ*hu)kGnn!$7g^JN=7IE%-0o0PE>7=e*9WW)c-6!wC+ynrsUqBWyOTmpn z!j2^l8_s*Xh?B3&9yaEiCvoD-1)G#o#ZqWA zg$@+WfoxUd3))iC;n(J+}^GQ$I_e)zDZz&;|4Hj!9PV@lg- z`kS@*F{UnkW_%k9Uurpn`bqBR?}Nqb>^Dn;a9zl$6L-}Dpp&Gcv^P`xs|)$iwC;rz2HH{nz0JddFkPfNgxwF*`+GtJkAS6CyItos zyO=H(?*piK%Kn8iCLxZ^2TL*DY^zrTpIPPAra9FxR7F#_URUl=AE{S_{f>lfT8a+r z+qDQJ$pAV`_ZiSyt2+DR++6}^*hq-Ej)FgbW__|H=a_pftBwPE5=7uS z0u9UtE4i=^p{Hm+WiTej6+74*?K6`82(=JYNJx2a(U?JVbjaQ+N7cqS{a-anu__ZH zpTBwfY1f&&iQdiYM9d+noPiYeITNq67b3k4n8F^f8LyVo2dS4|q{nvZaQC>&0 zv1E?IjuPDYTkpkG)_tMO2lRetwt>j2F|<|Qx>cA*2@v!*f^N>N+nxf%QVFn*#e4B= zS<60te2|i!;QYlo8|eLbP8Q!8JKFaF>|sTe+yyDn?Wed6j$J*#aTG-vPcxEiS49p{IC#EeEO!i}a%Y|HrD0zs z8l+D0#U2(Ipp{wMu1KfjDP~CjCEUYr@hV|$7!UVotWO-Tj;ATwOO;bQShy<00eAz# zWuOh7?T`*hW=F*<9jv>G-XmmCU!LSW-RwFw8*pko(ElV;CiY@NHj~Ga@f&AYl{3AO z86JyzU7bvr0hr5Fcj_t%LeZVnHtr_4w@LUqM2s_x98KgM*nt;O;Ui|PSusV*$VqkiC6W_*3|9iuwnc`2}QW!Ycaf zE8IvBN8f1Xark0GJP4`^is`C$pLY#YbK(uxq$CBMReeH_)ewd8H_Ga7$lx*=~vUMTTDKj$+!#1P2O*cq&Kv%bxV-|hY z!;ffo2}Ix#>TLKMMFUgtiD^skaw!X4sspNgTtz9s8pm4Y>Cx~Zzl{I!DhD%nZ$S0U zmqwW(^gAr6Yd4>&h{d#ywOT9D_y_QK(?5msO9vt8r3J&bg!~X&GXXk z2DmD$Yu91gkqdB7Fvp89K$;LCB7+1pMP_?~U-jcVXB1ta#3L~#X!Bk7{w>+|!mQ98 zO~Yz>-TQxhs}0(U6n%kc*mdu}|MUaBJFa{G>OWB3O?a{E-v6*a@=-7@{qM=?UDbr0 z10pg1)rZh~4@Qj277WqG_sT^H&|V7aAEIhaRk|hiH&u6`bY}8WW-o=d2-krK>9xre z6)ASI?d0Idzzk?;;G+iGL5QS@04lwuy@DVIv@x}e{Ru4&k7xRQgW5aZ#-%}TJdY)f z((i8RPnFl|e0lL3p>Za8(TZ9aA;6TIbl=EWYFrlbfH|Y@#Z5m;6T!8B_0vxcz3M-K zg3pAtl`-`XfGc?59EBS2jc>ocKKGtR4`a?d|L0iLWd4T#SSD=BkPY|_4^|7TfW(z4 z5S)ixi^$j=405+nI*I};CNRXOOy5@T!88YW%t|^37&ELzK=YWPuS_6LyKW{OKhVc8 zdSKSD(&4{3RZHWfjyk(safz&B+omH!g#*prpN9+6=2)}RFOx5af%t~c@Uo*^B|A%$ zu12FCs}R1nby2rx=!pJ#%xP}7 z0#)=SXjoVQKzd^ONzzTCw1jhEB85C4dj67T4I>3YFvZ@3PHeZ2r8+JN2(#C=12u5N zRjKfY}BxPo+uDNo+#P4~uFi*iiChU;4N%g8n|5|+@Rl#N+q@8KqpuE^F zd0$-#V&lBgqcu`x7l`IM%b8g<>pLaN|JvXqD)mJgt!5_>>=AU+r_Lk ziKMo!P*eWW*vcD|WK4@(1XxpLG&`uuRLxs{Tk76Kp@LWji8jQ&R1S1DU3699x-uN$ zs?HDQXJJs?VVE>Nd<#zE!lE6rIO|RU`lDz;ZswL$D>6b&O>7+JzMEpVgkl>a$$kWg z7-=HB(#-}X*~Wm#@)^61(>VT$BUHlb@F215$bSlP!_!^)f4C?FBV1#7+nT()5XE2W z(C)t$ZtInXcl6t2^P1vhfbkLk9G3-m<#f$@I7u2nZ%Ng@2=t-guo;#HIzi9OB0 zLc4aUKzKJNgQoz0(=~Rw~zpNu>EyMB%|Mj?m5Q^E_lvX+y zdCXR;=o)2u48_GIN&=>nKa-fUP}*AR>?ES0+N_+p5jNV(=(wf?*iP0Bu%w(AVs6&m zb>x*FRIl^T7*>c^?r2taKCFdhK|&&SdxWvQVdJO5^duuuxvsIO!sBT&)J zWBK{_Fz4AHC5^*1l%_eYbmW`%@4c-2iN12Et7X-!ipIV#LD{4~8xq*V)`PV$i+{`A zkN!%1(D2q|JeiKh5RsGMZgOyFDZ7xQRiLqShJy@Px^T9j31pRkLaPIR>cZ#O`2q>b z^w;6UiW>eGbUgs>sFCl5{McoWX5rJEnDa*%&7d-CNwqQ^D>70et4wE61N4kp0gILO zuJRwNH}497DBft%5Px{>{o{4|k5>yO_7Ocu%L6ds>;SJf1cX?J8x^VL8I96#$0>ab z^7VH;fTCn6GPhkhXUqGMCu!9ivDV0J2 zloYNYYn19(Dnxgaye~K&4_`)u}OYCgaHVxUul%6?z8d zr&_mh!7-CO6M$4d+dT1DxF|kys`^w+Pp>(2SOt{Kc}VsL9-9LX9^#XMDxof6u5m2e z41|{2Fs|4~q>QY)4wlw!E)G!u0+&hze9dIuC@FI^(%$9u z^4x5Ve;rQdV@yb#MhMWR$_|o#(Zp^a|ESkuOQG(3FpwxCs5eB$6I?JDtc4F;7ceup!W6AP%NmDQ z!GE;`TIs(j)pcwJ_?I-IDcJ=Q-j~Mu8=DZwdR;yYCd845syz-5yDLG83T?S`IR*o3 zIY7HoRv6(d*BxA;+t{>TS?18z_L67!fC1PiRd22@e4JrK3r1yWRt0jSkgz~YPM>;o z{KWYCAPf!a@FlPZwp`*F{JdNjsno}g@p?c*Kj|0#LfLb~z<87_H(i?tLHPfG(>#Vx zCGR#vN*V^n6ZAQOCZ7Y#amwn?*zcHeO5YZ-fN3A8^z7nAjo@N3yM2`8V3?JTJ4!A2 z29w+AaM4M^AOJGg13@NeuOt?%mBxjg2`ftRtvk=eR)^sV^5rec09~&pZZv)wybKrU ztiKULNLW`<55V(EHhUR=9eG_$ixBY0P=qsUjWADsoK=BawOuQ8w7=HRhb(ko~< zm#1|IbR2pp%a%G$4FCM+`{4PN1R$-JKWcWlm-I_sA9gs-nX*V51~1X6OhFhtuddvF zh3T%yBRPRjS!-!_n8`6K9#Nk#~t&iJ>>#3Cu}715dlGwKc26(HiWC>4pYyBdge!EoRvcJ=5Vj#l_gwB^rWcs=HGId`w6KD>D$LrxqT?~4sbklq^H1{>b zPGmJ5hK%#&hg59Dfh#T@wUBX`HfG^n+oWtUwR501EdB)^S@97iu02(OP>CL*ZI_(H zw;`dQAU-6Ku%|;CyKlM7yC57P&BNy1vSp+x6M+#ry zzfzvyZ{dCXN&$7n9T4iuB?}Dm-XW__vjX2Szxr^WjcZ#w#_cZ)wu+j+7!o(@Q9tll zO0vh*$kutYszdPNjKAo59%15rb86kfU5=hpLW|r1wg7Fk^GQk!besK%l0}3g9)yBj zZEJ||mSpRYnil@igcaI6xA2J!2C|>dduFnob@rfPMh=(Qn57*YTg_Go8fGW6gZJ<& z{5?z+XRbF{SeFjS^Bv(LLjco0yl5iz@(?%HX4S*mplITmr3}n9k42MER>Hi)#iMh2 z?f1?A9OvT$nj?E7K{;6{8ii%*y*6=fyWk zeP$n~qdAaxfLQscQme1?U!-byYrw*-G!imh?p51F;wa|&a@QQ5oWxIQdABdT7KGL;nb_)V+aLBXz!)+&k|6uM*9 zQPhPRnZ&3JalXmxxO%-A&vz&viwsxTvd+PO$y=K9g~L%bOk+a>a8x$j$PJKW1>_X_ zvF0T_(Gp+8e_eHFo!*jKo_3DkroBfPcHQ+}UJc;*=Zal#&?g#?ehcsPW;7(hYuKAj zxoW6a=Ux={RgoXZ7+I@SA4h?;sgPt*B)`rN7INE8+~SFAoo;S|ip7Cxw1Fr-H7pj4nl zU4`&2>N1by-4Oim&br3|^0_iSD-{mJ$|P*#8UQsKXzt}=8{!@&jVRjjyB&devhMvh z{>6P#|Hr@me;0>@)LFRf`w;ys^s=$Ybs8JtNNshn{_WBXr_|~=n~Wm!MRvL%$A7J_ zC%TQYACY!*qnicj5e;eP`vU<--`q8Z%c-;|?asRev&b z?6v35e4ygGi!|`PB@HCx`r?2u4 zoQ8RAvIrk-_=AGxX{Rx`1U9gVS`nDxHKE1Eu_*9#feJBRLH$$A_K|raDgW}*&;Jz8 zrsiGL&s7({0Ul^+J=7?3-Ixt>LKJhcq~EBJP%u!W))KDgU;j=0ocnxPAG7-0k0pYR zl4}aZm=}Q*ZVWEIcTBN*Ud_>7cBckA?fo`m$O)vf!Iou+3>Z{uH90O(HZ!sSh8rPPD4Pioo@Ysa4%nTf zw1l#Ic$YXL@@&pdJHuv$Qy=JDr4+3e-=<)fYbR}DSRt)GDaqD|bf)*J65t$-9;E8_ z*9maauoe6bfCPNm&%?Xnv;R7njL^J!E~Hib4CGSSaxlqFud0eT6Vh!A=ZHbc%Kx$1 z=3#+5E|+IJ{MvT)yu*n8T>$xOzCGHNzCLk*p5H!_H#J0qdwm|58ECW`TZNTv zf9{%u-(WxDx~m#y;lR7I8&MeGu%$GQPKA_hae)f!^oTeHt+gRuneyz2LH}afNZt)0 z1u5Bo>qePS(uq>HD7|pza8oj&h$Z0?pGM(FLjSU-RYjlQE-uqB9WwCgR*^1iYm}Zt5G%#P-`gQ(gZhRlkv_FxJwcUI^dZPadRp3neGIP5# zQ*hYkrbGGL_;yH!#?5SK%Mbs{ejD|MchBS5@X4k0CPOkN=cC~~5-9-4wF)KCbXqcm zT~nP>*q_eTPrcLTi)J}L+2VC5nBesOetc_pnd)ZzHlEyuZ!$cFCwP`IdX;q6%J3{b zagniJ<1|$#+jyC8MFRkOb(Wu|&qdX*tpR~BeV#kjdPGEhaq6JcfmS%lmj)hpYe{q8 z@#PllliYv!vCxXRy*9NiYH*=E2IC`^gEGT{g*zQR3|Smu3C9_&>8Ws`j@R2t)6DYk zt(3|LDVHGdF3Mx$_Ye~GS@>)`FlOFCwgC0LmeqGih)7J_#xbGw0}~3?tkSo0B4Xg> zQ}Q`PIoYS(#s3~nWVd4eB}p3vuj9K({&F=p3=Ctq<5Id}0HN2P8SJ7qMVORgrRg_g z53tLEc8YXIux27JPn0WwfAI46u4+_tE;<+NZ(tEP>!w7;xulUwmxmCDjIEkp2vDcL z^j?2S^8?loqo7-(d@T_%HW`y25u_@0U=`{oppCE)TUW!E1$Q^hm#uoU2U`&=PTd(O zBVL-kprT6MaW83ztdkRwJF?Tk)@y4>n6r<{?W~6oX4!=Fm_7z#hQ_#nyvw}X|50y(+|H8doF4u$~$ ziT))+K88ZX#S`|Krh=B9g}<3Y2Ra^!KUZR)*Y@_wz4Rq7UjRU2Inpj%ROwu}R~QS; zfwWe;6bsRY9;>2apN;<_)zI95rQN>0yVsi=RYB1Y`pnPvo5bOUi&5_(OnFbEXBVB2 z{^G|`vmBli{7+;>yVpEpNNQZd1HQcS>_aSb#XINasI`(ub`rzr>_kcz_Fsl+d}mTf zW=d7nUWZg0{p3k74J$h%0Iulfj_Ucxc+Z$?v4l(#?WOa|=ZOYq!i9DC^P7zOs zn~NWU230r~iW!`-u+DgeI-2)gs5rb%9%kYA_peWv%>_MIn!9I>L=vebnzMpR;~V3n z=AedzM+z=Fa;ONN;=}NhyoOi!`rI47&PIYRhB$JW+ppu>>CAnT;WrG2blnq*6>3S@ zrcYxw-EUuS)URA~ppw+9{649CzfNXjXr6~Tp|l`UqOE3LjK9}jD6X^RwSMt=gCz(% z5YP;B<4Zu1mLC`pyJD{AR%}!3C(=^xUD8e&IdfQ$FhRW=+t3@37LK&sxGbI_k&Kofsj8HOrJ0jm@kj2!9s8aNLMfPV zAHu3YOH^CcTlwa4p%!|9ihZhvz?bX^9ZGZKNK6ybC_*Zv_0ES@EQh}0mcxWq!}}bf z`*Loz&EAId$1nnK%$0SJm}%YP zMN%@HDCx;}-E~%__+&1f-h*j?t&}R7yUaQbh@6G7m9lb>(ov@k`vGldR|-bng$=SI zJE-e|!Dm~AdA)Ztn*2I^7R6cRIMX2Vx~eu~ZJiewLbn<6r2ABF&L#0sZY(x&&Z1ld z5pGAA$D8^K0C^w>x5~l7G%5S0+TFK z`(gRVgfE(@3Z}Y)s$(JF_sGs%Yk`ZEm@!r5e)*5yyhZf5q1x%udLWvsT&3>M<>@Yt za|xugh?)C^xeQ%)0#Ys|>inMOkLnDfK70^@f(Y60r8#*aT1`BFVaPhWDiHNrF9VV^ zk)hU4H7_ixhlGI@1uRI#lCFRb;Y^}4vv)U9)yDLp>Q$2zC5!Sx!dIxgfjdS~RtwcA z9V2DLl4HCT*KQ{#6?Ur+H12XttTS)5wn}=KAa#8=>!^PW!kWM0sjW9~M zu;%C|&(4;`(sxq<&+p2WMOji7Z7U==#ev}kGJ%kBTdrE{sE$ic3)W0lWb8xMth@TW zaLk5~VdJwETBnt((rv9y*;G5rT@t4gl^-b=^#$?%`r|_LdY!ZCZ$quqRMSUh{avS1 z?#)Ubn}f^|Jv)7fOL>X-Q1S%;Jq(jrRbD1yQS@--#@Pg66}#0o60-y|Wkswe#>_B= zQH`rGr;My^Dz+9FPE)knr$bS&8EZ@2P>j9C_H}x_Jq%DxG-u#HfNC~~6A{iBa?r8j z&~{X^TN0ZCN=SQ*vdT^mn$2*TL4!>Y)xvBP1+H-SUUJatyo}1j$I`Vlb;128?cNs9 zAL={rwWTwlYmGq~6VO0RP}CAYR_|X$d#cnKKkEabRO|1QJ3G`>NN7w0Miq-=eQ;R9 z>&tvGoGl1(6-vkg4&s_Qfs3_k<;22c9j1XuKY(a$qfZ{ZowRZo_b$6s3yv8vIey#f z!}c0ax}wX=({RFCrqz$Gew3qk?RiUS=uX;bT#)S$i%zd|L*;y>dDfM@kzojO{?Bc& zWzdU!4(dphJF?Uhf%StW6c5LOG71`ua`SMHx^NMY#Ivl z6U3xqVSyq@tlxZ-tYJrefyVuUr0+=rP2?nmnf55%V&44QP>9a^5K5ddgDPZng3k z{MEEK1@_qF1e$84+(p`Siq!7Pu1MIysUpbSpE^BOh1fAdmeQ0$20&O{e1BzRmbj@< zwWc`x+WKl)XLcXqQ@*p9A~yb?@J!x9q(%xfUIP-Sq%!Ya-8U_PU0rTf)rThfB3Gcl zRnJLmwm@Cq8%WZtLVs#`d&n9t*Cs7bV{EKpKbteAL0ar7GFAzR)-GE)cth1Lu27Nn z@HTwP zM?c`4O>wjS#E$BR9J2jkyUX!h4L;*L_~QFDl`II0R$EtR4TNu1n+hbVW9yPPL>=s* zy>pgmcX0^Q01bViF}e%Se}mPfv;Va?3sQNpB-fgPFY_k~^0Oit%?ri#BkK);*tM}o z8rKN1aGBSUv#~GUg{HO(Pu#Gm?j+(Pcj$|U5pw(C!*D#AFZf)-pP!-~4Ya1D_Lh*` ziZU9yH>P2#*&>WDOxNp_h~FvJK#V<3&dV%b#8W3`4606@op#>N!!BMe;8n1)l9?5t zM2-ffrrYpo?*{oeX@IkTulUFssEa|1e!n=>TN>!Gr)L zJ4FIK1!f+H_u*0aIl?D1cu`SVx}xDr9~f+yqlW$Lp=no`eh>~lIGDz{sHV$&eGx{@ zW}``>jfhT`H)uAI#@SLJ>qwEQFqf~TZyJP_-#(({dqa_i22F&ywE z_EjKSoQPd1+OrQb9O%#`#@-ylcfC2rpCRDzSeeJ<30y4( z;2|@uOGy#sCK;#j07?&90V=3Fa}UK8#z8>TNRAAlKsw9f_OXuQB<@ zfvgIE2jJHXp^yp+r|%WMx973=%l>U$$mn+4^&pV+b#849^FWB}CzeU98E*3wX46d@ zrRj=q@#zz~isB@EB^tdsm7d{*!|0pLkUXo)ur7)+s z#}PlNx>`2&bc*DOq&yn#)u}=bRe?l#7Vzf!;yv8X#t#ZP-xA_+H(bM35 zcssh2qE#PpL&&cO_-;dl{f5TPZt!s=hsQqEGQ=?`AM_d!fD49mDSe<^=O%hk>*#hNpe5VOaz-c2flF*=N-RH#!XaDV(fJ}g5vJ>GGXI|A(iua*K(Vu1 z3#U^8u`a$*>bL~YaCf#L9Rfcn|MIIE641mmZfkU{e+Wi>;g_F}mnbM#!^1RbTgG?{ z=u;@^eOLb?vM8=BVJ18H>Y&+BRM`07(76z6j}W0zrSZaQ-|n-FaF=^BNY6$aa* z``aS2ll40($z6|x(}e1Zn!k*=6o$x76DH$XyQul%87W~{=;BO1v{tVxLS|AK@xEH7 z=8x^a=)DYQ^JxcC4+DsNEdeZ6DBW#fe0T1QmTjc64z}OKPZLNneiqPYNiVBt{~wMPZp=|4*;-I zf^Bz@|JZSLj=s-f%NfC=;kt##)TN^|W=Rz%ii~SfeLScHD>q!w$kdi&r_st)hpaD6pU{v;_jD|g?aHv+* zCJYd(%XTC=9e=)KWGMNbDMUXU1R zE`Ce;wjlI5yjBYcHU-er&tNDA`t1B6+~c*UiTn(TRL- z%*P{xExh>JMZyHCX%gh=M`q0IaHRYDCrQepT>d>vsk`AZSM711ZV92Fp^0@yCi zFb`w}!K}l)!g=#{sqJtNMDNU zshEU|i!i6j!~BKnA~|ALoUB05(f4KeI5x`NiQz9vXpTya46 zw)=EgM6rolC#VM?f<$GhNfvk&D9&=60~L`gMB@e?C{Ugi)4&1MdPxkG8E@H8#@P?i z!UoTipVMYneNaEx+G7_SVZPa4ho<%n#fB1GG&ZDT*H`E#U|sq~H0n%XSYO_;II?C* z??hh}!^($;V{ym-_UV*)@v#k$U10vs z#?XhW$~3x4F$m0u2RdSh6jA0938??O~SF#&vsYq{1|rcS`TO|vnT z_i1}%%}?}v&16Pvlb6%F{CwN(i4$bzcAwF+qwnBex5h;t{=?>+i%~?&>?R4uabG6f z)Zk_Wb;{pHlZVleqJ$*Hma8o)9X+ZEWU7R3x{ijkKR$rV6TPTuBOr;8PfhA9n|!(l zF*4n*;PglX@WFf%qrG(zZstfgu(Z@0Hk?0>QAv+{(e4Mg)?_sWCpj*Eu%O0fi=K!@ z0t|;qa^IZUa&Nci&O=$!YUubiT@w>)OjGTOu2>(g<>~!l{P2?|NBCv_Ro5eU|L?2E}OY5uj3!r zkyD@jTnRJU?Q_qL?k8G!B_d2}M0}#6nLEoZ+N1dAknG2HBWdZgs@lgw;#UQ9Hvm`j z^6qygjC3%iEPeptPWC2Wsp>9B3a(lePS8~%0C2|q(^ery`{h4+ll=4nK^Cq5gTXxp z6`U@Si6nkakaq0UOx79w>6=vA3n6A?IrCDA&(t_!orjM$LmNLlfsKM1nzka^BlQAD zv&q*k%gZ-Fq4ho?st_ z+TAU$0@Gv|xW%d~F!5%d#v?VQ>pfNZ!xtopJDjAYV1j@Jac5;VEm@`++k&m^$k<*D zQ5O_t)shHb?139G~BX%?i@Pq>Iotn8M z)x)xsp3b_Eo$p)bsWge+0?~F4Oql;mMKCoOMb<4rXg5jDY~`PPh`y>_SVvbDFXSwW zsHnq^B%MHic+aff+j`sPXbvVH)BSi9mJxm@svIZ^d&ptm#~a)rImPGj%%WkD+(nov z1`pGL#4<2;uX?JJDtK)iuWD#=QMu)vY0=a6*eoe>a-zgqt$0cES2`TQ#SZB;$hy;~ zrxE%BHvL`qmd9e(NQ zO#WD|AthL5cpl=kM6s9-m5&!p%#JrsQJX{QAKD4T@xWPwm}C6uI>OQtX5NJ;t~lf0 zs$z9{kw#Tc(jh8rt}67Hs4%Q$!_~hM;5~>>2LTZh)Mccv{8Uk}r|5b(oieJGX%rkYO!5T3v4H^z47D7tXXN*2lz2QtxO+r$6v=>_$|HEQ($Rxcp>S| zp9as<$wLbUtAsVXtIqf?C{v|%%-b<9t2fopl+UZ;GW~l!8zSu$G zw^(MDy**JVz(&~cjcja z2yhT;-h}6vM3Unh45u)N*vU4170N1a=)LBSLwGd5W(Kx@E0EuRL@2E4afY$`N)07! z3cm0M6cOl6sqe?31i}v6hms}0EmN&qCu$Ay8(77}Ou9$3$)o74f}*1=Iee^y<)U=3 zA1I9rHC^Px9TD+{#p+;u+d0R)C8gLvz-cQ;47e0AIvS?!BlUrKXjC9x_J ztQa68R|$Ex1c@otAJU*o_0X3jou9#PyX zuhAh0j}bM;(rG&Dn3j(3wYHJa9p=XGdeC^gyZj?=gDSnC1z~-ezOu*`FWG=LVjl8CG+z)6dBG?sz6&R%rho#-hTVT>VuVM-#oRCSVK|vg-zM`2Es;8n zGgzR_?xZd@N5Vw0mR9bXMv)|NB{C9Xw{ulz>zIY& zAoG8`j%V)g*yN=Pn&HQ={!iy4pl-A-S(_IWTD=+&L7F4d9)CPy0Yue^&g6(POdsmY zI*%o3-8S2^pO&fMVl4C1$mo?qsN1@u7ZSXmn?bI;=XQeyi}skR`4Ug;T2jhqiUsfS~!KM{HwmL4P%P+L3_hWCMT0*%ft z#4@pmff|5QC^%hhyo9^UwToYfPz^E-=&Vm^BtV)@k9vx}G2H_~n$N;&XHl|K%-gj^ z+1^9fW9^?X!6*VmhOJG4gCu9p(0)v7W#E9(Puhnb;k?bZI>gH{Fc#-!*u|6EA5C7$ z%JlO&0jl4CH?i9_rd`y;EhQ9-sRG;lxA1?;j{lqEbi�ZRHA+LAE+ajoECqD$sF* zclTI4Zp_=RM1|o&mC*Z zjO8yXUgOEvC{CpA2Cwz=I!1t*+nfWtqS}(j-x13*rXn~PU^|moB{)X{W+vt#(H4uR zHolb6?)Y{Vm$XE4S3ic;>kc|QlEd%_oI1)bc_yj}UcOwQwDSvRtR-W7r4N*(G?lWb zIEI5A@}wqA*d9hGH3Wfu`N(4bi613)h&&3QWIywol$dyCgUctsBnUJ7-n*L(-^?x` z(G{#NLHp?uVm`7)2m-?o+TjYU1GWql8|2hC8{rP*zYvzg7NH7DVKo4=5&@?m&}ZNDR+5!bHB@^VTv-rQAoytj zObHYp}jVm}`1v!Ps7~70X>dseiWH#Y6R;M-y-WAEt{(ID0)*IG74i#-G8Y zG^tnu3_nSYo>6BD^}7$u`uyxttPvwO7xj)D z+H~$9Km z8iTRzNQ6}jGz`e*VM#bS;pV5uGG9!ehseKx$$(P@1}o;3j9>93xMDiJGhOvlSTwGg zHtg0CyAdBK`y;pi#O&rgIf0td^gI6X$Ii4`2viA)RHGuKb!%MfKng=l>tUiwBf*9F zlSgKdt^&30W|g~E*>QI$BCaMp)v%IW^~UPG{K@{f4#;ydafIg+pX>ZcSrxdXcIQQ- z#Dxsvsg)upWPe1h_4$|H>mhalYc)xqpf&jA{rl z8eK*t5NZp)g~g`nM(%9E@yRnMS2cf`qMG#AREToHf?v4p-;ZTM4NQEIQQqMVc8O1b zJWO8}GAFr0Mq(dA9XA*brLu5g6>5~ zDb{Joh2hR@nKHm0sw$9}pzeY7t(xNd#kC{6#vt-rWwc=eR5QJKOI1z0RsPcR%3w6x&8|CdV4ToDF02jg=DRJbO=oWgQ814sfl5P_PuT4FL)C|kX`f!)CExf zDcm>87?>P&Q$;H%V#FSBhi*|>!CiV^x@ry~{acp5T;k!H854E75VheBtba!qz=mKI zQpbKjc9mXZ^h+MNE;71mvR)HUjV{{p(ArM>zCD|Wd?e0bDHv8RhK~)hixiCIHtSD} zt%Cd0C&r1)@=F}yC9CNU+J-f}ZkxTsaWhF#IG+<5#=U?2U;pD@(ocH!N~}gQcXAe( zOlNPy_a3BqZ=XUic`>QH&K7hj!(0cOdi(KtIC1@n#q>?KX7R5tPhom|WdNqap>wj5 zodGC?o$=%A9NkiG<%4B{!9uc`QkyjAl z3!5TeUtWKR`od)cEKc7M=MI5Zx3?0a(o*nP0g3}T_xzb(i?JBHc|CZSzgj1I$#y09|R zvZ?B&@*9q!Dhl@E{7<8Ius7ZWuc$UHY>3uB#L3f;y+6#TMK~v6YKds74uROp&}kQV zJrmF&IviR2$&vV`o%2Ilgrj$BcUVQt0IlrgRt><;19dxv7x)|o=cADOgd8DpM>U?x zZAUmEamDPg214AQh#Os2EFl{btgl@0W0y7Y98Y($W{3trVwGEi``U%Yt2N!j2yaQm zbo?mKD$8Ms3KznlM|f}xa4(F?1hKSs44HSvr)(<;3J~u|OwR$!W&0=cnKDyT_KVXt zQ7_tgD9a;0HM91N`@tisXqJ`3*)2RksMCgF`+PGS-#v^HDWw%2R`0K!E|bNpCx{9B z&E@5?5`nZ~HB;+g!8FZk%#kF@s(56T?M+c9e0en&bo^%+^j$pD#G3j=m=o6GH{at& zIt?Mr9%Z5N2Z`B3Rk<(@)<_N@m!7ydU))o6)>rC=}D9L{7ZqsDW#gTR(*k$5HY4d=i#;Ez#iqg%b#PA|cM%WfAulc3VJ( zfDMq@bMO9XIGF&-aR(V{y%Pb#js}pCn1+SksASh@?^DY)TIhA+X?VECo71!iU(bL5gAM~qtV$P!%pFg>ds=qkxzQC-wcCrJF0EZo=N`%$V}79aKUVsf;_F z$U8bBhqZ5O5V8w0FSoxIi34p?r9kXZ5N~@1=X95lIK!SipAPDNMDmo)252gC(myBL zN0}K4qo34$+jHoGnL;Lza-^nl`-m`L|1qA&~KE4b>tXEj#{R$N7;B1A`i z!8qbdU~Ai>xjgFa)T%}h+M&TYdM2l8K+|$edYR-nqU10Aq}3k8%*{&Vu~eJD>C7ur zM2V+)5vJ*I&Sf2@V3pTHNG95mhr+wLQK+i7DkSw1p5Vz!Vu(k!@(Eaa?gXZ~Ns()n z-$fAlrw`U}EzmW7E_R>E=y)FuIpQZF^)MZRu(EyTB50)Dn|>+)x+3xgj7X;QA<1+)lO_jfgc?8JH-q@tA;#jA3dKd`sufxJ+RY>PrREAqvp-*$2M>3f)F9*A6O;ulQn9@sA8ScGXf-yKws zCb%m6U6i+`@`>3bR|$l1MO;y3;L!(-hJuWoUbR|fO%y|XAI&Y%EJ-r9)$UBPi_dNE z)2~1qJ}Xz0cNJuCaCc$+lCr(wFBTLT0IiE2VR`q9R z{3JfGByo@XJ;N(LOT% zcbdo0kkMgZVSh1EVGCd)Unu@A z5;ubXzrd-TxBuk~h)uuy?e9J=rf*MiT;QuxGO=|e{y)(`z70{~r`ywXb_BhDdd>m@ zcR2SS%)aW>Tlx<{X?}2lG_S~&9hXe6?gcCVw9`+_ob~ry>E!QdBOH!Wdtf~(OoNz4 z*5u~67mey=AbjdLqq`@TCmzi6v};|RQ2vVUict(sRUq9a0j$TRRX-H4&#&?P|H!pO Zgcs27{c+R^)Ess3|35mM&9_~12LQ|H!`c7< literal 0 HcmV?d00001 diff --git a/benchmark/dictionary.py b/benchmark/dictionary.py new file mode 100644 index 0000000..77feb48 --- /dev/null +++ b/benchmark/dictionary.py @@ -0,0 +1,43 @@ +import os.path, gzip + +from whoosh import analysis, fields +from whoosh.support.bench import Bench, Spec + + +class VulgarTongue(Spec): + name = "dictionary" + filename = "dcvgr10.txt.gz" + headline_field = "head" + + def documents(self): + path = os.path.join(self.options.dir, self.filename) + f = gzip.GzipFile(path) + + head = body = None + for line in f: + line = line.decode("latin1") + if line[0].isalpha(): + if head: + yield {"head": head, "body": head + body} + head, body = line.split(".", 1) + else: + body += line + + if head: + yield {"head": head, "body": head + body} + + def whoosh_schema(self): + ana = analysis.StemmingAnalyzer() + #ana = analysis.StandardAnalyzer() + schema = fields.Schema(head=fields.ID(stored=True), + body=fields.TEXT(analyzer=ana, stored=True)) + return schema + + def zcatalog_setup(self, cat): + from zcatalog import indexes #@UnresolvedImport + cat["head"] = indexes.FieldIndex(field_name="head") + cat["body"] = indexes.TextIndex(field_name="body") + + +if __name__ == "__main__": + Bench().run(VulgarTongue) diff --git a/benchmark/enron.py b/benchmark/enron.py new file mode 100644 index 0000000..80650c3 --- /dev/null +++ b/benchmark/enron.py @@ -0,0 +1,185 @@ +from __future__ import division +import os.path, tarfile +from email import message_from_string +from marshal import dump, load +from zlib import compress, decompress + +try: + import xappy +except ImportError: + pass + +from whoosh import analysis, fields +from whoosh.compat import urlretrieve, next +from whoosh.support.bench import Bench, Spec +from whoosh.util import now + + +# Benchmark class + +class Enron(Spec): + name = "enron" + + enron_archive_url = "http://www.cs.cmu.edu/~enron/enron_mail_082109.tar.gz" + enron_archive_filename = "enron_mail_082109.tar.gz" + cache_filename = "enron_cache.pickle" + + header_to_field = {"Date": "date", "From": "frm", "To": "to", + "Subject": "subject", "Cc": "cc", "Bcc": "bcc"} + + main_field = "body" + headline_field = "subject" + + field_order = ("subject", "date", "from", "to", "cc", "bcc", "body") + + cachefile = None + + # Functions for downloading and then reading the email archive and caching + # the messages in an easier-to-digest format + + def download_archive(self, archive): + print("Downloading Enron email archive to %r..." % archive) + t = now() + urlretrieve(self.enron_archive_url, archive) + print("Downloaded in ", now() - t, "seconds") + + @staticmethod + def get_texts(archive): + archive = tarfile.open(archive, "r:gz") + while True: + entry = next(archive) + archive.members = [] + if entry is None: + break + f = archive.extractfile(entry) + if f is not None: + text = f.read() + yield text + + @staticmethod + def get_messages(archive, headers=True): + header_to_field = Enron.header_to_field + for text in Enron.get_texts(archive): + message = message_from_string(text) + body = message.as_string().decode("latin_1") + blank = body.find("\n\n") + if blank > -1: + body = body[blank+2:] + d = {"body": body} + if headers: + for k in message.keys(): + fn = header_to_field.get(k) + if not fn: continue + v = message.get(k).strip() + if v: + d[fn] = v.decode("latin_1") + yield d + + def cache_messages(self, archive, cache): + print("Caching messages in %s..." % cache) + + if not os.path.exists(archive): + raise Exception("Archive file %r does not exist" % archive) + + t = now() + f = open(cache, "wb") + c = 0 + for d in self.get_messages(archive): + c += 1 + dump(d, f) + if not c % 1000: print(c) + f.close() + print("Cached messages in ", now() - t, "seconds") + + def setup(self): + archive = os.path.abspath(os.path.join(self.options.dir, self.enron_archive_filename)) + cache = os.path.abspath(os.path.join(self.options.dir, self.cache_filename)) + + if not os.path.exists(archive): + self.download_archive(archive) + else: + print("Archive is OK") + + if not os.path.exists(cache): + self.cache_messages(archive, cache) + else: + print("Cache is OK") + + def documents(self): + if not os.path.exists(self.cache_filename): + raise Exception("Message cache does not exist, use --setup") + + f = open(self.cache_filename, "rb") + try: + while True: + self.filepos = f.tell() + d = load(f) + yield d + except EOFError: + pass + f.close() + + def whoosh_schema(self): + ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None) + storebody = self.options.storebody + schema = fields.Schema(body=fields.TEXT(analyzer=ana, stored=storebody), + filepos=fields.STORED, + date=fields.ID(stored=True), + frm=fields.ID(stored=True), + to=fields.IDLIST(stored=True), + subject=fields.TEXT(stored=True), + cc=fields.IDLIST, + bcc=fields.IDLIST) + return schema + + def xappy_indexer_connection(self, path): + conn = xappy.IndexerConnection(path) + conn.add_field_action('body', xappy.FieldActions.INDEX_FREETEXT, language='en') + if self.options.storebody: + conn.add_field_action('body', xappy.FieldActions.STORE_CONTENT) + conn.add_field_action('date', xappy.FieldActions.INDEX_EXACT) + conn.add_field_action('date', xappy.FieldActions.STORE_CONTENT) + conn.add_field_action('frm', xappy.FieldActions.INDEX_EXACT) + conn.add_field_action('frm', xappy.FieldActions.STORE_CONTENT) + conn.add_field_action('to', xappy.FieldActions.INDEX_EXACT) + conn.add_field_action('to', xappy.FieldActions.STORE_CONTENT) + conn.add_field_action('subject', xappy.FieldActions.INDEX_FREETEXT, language='en') + conn.add_field_action('subject', xappy.FieldActions.STORE_CONTENT) + conn.add_field_action('cc', xappy.FieldActions.INDEX_EXACT) + conn.add_field_action('bcc', xappy.FieldActions.INDEX_EXACT) + return conn + + def zcatalog_setup(self, cat): + from zcatalog import indexes + for name in ("date", "frm"): + cat[name] = indexes.FieldIndex(field_name=name) + for name in ("to", "subject", "cc", "bcc", "body"): + cat[name] = indexes.TextIndex(field_name=name) + + def process_document_whoosh(self, d): + d["filepos"] = self.filepos + if self.options.storebody: + mf = self.main_field + d["_stored_%s" % mf] = compress(d[mf], 9) + + def process_result_whoosh(self, d): + mf = self.main_field + if mf in d: + d.fields()[mf] = decompress(d[mf]) + else: + if not self.cachefile: + self.cachefile = open(self.cache_filename, "rb") + filepos = d["filepos"] + self.cachefile.seek(filepos) + dd = load(self.cachefile) + d.fields()[mf] = dd[mf] + return d + + def process_document_xapian(self, d): + d[self.main_field] = " ".join([d.get(name, "") for name + in self.field_order]) + + + +if __name__=="__main__": + Bench().run(Enron) diff --git a/benchmark/marc21.py b/benchmark/marc21.py new file mode 100644 index 0000000..9a2bb9b --- /dev/null +++ b/benchmark/marc21.py @@ -0,0 +1,297 @@ +from __future__ import with_statement, print_function +import fnmatch, logging, os.path, re + +from whoosh import analysis, fields, index, qparser, query, scoring +from whoosh.compat import xrange +from whoosh.util import now + + +log = logging.getLogger(__name__) + + +# Functions for reading MARC format + +LEADER = (' ' * 10) + '22' + (' ' * 8) + '4500' +LEADER_LEN = len(LEADER) +DIRECTORY_ENTRY_LEN = 12 +SUBFIELD_INDICATOR = "\x1F" +END_OF_FIELD = "\x1E" +END_OF_RECORD = "\x1D" +isbn_regex = re.compile(r'[-0-9xX]+') + + +def read_file(dbfile, tags=None): + while True: + pos = dbfile.tell() + first5 = dbfile.read(5) + if not first5: + return + if len(first5) < 5: + raise Exception + length = int(first5) + chunk = dbfile.read(length - 5) + yield parse_record(first5 + chunk, tags), pos + + +def read_record(filename, pos, tags=None): + f = open(filename, "rb") + f.seek(pos) + first5 = f.read(5) + length = int(first5) + chunk = f.read(length - 5) + return parse_record(first5 + chunk, tags) + + +def parse_record(data, tags=None): + leader = data[:LEADER_LEN] + assert len(leader) == LEADER_LEN + + dataoffset = int(data[12:17]) + assert dataoffset > 0 + assert dataoffset < len(data) + + # dataoffset - 1 to avoid END-OF-FIELD byte + dirstart = LEADER_LEN + dirend = dataoffset - 1 + + # Number of fields in record + assert (dirend - dirstart) % DIRECTORY_ENTRY_LEN == 0 + field_count = (dirend - dirstart) // DIRECTORY_ENTRY_LEN + + result = {} + for i in xrange(field_count): + start = dirstart + i * DIRECTORY_ENTRY_LEN + end = start + DIRECTORY_ENTRY_LEN + tag = data[start:start + 3] + if tags and not tag in tags: + continue + + entry = data[start:end] + elen = int(entry[3:7]) + offset = dataoffset + int(entry[7:12]) + edata = data[offset:offset + elen - 1] + + if not (tag < "010" and tag.isdigit()): + edata = edata.split(SUBFIELD_INDICATOR)[1:] + if tag in result: + result[tag].extend(edata) + else: + result[tag] = edata + else: + result[tag] = edata + return result + + +def subfield(vs, code): + for v in vs: + if v.startswith(code): + return v[1:] + return None + + +def joinsubfields(vs): + return " ".join(v[1:] for v in vs if v and v[0] != "6") + + +def getfields(d, *tags): + return (d[tag] for tag in tags if tag in d) + + +def title(d): + title = None + if "245" in d: + svs = d["245"] + title = subfield(svs, "a") + if title: + t2 = subfield(svs, "b") + if t2: + title += t2 + return title + + +def isbn(d): + if "020" in d: + num = subfield(d["020"], "a") + if num: + match = isbn_regex.search(num) + if match: + return match.group(0).replace('-', '') + + +def author(d): + if "100" in d: + return joinsubfields(d["100"]) + elif "110" in d: + return joinsubfields(d["110"]) + elif "111" in d: + return joinsubfields(d["111"]) + + +def uniform_title(d): + if "130" in d: + return joinsubfields(d["130"]) + elif "240" in d: + return joinsubfields(d["240"]) + + +subjectfields = ("600 610 611 630 648 650 651 653 654 655 656 657 658 662 " + "690 691 696 697 698 699").split() + + +def subjects(d): + return " ".join(joinsubfields(vs) for vs in getfields(d, *subjectfields)) + + +def physical(d): + return joinsubfields(d["300"]) + + +def location(d): + return joinsubfields(d["852"]) + + +def publisher(d): + if "260" in d: + return subfield(d["260"], "b") + + +def pubyear(d): + if "260" in d: + return subfield(d["260"], "c") + + +def uni(v): + return u"" if v is None else v.decode("utf-8", "replace") + + +# Indexing and searching + +def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, + glob="*.mrc"): + if not os.path.exists(ixdir): + os.mkdir(ixdir) + + # Multi-lingual stop words + stoplist = (analysis.STOP_WORDS + | set("de la der und le die et en al no von di du da " + "del zur ein".split())) + # Schema + ana = analysis.StemmingAnalyzer(stoplist=stoplist) + schema = fields.Schema(title=fields.TEXT(analyzer=ana), + author=fields.TEXT(phrase=False), + subject=fields.TEXT(analyzer=ana, phrase=False), + file=fields.STORED, pos=fields.STORED, + ) + + # MARC fields to extract + mfields = set(subjectfields) # Subjects + mfields.update("100 110 111".split()) # Author + mfields.add("245") # Title + + print("Indexing with %d processor(s) and %d MB per processor" + % (procs, limitmb)) + c = 0 + t = now() + ix = index.create_in(ixdir, schema) + with ix.writer(procs=procs, limitmb=limitmb, + multisegment=multisegment) as w: + filenames = [filename for filename in os.listdir(basedir) + if fnmatch.fnmatch(filename, glob)] + for filename in filenames: + path = os.path.join(basedir, filename) + print("Indexing", path) + f = open(path, 'rb') + for x, pos in read_file(f, mfields): + w.add_document(title=uni(title(x)), author=uni(author(x)), + subject=uni(subjects(x)), + file=filename, pos=pos) + c += 1 + f.close() + print("Committing...") + print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0)) + + +def print_record(no, basedir, filename, pos): + path = os.path.join(basedir, filename) + record = read_record(path, pos) + print("% 5d. %s" % (no + 1, title(record))) + print(" ", author(record)) + print(" ", subjects(record)) + isbn_num = isbn(record) + if isbn_num: + print(" ISBN:", isbn_num) + print() + + +def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True): + ix = index.open_dir(ixdir) + qp = qparser.QueryParser("title", ix.schema) + q = qp.parse(qstring) + + with ix.searcher(weighting=scoring.PL2()) as s: + if scores: + r = s.search(q, limit=limit, optimize=optimize) + for hit in r: + print_record(hit.rank, basedir, hit["file"], hit["pos"]) + print("Found %d records in %0.06f seconds" % (len(r), r.runtime)) + else: + t = now() + for i, docnum in enumerate(s.docs_for_query(q)): + if not limit or i < limit: + fields = s.stored_fields(docnum) + print_record(i, basedir, fields["file"], fields["pos"]) + print("Found %d records in %0.06f seconds" % (i, now() - t)) + + +if __name__ == "__main__": + from optparse import OptionParser + + p = OptionParser(usage="usage: %prog [options] query") + # Common options + p.add_option("-f", "--filedir", metavar="DIR", dest="basedir", + help="Directory containing the .mrc files to index", + default="data/HLOM") + p.add_option("-d", "--dir", metavar="DIR", dest="ixdir", + help="Directory containing the index", default="marc_index") + + # Indexing options + p.add_option("-i", "--index", dest="index", + help="Index the records", action="store_true", default=False) + p.add_option("-p", "--procs", metavar="NPROCS", dest="procs", + help="Number of processors to use", default="1") + p.add_option("-m", "--mb", metavar="MB", dest="limitmb", + help="Limit the indexer to this many MB of memory per writer", + default="128") + p.add_option("-M", "--merge-segments", dest="multisegment", + help="If indexing with multiproc, merge the segments after" + " indexing", action="store_false", default=True) + p.add_option("-g", "--match", metavar="GLOB", dest="glob", + help="Only index file names matching the given pattern", + default="*.mrc") + + # Search options + p.add_option("-l", "--limit", metavar="NHITS", dest="limit", + help="Maximum number of search results to print (0=no limit)", + default="10") + p.add_option("-O", "--no-optimize", dest="optimize", + help="Turn off searcher optimization (for debugging)", + action="store_false", default=True) + p.add_option("-s", "--scoring", dest="scores", + help="Score the results", action="store_true", default=False) + + options, args = p.parse_args() + + if options.index: + make_index(options.basedir, options.ixdir, + procs=int(options.procs), + limitmb=int(options.limitmb), + multisegment=options.multisegment, + glob=options.glob) + + if args: + qstring = " ".join(args).decode("utf-8") + limit = int(options.limit) + if limit < 1: + limit = None + search(qstring, options.ixdir, options.basedir, limit=limit, + optimize=options.optimize, scores=options.scores) diff --git a/benchmark/reuters.py b/benchmark/reuters.py new file mode 100644 index 0000000..aa20c74 --- /dev/null +++ b/benchmark/reuters.py @@ -0,0 +1,38 @@ +import gzip, os.path + +from whoosh import analysis, fields, index, qparser, query +from whoosh.support.bench import Bench, Spec +from whoosh.util import now + + +class Reuters(Spec): + name = "reuters" + filename = "reuters21578.txt.gz" + main_field = "text" + headline_text = "headline" + + def whoosh_schema(self): + #ana = analysis.StemmingAnalyzer() + ana = analysis.StandardAnalyzer() + schema = fields.Schema(id=fields.ID(stored=True), + headline=fields.STORED, + text=fields.TEXT(analyzer=ana, stored=True)) + return schema + + def zcatalog_setup(self, cat): + from zcatalog import indexes #@UnresolvedImport + cat["id"] = indexes.FieldIndex(field_name="id") + cat["headline"] = indexes.TextIndex(field_name="headline") + cat["body"] = indexes.TextIndex(field_name="text") + + def documents(self): + path = os.path.join(self.options.dir, self.filename) + f = gzip.GzipFile(path) + + for line in f: + id, text = line.decode("latin1").split("\t") + yield {"id": id, "text": text, "headline": text[:70]} + + +if __name__ == "__main__": + Bench().run(Reuters) diff --git a/benchmark/reuters21578.txt.gz b/benchmark/reuters21578.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..cdf0677f71a6ac2e7cb5598a3245abeaf4ad5409 GIT binary patch literal 181938 zcmV(lK=i*KiwFon_F_x|19D|`bY*gLGBGtbI4*Q}bO7vq+i&B@)#vlK!2S<~d6{lN zR&=+W9W)@zvaPW!c_g`=&KpIRB#tRk;l&>3um8^PoT?%z%adgHVX?p($+RsM>vHbb zV*B#qywz)6{MTsy^f;fX(bG!NKlkIw-Eg+No6OW=Je;cWay49y)$?$%7|vGeYCgMG ztDRPUQXG<4(f@M#tJ7EeC{{s~D*cZl%-_|Um!@8vD_x|?I*9|dNm6yM!<`pryD(M@ z?L}&w0Eia>YTOza-j}>PK)giUaAB27UnxykiP5jcVQfa zzL$qd+)$DB0=4&IZ>v*v?S*fi8q@j*>no7&H}rZ~*;>PW>>`>X3%M{;>E(yQSXg z&jZbr?PziJwhR58O7$VtnU3>JdGx5>hs9nc8`Wv{yS$&|m7d6xz|J}Q3IN40llcSe zo8Y880{6D?0Dn!NuHSiS{Q;S2SccAu3Um)cXg+RZgrz0!yAI$stp3nqYP)fb1>C z#i1922z;HXq{sn`0|5t!%do$9LP0>-q))skdRK|>7l#ly-|1KeL|zg5JDgL+`i*AT z=r|)-&|h?FoH3)BcRD4sB`Gg)8 zk3!3VaJ0}K23vb0TK7~IZsTx66i2lE8v1#XzB9b}xPB7nVO-$EggP&H^6~L)SaJeNK3=AdE9*8e!F|UYnIR}Y|L*&IeYhZVwpBxB1M7R(T z2ma7|CE%uc7)9EEIVal9WFJ4xi_{X7cLe;M4pR$%TAF4M3OW=iu^V6zc?9K6lH|K2 z3VtfVWk6(M*#Npq!SMy3TQp`>>UhB>3G6A+3C>D#C`qvtjqf#Dt(Fi}RuT)zzI7Od zS-D{lwd0BAR|oAYZ@UY;&q!2YNVv=rk_Q>>=^Y!PBZo1qL82o!SmaZYRAMf9V{VAm zaQEu{lSB&WBPder#v_4bS-PnvIZYb_Hgpt=XcB|qFgjjtKBk@gPST5|*e;=GlDD`* z`n)}PN*LICCDFAe(uy>;2B63EC39~c9)u1x(NMH$n)&4!kg9;uAV1I~2Lj%B452{F z1M)iEhcS)MAnw3JcJD^|bS~T{>%_~M%=P||Tkd6|KRIQCcXosgIV*8M0V=LZ)aBE| zWHo=7lkl9~tKsBfsfH`{+tY+Z>0&&Ze;+S?sGG(7K|PEYw5` z&ApJd1F09yln6yy&=E5At9M`|?-L?$k}8{!=!{^z##A1(7U?hWAb|i+?sW=n=``z| zhT2>vTUot9q!AMvkx(XBc+q=?)!cdKnfKzIt-XwpYIog`$%q!2m@};8i4^K|o0r5j z#4ALSP;m`e-)Q!7?FK~co`|tY+FK5M}QDWecpcNAR=juM$(f?`1tRT=6 zw+O0<{WW7Hi8nHEM}P57Nl7TOhWr;&KB?XUX8?y~aU;f__0JJ;0x(4M7RK&-{JsfO zpfU1Dj^W@yJ4~0B_k%1qdL4tQAO*LkyZGm zt?+Z^NQgvLE;3keNdJX8_TO1L@(iyePU&&|#{nZMkFSZv$ZU#r{+6VCJS@ISUfvVh z0tu@_LKD-jNrZdQ^{EyOY0R-yLw>z#n*2?BFD2SekR`MLv=*c`WAadc54KEWp!=`7~3aCIHLtXDIi>`8xD zYU>g}+f*B4L0Ad$U`ronFr!%11Mq@H0Fl@y(;hL722GhqqXG+HW103zW8@(T;dMcZ zjOG#JQ?s>R92iIefs}aF=Y8uu6k8qU3+rSOpcy_gv%iHIJpeGW4aDGtl-tv0tmdaf zz5h(>@4bLd0$${AArTKWt)^|i=WKX_oxnUzH6scLbBhWXn4$(m^%8^wuoGkcKGI?Q z2K{zL1}0GwB=84%Q_FBq-z|wscA4eoA$_zxVjsfOUWhXa_bdRgBPLN6zzI;C2#D2F zbJo5q!=HD}5teZr>7y&xgq!eiWFa}Fx!a)v9 zOj1E2T0+y|RRV4+35WKiNyI~BzmcRMLWq*&MbN?OG4g27Bn4k+U}m<<#IVrY!rE{=)2duQo)+l4?*J9OT(l9WC#t!LC?fgab!dqf%nZLc}wC$G0sZeLgrXr+Rm!e zCNlAfz>4(!NbksQ?I)qJMX^_7ORXV03GPIl-Wq&IDlQ8lzsM}eHT1P{bx%qZl1hn% z$&lOupB^0at)#6m3(h~b)H3vm)~q!IK?ap-o?*b=5(8vs%UpX~lGQRk(~zv#63jb9 z2#82osvdC|ovcJG%PpD%IxBzELc0u8c1+TOl#SS@L7fO_)Gj&DYMX*hU(yh%L$|>; z8iGP@9GDC;!8VXqX5>bQJ zs^i&3WjSL;AfDktxZ>O|*arc+eilF|&YM5+wx5`+GN@6(R>mG!S~0wMJDbEO%B9D7 zs8b{-sU)jfyIZKPSg?iY69Od`1H{5Q-qKEEd?E&hFc*o82OYj7Rhp9E6oWt_K(J0o zx(oUl0S#Lob*dMD7Y2?;hrS|WxRYz2HCZo%a+-g4u{k|zf&i<<(=lU+GK zDgw(YRWH^Hnbk|%+@NX#0rkah=BxM@Y&{ysj2M7~**4T4EkSy+Zs`3q+!!k$Z-`y# zIZKuu-gO2;#)zEXtp_W9Fk9M|8bpcz0Y}tldsc7Y1H6I%HPkta0@)h#XBuvIIY*ih zGlWDv6~=vP-Z$05d^Y}}9)^qi@k(8fZziM3O5MyCR$QGl0ASSsXDOvs{z?RAL=Lemn@!KMQh!6qaV#XQVF4pPr5 zuzss=ciJ${unYwE6WJFj=_68xCJboy&$FbO%{6o6_GJ$f5hi00zj10jLQck}Sq$3t z!_>D55PJphk|_I`eu-E!6S7GZ(Ef>f%`$AW%8u<)Z3kU};Uk&e5!;KI&U3>8v@9u* z4NV7kjf@~oEU{+IFJf9Yx&>eV9$no|zK>_B-*tpETrMZe z)o?Z%|0zw8x)xvk1HCXC(}Z?+WT%mKVnyZ1D`oAS!F9Dfx!Q3YhI2((8PPH>xQ!Pv z-Ii2rCNw86X}a4gq2m-k5(%LOrM!lj@o*sz4FgmHS{h?5$KMMD zV&kcVgL{K2JFSJxp&`f=zC7!i%ooFweBJoN;7wAbGPd3ROM756;eUe7o2SL_@gQAyyevfi=VCT(XN+4me1YN-mcfT>jY}i9@l4JUjNECLd-f1@WU=k!YK8PoxW@%j5zPNU{U<5s^1pr=KaA#! zZ)!Auc$|(`xWhr?agVPY;jkaXS3_@SEzb?$mFqg_;p}%(M-vt4^cex#Y~wYlc94rrM3x zV8EY9CV)QgUp57h_`TBt-=wi^XPuS_5mos*{Z38o&X}%i& z74|n-fA1W;*duG5a5nWV)&dGLs#8Wbk;UXwYlrsOg4B5)Wcv0tX;FsX^0|DzyX$pV{c zL{cAc0XC8NI3{a|fL59W1p=<%k#G-Trr|)6eM@qi=0Iw5Pcn#nd`xM^t9|$f1J>gJ zwGH7_bM|J#F8g(2e?$#2b9;(dC(=)L%k5sL!WF3+bd=A<2NDEKVmlCQ#Ke!-Q#BjX zG}M#PBL0r|O13u{9tZZ1;i1GCOlr2ZX#!Dq92`CzqEI|Ra(cFLBCaY}iB{i}k3g=h zPn>9gA$B>3<(M2gQWHUspI^9B9OYD(cHc)+VwD(5lheuN5|Tab_sR*a4zGc<0p;92 zZw|_;439l5l8H&f;zYJ_mt-PP*ln_O_3ltCpgpyU;HwSMWe(cw4?a8TP3=iHsS9g} z%FJxy8kV04y3)LZi^_qm#)6{s418q1--(s!$LG6-xH(rsr-=(V4X_8>zEkPW$o98|Gi^-ncNjgoxH)vj3iPgL4 zH#<&Z(G*e)(&;t3PN;Rv@1;ybq#8oUlNs4SP`*(DoL37Fy$IKZ*twG3t+ELXxzX#i zKv@E_A23au@5^>8!>-BK+^=bV2*OVQG`^kB0HE=R;P5cN9xqTMwU{i&r1NC{$Hn|+ zLf>8&IZVfxj1!tho}7IrkdAL>&EfJ}H77o3+L9&{*{1FAE@G!6*gfwub^)So5k>*$ zk}77Hi{E&}(E&^Q+LU%CxIEaaF*(@8NMTPFQjDa3)eH;#`S8c%!u4F7Vi0ef@n2YTfyP+ zC85QU2~k(&N=Xp~5{Q7m7DH4?{7vj>oEf@JGEX)_iFi0iDN`HjSQu!r4$=Xgo={fGonymfc)?dvTlGB<$Y60Ad|dJDrb6+VKNT z!v+Bn0AQLCxrQ4tXfcg5`#Dk{bGB)TLP46|(q~v9FNXe3z{CQ)4o;9VAJMLB(k}?e z(N|`tpO6xGg!TC_j*Cj%y)u($zu{niR;+*5B6m4vjd~oAq=jT9jS$A0s4#I=kSBD- zjwur{LfHcyJqR}&jVLF9m5*xtUk%?ru-dT--qIKTGwu)$icM%yMgkBrl22|Wg*dRN zM)-rnmc|Aw&%&62X{E~|1fQ!Rgm@$*QjuaLp}Tbv_5rSy8DhPagmVRCNqs^osfr-7 zklSMVZJ5-f5Eh7BtTL7$V%FKx2Tr+*NaQLkXU^CVlW%Mzd`q+z7%k3R2(fnOZ42!v zS2*1>Y5%xjg<_sADbrar+7!W@;%Kuzh_F!sPm}E5lyQ5!e~AUTxmn5uT(t_LtO_$k zf!>JnWZMr3?Ni_lYaQEX%AXkBC01dzTt{|O^BXenO=?`s5?;hcrFHmFB0?K;b#|FH z8|P58P#KLewLff}Tdi(GRH$b>xVn@RWFt=PTZ+QS_zX$aLI1q2Eg?OUBrqwa^!VtN zIcXt4%oR10dD1*@pHy)*!=d3k?{2d+1l2mNDLq7HlPeljhGdgK#A8-w_dY?B4TdPw}XX^ zsva0hbZ$D%#is|HFYm|0TPcq;G(DM(aL)VNZ>qgqx%dm5j`J3q_GE1i&iOYRMj14E z?SYNR&@cUS7dsI@0)vDIH3W#xy+3OCO!lCdZ?WZmO=w<2kug!mt67 z-eTJT@PFyFTIVJ8%9l3(M8ZMqJ+nVyt<$nHn`9pIQ(Snql$cdLYE4tNBz`Son!jOj zOFv^Sw8xAnq(y;~n9T=N&gNW7;EK=c?HF<%>637I(l{nl1decwY9&{7^Q;51j|1n(cxiU6^ZZ1*`wIkVmq45MbIG<61P=l1*RN7iN!r{HMnZ}$; z=aAEW2E~XRP$?q90Uc7waD|w)MJcxPAUh;4nsi+3Dy1AqL@d+kZS6+igkXnE-Zv@LDe!Mp1Yss7>{}RAk;7s z_@ao=^dMAbaI|N=q1@vLhD6^p)R{tXR-SM+J4(n5wgT0??0$$}K`moH%-|IwcUh)E z|HCsx)hM1gIzY30yt)-8q;|7cFP0|#ZNa`N*ygmG^u$IUG<*FrH}TUvW?2kLma%je^{Vxf z5}gtlmMMRiBbPWXH;FO&^s7|o5$?%W%6Hc7Cx7J?RuD#2A)+l8af-R*q7OuENZIJS zfa?=XusJqeRjBP{@6ptDB((LID@jNX64e111{wzUTpqpWz>qCpuAu$RJ7!mheiT_E z((%G+C$j2IwP{44^ii%%%ds_)8qVirqVyj{62UTs5GPnQ2m_Shz)06ZlwnC>sO@?1 zM|AWZ3&#hYq5(lo-%&YjQ%MgJd$j@@rzwRrm-}FtqKUpX4YBk z{t;P`5p{-9dlYNI3?y{dn2?y5eR@-hQh@@(2@D1U0YS;%nW#4Y0%!PHABHYg|I zaK@_8?qo-pz2byQ5;MSpn$BsI5&@-!5f_|P~h$rD)sT%3^jyoPFjm zS(5demq+=fmGsu-<+gtoDvDWBlq)Bk?Dh^s)_*Xu5kR^}`p8vlL5m1W`;jzw)-?{i`ZE~1=KUo&3W)m^uZVq-xU00yQ z%0L(;%o}!UXS`v1K5SPIGn2iy*}wdDlUenyDKx=@WVtbMNS^y4DFJBVAWaU^1%OE1hdr+KmbC$xB8g3TWI{;fL9V};i2vE+YRsHQ z?X}L?oV} zHgCf2KAAgcT8sn92tavYTH|41Ix&zrreyboV$hJ7+=o7Qs z&=49#X|)Er=foJ@ReK3z;=u(u0T;-aEGUrAU9mfR$4+)08Q`R`7%QZL}~ zXAfNe(QSVQ%Ej$?wwlbw9Krs}#ksm&3@0-+pG=oya_bl4+sSgZ7y_0{H6+s(2_;12 zA+Oim!A1B&gBZ8}Z^A9tW3f|gr}QCVNyftkqeF&V*lZ&ZJd^Ny5=I$|lA1-Cm@v*o z!x)yCX)KW5-W=Uw^y|m_LY$w)^OQ*)*GMU!OK{#8v&QsVjEC3i`EER1Df<)C7C!uH ze|s2C{ytW7o8iw=pnZyQw<^I}JVZ<7zyd z)gI}#Y`nAU%Fxa`E(W@Mn%zw2&*KGS^Yf#$FB@(1$f$G`EH+Q#&PK6nSgv9VL{^RH zr}arI&-edwF_7nr$`qFklU4zxrMQg(mKW0i69}P}(`7LGnFu~d9^lv*Ty`2gzR{uA zXb;-W*4NK)C*!-VezVQv`;AVkWyhaFoIKG%2u|*d7_tYIgHNUX~X;G4% zPa|+Utxa-ZI+;yqC-s>cYRH9pD-i5uzk@r3p2yd-@$!23Lk(xwgt|`qTusq%u)2F% z;PZweF>*a7a6BzWTsbO;Lj3o595j0W)n2`z7) zAqJ2in!oZRGU;k~HyNs{De3#69q)#ne;mK!6zO6y|1n>F7=LkMJe_(nt)1ad=l}JI zP*y*4x&QXZ@yp?CHB^iFa_GJ|JF~wHK8_#F7uWNvVQv2IN%*uY^WTiCAImcGh*_BW>YXHvjppS{09bC}{-*{n*M=M1pFrd+8 zwZ|1Z()O?p(*WjyF@f;?@l^wHR+tI%+_S1o&Op(K(Y=^-95BUi*FQ{t(E;@S)jlu!AGW#C%Te7Tm5$S3J`SL{DR+!0G* ziD^}kG2e+QJTw6i-t)re_KPK1;j_?BB2SFT$3mwtC8AetYqIBF?rx^4fUG^Z_#DIr zH$sMT;>x$YlEr7G_OC{3y;hBWP5Cf8LY-Fs#Db?e^l3?c&SE(leN(q63_(#Xa?ID$ zg(7i54-C^H`{>`Gk&-qQaH}k*6HHJdHV1k}q8D+-x{qY-GZ#8UnK?*-t)VhU4#wA! zKJ0+TO2v3?IyzBD!&T=FrS+m!CXZNe6ZCm%?yyexDoFzgEkD?J=!ZhmvPp(Kqrz2M zm?txL`^52v<>oh2bB6#tvWW-NI7@Kg4nFQ!iP=`QrJY*ZB0lq^mczwpuAUy3QbIL_ zX~iauFvDmC<=Gj?@1&iGX#AvByMq9NaFFeU#U)F~N!W69$(2jr;wD6(jq6p`@0@Jd zdC`Wu(3rG%CEwgy7_A#*Y+T`_5%^ikHGFb<5Fv~bSk~p`Ga+t@3prH#>1nw*iA+a6 ztaOvzJfXOOv>ca2Kaa=vwjP{NQ)~9y7f246NI@wz$R7T9snb#Z!$sWqkljJ^{Ji#v zWCkW>$)#_Aa|z?|=aksFS^kv^bDa*RFOdAzaZX1Mp~Nnq-|OFFHMw z$eRs?fZ;nA{bsi|L%Vg}?6!}l@VL+P+|@IgDkd`%>Qo2@@GkoG-E_(IKZ>ZI#G4q_ z6Y@vMgkhha?7i9YS>3|sojt&$CI@OnrZ%nueTr=#>rB6qpT&7lSc50Sr9;x@HITXy(G)!1sk|5 z00n1B!8gN|$i)=^90211NA2T@2sAE?vK`y|ko!x`#u`q_n8Yvb9=psoVKvIY*Kx9* zH}OR2Ei#dF?E6LA3w1UGL(cdiU1jrkQY2;}=pMcg6llMz z!mIBjeqLgCQ6aPny2^@_1O=G|AsxyiY($}F>if91M=TlK8)!yhr4>iBnME@7WjI=$x=pxMKQbwX zy8ZnAnxrMZIR%w6Yxs6B}hCE-k`oii+F;nq^mC+*6eh z)>m~Vy2qS^j%&JGB75D5+u;}N;j1mxCP5@63u;uJ6Mp*WE1g`jGcpxd$(kxB5vP12 zQxDfQJ5Z|I?8TY5gw1lL}&{ebD zwSYMFCqQadz_D9yGxZLpbjDm8UbByDTgW;4-mFLgGs>oSp>k|74*683>toN2JIK~% zR%OT)Y;vn70l?OVaXK5CwN#1jc5*Jj@dBjzq(DRxN|z0HdW`Z5Y4~-ZaRrcNLm^L@ zNu=mkFD~RhrGEphD^}wfTwaOe{s~s$0f+G`r>) z`mr>D{3QpN=l4J64fZz>MqeHwRMhL*?q2FTmAB_|U+8#FcW0H0EZV(((_H1x>1l6c z%NPu*_-ur)7{%$;A_;-luYA>AS$=|%T-I7CC9$!q>@D2DrUo(&+Lr__mkOE^Ouo<> z{Y#ue=2p`$w`U{)QEGv^%Ta5=wj3K~l_Cf`d7(RWNC+GTob_*r0!wWNa%N<&rG@-*a5D^e0 z&snP>#_cE7djCML-R&kZ-$QO2q0H+l+-0kHl3-n_`v<5|Iv@pIm6(}psj^K0uBqzt z2C9nRT-?;2vxP-ci z7)@=JNiN6gkTa*9UbuaKLf{f-oxNGWJ%- z4Xi+w2ci_ZJuuNToXl3Kh`GAOOc}1za2H0jyXH=u`iSZry~-TY=fyWLaNp)1rrC=x ziRd*hI+fTj+a;A^Xo-0;>AJpYTTm+(Aabdo=Q`cuOn(N}@$?pPwb9dJfhgK=G@3ur zzssk^PJvFBfYcuI;=$K~hcPdrI)k(V- z708sg;RhP=lANV0Zp>rjP!fkWQU)2_Z4N54A}-J_W3A?LK6A&EL|R-YSLI#^)I)Jd zl+0*-C}VJL4;!NIR^=iMvwvap{-AOIceBRlRQU9urb9SAq~xUIkc%g7VMUv_IxQKn z_eA67=5<4l+yl$++~J;GqoCx%S+ggo8@6MZu+WiWE|oCve=;wR*_rm(;+@DK3Xa~? zG!jj*ENEhVL~Sx9t^&_=ghF}_|9PrQMy`E1lxx{TT(9;jq^5c_QZW=-m8eIzWh;_6 z+sScE*FRSsa@n(ejbuUhMRxGX&HavaNND_a-<`LqR_Nk#r%$y@7D*K?-}lJ(K65pn zfeL`TP4QM9DTL#OlhQy(L0_ZSH;WU~SFXzwzLQ%oOvn?xekBM^#Wl|<(-GPHohC4s zJarnaUjHxj-*Y=ZCoL;m`hS?d;{U4z_W$YY-=?oq#=4;|TqU?%K8@={)u@z@{ct38 z)F53UrV?m1N?~fM6Ns19mhDOyobZ; z{ME(y`+WKx4(n(!zMibasjuc_K@oL)U5f?REs3vWhCXJpuBFwzLAmJZKUZ;Ue)7mU400b zdB`H}Eg2rTam#l>m!gpFVn=w+cPRyGAvu}p+{w{i)zOVc751h3)Uy+703 ziNzh0pXmGVlO=hH^v^fExNJN@h)pC#urQ3JI5G;_7X0> zzJ*h&b}`q&T$TX1drzI8ZK51UFA3qwfmPwZK#2_Fe3!)1&#`DuiIBW^0gQAe6~Vg= zn3<4m`{LszFSE329+x z57+83)&=yLpP9bMleBO%c{p3nzYj+eE)(OAsN#D3@W|N~qXPoxP%>C1`+b6}1I@=z zH&i)Oqw6}Wl`Qp!{`Jqu$p@RM6=6Hs<|T~SEG3`$!YwBR|2wF(d-@NuSYdEx!o(aB z_?_j|tGUD3w$Ck#SltzP$_vSP2Bk7L%3BZb?jM_X_um+pzaRj+f@=Bl-|YQsa~fH; zH46Wg41cjJ0%3vfCmenQAtX~qpcbgyt`8?71t}@pK&nZwtorq5jWOq3Yb6&~b?@gr zah`LkWACypLFT%zc^PwzetlH`W$n`+h0yXqf)SEtORHgpP7Pge3%wE zES}S`81jwAe(mTKBu>eL*eWcHesN|fQUlX$$=W|0al(7={x;605+wAa=lShD-7_1C z$^3>1?P-}byZ-5O?}Br};f`aU>>|k;e5`4gDRbMp2B5=P_AJo5b+&z{*B-D4;_gP# zkq2hsxJ##9zW`%bN?y2|>G)$%JdE}*!e%@x4K%42u<qeaB*X_2 zoZuHW!e%dsFNAxh3_2dO%%{I(Pg51TA<;|9H!+;=)|z!g>j7o-(}gLesXQDMu#b

QAAjJQ$Zavx>-I|8T$1r!haP5WhF@gpnF?+fW`xA^Y*z3%5u?<>=xs z+#CPM>3)ohagl2AfJi$^HbYaYwDP=Xp8ZHcn1&&4^LsZ^>b(Fl9UeD!mr47kg`h7S zGQ@bFM23>$dx7a&z(~STt+l6M8er1pzTo047Uq=$4J#qj4L5dV$-_thiP@paN)x|O zSeaMAVyG-&|JAjTjb#QF)hL;M9FMu~AC>K89%(r0uvuokjAT*#~aoM!6w$&xpHYO&tT6ypjpX2@!+DTIZ zD&tK5tAZDIEQ6bt&9%`5prv0^N=Zkt?`Y;MkLtF~q(G<2eCjUUOJz$IJmLl9?3Th} z-)^-#|E#%_=sUqCZa5xjzUZV@=Heb!_rFw;N0u9B*u4p}i#GZ#s0J6ZK!WXeE*p}1 zc^b{M*R*xL70$J1zv!u8^#W(Q3jS9y$7cZd!R7oQ|3>JB2Sqkl2^3}V%%=2VWjW@~ z*;832Bp!I-YL=XAhnQ&So4GR$bgpi`;g0ECo0`>D)5^|>(a0IO?c2dk!V1k~Z!ftU zI`>>l?PQs~EkjLgv1BHn1<b#c3k7+__I3ZRF@!M*pj^M|*jA(W zyEkf@G@^er^;>rof7c?erK;j&i#_U(bEg(-k~#jD93?enD*7Zx8YR{7aD(2g43G*c z#VAetz7cDoXA3f?NT5zaon|5_97&Xpe~d*MH<7}0Q;~vJaarM%x9NN3fLg}dcx{z_ z$%AgetSFc}KiU<{C@CS1#mO;K0|Vk$i;k-VxU@Eld%==bgz3eT9!U|jWg&X0!AEvY zuuy4^S98bsMplmS$`ump8~kLiSk?2_Zl`xKMEt6s7|3^R)7WB?h3Q`Au6J8K;ks48 z_=9%cEzRQf9`qF){xYV~77UC6A>AR-tP{rAtT%1NJX4rxyd(v~|1CR8y!+;1O)1=L z0ww3G1p;h%QyP0{E5RQs;tg20u{f#~Re==cA^lz$#epn%o-4A(JZG`pqJBYxL4@)s zioeg4wnC7FZvR9K^`0GM$JX*TgLLOWVV-gqdaHFcS2{W%f$kT1_Tb(=1q~Yx_~=n3 zz(y0nG^3b^qsi*FlpeGOhja+(kC43<=aLLqY;%`O2L5Ku&tzVg8VWG!Sh&*=PR5;; z|6v5C8+%r)nZ?{6Tx+brUe3jb8dcX-388@A;}SK!=pFC55Hg-HsYYjLCLS9>E%k~J zi}1w*w%Zh(uwsT3#vm?&jEy8kbthqj*1K{~89doiYYHdYyfEfMd_+!fThQ^L=0|MD z@n%o;b8qKD{w&e+KZmz`KnKY1~k|_jh#0b3>a35$?t7^k1y)!=4BX8TQ=D`|S3KYBehB zLRhhJsFy3Kv)XWTl*^wU-bgjM^E>e+`=9{09Q1lh>}o&$(i$QpjN;Ra4W*?)`eu+^yJxlr z@{!)4e{tLG%dde4c8%gcRG_0fKoxucVvxB18{YNoMK`VCaKIu(eUMbm;{mM%3R~CR zo^$m&AP;b+^X=X5Nvr2Ra9Te~uCMWN=L+6+e^0Ik{oh;em5231_lx=tx3bfBcieTi zk7Q-XJ=l%h=g#eI(z>|l4xJgPq}LzL-O`+-K;BqP%K~oAA+Du&uk5OY22>3cttf@=I-{zN2ffKzN5H!P@H|1@n8Es50_4};vqhehPuX^4%=A>qIhPd;f+ zQ+Ka_F0!YoW;VaT71Hdyn6MR@(9o}oJj0&(l$WIjJ*v=j>fkUXCZui(HkTZpTg6vy zGcpbs)x`mxU(%exl95srf_|Cf#ExA>QmXOTc%^zFwFOq4QN_7lkuoN3Et7fkrKt95 ztTmEjgliy@Qih6L6ON8ajFAlA_I-9_aaJ`42m-%v~MWd92c1x+CDE*sq zbw2^VUZvM(&hwWPsvE%tG3Y8W<5@rFsG7N7NO=DKm8tRtDKmFZp+&i_R}!##EQr|` z&IQ_GYGW>R_i8n_zVdl>KLwcH(go&}DqmK}3s@zE5 z5Z=ze#Uzgv!gwySW$`G{HHA}#>9<&0mdo##>EoRK;zHtvX(;!sYWayw#fi}YfZi|E zF4@Y8q!S^-9_i6C2OWR@A{cq@uXMR*2^633a0bdAh zD~f-7`RxvNYz+ zN~}UfC~}_)Am$|Iw$LDjfdh0Ext&R(2yutESIuBGHTN4$yAF=MrBqS;SkM$?=9S}E zu>wEm6Rymg%sFq9*A4FI5#2NYN_c2-)7))b#Bh$crPoJVYWDp2z{_b$Sp{QK3yODT zqTZI8P#;gLQ&su&jP3F$l~F-arr6~=?XGo8o#1R~2*$`{6( zdre$Z*X}|^AG)TcoD2n-AKZ!ab9`_JY*?zHkDH=cye(B_>!(cof)q=q1xCnuCQ^@x zXOD|L!(v3K^sRoz-R*Vj69`P9$2k-OLyd5eb?;$r`a zc92~QPz~1v$gW1mBthzPfC8rxC2-#mJ)_KRx3wb|V=>C6-->x8ffv@F$y*xq3?2-( zyPlO(FoNZYCXQMDv4a5*9j(YAz z>;Ib`3)@{0mbh#5r(z{B*e=aO8OlB|f>Hb6o(d<|bQcR6kS&tMmvEwc;%V(9f^k%N zk1|otdG?9fJS8E3FVU=ym6(+ZJ$TeM1n%CFQ*y`{(8wUA z7Xeaw-n{_!$DOSMxni7&7N|K=z)s6l7;kxLS zQh}V+Pb+1Zar)VPZNKtMd#8|LueBrBvx9Fut2fSey|8((>xHf{$3K1FPnM9Z916Wj zf_I2m`|SvI*+>ky?fg~e?X4AWKIda5Sef`)c)<{ypg9D|?x#>cqu$84pt&DH0&igm zPjb~p(!Ai3ilA_Oo`HVS5)0>e<&q{D7j*XM*` zktzhdvH{pvUz<`HP3YZC{epGFv32bm@qbhz%E(bHWPkj&vNoNQIJMj1%_nE`2@I4# z7y?T(w)f(@h*xvMP+nt=_tmkq(XuEkX+{zrW`H`w-TH=?Pen|zE#h~uyUWm z0nKvl1v8%1+HDF46XU?}CRWRW(EI=?)RRq;Nt(Yta92V&R|3Hi=1WKY*0!1cPM*vW z(JiXr0tn5C)RJ+w+8d!g_e!BDjWqhb^F|)|$nb}`Q*?E7S5}}Gw1VuR?$4CWZZy+3 zPGs8ZmE(AG<4U~5PTF#sllKM8%p2#eKCXGNMcgXd9t_avNv0DzK z0|kH~t-LfoCj15V7V2X_WTDp;j!`ia>#A6Ow6RsOyu=kT*ht zn#D*X-&?Q4Y~jwgSt>D2y`pGeE_1R}WDI@5tb55xO|W`=L~hg%od&@Fo~8eNW=XNv zuj->fa7{vk;zcU>JG8Ad8c z^{_dLN7=_2%K@Yk@g+U?qDUs2tII7*=R6;4CMSx^I&}JneR)&itA^A5AziGE0nLMQ zscWXT1y)tUzkU?OHi?{Rm5W5CVDl*hiC=);Jr4spH(09>!SJSksS=6%ZrHL57KV?< zy|}ynt)=<^0m5Y)OkM@3$IC}ZzR9vG5q}#Tw|~5{om~5Gy{Qn=r)o18E}>t1Pm|#o z4k200SI$U4iN!-?bbLwC{Nz{iu?wjmW&b1@3aWN->^%+=%LM|R26muD8wE=(dB|vM6a|9N)@}qXI*#X`&d@T z^)pqZ|G5^|TYhvus-YKsNfV9*Td4*)oAslk4MkRM6x+fhF({(@O3<^@dh_gf%?&=S zA2jx@WDS)vQunO4OQU|czwxfdX)}7+OI9HRiC%nm?98UqwfDKZ?C@;uePra5F^j%_ zqV=scNRXFqoK7Isg0bAv(4#flxP#-vzt+ZP{p|Q`t&In6rTx`59tk;$Z=#Y1uQ$|3bGR#WTl*_hLhbsW4M;uamEE;-uaj%{-*)n)+aC(xPk9#FgZ}8Y z)gC49MtUX8l56+hv64D==i^`OkSz80!)2CDk!k-{vi3Tb;P7Y!UNs`wvc~04A>mgt zMNP@jTMD@EoqPI%iBtDVAr?f5Ax=JGQgM@Xkq<5lXF@=W#8Y=ikNV6PkKXo@5AI(`6_19=&8Kd! zdvim`yHs#KIz~}x#B&TF|(&W@XLAy z-#AOgd3OX>8uk_=47LWbuYRIL;z17yJ>fFBFXZe?moP#k^8{kPBB^33M*&5NI;$8* zxnuNhuev={BlR!4x7Wku%ct%L@!PA`EsGS%ozr)Bk)j9pq+CbJ*IdUJkC?G{sCxZ{ z@g1eJ$pYvU@B?9T$Cdj8J?3)tC?;>$-KweZ7g+I^*1sRUC-Dg-ZDL{Em@@Z<2Xuuf zhvW>~$Z{A7BHGPL?_47ur3S{QQarL~Qp`Pwrr9*D7uDez=MN^5i`Gkv^o}9cLL{Eq zXGwNMy{;?UhiK;#dWVp_kuVv^0>eQG<)1RTwA=0E>JA9#5wMe(tJ~H;lHr%`sQsyP zk?ifUIEJSU@&^;7QI)Hxtwbd_4d1y3{UWJXC9gw&AkJ1T_myuxu7F2hX4H@|c~y8x zGf{5fbXuN=Mwm01grh5ae3Tu3vi9icnfUrWcAQ&aI2Htw0D=r74&7_c5vApCFg^| za8wh-dosM~p_&@`0>7+Lpg`aNm_vjY`-6MnP5q|zF-qdVhL;mo4&T7NfEXFy`YdU) z#^JuQg~mjXjr{{n)Em7bC0SsSltws=n{SfM*T4pPt_U~UMBIqHj6kT)2&rA7k;v|6KCwr_*)+(X2Ya1JY?ltT0# zOku~5g53Pb+~+*qVZ4$~lS%|N+t%W#A9N*l-xCEwS`dBHG+tfyv2)!vH zqWn6Nr7mNVZr{;}7ERxyoLaKvlpe=Jx*Nw9Vo%-e6_Mnm6T;{i7#yc=!=uD~b6A<) z;`4@}A%CrF@j2gZ+c)s90gs7r(UMj;5A7fBudZ1+(Y$%{QB$G*)- zZ0^Pp$T##nDm^e6#|m%Yvj$eygzDT&-j-{POqZS@hZd*sNJim)nQE<%iyBcmX(Yho zIViNFK@3lJ`Yb*b!a8tY{V*(CZ=DT|WSEFJc~&lKFE$`v#Q+#W!KYfG{hSrgEQ4YG zEe1Y}Dk4WX23<$RTB;0!M|BCZXId5tU$M-V9#TU@;P$SHdqm+r46i!8DWdmfqbl)YkwFyfa5vKav|BbfvNq>E}IC2b@~& z3)f0>m5~uR%RWX`$-zAeH%HC05$B7EzY;DHY{ zy{Kfmw_2bbXL`c%>W@A;FXH;9*BPxNt&q5!R<$JK4pcd5+z5WjruWXZY}0HXp#n2! zi9?L20(dMd-47|;+!`_4ctBe#p0fx{eYF3lVCqfG^oc}&ST)BNFf_FZj~>o2#^>o`p1szR zPoS=eO5ACYRf)@?FCs=_?y(1MjAp}qt6%+qx@ z?76jf@Yhh!t)F9_nRBXN){{>wvM_|9;a~60hq08ze+GAM16qICZMR0wcT|Fs;b_qQ zJd8!cCdNcUzoWF^P0SY=DP8cBUd^IkBu)GJ!%Ak)N3l}cAtsxq%s2sab#PcaK5hVh zfrQTlbUTMl{dcId@xfsU7f1l!JKeIo^*e<1NLe1ybA(LQnvK)--%qlj1t&(y*3tXq zM(%2WKobVh?KAhcErko%bZ&>aV=R3H|AYAd>szhy7xvnJTe#5t4+|HroMGbr4qfE& zQ>jbte_72!^FOOuaNgzd!T<82g_q)IB@26vzgV&G^Syh?YO(Wz3dx=^vq}c3`}sfE zw4b^?kfT(QmwgDvO&%!0pLn>Z_2a0q+Y#I8ST#pHqQC=IZ9_;|A{Mpa;H)w+A)J^&`3BW7`~+GG zr*UA4Dp4tb_Snq8?TfJAO1Rdb1q>*5zw-sh-t~3owhf}to7Rmp9xvVD*z1nICLcZ* zFH@i?Iz+kWtne60wnCU9VSul3e!|&#a%Wu9qZcU~477a4E!pnfeDZvsogY7JpMCh} zp#QO^66W)a@)zn65fgbEg(F{`~ko(w_FR~mRw8f!tN>3TJPBxsp!5Fg-c zKk1Qb4>Ybbmp_jOe>ON*pC`k%{U!mxTOyqffW)@9spo zNY1||H=QBG8!o!TPHWgnZa_gJS;QO&dQbBR?AfAtrKFUEZNL@}!G2pT>emEuru|tRJm?+c!dR3Xz<0faZ%F zz_S(QruTCIf2N)u3p7?*`$pN$ISR$S%}_osG$1Q4n&p#6p=)f`4^OJC0U4RI(0}C4 z@yg~j_1qAy@R!#7X~iGkXE}%Y*W8_f<;qfE{tKv2DijVSpO*Kg&ri8BK}k0;u!m4C ziPcNu8x^zaX-b22P^yY*$sd$+nTRh9B6KA(K>eWa6WMOsOf+416G+<3wa@oBd zj=&n+>0b4d^8u_Tu(Nr5aJqmQSx!#0GsL9z*;1l;*O~d{frf+DN%3xMbdnIe;CJWE!#v z&`mzMlCwY9uka!wQfuuv?hk~d*^llqL~|T z&x1LLUkHTWI5I1QLCnml7v%|q)XkC*ty|2Qmd6E+(MI!Vi$#G$VHm{j9^GR9wGMYq^ zl%VnTYyGCaUL5-2*PHIgpihByOkc+x7Q>JTNp4}=f~N|B!b0$dfh)I=h+-wll#BsD zSwix6wQ!@jRNSr52~9F7Sk3}{dG0Udd&wIL&ri9ge<688IMd zsJ=vPQEg9v+F%xhU*!H(YR5ThBIYPiAF$$t@~yI9>;Na-G=JX9#dqW1xeBUtgeNX! zMs2D*p}AWckB>B#QBd=sGEm=b;?ie4D(3lLFV;=fW?=X#@OcY0eH>Ls3-7aWiZWqa zON@(P+80&?r?^~dF)%f(^4mPKb z4?qT@ru|yGe7ftW8ISO)&ioPJ3y1``3qF^HwEfW@C>X>Df=rL1mhX*7AtE~Wril6+ zGSvYm#ao@n1$Q3`E1ltJhCR6|REz_%)j^Kvoq8qb1_y!F1|O zTcs6^fCmyk(TRkWfQ|Fw5(y(3I z1#GTi`dx@a5kf57snROT!qjDLg!G_Kr{BtepT5*m)r}`~=jBQOGn^`opRTV}jl%u( z{YF}cwFTZwoFgC%VDqE_WJa%reexy z14bvD@3Gzm*9)N{V)g zw`*=xoHy=PxN%zy7OLot8(M

CSU>&>QrruhZ(cuWp@HiX-*>szQHO0mP zC9df0IBOgUT_KNSj((X5gP8Ffz;*+=_*JYq(pXTl+q=%52r-&* z$P!yE;_sA5d@T--*sUfS0gobb|B52dry>$SUu#i}6!B3d9H?qo`nn$7*1ix}9P~e7 zlo(uJv_>sA!WVtLv#vY0R|r>j`)>THj91L4mEF&l)TW4UD+w$58%qb8_teEA`;kpS zZZk(BvzNj~TFSSVqNv3Z-wI8Gck8S2U2;8>q*~=vng>Uf7(B1^sPs4TF5^fpKtRe` zF|kvPkv``=PAQHsW0o%fL?gWnH*FujWYhAg2KaATFhqc9Q8Qz>EMDBb{@X{JMhu&u zGLE6&oH|l3LWVCq@lsp8Uh7e5vU0`ZGnJO?p5AW1J-`MUeT3)l-2H3Fv-AeHRJRhZ zunpg6Tm)f3IX^KP;_m^zEf}>a`OPPR62!iHn;BMTON1`wFLLX3YJ)2Xc+U!G<_b)9 z9JEg+90mIYoPtuV!`)bZ6W)IBxA}o)bLinmd`~k_LSs*PISnt~p&c>y^1w{xNkF|? zP{DF2pNae#Bd}z=uATP?g}K-mpf^4Dec{TfU)51ycE4IY#vdj1Y4zE8Ji3+~v0lfM zmrs2H5!yEd9~%GGO4m@dVf=w43vxOQ{w~e>pxL)c7^`QTy)9Q^rB^_aG{?Pu? z`H0Gd&QKD#pAVg>-*6_=KSsAnzvE1&rv+>Ylc|rCXDOOLa#OU z8?~dO27Y&X(5M}poyso^)0yRr7vShjiZCOPxr=^Bo4z< z19HtRC{U^xtZImY41JMs%X#uhkG{~(2`vzUG-;p^OQrxlqp zIXzVT4p2~zlwB^&8N@ZWa5jqR%3fEQdtYt)oC?ozF{1~_QBzvPPz#evNeZy%*5!aY zoPQ_34y-ste)OjBZ}uF$>%IWin23~NLTLcgkuy$aMeiOGdT?CLq58#d?SWr4}km^uD_!DN~BGXuC#H2^EUY zV#cO@L|&7lQT}%+9)01ZV^4e{SL)v6i=>&9mThg4#IX4Qk*AMp3$-gy3EJ}28sU3F zsfwrQk`UT`2`Ni>m1c;f&i%Tn8SnCrNgQ=eAz^Ui&g4(st50q=-8ps($&=h&~0_E?nk9D+ECtM@}An;R7NxT60s*yCzO|BHGwM81PkhE0$L7;JHU^gT7!}`M#bxlZ3ou#Z9NW`mM?hl$ zkRyCV)Yl{_ir!Caf)UlnZ}<}HP?PnH+oB>3hx&b-bBD*MfPNLlnm`+_mwlylMfzR} zh@P;zpB7jC7ZWr5(j5+y%UkDOOFnMPylf0I$jWrChHgKVTHO)P)cQ1>C?wcVLNE!u z+Qr)7%cJ>n4_c37@M}Y}G0wH4GY)etKtWh8B#2Z7M#AHMK4)jg_0z+6%Glwm%0^yB z2j;!rebfU^Nr?O=izW01+&II5%Y|RjcF#Lx|6$e zi}bq~3~NIyhvW|;CV1-&?98R^DDhI_ntwNDP&owR zBU)=YJ$-B??GBf$QpHr+|B;+KrtXD`gA6ZA5xwg7oFeCA@;&Z@N0CapO5De10@xVb zLo?qbRsv}=SxsCZJ}Npt?l)J=m*<&O{vek-4r%V|0yV-a`yP|76haa%tqqJ9Fb#R{ zo93DojY4&n1;CQu!`HPGqLLl=p!%jtQw-qLvR*`}r{tA!5s+0&oYl`%*QW#BWJNd_ zM?)BOrb^tzc%NB79Af;P5^^q=svmbgs#2G6ax7=@&JGKm_(K-CXrV!_$5c;*Wofs_ zLvmPGxw@eGauVXgzO1@3T-LwAcvK7-3`t28-b}qx$CA=6Rc%>v2yzdtn2e^d7qBI$w?)21k3>ZKqcY_JQ_HX;x_dg#y{An7v+lq%Au<8*2=N(2*_ ziquX1oFSna8Ej|^2r%Z#eDxr)R^!gEw%oUOK}K-b38iTi%|@<~XMdF{`DR(G$`U=M zsKcFS5^gYcKZ&FGF_s6KQekDh@sjXC8FG#$`S~E1sV$Z%Slz?*R4X?SW-H&?qp9Iq ztE?*wD%F$og&H>DT56ot505m-w0ee9O?h0mO%`rg5-&Wl9jqS+g}TxFU#+GOH5QL9 zrEj``T*IWe_MW;fW1%p0EjTy`ROR%7M%j;4WBqCtvvp#h?AK4iP8=fe^a!|qSy_W< zgGmsjfl7fP+k+Nwp zJ*Xf3=?b`%Xtf;~IvLl*9srV%0ok;6+R*($SW&qBpyI+{ytamZliaA}i$J@?4);ac zpI3-y>QsobRajZ4^@9`bp#bB$t{q4^NS%Xxyb&!#s113mcMvTnu1coBH~PTk(;p`7 z?q$-ex9V~`{cAM?1+AZcx0sxhxIV5;ErXH9o;H;oYi=je+CJW42VS(kqNMQC-32os z7oU*KzHNUZ)6ftZ5O-mpu>kcVeMOPfr`4pi8V;JUB+{QMF-gFae$SM6Lm+i>)n8)E zapq3W8gW&Nr<8+#cyv^c19W=X0k$xhlVi)*f}-Y2?`3b-YfE$;OxXUg=PRw8V|=W> zfvoQJgDU4QM_doT~8e#8EQZU4bfqvRb%K+)Y){tr3H9eirt=_pa#BZJZcJaA) ze*Q6O_Ai)Xx$K-L%~PLgY3^I5MOU$9TAEr^gx@GETe4#K!NIZq2>&=a)Q@tc@*l}H z28RE*aBnzi+8a)rl{Z+4l>5z@60rC;(2}v=Hjhtr(zo}v+qx#o_x9^>&M4ZY_@AvWs*dGMAX z+{~9zGQ4m<;~YoIH1?ia@`h?0;v_+cJ<{%7m@;;z$#cLEo!urHKF)a*b+Yw-`W0?3 zBLkhL>pgrF37tJWwhAQWNNm71;(_1+MN0~{zO_QsuEZoY5Sp7Ch7N}X^qSG=*b z%&0l$4D&2RqO5=oA~1f~CAqt&Atxa|?Z(tf_G9@fY)w*g-d1s^2#~Zk`sT?aYrboH zS;5NPXqf!oy*_W9f9WLWoz5k&2;@3)*6**K!A52+1;9OGc)id}IFX2%JuvmPeZGE0Maut`748(77%7gV5boXDVHED6D8zrA zxt~5+wGFuk@PikUv>|`6Oz)kc3ewqH&~%I$bmEmju3n)Y%TRckE?CsCBD-!j+1~fI z2eaxl_;wPqh?}+)TR|qrn*;8f-vNgfAOK3g050_y1Pmzo$bwQ^Q&LyDGnZtT2RmM;|hb{+XWo!x_A4h8EA z83|?z+};jy9qgOUZLLzh#04rJwrSAvglSO`&oTwo4@i;Cl*zZeb-Vy-ZxYertt=`z zLOe;J;xpPJnV@K0H}N8wSW85;e#&#t9hw)J#qdnq5EtvJBq8AX^1?sfg%$KmSVH*K zoGWgUY3VuK5h*r=?lWO94tO7?i z2;x%F=~(Fs3Gb1Rf_Ug(vTYf6i7eUf5dl~eCoKuCMkYWqm{bs@z?+P_twi1ukE|Ne z>RK+Sf3J#VD!NjQO0^LT$-PGr*uX%zr+3<(?LAk4fskPiC&z?L+_;2kF7auS*>L9I zLhH?JNEOKDOlm-Rr0V4PQBmFLeY#v0xzraRLkxcvs+jKSt}%<%)SQ-~02=HtN-L+U z7xanSX;+gQ{jpe{-dtdRA>lh+EeTu&F>f>F=A7XZ~Ip9VemGhq+?@TvFp zBM;2m;?-TV<(vSFSuWhEiVtNfP*Ga{3B1WFS!P%;DGEMfIO``~`9rdd(w>qv|LiOZ z(ndZAJX~n;s=O}4r*yUe-sh8t-GLu_nm<5gAeE}1bge%gK4->^vom-sXrWv_2?5Ac zG<2ITWi4*@+cK64113L<#D+8zo{Dp!aS?{ZQlKFtK%1WT4~WluU-ZPp+yvnNMh+t` zbBpc;!jtTc47SQ5f%0s%jKc!@G^q+toogWBQB6Ccff-`Q!g>chaWpv*L?QI9I<|Im z!iq%tuTet*W&9AU0$MnSphAMDB6blBB1EE0FIei}ggr_hqyUD4J%tyX)WWJ*n&k4Z z(_ULJ0rN4zF{wZ@yBZaqBsdxFzv^s9dY1>`7izQnix6l}46v;~;xcJB@CXF`kT8A=DT07iOVWZs826}yax%~Zuz!4a@p z_`0FES;D>e#yjyU0c0CgV9C4WHZKO9!=y)>oJmMLGg^F}%;gIm){ih!oGW)vT6W7j-!s_$%j^mijZ!&%;8XF0^QDm8Xko z;Trhw6@tFuG~IrK2quFym_6dEoZr)HwLkYzklSHC@bl%}?MT>#1f(!oX3YK1a0NB#mQYJL1$Ks#`y8p6tIr-51DI&FyJeI5E{X(=+aK2ggV6 zPqez{bAhYG7x#c-zO4A;>_r!}*ISxl}8Z_46g#D0NmgZd0ueZ5=QZ#m z@hi@Rn1WvF9+Vd)GsQ|6rnhp`9ue;>pNqL=>PX-%Kw@tYEHc6F`4xz{X17Z=NosOu zf|F@aQ|DNHE;9EPNsTh_ex5;*Ugo{pp)g<<;NV;&pYpN{OeN7VfP@9=hzJ?RRi-IL z8896zUz9=A(0E0uZpLr~{pl4j1BtE;Gp@)PmAx=7Vx`; z0wwtSgZk@S79{49a>94YK0ZD zcVgrWc)hvaRq8Wnzhen74UqzCFxDfm@xE~!a&}?&u|k+HINod6hfJ`N`|Q!~EHwvG zq#w?Xn(xm5x#nh;MGL8#{r6|bM{=divE(Ic9HR&oAmDcda$U{+`oaEt=U~a>o{8Al z*Bp)eW%KC$;eIW6u0ZNR&J~SAPM1VkqU-7|yR-eR&vCzO9KAm{THjZl>!HboN!9a< zl)eh3uQZyHj(!32d7+PF&D^i=AH3f`uD&^H@_9CWM5_Ct43pS?dlXl!e{ z`{hBy{qkT3*h5Xd%N95=&pXcAb1V5E`P%8Tx|vAO$jD=&Qw#xo?3LOALtE`1k-Dt5Jaj>RFoUqPURhEa8} z?ck4wx)xll8JvF7vssC{So+SREtu$iO5G(kPbkl^3e~LL9RqgM3t;H z!g00ark%7RG7CnGv952N(Z!l&RLU*#OheuuZrbgSS|}y9(V^>R=VBDuSXMtO>MLq* zlTGP0dTh+tEJJDB;}cO*a1UQ=95eeuJLU8FMxWxVLV2S5*HoVD$A*Jqev*%c`Vc7w z=ed|yhcRR|k7|v^aqaZ18TCejyfuW2RjTx**rRAdobFpA3aUwbHFU`G&%6|z?LJ%0 zHl=Yy<7?>^!nAI~H$_MrWmk}dq(V2dK2Vhr7IdeQ6S_6CWq~10K~#imQ1ojBfJ#B? zgLCFZ=FqpwopB<+tcxk&OT0pdYf4lc=oij$?cktUJ3MLZSUWN4(!tRd#-&T|jB`!R z+!3*{>`hdM89LOSr!lC5qv+orYPewAO{=MZjW*`M>`XXeh%w59%;h{qnJ>!AeZuV7 zon7Pn86ljA*pGfpM3%R~!@l1X8cT8?Sr|=+PwOh_$A-pn24Ms(cpYudl%FVH9#U>@ zC4tOrIRRt~G){#HFz~5upTTI?0O%E^TDEN&zg@ql(;UvKSIXlRUdY0pEYcSZQLIh% zlNO(#>RZncRkQ+Oj9uw+Y!XF{6bU@y+4E9DaPhY13e@j&&#*HT{b4-4c&KRueIFFK zREvjlM^?1cCf9X$$nyTy-RRQ*ioF-e1)Sd@Pul&xLwwI&UrOC70G>Qlmn#vD)?$Lf zOt@YQp%GH%vo!xhDfTPZEfIhLF$e|ZNuqsBeq9j*Zq|w9hBu!vH|U1M*?o0cEpF~s zOpj(bJk1Yr02F6dQ>xPjjjCG*2%TPsL{Hm8Db~O}9`**D@d_-{fd@(ra_lacQuMqG z8RLgYVq#TcO;bnG6u!sV$FkxP=eqx1Bvc4-;nIVWhR*DP(WxymRe|c1&5QXSVGU@O zVoifO=#S-AdC(0JxXMgPI>#O&Pxe`|=6<7oc$Rd=D=%NUF;Pp2DQQ{=Fve^~qr-%s zEKtj8eE(&;{IdYF%2i~^c@IK4h2xD7A|&d?gH%3lKR0N2r9X7ODq3>NdTdJ&ChJ5> z8RRitCbDz^UNTHc5i6pKM?Xt{&=gr@_geYYI5>qr z?;BwH^HSv|ipT5k(+HEUQm`xRa|zXui^Q>_3aUhJh0WhZoL*rl_n>@Mp~`fkw8$*O z!M#Wt=d~!u_EGM&?)vRd;5+M&x_cLQ*Eb;Jy15;kcTiLOYwM;(Qs?%k?#-5=mPEFw zj-L_Fyz+i2$i&cMLgSD~C1 zP$(8ZNJ<_UryoD%&n2_cGZWL$V~w_;okVUzIzQabqJ0`)1U#7vhHc|1Fl~o)%Jg>u zCIuY%D2oE20m zO@he3@gKs4J|~;&tRQrceRi)|&4HGd%8p8Diny|Y;hJHblhfGlX`J|x<>;^-vQ-0` z|9D@^{*k;ptlLmZuk2KR4fayNh^cWyJf0PH$xa8H!wgR`5DxrXSWRv4jcggwso97| z<;ZKZnB!`k=Gk{Wmg4zA1LZ0&S4@7Az+tM%8Ig%4PF4_-3W7Rgc<1n!Fnf4J7CvHR$(40HIlwDUeT07MiRw56PJ@WbTeMS&G! zx=5O=;U_OSanQ=k-N`|!-PjWF%U0IGt{6gFfpVixABFQ|i^4^M1Fk=CCt9n2MU(Mn z(CfCpvS)U_dw4F7c@ML!N@cq@sL*z%befcBp8F&!2&D;%O^mo0RPc*7rBDTpIzW-+i<@%)tgA!C0XUAgKSVH1kJ}s2dw{coK zIErsHwn1y#Byn+EKRrH_2W-@d1sXgXs4_r?aWBZptv7O@Ycx?>8OBl;pOPf`rZfCy z$Hx?;EY9-FG&LuKOTIdcBQ(LC4XsF@9j4#CP9Gcu*PrTm)YRWBll}VX34bpQ*wX3c znytBRDbh^Dqbxd1qmUUIxA$dF~R>3MK^B1i_5r%O+isUs<2 zP$=-aGh3_R{8yet6W~QlttD@CEh$CwIHfbH$bPzk1Xzh{i5Fj8RQ zTaQxVeWZBrMK=Cc*g(|Svn4Nh=g3W8%VoTD+{0Oruo8}WmBx84mQ1nIAEfcK0Ka>% zzwIyvO^DkKHDxhQlh~v&CSG&>1T!b#ZiY~%tMNkE)P1=C^9wlCJTg|y9o$aZkvyH3 z8zE^HKFn-PS@)o!j&%9-k}ix0&6C+y8Uj3{HbH3B#tcm@H)ligA08)Kdn5af=^(W& zQ69l98d@etbE<_u(Ts4~YHzeoY3=(l?}S_aZmNVYQ~EV2W*M2~!qc814(C?;q{t+y znND9ooBfncpCzj$fhx%~$izMnAQA)Xs)r~<{fE(ej!kdj9v;3@ijGy;$S+;GsiFE!3p|cv-?Fs zR#0KnLRu9oS~+ZZBV-Q`H0A+hlD&k;i!f^dvXp~RpoG-K814C%WzR+ui|&8pMXti? zh-Cm(giAej)B9?q2Wn!vPx_w(S698cMs}cp%QFx=$H(@zb+cQuT8+#VyiKilN zj;n3)xH3b;E@M%`Fn)ueu#0y~TM<4)%Rd+2R&Jdn7jC+qs9*uPEFl9j+Z4CRte6+r z27d@Y!CDWx%OEZd@HCYdbPOXDGYa4>B4<~Lc8|3Q?xsO($eWIpky|?^;5QiP^pP1o zwbOw5L~gNqdOu?&5qd_AjV&YpR!%f6Rdbe_h)9u8M%Givz?8ni%$N(8;2mBnu|#l1 zy;3s{iD)*df&qi-JMeH4$+MN?{IKb2A)!1s>OgN#Gpbg>HhRtkroClIyrm7P=HLxm zK*||$3$v1lWA$qe?t5v~e@kgBtpK8ggK3XRifn(^9BcEK2DUA4=2if>o-%#DyOr0N zC}=|8jNX<)ExiDLF^2?{Oc4m#P4bJn>=Mw$3T6Q@Zya-1-+XyWm*IqvyfRVva;N*d z&W+E*(pDqF!^Ojo>A^U5J6TfwzJw8q|qO3#U4(G*cDn(~l zvy`T5O}^(PF3Ctd)u?rRB+;f@@_HN7?Qe8{0Eu0o)R#&(Nh^DrzLds%YUo)GQBe@t zybviho)I_|bX=`{ zddDU9i@iL#ARWUr`SL_X6jLdIuRP7?vBJYMAlVPAX>1IrE{IOeV#O#*56bB7w-?#0 z;GsRH^vG-Su6J?sOEOV)U**Vqm&|eonuwvOF2NHiC+rfEn3hFne5Q zWhk9hGhJb6iSO-2lwpdys0e0#eH3hUu*Eo^2%lB>wQr>B$&8NVCeZp9~x1@-qA97w( zHD5fr<;LUTgw@Ci^IUVj6iBzanz&M0YZsbHQW{GHkQhF+r+F@)1cDO~X?6@r`c~Uv zv$I7o>oOAqGI2=nSKtumDTsWY970u0GNXzmQb@inHMNB`U?e2u*3+ph#JtqdP=uTn z7hF;ri)?1Hy<5BVwjx?2(K8R|RIU)-{rcHDIV%=7TuEl~3`=&7APD_i60Qv96 zOV)XD?@6jb-au__II#XFZYZ#$YI4bdJ{=lP3%@YK(trT*YGUe7ZJY_?GEu^H%o5vS zPggY(7RRg>4>h^J3_xe+BX1`+vr9>G3b7&AxEVQFEDH3(LX{iTu}`$VM2U|5Nkak% z75Fnt83iHb(xkn3p@2&U0Dz2n3$aov81>CrfLB87lx6yYv0`V0g`^}^rIiy}KNU7@ zK9(ZfZ1J3`PV`JKUt_ZcZ-H3!p+?MmpWyXbYYQF1-#L)KutN+&Q&N;|HE zr`sD+frQEp#`8)cpRdWOvIFqaK0HXi&5~y&Oq1Hah%5m@H>5-egz|vP0P0^8Js)Rd zzJ^T{xQoM_4MGcWVLSoQJ_-2KBW1qG+**11_X@M2*`0JotN3UiP4ww!#89odxP{To zf34<^Q}*>C&!(cc-Af+}GDQ}i#X?35c@oR~ou|{ZFw_1f9V1;-(UCof-YvBi^KUh# zLGGto*Srwdi~ye_y&`BhNTd`wns@illV2jWrTbN0t@{S}T<7yy_@SDI7QaiH_SENz zfn;e%`T?t`k+O+W{XRzQZ@fMfH>+&n{-A~t6+v{4H69799$}=SJPk_j=?cGh=Rmwp zq7h4qaXhG_%x(7~G%DI}h;hth+bVesx1VH;nTqFqtCbL3)6XE*!ON<>keTxh63!js zQFN_gDN7!kF<&djpkA9qi@J3WsSdrki5!@JAv3I12 zl5vyrG|60hd?*sD_IzJ+-7Tl-%&mQ3KUu$ZdW32wWPIPVGw!2mk!S&LO>X+}YAGZs z;s;4;|80cRuteDzp!a;h*&#qB;n8|TMa`0t@SXRiimIE3>mcr?MR;r;0t_&#bZiX{ z8p(J@n;MA+sL_;Kn=I~@+XPE{Q~h&Y=>1^Og3JLB3eeeedy&)r zL+9TsI9}MspNje8H&m~<#qkmEWE@vk=7FF<KLO@$dj4L<~b zIA@|`w9YvGp$e4wl+GEzWDt!_z}WBj5Pn}c)hO+YvlN9C zRegDgkGsXkL>B1iGb(eKHTGTSRg(_<*1gn$2s|7fGucXP?ML8fH}fhbWqg3u(N_8$ zcGbNfB^|%J1D3RMVteaAMKwpSBJQ9YC#aVR&4_)0#{tQa^Jh9%a3t?DS=`@qJwp8_ zmh^0C4vOt2gBNYX2f#t0?&+x+8b=Dr09Zh$zY!X5j#pXHq1?7O$$Z`FE1q!ZJ_Ok? zw`4n`-d8MB6zHog+CT0(TTp%G2fFKySAiSQOp95qz&{lJYZa&`HxSAwtq6`aX)0Zt zGcjyGzkjwOV#98lTJ-_lmH&B)0oGfhsx8T!RjkFq0uQ>>pe>`U5t41h%*lK!gV>YphcfObvtq8NEXRATlpiX}bW$m3 zzZbXj#f=LQH#A(JyOXoA_q39v?yr^V}AQ_^bKa2GgV$WrmlfgA-+sO+pEFsLT-JD zd!TMZk|)klET4W-1MGE{nF3wRY9`%2sb0&(Gsx*_H;P**s8U6ZRP{PBv_*6<>+(Wg ziOVc46C*reNQSBMHTEAj88ZKnPmBC*IchDWE0~UGsVRy;k`2fq@$i>%2-IjQt?Z{> z<>!qtG#)Tbl02Y{#*^8g?VkK)jf;m(VbJ4^9q|m5mrGv>C>dMlUBWO=S9*by6ln@mnr8#Z10%YX~*= zG8;e5nHDs&LP@ZUTO?&+_L27%qf1eX^y;SvmCqiBqdHu)ZlL2!Sxy=r{K3AMtkaII zov@LS4w8r|Ps=GOvuBb+TGFqHOPy4PuFNw2>R|Qt@gaM$peu(tD9WgMQCO3y5gQ4Eko9Oc*%cW>$5WnJA{WLFn%{0Epf7~12lhl` zp(~zMu+_k@I-Q4o;QQvCgu;TsYutffF85@NyH5xd8dbX5HZ-`W?0nBpe0j1?BLai!npKiF{ zz(Pf9-F_uN5aur2J9t zs$w+wa{fK5P$nez`J*60Dws>Q&m=hv*U%;2DOg26wbzzPXe<%bM*ZYO_AX{uSBC6p zf0h1NDs!}~|LPQZ)-#E}p=pJ3jRr<$N@~}(atiti6Vf2g)jCGLQgno+6A>C!Jv2O= zUYeE&tl!#^cL74GX*Od_#{4~*h|+NGO#GLE8Me49?<>7m-FZQQQ@bz0tUOJHuUQb7 zD1Tvu0d9`7;P5boMQnJ&a@K=POQEoee~nq{YG}LgYNb`kE5ToSQ42K6y-rthYg)K3 z498C|q;zwh!T2c$S-IasuK(*|lCJaTX*p#LZW);9Ozl8m^RMa0Drk+yDFr&7i>7A-SQ!dRP=g>~L_cfB*5 z{|K#kC{XDkX8ucJUItsL=(-pPR)w+vjV?$*UlSN8|6~{QSm6Pp#aiA*)2By8PULy` z_bMNMi|TMZ@cl+Rx)B~=fJ5z7mLqGCoIjGyl6es#1oCrMZzUE?Jm{ZnL70MZF%#-b z(Wl{&S2bh-Cu9hMW*qyW3>dDzvsF2sYPd(toArpv-!ZShV_r)0u`SRQY|@H_guv8) z7w-CdvHiW+{tmGI9bo-C!1{N9^?&mKD)!#^zXPm)2U!0Ou>Kuj{a+bi z-HKi$c~^2i=-&-1aBJryc5hq%NcQ&bh8HbQH&amHpuK>KGy{Ew)U_n<+U@o)%7Ve| z4lytZa2c5LzL=XRph#$$;;PD0{RF-3zSLc$|E@*E36BeCzVmRnfJBnq`fL2U)nCZ9 z##W@b)H7iWY3v_)spn^rtDgr&`U;3#Vs;+q?X2)uz!vub$UHp>FP4|E!e3wHdVpWt zxEsE_U%89PbdqAkQ%$Ium9c)m$Kt3k=kc>2H zrrn{L&i|LK)+{We6!s=HJWlKTr$1k_x>{kVv$FX*eckKh486x;v~9uiEv5pG1IP8n z-30DI7mMum!F__U{dZP9pkwh%O?^(Po#u1KKgywZ2gc+`IRpD*{gAY=WoFd4V< zzgouyxDmzz2|s9XgpPNv+kb)*+x_Qb(uU44mM(DYC?BIpTFv9uhpp}ftFe~BZ>Vs0 zL-|x@yz}FSDr1AzivK|GT6LkvbxVy&sUV^TDiT&oDk%&|itbrCIbOLzV+|wL07i#a zOg@e2+p)~=-LLmeS`B4-5RRG0>S~&KStm)#q&*UCWY0#`wu zZIV$LF4I~Fqn%fod1b>7O&8KS@Af~dP_x*GzBsV9?m7P13=Ri(qfbfe(#@MYbqJMi zwf}L~z3p5i=UQ;x=Nd7Qyu{Dj~StlJG1ZWRz$>9K71&H730WB zo2~@H%;;mFYn5-Y7l-u&8=ppfzu^mMI6h~`hd&=@u+r`yMjWEJ34`v?pAD&urpjHq zA2k{}Tb|;Cg$ZD41CzPPWmyB2akp>P-R^JrmWGV_b+^0)28!R3{cdd4&d-^ZZ7*$sk zvqMxeQv?r%I5?`i4#6O5sR%X831i6csmVm(IUmT0ILZFKN+sh(ZSF~iS|!~Au6wbP zD}B2ud}sK~t0g0iQtg6ElJhSG{~Ha9tLWIYgVUJA8~dkra3tE<+G@#Is9TP-xRl;2 z<%C(mpa2V(^sxAzyH4EAMLTIB1-q~foqL^XfJ)37)ES9BE=&}jv@K`BxT9SOkA#bi zjOuC&wNHXJokLx zQlY3Ne1YAEc~jH8puvGe4Y{!5H#lHI_Jrmf^d$jYTXYY{<9N z-$T73@^!ZO)|TQ+Wm;@aOGCqKSP6PnlB-#3G!805uQlmb52suv{$S8UN0Q2w#WT?iuzh=P)1}ozmwp6^(-Egn^^He*LKv& zncOXb=!WM*{!~cJD~d2d5<%&?!&X_}Q9JVgm+ir?$@Sp3&UL3hN)Bs>XUDapgM;L% z)$0xV?r>}kN6D8?=W}v7Ef!z{YUfL5p20sFlQ^(SSj~BMci11INh5g)(n&~4?q4gI ze-@up4onUXZnHT`$@Dm2+*{0}mYPRArI3gV?GzuR1R+=HTID>*qtn{i$+7>%=F{Z9 zYMyeK4r`6WM(yb2bo~oZn+^)8SmJx2H?WI1)QED;9-f`m&W=y~LyB-u8-F<0B%7ga zJ#a5>p6>ha_+|=)L_Js>n~&g2n}@ZNlS42$ffyhRDkD|36ZiPW!I{0SQ9C@wuAr0KWn7n`+%=4tKV_|#`@n|Ms^;Pk-0sYZ#W0W`y;yKTIhtt5C~eeB6u?Wl2F zJ2~2zvXkT5S>w#@?z6y24W%vroVi=V&)~}O`kC@7Jofas);v0_ogNrtoirN06-TwR)1%t)*>UA-+bsv@>|M9MCasGL(7tqTe{)s@ zR83laR;UlZ4o979nhTfNgjv7a42(b-6HAuQ^eDHepgvN7(@clr$EX#9WdIj&Ax|Ug z8%3SnZy_9Op8R&L*jfx}+|Fj4T0!+v+OACda}9nctHq;4^a}(MLp$_tB{fMzPPxam zii6kPk#spVW-r9}xn(_7ven&Bh0vDH!$i@fJbh^*+9dh#_Q`yiXFh0>1eheu)?i|6Z%1(wp5(k7&4D2mQE zMKX*(Sx6)~Kfn8mg(Di4Lk2cg5EOcrCZ@4%K?(WvQY9wj9*Fg3UqhZtQ;b|yX}L{t z5qtuBS`e+$mxuM^eS3&W43gW-WY=}jY9ZaTnIl~b29*%e@H}x*a$oL8KhLvbA_-d} zG+_rT<}1}?S{`bfJOd7xGmIT4HK zN`4E=P^fHP3dA+w`Odv+x>-`1qEm{E4gA>?4hAe?6Dg!XXspFK%O?_X+p)Ylx2=&| zWEY*A*6oN_{BSgY{N9Jli{VBTLlzfCD~dc1MLWss^ZiLEL+wo{>n^t_p`6@f6^~g`F4tsKXy4To0 z`0LZUf4B!n=Fg_pFPqc-z2^SWU!T_f~PTQS^{od$faya^w{9|;R z^aevW*H03lpz=E$>hGoe+jRl|#Q*y4D&@}_O>lVa`_DMb{>)j?Cnws1@A%Qd!9nfN z*&*&92hP?6ng;*-inE1W9Zw4%YF~S2qkecIXOX>DKiF_i!0AcTf6y80Cym;^Gi?2f z4{OJb(^~U@A8OXlj?Uoo>>ihXyQ5+X$Vs)XAm=scyT#@%5?Cg~QR}?h!xFpP02!)` z@plhq3N-&D^fbtemTMFSK~Ujdy??U`ta|BDY|1slh!}1Gf)pdF+n^dE;TQER<7z3S zIb4Gf;aV$5F^yqZ7Xbu9G^FWWr=&dDdm;^<7XHL$#|V*CWbA2;$bA@0gIYw|S%E@* zPp2zy5E(wiW2mB`Kp%;E^y>KMnwi{XQumGzQM5uBNjX>i8&WThuq*D#VNRo_~p zqZz!Ym}iad*0sw0oQZE|9Za2Zf1fSD+M_&jZlK;gB2J-$^v1*h?iZ*AgLMV~ibb#| zR&?hR?hzw7M@sbfo~IJH7M+S^X@MwZy1@wwSko4yJx9)_(ltdC^~%jyx>wH@-@!GR z%40Hh+x49B$VY0x1!+IizgGVsDlV$W_Dl@Gti_rShSwF(5HXg>^t7(!%UL9YGcJ^m z3W;hcO@a|hw^?d(98BDb(Q6<~l`iB5YtUg3G^2os4Zp4IvX+e}DL5lfLTU=^mUeZG z+gFv&5h&GNr{QqF`LZ8wFiWH2-LY>K#|73TTizxcP?FYMpm7YIL@OK!&_7}I=Bbm< zl&=~3G)Dd&a~j8!g*)24{|!)uDItHUs&UXeUiwY*dXA?sW$;#>nQ&J;Yb1 zMQXUg=hgkxH4QXa5_X4<5}u_4z!jngk8$4@d%EwgK(37+4|)FHYXE&&JZ5YAiCKsk zVjb3*XVl+zgd@R#;Ttn-yrB$PlToi#$^dHwfub_k(+302CL`8P=jkNfxhK{R%-wr{ z;B~L|%SxmtE5q^QH4lDYvl(hS$+hd%=B$n|zKnKc+;nN6wq~z-L;0)3LV8;HYuuH! zYAFd)8LKG05|>N(lN!f~sZ5!2s6HWJTy*hQ531-Ysa7uvV;`zN?bt)WU3^Wja=%C* zpd$T8kRwc60fFJ_WHy0h+<7_Eg05nla;Wft`uZ_TK#1V!QtM~c={KR4Smh$r_j>KL zbt><0^#`q~a`*bu0rF10$UJH^!d~r_f1>x>lE5veR)sKri*;g%-fWuQ7a=y)T~=h= zwG`vc+PV%j*E$z#=8s&}v4ynA43H~0|K$H}82@flvg^!j=cWwZe@0&hx1Y`2?sq>X z&He?=ZTG9>AIpUrOktCrdaF1xhb1W-8{j9$&XztnrCHr@R`%(s*}lnXT~bE)Rb6s= z49|QwNp1j2n(Hmi`3|`8c0^V`w-kD>gh*r$Zq66f)g9(jXvTv| zUoG}q-~`>4qSQ%Xr%e%x$zm~e7-RDS3Ej%`DX>(FpsJguCUwpV|Uw%xk(tA+Ps#WEqpZU@VUIQi%XE2lW zsp2Ty2x2Fnx`DY+{hs%2p)^e{aia5avg;wvd04L~=;}k<6{mu0o41F4nL$my8pz7b z6z%gs96X-pEZY;cINt{iM~BM}8D(dvDAle>^@+y0K}u{$^dZ2McvZG4&~|&TnQ;%y z=sH5$73F7FN%Os9&-wr4mfIwoK0oEC{lybHYKK)@rIJpe>l9c*-PFI`b+Tk2JqEP4 zS;5hD-LNNUb!>#SB>RcGOusSz!zb=uUc(VHfbMoXAAU4w(Qt^^$@y*T_inG->bsw` z`a?)&l}hR?DlLq!0}rNNu<1F2`mPF)%4kMLo@T@Gb zdr?XP##>nx#K^@TQk{eFDGRgnEdp^ksfqTWf=cZZSWF9ruG!HxUh!-^j^h3im^>AI z2+oyX-C&k>#yo)D6mD?V)6>*BK5^wsX@c~@HkFexDTH0#~-v)cPn(DU(D0-cmXvM_NKXCVSY)r?lFx5}NwFQr2gh|H zM)hrn$D7|1U`2$75AfmdB4S4J`GN~8Mt<2WAWCy^6m=|ad*i4nEvVM**>~~Ens?^3 z)Zb0Tb4F0J)xDT$ps8#2aX*IC!je--A>^KAxDO6xO;uZT*r+!)@>U)kA!wWb^Lpnw zp)dh4h;d{z+;7YthOMa!&l!Wdvdq%-OZ0mgRw1oPnTw{gd(V6d3sK)ARDanIr0^Xp ztZ`63t9;O1MF%1Iwd154wLhKT-CiYEE#wieIv{vFzNYiqEj#DrJ5y}wz8$42 zq3)aJ9e-dqr2bzD9i(v}Ji&%p?J4C@KryvezM(HpH9x2^X z%*kd|xy1d2{}2OKQ34U5F3-;D&K6i+yen%$vF)0rJW@4%8E|DwMQQryhgRp#k-B$> z9IL}6toH1T@oHmyzA65+S!yj)3&jp! z(Eb|v>Qv!m7jlzfVCh!nwEA4*u}(XykIHJ2BNHs%6L_JA4D>SnoyyFr$r+@>1>VmHO~6@Z&b9-VOOr4Oih zh)%<`uhbGL5C`ybwQ$ycEg3xJMU6HZ&tDHXKRu4l#!8>aoc5C%mdn9l^r_qHVZU}i z){CIJ(Uqs#&8$IIsL*Vjzu`~`cJ#umbvoMDv>65va<%?uh{lv4=iTK*{ zOfzgGZqkEi+Xz}K+&ZS;9L@mqIJ(UlCVpL_ZQW%;4?HKyxwd>OVs1D(@n( z_Sfi<0Z`{=!Dql`f;=Tno#?`dmXPY#TOU?)+7A!8l&ehgpt3j1b`}=gB8B>P=x+I5 zuXS;En+#j-U>db$Y>7T#`*`Gso7M6e~=_IYd?_t3_n@S4A;{ z=r+(yICY4fqx)}R-K3ClX zMV)jf0MZ%GS$aXxeEycQ#z|RdTIG{+Uxqfb*Kb${J%Q#l-pIN*Xwb|bYj3zM86>LA zMx3tTwj{TxyC22=;D{-;PDoc2Q7+oEe$x1#Tg+D@!2(5#rc3F}e6C{WlI--`pOQ#Wz?qVV%$TqfL;Gk*~`Z%(}vBquxl!d<@Xb1OK#76lT^$n-FC+1mi(k3vw|HS;Pw zDzt9QDU0o8I3-3{5taB(3g;*>mKhFKv){1@y2=D~Wsy}lrXHNEp9^Hihe*}n?E6hsz}92z3@W^UqI z5139tM0rw1RJbPdrJ3FtnDfMUcBUAHkGL7(+7iiIQP>VcX=2LiVlwKS3gLi9T+y4i zGY(az&c{eSLsIs`_1i3im#4xRd8Eh2eGr+y?&SRb3Yw^uRJT#Bn{|qqYUZQ%lvo>T zX6vvR?Wk16?kJu$4Oq3t)-(#Mv_YBBQE2g(o6(7Q(Gz6HbKW|{wa(;}@!mbV#O5ERZFoC~I#SDkYK@wcvxo@NN3I`FFjrj|7*%L z{of6i6PTrt!0VjRBXFnDlMt5WtoNI8`0el^Qf5i}&F%KteKE;Hw-ig!vIt7FKK>to z$*cVolowD=qGM};f+`MG2kN;ffi)52rKjJ}-7i!3Q=pz}rl57OrvDUsSH<-i7|^hN z76_8EFF#>kdDmkPPF&F;qpo8=5O)Pw=im2MmS3G&Xzjm+T4y5h@#|Im2#u(17$49V#Effq|uENhOm| zw&yLf2jtlTLN^=}t! zs<`f@;tq^9)}qbE?u%wsxd!HeYFz4>*Gnlt?jV_Am2AO;*Q)$P!Z@pR(v>L;N-RlS zM-oMd!vJn}9ZavJED?8A(c95L2cDg-z7GoywBZ;9-SqE#(&d+$`Z&DVbqd~A&fd4M zz*WxWd-{mF7U%8P-c-kbBxClrjQJTK_~RnE4c?_BhK$vG*x+RX5T;wao_D%1JcYRqIL{VjtG1or5P z&WJMelzvkp0<|JPm)VDQ;P_fEP|4^inv(QTyoo?`JdL?0WwHayb9g*}uJ=n*2IDcg zjhaVj{ZQN}LpRpl6@<$CqavD0Z}m9ix>}Yt1oszh*wdX`C9!St&2JUG1%2 z>z`lAS^42=@XwFQd544rogT^r$$i+m?))>ka=yWjS5hCw=#DtC89--9h7!x`yglGw z%hkPlqu}Vz?%jNIeSia}advhX#rVKfTB=1Yv6~M{05x9&34W~XHjBv23&JHbspA9p zmH-e?$KW258nlqTd7$`1`zd`U(5S(Sda-(5j!B6mfs$eVV+Y9CiYt&cMO}o1s=VXo zoUms>zN<6jI`X5!9)Xuwn3%ZZwV)AMH=Kc;y%6}MpGFD(uGmd$)cMl-YO+4<7P65; ziFRMPUu;5=8i(tVZNnlRA05<=jvIn9YShk-j^fJG#vu^#$5MIf=CIa0INi~v&ULrn zB0{%6V9O9t8uZ<|I=>wZyDj(A+YXaepNgra%Q?y)uH43+r^)a+{kV>BjR+Uq(l8=z zUN4&NQ!*0E)ia=NB%Nu}kRs(0iEPZ^hyJSgUI4l|F5FBxf2DK}EckA_+T0ptZ+&Sk&g_^G9BOR#yqI!gP0sI2CU=Bs zT43b>q3@~BWjIX9ukt7~hy>~?W)O8eWM%c~Zj=z_)sT{zaH0{|4K-M`_#vMD^C~aC zr}KI42KbM{pvLB<^7u6`76xbOb(}o0Vt{`r8A>T4g?eRUyfh=V?Ud{EJGZSKb1Qu$ zR7lu69JM}o0F-gRZVzs6+}{Bz3L=iy4N?I zuL9LLckAF1KI`BTZtLjwt_@NJP&zl9jo}NBNlIO)- z$m58AAp+|Tj@3c}qiI>WltgvZ2Q9s)?k>X2(8&QNyipgxPw;q{M%WNj6M=#CLO(-7 zL$2|Q$jbLXUj@gy%TB{ncUbp&${<@>(ty@@#jvy}bTlcq4GHUfc zbG?DZ!<{?s8B!$=tKg;Zx!O2bz}om{I5%)xolQ;BA@%%4EGaAmANFH_j_`Rs6yrjF zDX+tDc^VR?{XkrHZ&jaa$}&Lw6WXV9e7a;*L-1rXVN7pN-F{9!7unO))GTBKpRa2+$Cf2S^%L@- z)Zw*NBx6#LfCjZgfr6wF?c{nNebON2=!@ovcA^g(X33>{O(!jv8#;H78VZ?;P=Y&X zA0@Qwg$m97j@Dh>i>_3ZXv%6}CyUp|H^FCS$pPg5u>3f95uftXE~ceG^$0_Mw7^@@A`< zdL9z2FK*1r_uIprH{)n=}+E2GWrcsQAwetg*u19wSO*O^pJf$ra@e!p5 zlT4Lze~{#9WX_qf66%2DlLCzV3pHNvqZ|G>g0T}sy18-o_LO*kmQl3Z{oXm|D3K7p z-4HloLL9!Ja@<&^y8tt*XDT&;^#&rN2PaVFvCK3-|pwI`Qg(8DGJ$Bk6PO4+4{ zUQ9J^WPx5W23fk#AwZIG{pfcGQeFnt*O}Z|5qBGcn|59Yu6SAura}nv@YUm^pu>a( z5M9a^a)P2<|2x4YnyuMb0@a1Ly*@{8>FYt?8G{Oy9NzT0bVY?JoHz81^GR|NyvPGxs` zu_s(8)J?1$mYaLBZFA-N_17-)HE@#Lek)V%3fxST(9~rs%%*zB|Jlthxe3>v|L(%p zYiD6MDI#;L1uO7gys(+xB-K0IEyXAe4;R;s%_Jvf3;J95uX6~NC zG5%gZb*^#s&E7V#L1P&jJTuXYtXz`SYtRgsO;JbQ?1~Fhq?jo#Y5a~)BH>gVskPj% z^uD_LLgJ`ai7{tC;Dc9<_4VQ6@Icd2mfDHd!8}yoAm@u`7l?AvF=NfhYl(#qT5c*CM-LCgD3a8fisqjU{;SE)($v5{U1ZuprB{?= z%VPMOr>N$^ysg9sDF`DxflEI#IqO}EKe8KzrdTY-HM>rnV}Rj;K&y_4jjk*uF3~3@ z#`Jn0qv4Vc$(J(f8ZfFJQKqccY_dQX(O|P{fBs_MKC?-6QM*@U(}(KOA`=(^Bn8wr zSFYy!OqWG4Lyzjs^-ZaeK(`BP5y7+AQ?8!{OTNkAtezUSpyti0hVYs)1)ZyT5KUEM zR!bx(f&L&PK^6k@8N5L(WFV7_5fkI3&;YAI=P`mW!udY+0tWiQ$R4+d?Qx@iv}wGr zlV-06ar?B7H zYrJ=RlncA&MM|~qzz=;Wj8H2GCPL>YgRN~j zKvbJER*R2Hkpd%8H|Z3VK-xPs>Bb^FRlER|M?uW%2*{WwGtF z5ap##d(aI zl5Y@&BV0)}b?jbTTOKvyphT%^kan*xDc6uzFowCTlZMjBB>;?t`O)U4tBtK=-XQ|>IyC%t$ z851SKToXmq+^b`bne%Q%&8IIUBj%)CY@WMKwLl)lV%S+TnjHC%-9#c0PoL4sORKMr*7Ygj`~uwwU(Pw zrzDTiSrL{#}0t9zprlNKwS?n+yyd0LRi4uckKhhbwY$R8_T9n%e z-W5EUi>S_+oV|)%RfGtRT@CAUzAN3f((3e)H@lc1#wk#ZqQvKgKOyW@!U4<@z_}rV zz=FAY28q#9(gNBbK8OjYGV=B9s;Z7hajy$a*QD4i{&X8CLPEF}^!XA-z-jMhd0QT4vyKOy}h6`1yYNQl8-!;S-3+SvWiVXuJoRTD54Xt zBG(ZQx1z_$IT=NV-|*~>T8Iz~TPZbyEs1D`yP}X@53FXa)k*FgubKZZQFlQ8h9gvXaroeqb~N z;HnxH26Y3(i!z!tuEUGzLdq2P3WA)O+NHBWYgj8`-MJKdO^LJNTVPz&s-qA`&s&YCkZr-c4Fcg_c`-Cy_=oAsVyv2Q$m? zzD^KIVi^xjwkYt^b1tv}04#t?Lr*fwBdb7+eX<-dtjhChesb1Wd9BSHR0$<6&97W2 zKlhTChYY^Nt*u1vNY+fW@Vwx({wf}L;a@|<7z~h=A!1u=K3x*CCJ9AEUP=Z>$pY6F z>(2hM(Y5U^xzaE58G}XKiDI$Q?wX7cPum$1OG-NQzzZBrYAVb+V!tF?u$xDMPNS+l zE63s}*W^n1>9`;r4Qs?xsm`?WYUDpLf&QPx#h5b5cyIGU!8A)o`gzeP%7sF>%crA8w zNQ({y36jA+>#Q8v$_!xnMK5GI!t1kF(%gWkqKtIlNHC~>S^5?D)7D3#*ushveMkJ) zQzI}0+{0Hu^InSzwzS~cHE~*~5y85|)ZGL+10gtoFT`VGt)amZH$hRR@_nB`3!)$T z^2)QK!N|3+Ra3{=oB^~@ve-OKpvHsUBZ+6;{f1~(pLlC5a>oL!uDlPMOYh@x@Be62|7U5UiLp&d6bqPe(K(!!hyB2qfedWZaBP!g?cx1=U=2O+&n$0om6P9 zcpo|MX-*pT=b&vF%RTQ@QEqPWz4*jf%Rca_2IxRXh`xrYr~|N8K-XrvwD_6cJB|Hg zs25nHt({g)$R!~ux-}i;d7=DOjA^{ygJKKc7E^k%@|-r6Z_D7}617|}^ukY@M=>{j zJaWvl`UF|Crq1s)r(17SrUmX}??c-_?eS%9h3511y0j53k6}Xv= zD^oVio|gi(L0%&zgv+xbuXT>&R!fd+r~9Y1M&(yrEMyj|9W_qZmXi*1`KQYjhop!} zv_=UEaeCm&iY5=GK{76Z-`~pSMB`k3E-6%H_$QViWftZJ0$#-UZLCoHRdStSf-$&N z7WhAF`j=LBq)kUYVQ|~)kh0jBP;rylw7A`k!=25A1vpqtF?LBd&At`z_}L3wylt(}PaYDC@tE0rFHmrh;Z|G!$-E>Z3AF*1k# zhgO?mwuh=UXchD~DOn5J3jxSH@#Gg)TL)hzq$BqB;L@1_%Oo2E$+)+tl|50qO4=nA zJ~hMKY${vRm2;>1cUCAzEM2vT{`B+%*KvWGMApHeo+%HU>8L%?!2A3WS_5;L>_#(W;VqUac`g3- z*nU~t#v+Wt=*$qU;1a%>rehzkmo-j6itzfe-k|*?_Gg<(wQ}hTTqOKa^GXt%?4F~5 zG7jsZ#@f!%1tA=-1wkI6(PB)eU%##=x2|*L_$i-?13ZMG3r`gPFugTl%c`bs@tKM^ zOVA{6O8{AjFaeSB$SVoM1VQJ?_X2qW9eXKjB&j4J4-&0bIw%ImSXS+NAOR*!r4Mx4 z*NzN_q)`VGEYi@#`ViVUp?aYB7tmmSap>R?c%h^-BpV{LoPq$P)&-#}4O?}6*lM6* zt6PTe-}SoZgWF$fDpOR+TpJJpE)!yZGX#3YF|0D;i@{Wy5y3H8hWPih?F3a$sj}}q zFS5O9w%o$Qoj+`R9Um@n2n!K{y%?!iL0Co2szsbdIy^ROR#O54?|ri}To{g;uVLN3 z(l1GwiXAFa6zFg4?`SxPO#Ff`Zum)AYsoXqdS&o^U@Ck%Uwa^;Z-Hgbu!V&YV3bIy zOsYBRMWlWeQ?w~&WOI@c0P`3cKfPM-TC)7ee1Py}uUH>>S zO=MD%z_ku0TO&uFop?lf9YymJO|CC-+*nl2YS24c?f(2=`o=FH%=7putC?H77K7iq z+IC*hxR(c)c*>VfuRXYiKvatwinK&!&(0Y9`~*XE_=?T=(4J3-|&RH14eBp<8@^$Zri2EsOo-sZeN{2FKWs zh?FcI@DK;1$c+46E=G;;U=>DE9)KVPk}S34)?5lZ1hF#|Cxo=n&E|?$3{i%AMbR zO}hQj+a8L&MxCzvmHP|)vqAgwT6=Clv57zq84I?PV*^(y^$_NtJj3qgU>_Vv@Y0BK zJe5PEfvR8?m=eMS7Fr6|QLq=?O8w-nBp}EafZVuSVTX_!)Ro41GlMi$ zu#(olt@GezF1@cSy>{<^NN0f1n}SJA{XMs-+!22crU(skwhIXio@r#6+zUDLT-AAE z^bJwF1Et~!m3e~R%plpDZEqI?a&QQHP90EFUP z-F@cS(s8No$Th4tKwVe6YNYnG8UQf6r>q5`j96O9pS3ool60MWx0nH7=jZao@K^on##?w;qGQQz=Y{+Ijk#p`u}Y|cD}T_ zkcRB^I_=SIx9v`#jz56>Z@c~B9ff6F|6kG^`5iB(7QqtYqbHj$74bP0p1tBh^BTsS zg6{jrb?4P9K)JiwAF4XAhKWjshPJW_Ih(8Z1ij9mm zr;E8zfy~?$%ME%M$85B`W@8{XQfrp&(IuEZ&piqSBiw8rC)w21iP z@49a~t={Mp1t$;-JqLP+Ok8dO+}=QOqC4u6jfi>^p9q{}_imj(a4%Hzm=BHUV661a z)@}f1OHea~Q6ZBqS`|3)CnBmLDiy{F6xSIG zgqQIE`9sP}eLG`ok>9VDCf6JdmFc+1!^Oo>CCr-PU3NV}l`NNwpsu=g?a6@oS!XOivEK@1(&_ftZ<}%iz50?S< z=!RQdh2}whb7A7LQr8Zv5&&61roTxvU4wt3Zt$rCb-b%i5>)JwmFTve&*qNj+bwFH zhhDAo@Swtw*Qg!sAJ8mtzdJiP6-{@q{qDBLLG9q^w03-as62W44?e(Ip8tm)hJwP6 zM`p_!53=z={7s9aWHs*S@ZgFkQM)x7^*YJ0*Sfp<1lZ2iU1vBPb*}}U<4!(E*&+*v zK$#m?pJI5J7MNyazpM%-BpoQij;yg%9kaY&!efssXM-`pFX=VmPkt)g&`eVVz+u`f z?V@#CLvs8S*rurkfT0zhoHh9~CcV*4?!u=0Q1w}bFxm8*gni&o-K^F{JGV=7$l?C=ewc%*Im*c^ndHzj=JY=B06`s16IYg z2X{AaSA!W!4o9hJ(>mKAVe2OOXwPa|0=H(iS`;jn!(RMUeEa6Mc9D$IZ>#dD__mS| zHe%HtHso z_6~sBr@^(`1+CwhHxXIyPuIiq&24|wux*bJGH8tFfg7{7e|ly+|D<-%*!O$Cy02fv zWaefj-`ICK)WQvg$NBrnODG3vW!|qRvL$E+xi_j$>8&F2;j)}V4eMAO5#K(pe7#va zJU$IS5neTwNx?ZXkDCO_HI3G}>Cr70OfA|Cl`k+?W}jlyv?3)uC~$tUY$jB zCNMg7?x}v~>z7Wi=S=8ckGU{ppbsO+bU;gg0#5=Bl(tmGUcoW8sDpbTXmO zmN5zNWR2r4u$vGSvC1{%wlw(mGjuJ)(u=(~7_8-}R{}AXC=${2Ar5D`C#h|48!0?J zz~r8U`C=QWT2AUUM789L1hb()(ewVzpEaJGWfC>Rd4Thx0EnTiM+u}itvA$n(PWY6 zJSb6(;+>;GFD(UOzbl_nut^^D{2uI82&EnP`cfgTk=G_~&TDVf%U#Vi9v&hVqAwR= ztegMoYps^JYTi`fwUtI5RzEH9BAChf55*bDK9yKFzjU(p3PvOK!^RO1FJB&bKK21B zsHLo=c8*0@Iy1-}!$AJ<(;wXaxT)W?KGx!W(4CJJ|K9Jk8PWRaSJ=+u@^0917G}TG zQwfM0Xx3f18|%8&hats)6+>q?U3ah1>!VT7qFTdH>o%hp9v1EQB9a1x3;GbU{t-}_ zQbfi0+cTM(aeXgX`$AZ{-8Rlm&E{oR0li)5&pb1B#r@ERFbEr3GIfUMbo#n?pNozD zZ^$p%P6FeqFkmw5BsefnL$B?s5eWdstZZ5FgFR^&ETdHRi3UKdr7cxOmyLan#Vr1) zt!rs$Kmf`pGfG1Id89-ga#PFbatf$--Z z*gizG%%(`iR}j+wS5~+lPoV6`I;QN=U5DU}qF7 znpgfJvem5gNn<^^`&Z4yNe3pHGMm1-BP8(tpb5xLy}7Qey~;a6Wd`eV8~eJmyb;;yxzXd~s5cY|OH^d$%~5D^XXt`)Nl92h`+#9*<)RP>j8 zDt^x6?&~@U4PGmBBkGg#Qkn3#3Nsxrjx(h%kprC^t~@C0HPIrhghDIRS=Ujr7GV~^ zqa?6RRaxElR4G!z^040UPpS`L{N0=`*j*Yef`CIdMT+mTbKOFFWa8JB7Pk~JQYbi+ zIy}oPxIe8fx>_LU+HVUakKso6-~eTUJOS1KX6GVyWyqcY2rhkWKQ9FPO{Ta7N^v*T zM8a~Z)Kk+l#ZZ5Tev45MZ(a04227g2JuO#rR#^P1?FQiA*U{hC(cjn6|K`^bC*$us z>F+z~{}p%A9j+edXWd;hyEjC9;<|IuZMS;K;PR5O0V!2QM!oxU$Krqml32Gk;M)Brjw>*#{O} zn}>@$>80~;saO)e?#ewB;c=#XWLNn&$VnPVpLXaD=$J7+$lS+5Ln}=aK0vU|kh{^1 zY0Fr`YeL_xO79>D0pnnf-9$N9uo}Bn1u1V~Y(%1XAlPpM1BXnazD&bCvrNK#VGQ1# zM2Xp7HrYAeorXAJ`Aftw;igUlg<7u%EWzFbqA#fix{vk^6wr-AkKi(pINsv|tJyMm zE8T1YoESerS&c`_z7j2|;l#~y4^9vFB@x4wk=a)UfBWE;kCE|_6X+E!s>hR!S2q8i zm+7NXh#->V)oI(P8O?=;F3PU~^X1nqE4uDg>}pX||MC$%5Rn_B<`g z6qXI6JPT`8c*H&Nm0zHtS3!AO6@)N*stO)f_1hA%YVNQ?vsJECDd=#j>4w@7g5p-b zrLiBjZDm)tZ{2A-`VY3U9b$r55Iv}9Gu!g3Z0T_2rAtqu6EwID;PdBOUXRj?*2dm< zJ5NP2RN6?TAGq|f_sexQFU5@pf~yL$*0dZXaRLg|GJjO^>af{$tJ5?s3Xw|-$3`vY z4pLdiF*vQigK)zaB*}Th^8t@6U!4U%Te~%uv<*Z*)FZ^lEFI3tPaVtiWmG$KKR4s| zFYJ!UL5X$=H-YQca{pypZ2cxyx6-=7r)dZ0&P!& zjjYZ4(-VELEUUQcSJZbGLDYt8RZc@?yHPbGJVy01$4mU$Xab2DjS{GhqsC@a*$9w! zlGrYvDhZymAjo4a6|wl}5uQu#?<4r8*@89quw2U3{az3k^6nRRTYa6P=P^}sTj{v? z;aHd8UBV*OaDQmIJNmzi&Vc+nYB!X{+~G*NG9(v^sSp~o&ME5Z9&gO#_$YaIypK8- zmNg%qCGQT;enD^jfUqvUM15avTi#18Jm;+6xF$XGfEv zdTrN~8I(3_k-FU4wcxKHK@WQAihQxiAC(QDSgNjlN}5w8`$J-5ktin^8Npt`(ZeBx zsmx<1UsjGCZ`D5_D-Nlx1!$d|S3<88d` z9Eb30OjPLnN!JnGi~0N+nji0YzlUfBL`kB3=L~%u7858Gx|2&|mU*Uhbuw$;b5voJ zsKs+u^eUUw-q?Qp>GeD4_J*C#MKZV>-Q11bZG78lw}vA)bhq7M$KSa{KIK_qd$@{? zUgV|nHr1O)J1#NZ2p(UBj<~7JtTawtChxOCAdtzpo`zA=D^;7UZ^~=>AzRN^J!R9O3%Th&P zqstpK9QmHjcObtnE7tdotZ6aMl-K4!^YV+;b5gvxG)n+N8zgt6$gyaOjPi8H5Uy~ckX*2jkD-;l}CCF#-VBL zC}&jUs9xC4jCJ?8ZciRL1hvgY6eVD%zx#ADk1UxCa|Fk&C7CK}8j{auyjFaHTL2 zL}64|mzvg^46H=ZNMai?20s_h5KCEfk&%x!eAs!fYEn_{`6a+@)q!%OVaWP(?LIrm zXh#@NmilP;ZQ(UBgsFF9R$gUjW1>`Ps2_U78&-ynFJJwmwMc80_=;qDUt`$!GJZ z$80ekN^{UMu!avmvY5LPln|j7x1L%( zIVl#4#Vdh+fr?C3TJFJ*JEx@5tCjvwRL3}qW8Y`Cm3>Jf5NFa!uF6?)&*2ZOa!aPc5stW9wxwhec^qEwEuL-(TdVs@XJAfjC_ z=jn2qzogTYefbPDWxg?E(GY%5r>htN-rQe52hc12>d<%TXyZfLEk?{}aj$v!AHO=7iby^!V6bcbsi|3RE{hruE4kjnJzB07#5vmCrtZ>@B$x|s2~=`;Z*cUut-oS? z@QL{%p<}0k>sE-s6@ZcmuQUC3&aqFoyBB$JZGn>0C!@aG+Epx)aU^fYR>;>(k3ZQH zWML?!xb=IctgUZUIM-HhH8$e8H8edz*z=BS|GTgnoBuC0>#1acH38Bu%oHgW6-$J@ ziwF?t#gY;VPmu_K8W(gh0MYQ`I*kL>$d6e;=d@|3IFG{ zat8x0&yrhNt!V^!?ZhWhP1SDM3T&vj=3!Dj!WtmRghOZL3|l`LNTWArF(pGG5|G*K zd1;|l<)RAW2J9TPVeG=;zvtFr;vlG6ArH10GjsZ28o7hOsZ3s>B}P{mQQjXJkZXYn-_T> zS-Zy2xx>3y%JRZRDF+~~NNytMrp+pV2y3*6`qi@tHfU=UF(jk(^R!)N&cHOY8PzSYy}e6XJAkPDbw9 zcz(i~=;x2=EV{Z6>Zb?mEeP>kYfCB+J$xY+B&1Y1J}i!laBR@1zfJQg>~CvaTlvS_ zx89Z(mo8W3y_*}ODqjk*&{6$p*N{ZbXogRDk_y7!pmu2y-LOoAHV|?XljwX=Jd4j; zkZ`KHVC7QTe(-n;UySOXxmNiy3Iw}ZgormIb2JAww;DJ?Q{>+$lcbjOmN-S-L$KC5 zEciojHNSZPXObK7T&jjesaQ9(SWYUNAq&&;qa~nNdUdNZLF_sx-tW5VF_&9gBot?& znY6wM={JEbE?ju}p8HoR+oI*<#d|CQr-C>}or|zUW+mK6&QC?I=zg`Bpww^6qVRNS zI6?^yqt1MIew?|8$SDwd*2>QPwLn6pTgPYhv*wnQ*a8;p@5xF0A+7gx|3A$U5iYhJ zin|aDefJ~KK~Z7+qI-Gi{@Diy09c|2$?&?>>(y9|=)O8^eI0@tC{{=#JmAKA?l-Op zZngE?wYCBahz8P}V?7~sM{O@wN_u5_+W2ZnuIbf?uzD7gD2a>ewY~2bsWQvR*GRX* zIz4!5AmwsfqKqhtyw9;qoJaNod_VG?=$o^-cHf zOi2HVgeq+LG7w7v0I?9x?}s2G@9x%o{K^p9OmU>jDE?iUQ{Es1|AOPhKWbHG538vN zD_YjFGSvCJIuTWWXXi5Mv&g^I3yiWKP_jy7VXOU*yY4Mw$ip?Di{~``1pl%EGc>Xg zQ34B+)S?t9cp&^n=yW*6AFfmPfo$)SvvI~x&d7RnS4A1h$2^t?L=wweSA5+iDyN&;2N7^J__#?O8!PkYXUWbIu-M&;&T;3s*@}b(p(Ni%4ZP0)$ zBBcZ}gPwz7p$zBZZ!7Cau3|EB-^Lo^C_mRRhwiB^ zMm_08=FZvF&;rcw=U!@*v=593lsl+Ws3jxKR??N~^{2pFCTj^|A-4`NeK=Vq`>j_L4bo*&U>*b~|>N*4#Z&EFSYzgzd!aB2e#OKfv@NPv5O&GRw1%kwGTC z1s{R&NK3uTTrw4HH(Qgb`}>Xj3~cdRSPQ%wu<;(wJ-S{IAf?Fx({Wa9KOj2h6WkN1 z1;BQiWT6DWBs6I+A<>`>tVG4`=FG5jVvXxTcUL3AQ0<7Bu)X2P-A z+G9a~P9jiF2$C^c?@sJl^iqvCIMY$Hl3MISB&0&gVKcqC$MvV+gEvaD$%_=DXNM_{ zTJ1sX8M5fxS}RPbo-M+izdE(uQaBHObzR9k=6se2Wxbbxz%ay&cB$@yb{h*|f$JKo*OXkBp- z`vAWc6nhcD0VbJJ4PBDxd$bVJP6U?4$-yOpA|eitI7P_%*t4W?B5rqxpSeAPb6i-X zYp9iPN{S<;s%xZV9Mraz6hvvfvq*3IIbBpEQo2RW{s_E!(e(iLoTh+#K=S~4-4$L$ zJz{T1_?&!S3mQ1J^ITLgD`3kW>$wGS$8z2_X_MC0Mbr7_z?yd#LnRO&dmoc%j zbQL_i!U?-v>w=O-NNg-(4KQwQjipdyOg_|FJ>uwdC+u0}wnTIAo`mGU%W7Nms0>UZ zg<^ZTV{wwc3bkbOz-ME&F|JT)7mr4q7GUty9j$;K?*Hs-HNTv~;Z)(EG{eb5do^2L z(x;eE4Q=`mg~ima0fCb^1Vd z=ots62Z33F@ccDNcH0ng$*(P#EwbyvdTvd6$g;qW#bZM9VoeiU*Wgq%qJh4gpAThU zG81!+)Do&p=35b8Sl>x>t-I5#)FCY<*d|HK!+V*s-hm6tYqA(k$++ zQKWYF5{X>Y5gVbP(6@9_w(-_RhUzrd*znD&xg~85ljUy(yUF7LMD@Yzb0$jB5NjqS zRm^>Fu@$mO!!*Ln#!|Dycgn^9DVHRz=VkyzimhJ%+hsmtI3n|s~HN+quMgVWLV5KDHS%}9hIx)=&4}Or* zC*oy~nt}L0nY=q6jLKhlJIsIr(;_fXc1w`(IR$ajT9oacfs{SPKkz{+mrGT}85Mnu zriR?e>%kD1l3c4yz6RtvPitJgfoMwCzDiwNXI`Xihf-=0zu;ko*PJ>3lm@SqHy9Z) z2-?=hC{$+VKPo*FY2D1uX4F+{hB4K>OAfZ<`*!udB1sP*irHrk|76>Ry}bv zN68nt<#|?jp0cPVYYg?Q-jw83h)VB?W0$TXqWZzZu706$HdQ*ABD+x2vY7+W3wZUs2F^V?J$2tZ`jcKnlA0Vm zwKw%%1`vXMm04}p4o^29C}(D)b{72(t3~%RP7*DmrvrEz=g_?Xj}K$6=g6H%)$5caUFX zIH(gCgM|sFC>Equ|5OJLc_FmbAI^JiDFGUoGU9*dY4GI%8~VqXFW@d9I8}{HxW%_RSNTzcuJEr32ueWhQD|c=!Rrg*oc6_66>!G#v^hm+K zp{*xJ|MRwXDd}X&Pn{GY68)@{T?xKI$1&22&&3y&;_rpTmAq%RbB=G^&3S$|?Djju zVbbfgE;_eE#K%Qfq3yQTMbaH6Uk10gUqJ-N8rdv0T3W1K&yt>AFk>mQ09KXX_*zHpwS~uyEc}vGA=NrQa}e&WV4@ zooknFHdKZyu(e=Y_~u^3h19-*Iz2uxqPRJ0yu#%KEvM-P4fB6NdV}9#Bt0)w9Q+R(!4SbC;R z!}@)laRwj7X1VKcaDYaC3x{UXR zUy@sm^}lxD{C(Q|ecJrLV-4Ur4$SKeMaiAA?Fod5{XJ-s1ds{mE9eJtGkq##cXWJrcKBV^3qZ$gU{XufmqvY>^B-$6< zVS8}bSFlBuMR!e1OeFWXiRMKpd>Nx=5USK971M1O)tZn4O!A2&(9yV^VzuXmAZl2! zJ=YgUDcr4RyotH+Dd9|~DDjJAKh~SWrbIHVyw!obadU`(mmVz-8IEjc2>{EXaZq$H zrDjt|vW(r#7rs^)fpY{47J#O?2lUdHl2>)5=bEJ#a(8J3G2&j`i}G5C^N4%FD?Bze z_L3R5jMav6+<%XOzLM4}mMMeGsxkJ~n2wW~*7lcC&#Fs) zlOCQOhQ@_!hj3|u=#R1aR~WSH2&wE!k;@#fB+xopE5xVm?*M?dbI z)-E)amP=nc`I5!SDtXO&+~Z=`%!Q&0PyN?w@+dTcYWMMxCVW_T}kk?!fF#$KFERd$z5)?vpNI2(_H7VZ^xSmW0s*r6|vNGV3BQ{L% zcP*?|B={{aB*O8?FD=uoB%3a?`vsxWQgx}J1fE$w(%|15GT0HE5G9^opxJ$~y8EK! z`lbkL2kwgz|EP0<0p9zcdue+Dg$)3*`ynMr}HGaO~#Fm9DANzjckxurIK;2}bm9IYo=lwePL>0m7)@byp^Y!fz zR>!E@zq?jp?weZ@{ry@zQN`>!UCNDIh;yd(q*)-A!HMa1!D^m{2{$x zNN&WrLz4SP)``$ETnUT9olMA!xCY-ipMyoB`gM@Iq*zB_T*3_awz{R3BuB?h_uqF% z$IkLdZd$#bGwj{(@Tes%k4pub9>Oyj=wj(^}+ zcW!|C_Q2$nx)|?H-LI+xdXsef7k9(aZ5P@*zut5|LLlX`dwb1l`J2J8)l2ZHAcBT~ zI3T7uxqMX&kAolp@7JX5yg*RqtbOT2yVv>cqnvWpM`GiZt9wQ7r%LZB2bMj|La{M? zZcV-pR{lZ#$Rstq?v@7GHMH*5Msibv?!>j^TV6&EsCW!fX>O&NwY3znm#TC7rBtSv z*)#H&^X#!$=BcN;iUkLyDor%X16w9gP}0%2DhRc4-qzLz&C99jQ96m9;5HQt&C&&J zR-tmz{%?JVl0T*|sU&QeXI@GXe`oW{t!{8FKdGNtQxEFLhg+I@P(L`S z{@9AKfQn{)jKCBBq|KrqB=42t)w-nz|7@}bH=VwU@|;5w@sm)h4M&4p_qWfu?yD0f z#Z1pU!U0jA(;6TNDTe8T(2@#@9@u!^nIF-yX2bF*Y&<)A=^UVZlJ3aL(8RchCCm+X zbRs;dE8~f#>mY3~qhAUz1Gje6kbbs=Ywvdz2A|zYdwb&s{pPbfbnh^_qfYx%pOm#1oy*o;Z{!9%eJ;Lj z3g3$SI2nUT**3`@Qh@|Rz0c<$DZ9!tmvhy%mt5pKm(#Rf z3*zeR&uyj!CA7P6L=4mKTn)6r#@|Y0F0b89U$6PDuzC`&xB<&ua)Tl)KwzPTNf+sE z;nt0_7_TRFK`AwAjffU?*Bsv6{=WCQHFQVY-OUhLSTk5QSuz9NGON%g~JqKs80-n+FKd<5HL=btyd!L6t;j&56Rw?~{2>n=Eu z5%&lY_7+F~=HfwPv%8=kmpW9J=M$X*9>u zna0t*1VZeEYC?<6>ZVtYI_;0RF~=v$t81~G+WHHd`#wxM;5N1HvX+|*!W7_g&7QZrA3{dWK}5?B$zmKL4Z00WC;A-2C4ASZz5wnEc~tEOezt0W1ebkuZ1!RQdbDhW}Gq@*l9i!IFXdEGA~e(F7$4^W=Wr|)VoOjyi zHSxSn5uJIB0L5#Ocmt&oiS4vQQyvQ!pz`f?Sgr$+o@%kR8?DbdMJccI*YpqPSA>YVZZ?T#v)vlz6mA=5k}XiEBY_t=74q_% zlya02r6L2AdjD;yr>X5IdW3b%>@^Xmj8UX>8CP;$x=r!KqzJ-VjT5bXh0m4=`8VS} zxRA>y%&xN=1Echt?n8)iq6#uKxaNJnG>ogxMol?CGS(~`vkyDFPRr6N5wljDaiy2W z?c|{ZG0|&vcMt}y)w@3O8F6Q!?Y=c>$W<6~HHmJgRF_?^s-zxvk(KkeOTRyXIuoA9 zjVjFT^-pg$)N+wUkP*=z9iCMr&<|Qzt zAicCXuc&GuRji^G)-cVJv2lby%6;kI7(X^r>4=Ye7B-RkP=af0LFp4I{xFc@86HCn zvA&51D%=`AINNOc*q-FZ6#_zz9pxdFs|e@dA6W%_!qJ=W8RVb}?m$hGjIKY^wEiVx z5%9;tD-TKOL}31mLWUOAo+!`qEM*40nhmmZFraW{N4Q<9rGlnXpdS(}G8vVLKXu3Z zTxTQ>drD3|-U0xdzmQ#MM;e3B|`Zo4G2GTU$PhaXh7h25RX`tTBn0T{7kT4c7FZrZjZ8g2#g++kAOml`XKTUn24^XTKj z7mPw^j((K9*DtWvC{8>5iJjW%5U0@o)aqY#q@4M4J-K$S`lxmL)j65HZu_gh*#B}m z-VB&Bec{OvhG`Z#Fqk8I^g8E3YQQ8T_*ku-aGg=`DP z6(S?IdY;O)gFPh$U>u~i7RLu>-&wO=!#T*QDaR#g;wT-oZ9s6$Y@Mu5>%y`)q?2|t zciHWCdTwS~y;i^7aliibb%-FDbNm70(R2&XUV&k8MW|rB(&M;mWc*Etl*N#A#{k?W zW|<9#x2RZuCTd#@34)Rk|3DS&J%QYd z!hL5}8i9(4lB0M+$r+3sSE2XxOj+4>4GF;z0D9NE?)+l?Pg&fvDwR5YRlH&fc?Vg- zPhjd#C~T9_$oyH=eRoSnJ5 zx@LL#%r*F+s*OAwRea6)gVQWlNg#nqm=vR-1G^B8G5kAF56ETtkWHEBN+ae-UWYXR z8sJMh+i*@c5@{n|%5YH4`WPlE2VEROv~;)xEv6e;Pr`EBY7{A|7bV@EzUq~05fp_E zgwS|MYYU5n*Rta|KP~Pv)4Q0aui3;y05y_)lPy;0{WWR|GlX@kIpT;LInoti-@%1U zufxW;mSBnTfCJ5jI}+VdPm44lSpZ7}daI(K(9RK>N!p9mAE-l~xoOB)BHVMAx-k*T zpakiN>ta#3JDR_{a`*2zQ(ZH)9hNb(h|ZVDLr(IAXI3=oV<|1ingg|U2%$_C>5Jl% z!b;vK&=pQ3fL5$Q_#Ho3t--Q;deYZARrTFBL*^swf8oC{k^RV0YVb(Af` zKN$eY=$O}o&AHAB5nkpbEV*3?PWD^& zEV#bL;eHHOeOA9FId?~D&CQuJV<0Z49Tt#kT}~PVepHlW5g!?3($T|4Xm;=|+aK?3 zYL$&fk@dc{*Gl{#!{(YT1TT}!d`<8+DxvDy3J0ymBN@x}pjp@nC-!+v{d=ALAV-iL zNoGeY+1@N817K1DT186Y;son`jj++s^M??$^P?Y~UoYumQffO~oXvaddMJ+;V!uO9MZ8KINUQB3C z&7CYucdS3NxwplVhVV@HghtqcR?IVv*2r}xECy$%`?cg>K!9>Cg(*RqkKCsC;`%ev6R0GYl$G4FJ0Z~Mrb69Wp6tccAC|a^;Ly08r$~C` zw)UomYxqi}A+c0E0_3lUa|96(!w-O{sT*aw%giHRi1;dJ z%c$KbeUsCFMNPQ?SW+Y*XJ9RMri>ffTu#={ISjUG=1kx7db^Hf_B?yeJB=S9CDunDEfTiEa{gVq`sAB!XKBMq35Dwdx`V;b%mG! zM;*x*G4go)I}`uATd)>ZBUn<@mU9RvYFhsf*F(LXpS4PyJeb@B9$_4)Vk?Ubn``T# z2QZ%8b|!=ZE8m@iD~(Uvy&@4cQyX%}?)!rSJyGYrqzVx%bi?wU&a_>?TLDgZE+Kn7 z1(*@z`7NLPSqX8|6%HB)a_%odj#NtaVprUvbKdHAoC|z2 zxa(gem#tn8@Tu1AjdO-uSFLV;I7%)CyB(> zJ1I=c{)<@yDr3qP@BCjc{$BFuR-+F+Y{V#OR;Un735r z%O#3D^2r5W98t!LX&TQ(Q7M6}y^)+ChiQutz3+EibAY~|R-yY5 z$2=KO#N4W$3RM(>R`hequ?#D32ZXlQ`DZfh-2T>Wcci{-*lFDklk34~aH}BNUiKpy zihxscGaV-%+;@8Q-p2^pGYQ0qABMD(uL}^oO5_T}esMZt+@SvfMzeZIu2Z$QI@90~&b;z*WYN_=6gk!dN+D zqkR4M%&oU+7S8Y|aBsL>AVOq3vc)daPnMck6KeSn=+1BYjGiVC$>2yp)M5Q#eW*x? z=FUISkI~f)o|1cTjp*`}_WL(_$t54yir$)&8#Gfz8qxoY)}lChA9|1=SxofKZflmv z&Ckd1L?3fjj8aS`w_KAwm%u$wK=JyOOpo9)KleNwW4IqS_2K^hGtZIIzsXmiq6_)M zYXoCuIiYJD^)<+N4mk&bv#vFL>w;O9>q}#E=Pfc#5tNKg{i~?zOA?2L->sH~TT{3w zgS@olBC|O?r-^P|dQYCR-&YN|hPJ{p1Q>%PjRaO08{x@>3P3Ar0sk~}J z?VyodIBUfUsgAjMnukrlE9BrvJ$ukZnR(W?bii%#7$wNzXg}Uuz*~}`Td1;-(2km~ z8bMTJnQy}H(PftGs(J6u{AUXW9yZ+HXtPKK{2n? z4dsT>PN$^l1T}$G0!Ms<)webd-MO~!-vILc??kDIP|Xw%m>T+Lqg==Q+PLP>uP!6l zfiCvK);Ewmo<2z0B5}IcY#!V;%Y;`7{a%WIXj!k8CWj^tOgTQ4;>Tw&1Fw6{NbIAE z>QYaeAXDK>ti2EoO7whLwNOynij3;0?Wp6`S*w;+N45H}ooW@TG4-2huuJ8ra(02| zi)^J_Kk9K!54yDD3YTiA4~L* z1-B3cBf^ECA%81|u&BA%%@Hw=6ngL_?itZCWk7Y3Y2MbE64Tc`ty6EcT6+u8@-Uh_ zGS6gG+532PqRKUa)ZN9N%GM`g8 zf`bG+Cb$IW52QUKvEbfbkVfY8(?sSuC>p=wI1wiojiF#0Waz?PF$Dm2rCP`zi~xCn zB1Gm&rcaGHX9Yy1MV6JA=I*$j7~`n@Hu7;In#42&@o6KtEy&b?95g{xD4ao|$)N(? z#;kwia8ft?DG z!g8~)&eBJU-JMnoZ{cyGRJ{zo<%G!zc~>S_C%StL_@w%`qa)2`5UHS5gW@CePCE>A z6hX(E&Z1A_kBagszfafLBn%_@E zDy5kq4L7CSA`G?$Dq-dcQF-L4f_vC^&UcBf*0;P|7x1kYOlz~(Z=uS$Tuu-LL z4CJaZyb{U+hR)323Wjo9B8CAr)(K-Cw?tn-BAO=WL8{hHeCl-l}mld zJ}NiY#E*QOnvmoJ2wjCYgr0?55%S3^k(5nHM`h)Cy3fTHO-)K?lH&Rfdd z)dmZ^XOh7@0z8A#IO1Fd_NTH)8O><<5&?UFxI?vV?v8Ujg3UjQhoKwFqvYe5 zu}jEbJz3mcV?H|EQ>Gs=-$bWUY0r&^Ai~c@7JqmY&ZL(@Sj?TFmYQ;tC4zW3IK$(6 zm;hNL7C0u=&66tYzKYg4LQiOtoJT+RI4^HI-K$TdL?tXPJFO8*Bd=~--F`B-8zqBF zP1qOYGlK=8O6ZS^lv)8lNMgjCu%@4jmCt2q)|!Hrp-jg(;(Vk?0M3tdBvz6A`Ch!H zkC}%q`7|Y($cloXX>b-a*YhjU3OH#!1ogRr=xKX~(Q_mzQ@Y;D{EDWg!ybCitT;mGh z!!K4@qt8t62$gZ;Osmg}g-t!)-Aj`jR78&%0zY@2Tl$?W^{EQq)~&?zfq`j1P*+bH zgzBfxCoSLqmd)`HmKWBOj_cnG(k@gu{zD0Fz+h_#N6(hX@zL?q8ZCj1jGw4*pOI2V zoWdxu>-o{d&`gUmGEW6M7})W~J`UKs6sp8Gwb60dvL zB!K2NQA+wuHR?j_Opc4o1w&2Fki?ddT>0t7;rgn11#aL6(CL$SB%S@~aRYr5J#6ym zh7czd-}^0PBJfLU!gGJ#)Mllr%3|}Xd(YW1@^M-6{;Jtn`)N2Z@-plMr3Pm%>KV_8 zF1B6k<+rZp@;#HZ0<-n!F` zRGD6}O}HKaMOrT0eeg+Z*->KtQJ254y5P_`*dRbxx5%|gRJJN|Z;fiO(!}GW>f9`u z%1210ZuvgiA6^OWA@PkQk%DTj!DoVooQlFhbS?_K|JJQZF%7^Z6tZ-e)WUF=?hLMQ zk?zuW%YL|X$dWfHtHI{EWMMo2q-jHB;}AjuL;v@cJ(>-F-YxKVq;XB^rL+%2XDICfPn~34%++m&Wh&MJ6Oa2taUD z@XSb=t1~5_tdyb3U9@hod!kTJM27v?vLbb9H3*`K7)bRzZCFj6wooSp5>5%o*~mva zXC{s$PmUq%5+oo1C@B%5TEV0W^BWpQ_@q7C3_zTBXCAvDAYdIlL%MVO`c;yLe662s z{end=Sr3`T64ug~d2%Zsyj)48lquLf$n9k7B7o5t>X7fyYoFnI7vcbQpXIplx1z2C zpR@P6)-`*MRggWp|l85Gs z*-?p}T!>3Gz6VOViAI5Nd^|%^3^`<0^{g#F0h?IngoF;NUmrsY*LO0SMt|RKe_4Nbb8%Ba;&YspJch0S)wII zf!wHvK^hq@eDg@$d17TmO)&NK6!DGa7Ud&+$^1D(U_`e_MkNUf9L*me(>d!sN-_w! zpZ;1r*#Hh8QRF@ym^a$d!E_0e`Q4z?zWgPI{JMVw%Jr0I4{JNL<~hNy(`C7OO>3L% zzEL{!sv?|?=#}0T0>7olJD25*mj;pO7_FtlSV=-C)3<+)KM>w%GR4@I)bcmheg5Ek zPPa>{XS)xcHoO)dr}PeR!ilwH^vnc&OyZ+PDPzd%2+VQy<`g)k6%Jw z*nRIaG)bR!Zb1;zG2}9A{pN1=p~4%4qmG4#!_n=af7ScSHhOZ>NE-&sGaY$IsU%bq z3T3XPA>nlmxqFp8s;(@Pll{m9mXk`05CjeA>6dIk_#6=GMsLz~2rymZ_^68WyVjGcbq+I zO)L?r`48j7J3Ne}ai;!~QIVhH(~5<^F<6%ssUEMY#?Got59YtN)@d2TsCcx|6p}vg zM@D#q@EqgeL%UDv`;Dz!e)}g|Msa;;wwt4Ox6Vtu8zxs!r*memap!*--1aVnosSp9 zsG8vSLRBVOkS3a-LeyR^0A97}1liD;(A(*Ka-A|)z2b)R!G9&&Wf`$osgT&p8V|Gg zfEp*!Jw!{=ct^D2nN{yBfK}l$?C@;*9BRM0t$A>hb=MPpT3bS$5)DGF*|maNF+fCh z{Tb6L_Lh5;J7p1iu}3cxHTc|hBvygnc|0y_$(vC`rMOm~$zyvB6hw&KOl-WoYDuw& z?3f7K%S9ZGs@5z96KZFf^mY3Jf)Kr4axu8O>a~W+hwF=v0_X+*9FIJ(xF*-*3&f?| z99$HuN4Kl0T!J3Pd+GM>^HPGD&wQbWU8?x4#Usn&VM&@$!?;RXPif%6)xZ!%x6BRn zV0_x^-Z2kK*y3J>LV+EoL+xJa&X{Q}>C7Le^rZ0A#GN41!g)EzN4}iE>Yh}nl=2@d zIiw{N%LteqAhj*SQuTy9`&J-2=c~Dz#nxdK$e1NJpVys}6pS4mzPbx)R#wg4U39$UZV;} zT_`y_6StwucZ)OM@DgYlza^8wx**DB7^?N)!(?RAWS^~3dWw4t@PUl*}G6fYuj?e40fP^z|z5a0Vt#JUf$?%sag3-2{f zomsG^X4g19h<{e4)!n6M=ei5DO~435js71>?==I#Zu9I!s|O+?;M24E@qezq7Y#!5 z^fZ2gKJ}y?thrsgy4(KLy>b`!Rj1#%ZS~wVUb)R{`@Hi{r6o~OS}>*zeAsZuT^cy8q5l%g*uRD)Ykf?>o5RHHV!L2M5}F#0H##*UhA)qn9l zW1C9Pb9)R8y&!liaUQ|~ZTBuiyAVaAUfOmRZ@nXGo743()2@5b!-j4v`!D|eF=-4( z$v;N7Nxw5nhEFJ!nX-(%0Y+;sweLXI>>tWMSlj20qQfJtM{&PBII9a7UCzG}5xT~4 zJ(!?)!cjd$31v5!T>I;OGsdFF?+RZlg^ZM{f$tdIeqUI3ulu8aeiYIlX9r<; z?^?GwUVDS#5Mk#z-1+Lp5@SuGr;I(I9;@{GDOmrfdMf>UC8 zKxza|Ut_zvqD>NHbu0DsH=YBZl&Ef$O3LpP5<@W<)6Pj44e!po!%tlV&?x{$(cSg{ zqPwH+xx387-M)6iy1IvKo$^w|`?D(b^h*6Y~A+s#Zs6}MTfE6%GJRkb`w{Jzv-0bdQXo;c5FT%{L^i#lkN-N6T zP=j22q}+CvrlG_&%wX(;nbQ1FHo&t*i4{ZshH1JqSh#r9QX}(fF*Gb!s%5+;jl)_z zD_g3O>Z(!Zhg?BZ{wNL>-GNH8)qGW57;YQljZZrpFf`z;T2eL`FtD{?113ogXjPpj z0e!R3JuoB@wY$r->V9qg-uluK23y2lha>31cl*QcMJKsw-FAn`EpO$&PI4MR=drR$ z*j@8&b+7AEfgu16D>ou4A&er*&qg$K8lxw-mNJp5wO^Vn7|Fdz>H*jU-0pWliscty zD4SBZi5fL(alI&<3rSyqr$bX-vPcu#@T{RFp8vM?CJd1rAwnTT2#Y4yt5uLYIMALv z<~!!RwL82$gg1lR(J(oHBV$oT1~h3HePJQlo)nT@7x($pZsx)HM|W?6L;r*Pd;ssjqk^U8 z+rw#6zQDUhbl+)Ax#mi_yHts46rYY1Y^;Nmx&|zr&GV2i_47wD=UEw-_opg$={Yft ze}Y6!mb;y(MrQc;0^o~`mD{W3#PxTBFiVRLfqO%k8}9{!T)I*+s8Ckt`AJx&fXjNk zSWPl5-k|B`Gw?EYTkqA1Z6~!7dzw+!E$)kr%C|Z`65k(cW!5D!jv4s6^i zIRc0+qzQ3-p?3Syuxq#9Qj|SlnS!LYeo<#Nufp@4-Gxo`lS<;n z7e%L!g*}s$&^$!^JM}F%2#=4C;_wK7!SPtcY;=*hf)JuvrcIKWl_ZOnch2(3o--Tw z%PnSs1##hP=Fg(r%*==)pQ@68UMf&wcAm@bbKU!ZtLd1Sj4YMp$z-v5Oxmx@0^Ve} zBGv>^vC*H?X9=(}h)A!YeA0c(r|?LBLiAk3*ePJ0CJ*ic&-Ub2;@gez7pP2N7JQnR zkQpwPk=yh$U_cnJMP1g+-RPg0?mf@O-)7X;rV!BQbLCAKh{U98Nhz|SI*~q4Y<2ld zEjpN{<;=N+k1#oav!eU@ZgkW7CA>FQQ*`4#_RATO5_;h74IdB_`2QS5ORFcF?B;fG=}s3g>9}JA>ycvuH_aje{G6S% z+zwhjw#JLV-30Y-LaN<9SZD+!Dl6yq!f~iXog}J@mnl4W)@Dl_hZ$O~bR0_cll_XO zPAw9uyRIjnGkBd>MKM{RmMV6+ex=ZHN2GX(+GAy3q^R4pVC{3>n9l~!MH62k;sVP>(UznQ`MGCT_8V6_f za2Cn99vxQ3Roq+2sk;fqil5ls@AlY11VzO`1UYCcCk*e@Q!btjU134zfzU+O(G>*O ziNKyuj#a`%f`SxMZ zl}d}S!ieGlRW#M)ccHNcm3i{=)^Z>Ix|)DCdx+!2{Rpvv^2N$3qqNCelpG@Rf=WiO zg;iZX^Uaeto*I`Dg=JD7DU67Z|$Yv<<~zN)+V$%D;`{tP7c@^RuVm#d#S_TX z??w}m!YAVI)z?ET*Tjxe6~MrKrGTlQ9pkrGt$r;T^sa}aPV1tUeD3u7or|wzQ_#;N zn$EH{Jy53hLQqfY5m?KizJg*Z?p1A1py7Y5UETTEc*fBTZ0wAL{7m-{IsP3w4SsbLyy@fHzTH_wR&7Bfw=qp zLFFk?6D#t{TSmWM|5_Jp(M5MiHLv7+aC2P`tNn)S2|}E3|hD%}U5#pIClVUVN(ffc3W>>~DV^chtdQ{jl;&-{>?r z!ZOY^ze0S3hxV?UkSV3(mmoeH(^8qj_Kwk+)wa?XuOA(G+lmu)p_9F1+}t-J1?07U zG!E*=jrf=TAsRZ^x;$Dp?l}I?cmD7xZ_g?~tY_tyRL+45->ng>s_215B!T9Sf1+q>0oNb^sRo~zl4{~ajKFeP3Jef>z_4*6V=tX*XTR?hS?p8j z)OoC}-#WK$@J$;97vp8?_PTSsH~8Yrp~Trgo!j3!Y&>tX*5d-2KCEWD$beJh=@SYb zabBdbF*Mq{&|T$6pPVLCKhzUR>v{C<(_-M`}5V_&{;fBt69>1S^gpl z^zy%cx|}MQpknuNGr&v0@$CQC`u>O?-nuc}`zOf8z^U#2cf)qrA75CuoZ!f4@{hxP z{eJg1{&848KH^{YqLys=-+%P!-@5wg!|-WmQLP=E6-CEL>n#!*GlhG1M zX;11pM(%Z*b+9sM!i$kwU=U>BZir)oQyQYgk!2J491UiEh6Y>nAo>(*?(G%7$v@xU z%Ye#vj@~y8-=8)?WxbfcFLgZRcPH-~?pOZn-GmX9QY-nZ?X#<(amLq!i0Get604FG z^>vS?1S>YbjFNXSMyL1XFMoD2opaP$Z|}en2U_Mv;KPZjwO_S_2`9>t6ri{%lc9U-G7Vit6dED#)-=?| zg>g_?FghI5^xjQAfJ9Op!?a679!wuJ9RseO?~>j8ucz03GyHER`oor~{rFesR=)eh zLU^4Z#zYujE@*+rb5z@$pU7Q=~?+e||vEd&u|LN0Vd7 zs*kuv^|A<<3vbB2>aj;_xPrU_|Gf!FMvzRok`MORGd_X{LvkT`gIs5LI~LOSe6bY5 zWk^N(A=S`~58S!GEU;1sEv93yuZBmkNiq*@@7YnJ*B#S%_ASygTSiY0#^fXM4<~ zjItl-3yY!_#GaSsOcNO_ck@)?GZv*9F4c#GO$Xf2dm+-T8JC|8Wvj^pC@+R3_%0`(?ID_j%YC~eI;M2ILj97PQ|m|VHeNmq@6V(73cA_ zUKqbq&xWk8LEPxW+1Ro^gVWK~8H;O~=xn9DL3pel zVbVyVU_G4v5S79;AJ=HNqDHJ#LS_snf*FKxW5L%^CK5EjP^&|UeCS_Vs_wu$yw~Pr z`i=k$>pd*R}An9&_Ck4ryRnGa?jTFz7TA9dzVw73XtYA_KwLYHkQ3Xsq1O!D? zx_Dj^kohTF0Z7EQdF7pm$?vO21SXf#NI0l|a_V8bK_e1taV(tUMT;Sw=T#Kc$^5)W z)6ONat)iJr`r|u`xXe70t-8`$qTZ}05P)#UXz$4VJRPYnpj5IiBZL=D5*b#PK;_fE z+-`I-Nepo321r4rTB%^oa9X8KltEMzyN9@ySlTRd0$&vl0r_VnqBOp!$I=PQXUG~T zCIQpp3}mWBJ2)gsub?c5__UpPQnU_= zV&nIZ?HLWN5u_KjvHCLE{z&4}GXVLOZdm9=f`h75MT6xUeQ>G}N;ns8rc81oWq#=W zh+oz2MsG!W2~3~dKZMZ{Vk(95X4&oL@LdHQCxXsK-3`oujo#HW`7xsMrg*}}T&&Ow zE2gc7%fB;Kh-_nMu9buENENJU(N+WesCP83LdFN;-CT#~yeFR92^3-tvEE8J`cv!znD|4_EG^c3(y@GWK8k zH$QhOofOA$M9jBeGPfyB8$g@|gc*9yE*aLM)xmQKm5EMl7nNk>u|U}M@>(n9G~Cxe z%a{HXVr?PQq%kaTQxw{(j}}=3w%0#w-j<}e2a&M+bGxR6^+2Qun~8_x)KC2ELiG9%+MFReLiSA5j9975QE1+VT#5ZtOrycIve5b==-jVB5F0X+s;5up10F#k^6E(d2|J$BM| zIr!*=>q5hTXM>Ze^o>ajR)+$jR&3%y)C3)i-aR9tq^`^hYcvU23g6`v^)O&d`_g)# zDCVqpm8b(=vKeb}Kvy|Mb`S&ILxFinWj&*dGBp7!E+n8I^?M1NV)?SVUPk1vgZW17 znPM4|2U+y3VuTFc*%(qb1sIv)5XgY9Y&gNzEspCL4Klu zD6G7*sGc!FQWJSt1Q6nNi#<5)Po@(oOBxR*Hy2Y@LJ~0M>(Du(4CqX0)Om#Q_a5sT z03K@)nClDRvYA}vHqX@`PPV4gt7_l5xxct;+#i44X^t{jY8-ysrHI1J`zBVtDE6j= znu#}xQ7vWiS+28vLd4FB4m-`ZmgPKv4LP%PV5eq$PFk#5QFq<@{N|q!4J@>(cB{>h zIPv$CQMo@hLQ|X|S17K$dPeH~c%B|2$qI7=r<7g^?&rEzH`DOlgH+0 z>m;)Azv4mZeauz06BQM=RQE>0%h$hWN;!{^2pHC@utPm3b+#ZO3ooapjdvl^m3ey| zN{7DY8|xM;0_8w*KSv#}9j{?&)%plV$v6!N7H03cExmPG!}U3QhtcFvP> zen@6s$(U(eW3AdXOD|IH$|*8|G~X{P4-tiVexJ!sq0?i0+KSxV1khahg+#7ejb2$; zt#;VYS-y0$hNc7PC*;nv$f(!gh!&EVT!)ept)iNz`2ww5@W1_-jl># z0oaHOH%p+QGG>2FIsan8N^dEMaJTltt+m8jvhSNh!oSy+4gwTw%L~Na_BOvmLdO}t zAlz{;{Ue>Tt_c>NM&}<=&rcgt1+x`==VB*l={KW2Fh;||WB`|JB`xZ(Z4HBs-MCI& zr>_|b1Gl;YB;c%q!N1Vzg8=sBa0-uPg1pzwlIOanP26xQz2est!@xR3eBE zt`!~X3Q86c!LzMt2ss8Ht6c5dg;BLC)J?^N#2TukoTTvi=7Uv04NtX;J(d@IWXMR% zO<4;?M-!|Psj)P@k>G9p*7(gSJ|sl5AIVodHU1xxGM09(*N-c`8dJ{APLIV`d;eo% z7%0*8%CY?YSizf1|C_8{Zdv%5IO(g=<-uKRHT#yo%vqJ+v)?X(;OYj@t#sw@^=^&T zVtcpbj$(4P(yG;t{=~-~%gxHeMBq)+tDSJGrXTj7M+@g%>wZGYaJ^kFr2uFiOL2s- zfGY#3pD#a%WJnMd-6BfCrEE(1jOf4ib`=%t1~EBg*ppFk!(B&9@snZAt@8&Kdd#gE z4E52h>Q)3!We{fBmjbkDwd|8d#6!n{3S)RfZlk+Y$sk#g60;2iaAZCor$htG=5Fn5wJH{zd}|Dw~Qrxr)b4Lu_T2kb@n=HElp z8!@}wNd=LoZ{_sKZ@r%1fxG(&V0?m)!6HL9I7AZ_>H)jzmp-aKJ^>!bFzSeRd^hJg z7_F)s;x8w-_+~D8JN~2Zma%W*>Qh`j;a$0MuZjaMKEuTW{c|Gtfg&M-k2#-5f~W{E zMn^6ul88~E_^8zBRGi5#1KhiRrg(#!nO~fal=_>N&f{>H#-_64X5_qXx+OsYLylVim|}S zbTdEt-3Y$l^4nxGhJDq@QIsh{Q3Om^1r>Mer~Ctxv1dWhgw^wzAV*c z)z_|98>J6`UTV;gJFrmu8>IY;@;cR9vMsP+%>!3Q%T#ysNhRg9+Hp%>U}q#@m7IEP z&~JmLZyjS9!XhPZr**h+PVtF44{MaPE8PZH#(yYTo-#N&s|2loI{7ay-2S@lE5~@@ znvFUord6xk7O?5sU$*_}FI`(!f^T12)_`wc8!g!1jyGB^rv+r=^;$gMBF$H6wtCQt zZmAm70b*PrQXoY@YfXIP6iVpC6|N##ed%@d`&g(u1vS5 z2S=}SEay|Zrsq`B;`^01S+}F(b{eU6Xj|+k$A;akv#WRytkPS$bbqDXkUMhsJ+z>d_2Y4eMpvuSYg0(e`mtmBv17~Ttc~e(bTt_a&VFti{f{U_ zw>w(;vyuGWPBP=sejeINrVwA%aR(G)R}GWyuC;k^Gqv<*;}wG+j?m5Z1tkA$bUBy+ zt#RD{s`IPyk^jcj{;b{IsMKn3FYuvj!rJQeIPg}R6JFj54R4*-4!_;s*$D>~)=ytv zvb$>y4%B^crdspR*h^cX>3pL)t$|u=$5iMzQ-M~8`%vR?7OOA!O`4bfL-Ort7)2=# zgs4vXe}E`Hri*zc2?3K5soV%p7m* z+2sdRTKia?>`tSx>mbt+9?=sg- zn{twR{SR)8x2>>dOCmr?)18;~zdJtATU6Wm@XyrS>*m~}+broFH#xH77Jrr&tY;>3*FMy1y5Ll^xO)xqSSXR;jo7wVGQui*0>9A5I7VoppH2dpW#u z+O3{Us0K=Tw1|Mn&vN+SxGG5>|H0Y5@SR1cIq7)bwfwZpJ5RY zTCGd$pzExKZnOvt9dQ4<4_%2G{pp<1SthiP5-7OOACS~6?U>0mYvc~C)X#Y{Zz;Hg z3)SxGi*_0{`A`?D-hRgG)Ht??O6PU$W{c!~!P@JgI9sEiU!qRU4*O zU*}nZ_)XV4FIXVwy41PQ&UNWobSpW#8Q)y~^xXffx_UEi-O@cbslvTs6LPyR;{ScH@!@2LrViD^H3uQ(c>Qso06j3tdyshDZa6I{a zo<-ziq`{p@=J#Mi5xLs^f(f7TeJjr`1%@GC{Gw%pXZ=z#c(?Sz-`f|s;Q6JbvAj~b zB7>CRQk21(jMo#9|DB-IzV;YYPoqZqy?BG=2IQyUBYs6SQ!?11kXto?*mg;y**P41 zAjPwOqao*(GatQ%Hy_OVV&UyieJ{iRhO|WZ)y-rwPWI>Fl`SH~iFW&A4WD*84GTDw zu0DNqqU>*dY--Iu9Ba%`e=2#o zo2vb<`%!O8Z?k68->i2v09n_#oP2mqe4`kE|Fjcoh~&3kGruc*DqkIo2U6QIhGw(2 zlY&1a9JNvO${^qRDbExgG<V!OiWz+Q7)SJ*@NzaWuwiy=k(ADHP%@2>{XJYf|26r2FrEy) zCBv&f2B2G%mR<-f$Q4JQk}t!n{`gDs!TtQ0^ogACJGjtav{MHpw7;*4?CCN2l+M3r zOE`48T6jL_b6{Cu%4hDD-lu&4qwqHiGj)H z@#y}Bw1Pio^E~^J8Q%=RA)eUF z29pUXjVH+`ctG`Pv!0v`FD}rQLda1}QMM+ki8M)h-rPC4>9qOicD4HM{pNaY7oU^} znsL*4dIujgm9{K4+3?XNNNe;cHh{n$p(h?}N2pg0Vwe)B5{Vywc3= zpX$d$#c+FO|SKGTOs~>yiyI$?VF_PGsrOYgB zkCAt8GsCuIz4NDat_)m<1>n3-LL_WA_RN2{8eRjrWjaYF*Mn1F^Yac{u zz4k_9|dj1DcX_8Ac=}!6=e*iMzY6Y~7wA7I;rQ$_8|AF4% z2*F;teJ{7`+3N1`2k;Xr$-iwgw^b4B&NP5RLRtkXp=5r~*<%NUc3dVcZ%SZOuazMu z(uI4d9#A&8fTBT-fN$S#Np!G@(4pn40oi`L6b|THmGB*P?c3>->J> zfjt()T#>R2A~kuCa`{_81P|b)LXIj*KYy@wQc~=Bwox8|JM7=J+=achiwu~b3JK-z z(`^_MwZ`;#eYynhRhiXAX!rE{hH}9@Yjo5CW-A2cb0RJQ!+?L{tWkqMF`#En4wcec z|C~^p2rD7a$o;B6EsOexNW4G(x8%FRxvTt7OL%`N{T%I%d`|YJVq##hO7d^3S{G96DolBN0lW;{(!z9lE){=W||)N;LAt))?Sx&|J9OfI|& z=jXxgs5^|r?e+zY0h~4CxB`g-mbu-mHcE&mVj{Y^I9aEEX*)G-?nvqRYX*1QbhcU> z{zUPJmbBcEas-A?I_%{uLJXX+2XE=-8W=%RMCZO0r;w;&gfb1i!U_n53xt_nTN3Y_ z{_^ma@tf>Grptf^!XOW2_p~C8{LZ}tv^^hRVy1V0?g^5Ur%KvBDVJG&=fZ9?zA#2( zFTb|JA_(w)9S(8~R(44?xMn>(sBOR{c8O-M6FWNSUp#nk1ET^Gt^rGzO+NX1SW0tmp-`fYUyqkHVl@*AWGqx0b?x$IA; zV|b+>Mwe4X_nHj)7vB&FxE!9I4?iPr|Cq1bjW-(-i=LKf-<-tY&qot?2E2reE=y!h zMo<{e83N;L0+j?GOJ|5cJM6$M?A+Gfh}`PRMenI#hD#z7ZTx{#I*{ci?qXsYC}{Df zP>8IC;LtjO0b!{eZ1JSZw%PzVLUVwG?X{mrZ7n7BZ% z*rM0{FGH}nC7-2k$*HCgk zg>;Rnkqox>X$2Ax1XMNvOU>^{yJV++iH7RY>3`a`_xi0n!8Jt#u|3eeGax8ei~KGo zkCdhHj21L$S8NzZg5KCXbeMUe?3+L+3V=-L*Izgz``L*EcZ97w&?&N|F-G2{Pwq%D z`_xR~@UXmu+DmiCzqTB>A|{V7GiRlx>svDqj2Sh0)vxJBt@3M5$XR&fq`!3zHdtxAOyA1}|Tpz=U=$?R^0Wab)fBI*!z5w5t@=4HM zen)UPv>{mPv6Oul9S7m5`+$PYR;L0`P2NO8dkl0y09w`KNbOp{j~-ea@C(D2xXwEe zn>l5Zs+ffLcZQvle5kP(MU5T?cLh6VX@F`E3Pc6<<|a}oKm2`CGRb?Gs{1W*Nf3Hj zEMRi`0F;0QKM+WdAhqx-($#u~$mmN%#o>OtEK;$23uUZJ-6nB|+ZuX}%v||jio^h5 z$Xxx6m0&(^Sa5TA((}TYf3-*k2aHp8le9m1`t8fi+C%!MHotK=U*_xobm= z)~|kEJ0tyh1%jDL^|b1oVgsHtumaraRWY9q8CtuZ4nHQRBeIXP4wA@$?uc{7?dkM} zzq)oWkCm8acaO`}eD&~B3Q-AN7~{UL(Va2q3=l^^m&FVxSk)2(u)uUXukOfQv>;eu z3?~!u&v{HULiN#>Xq4gq;zNo4?s~&-2Gl^s++J!3k6}I^l)!*Vy1j{jY(iC($YqIQ zG2(g@U&!A8+q&2#GPTnkQet67k2F_0kOlAq-u?aRZp#yMb!g%Yl(~3xZoXxf4x(`H z7=>z)r!!DR08=ETP7P&V<#%x!*X`{916$(%E#11ZgShLdz!jNGQ2B#%<8F?8ei=zP zegor@4klGZwYB>`veoUlQ|G{^I(D$cjr@)*cB*Z5D7>(L4!pH^$&oo_R8kr>AWEbbq-RPUJLnhVZr1n<-ABb9csZIzjucI5kC} zC)XA9fq7fI7eYf5jarFRy~@_?lLrcO3+bFxrxurf2jbjxXN=pZj^Jkm)u1k-JCuU5mO8pF?n{8Nv~EOD9m8Y!H*e2*?U{dgNa&>PFSza zD)9)r2qfC%y{B@3tgcAL{;*En>86>q^?deBn=Uf-39R$*Qa!Z05k+G)rdrm8jUjqk zd5#}#>2l?&=VIgOI$h%WLx68#|F!F4jz=dt7IC=}A}n)8Y_HM~GaVH_3k1cqh&ldO z*x#=Q)A8tHaC1rNb4SzYW(+wgR9lE zS}M2M%S!DSVe1A}5;fE!ozHz;gwe@BBFfWLY}XHl%MVt25(qcDGF^K-g#v-u1qg4d zSd?4_C87-UojMtp?iiY>~1%@+pJ_7E1n&?ScTQ%SN)PLZ5@QL zJ6?rqm5D@3#LV1RLBeo0SsK_&Gz2QPArU`>TI0-HTxBI7R?}~&;GAGT0UH#e>&Bm@>ze{ zPx_~)h&4;x=?tYc{W~6OF_9RzjHlQJRxYgK$4rP6Xjj((MoK%Od`P$=70IRFL6 zFVEzlC#ce3i##U2tn?T^(n}zpFB72L-hI#bs~v9ln613@SR_5>l2U`Z5O={I;J1Uu zH3HVO+*Px$@eOZR=^BX~GoppDh%~XQbqQS3mh(pzrr+nu?KX!kxNnmonA1mR{19N?tYrnL@^l1NZh2^JQ-J$bLe))NQ9zDYE!ntTyp>RBJ4|+Uy7ZD_0)pD zr(kh8tvZtobEyHjO}=qrj!f(k!)G}i47x)QpGmgHaQY?B$LU~kcZGc1Q-&a{Qa?WK zsEc8e$U+~9U4ZAM%zSd00kIk$Gx9KP&=!?tksA9{dPt!593URfr%lB zonW+mcBdSO7Bqd>qiBn{Uw%TzZE()H8$8vkR zfz6y?&V5BzNw3M(*Mr_WMji|?g$mE`A*J15?ZVyqm6hRU z((ov)SqAOHE8=xF8cvUGLZIur{e-B-BA-2LfxS^Ho$Ea<94YkeSVJD@cBoOqrL$S~ z>UlM~_y#r`DP*z73DR=z5KBHxuBTU)iMw9SU+$3OjMkPBz<-!+qmAU$m4|It z7RCwkIvMDC;(VK?%}Oz?kg&F|z$2C2WDdF0xaHb?2`lcAX8N}>F(smP2qa3XqI1+U z+Zu9osEW9rKehD>W;vM!V9eGl$Kf9 z^-9z2U0&0x@k>#l4K6T-&Cuk3Zq`M_y3IU#96OIvlu&$?Zo6HnH@iIZByLGHoH4J& zgZKYK;b;tKum*sK0kIHGHQ@fU2ACO%xzPX<;Upu*0jH_0&=KtGh0u8U1{L*N2 zy46PgS9g71yR)3!CY@fh#RkUXM)kPiE^D@X)#DB>6Xc&pr_*xEuE%J0!+ocEe2lvt z*Q@oWyG%henr@cSA>U#1dac{6*4yqfqBXVLWd_CI*4_WN-mdkkU3a&7yI!kyy0~nr zy;|M2{nyGk~#r``G`tfquX`$JTCC>pgeb<#2LJuz=bzKxtdj`(Hao z@b8y;^SD>-)bP;J)M>6}8%x)0$k@WPf+BCyztBfEYHsO{o9^q!$L<3Lb9kFOW#l&S zAA{s#_-Wu&q5H*0_)jrg&h>h=QP=bWoIP>9r%1lK&&gbNEEzLd?k$B#u&h5^V5GM>B{X>LS;;grlAriOdZfW^YBWS zGxO9D;pJeE6|&Tl3`|pwPE~3I9Fa98tCWD&so49ItuvW2FnG8JovaG*F=IZMhE4y< zz;&fL+1I9gFg!g+VmUcIA6yD?<@j`{)8D_E4v%2ijodC}ipkx?9RP420u^_Ohf0)n zI&(6_E%ltP*D0-MO%Ne~NXQ$Z_+0L%!Hj}PJq^#KZmBzobfw{zD1VS@`(O|L1ob(Y zmJbiFD3usKi2@P3(vq_=589TM805=#k3hZ#^S(j8U!pUqaFS|djK?F~yG8n4TLYY3 z;OvA=TsnhuQ!Lf0)&|(jdxwtGtd%Sz4KZp~3bPP#XR_;I4zI*R+k{y7&OJGc(X1nu zyqRa^m&P{PzDVWcGCO*7uIM`3JPNaz&i%a*)gp#Vml^#cYqkG&mm4*ikcgaV1E22l zrb4}8Are{@ex)GSKcujeLoP(A{>d&9#Qb4+Nt92sPt0T50AJ4tMpe;r-$2;Iv1y_S zoBY1w*V0cVW6$|Ys!CZhn!@`nq*zT(iqt9q3rlFZKQWc3Bhwd?GE3IZNp&#|rHszM zNQ!JDOMa4q4%EAP)8|m`c#vszXZ^(k*Ht%e)ht^Czn=yd)1-`4J>?mgSEb{|r-YwY zS`y*oxK{5-L*6b&IUsnZ-FsSAFUT2WtdO1Ii(`G+`CD^UT3J3XXPj!8iE$`kSn4=O zb%$Cm@GPNQ6@X3p=g~dVQl;+H*jW!@Z+FrqQfAb(iU9VFJz?k9YdjbV$&bIb9A`xG z@?|R2MXJl@j9sw`TCEg0t`-9QwUgI;RonmTynpF5{J+lO$zR`$PtW@x85HmR(|B~5 zpqFOw1wAxh+%?&s;J4nhnB3v2O4_*@J>oi;iyoY7AjSH09(r!F@1C2!)sUDLG)~1@ zsBi)9Zh@txp6MVVmaS~$&N?pMskuWsZj#L_IOyosk_r~GXlP7TW_Gn7JEI2@?>F&~ zkX!00jCs09`=5{cRYh#{c5)UXcel=z0t(Bj!@13Qj^#_!{rkrhN4s;h@-3t!`>)L6 zwf@Mo0K`+~O_mkvwDTXL6eUN2B33BITr$|(Cpj+Sy(Xm);%{rCflZIqLptlmslvQ? zw2-E|!$3n}YGsa00VUT?mUVXQh>dG6{R?n_I)6fT{#L~dH*>vS^$jKRO{aoON`aTf zMPSg#TG=&MH}!t24JasJ_AJt`ZKU7F$yNWvIX)w|n|>dEOx!+#ZR*UXDGGEy_lH-L z3RoV;pWXiA_316D1Iy+TYmCa3pT^Qm^=VK+`0IHF zPRpDvCc?RQe_uG&!P;B)4Ha(~Mse}tnq6GhWaU$CXk24kJ7!lF(`s$G1EY5iMFjF- zixvecPvHa=c@mKzUAfoPxW7A2-@3P5BH!d;6VDoZ;wpYdl~&UIh$+e~njwXq?4Kg4 zz|2VpJ4>AYdGbZ@fm~i!jn&my2&AjlRo5q5_g;3uY4<|)P_i5 zadfx))oR9Mc0S9~H9IB=;v)CzmD{R!4?xbBmiTY-b!J-tUqGP07`S5N78NnFS024qNUZl9Cs>if&;NI3WW zdpL_xD2L?^SNYSEIyE3<6peiRxx5cYb;PL32^U%(2rl;r0FXOXcWwNKEWZWORsXrg||M-@Cx*1#~8UdYP zHdjvlW^g>Rt(@gBNuwEu-dw#nrGK{F+&%s_dCVBubW%_$hX+``BVJ4jJ5Pe9c$LR2 z{eepGY;F~m=>B!eLA={r=VoQ|N@Oi16Tt|QDA0^l%8l?!zMJcB#qh)U?4uXM)0-)T zkVhb(5+%I4CtUA~gMt~$WLfT_s!qO7zVRoCm&RMR!rpD2Ww20KUEj;|i1xR&NhcPh>V95 z%q>rn(arSQ&3Au@t^lzl#)HYt#nfGf^eO-bj{eSn^qqj1X=&>1YE^84S)(=s0-}a{v@EuZCp*Ln_)k#($s!~7sT!8~ znp#7(3!#f1rE(r28QO2!^(Y;gKyD!QbiSsKZS}BP^7EWITYhOU7npXHAq@te@rW$6 zj^0ukma>pTHNOY;f;3fC>~*d65-pn#FTQtWUs|x!q&IFOxExTwKCi50HhieJ{ovJ$ z{lcT4Arv)WFd5ovk2XIzrK#A)Qt;-}SB;j0Jd4ePmclpM(pO)?eugtsMSTg3Bwsa$ zb8MEg+zudCHEYQaH|x|}(k!DUnG?~-6azgndoF_znBl^<3fkIh?G}rr&6)=z?1^XA z=E8Uly-)Lb`XrL-SG2HN#zXhwe$?n;qt@{6;$99jDesMQu~pMjH$9D{JiOJib`>ey zg-~>Ad%0sLV;VD$-&+NRO7_`#=<}B&L}tp?8z=B&wRVf(tQ~I97V_6GXehC`Di>nLZTo! zgFkk2HD!+4nM6~kFtHCOn=LCTE>==*sO^G%?So?Tj`=+(E0*jvy2Q2R5q<`k%Z0dt zX_|nq{=ejX`)JLaf~^XGd3ZQ8i5F-3Il3T8+DG*k%3=z_;fov*8o%4N&+#=$y?xZ| z)ciMU(Ko_pcTulx-+1+Z@$-6}qvQHde8WHExN%fRBg@}ELu`$t-t8XM-9LW*8TDQ( zde=jT;iP}{WiU)mZieiKONNulja%u#zujPZzxK!DzLNo;F`A^y?^&LlY;$%w!Op<$ zL9Js^Y^BqswB58D(2%cQnwvR&jbQ>6CIv;s~u z54OXKP5>E1WCa1cMoz^dV;Ww7d9(`09Z5cK<<6sdrbG66g8?x(%JR*{Ekl$~ShwbH zEon!SUTi6PW2Z}A$Sm>>a}EDmfZYxCV*6*dxLs}6nzcbS@?49Z?J_Oa@>%}2JV%#I zOoljCPu+*7HZ(q~@JX7#Ys!kg((5+q-<%CqnT+9N^(VEXIkPf z3+H~wX~A!@(f#GnP~Q#QP+x9WGeHyC-hKDad|GXeP`^=mM`s3EXYD4|x3mjn2APwW zK!y8>+-j@BY^sknx8-*}gz3ADDn51@zqu|_@uVnqn0yb;rPHO!CnDGZEERS#O?@{Q z0if#~HGivjDtLI?eJG8l&A)V2BlzLi?nZ(^AXt-Z24_qIvUcNRxpC>Poue*)TbG?y z(JdO#Txm7^=XND&*v$&43h3ZKYURajZ-|T2N`Px~o)2C@_w5c$sa&advqe)WPk|KG zZP#kQ)mm0mKU>ix5krxG?fzCfF1`7|y*B8NC(hIv4Nh)Oza$?f)<-%XO#9BAM1ja8 zxw!@k4ZPIxD!YXwDS`tJabh3LL-83Q3#CMO2H)nhmTG8KJVmOdt~mVqYNt_#Tjg{+ zH*p7Sb{9p-ipiG@iDudNq69hnX$9V8j{CrHWVAY#wvENv*PR1naii^)e{DM{U8wWo zt8EXSpM(BnbmbiQ!N0DZ%0fRmeh$t6lt!0G{xBF%P}|}5wa-X}Cgzyd<~G|r6P`}0 zNA8_Jyk0}&8%+(?N0se98%-riBW4`;+T^c^8FEU@G@g}ayPTtEgM2g=&HY;z*XD0- zc9O5eH8hm(dZpG9GR#^9)k;nB)pr|2+McXlZZm`gh1sM+s8SK35 zfa~w>)i@>wySrCiNZ1=~RLxa8AV-?4)(?-{g&Dann_;L_`8&fe z`<^=kFBGaAviu0fblCL!FPME6{`)(l?_28pth>j2$L#w>;wiuW1N$#sKBxb`XZ*=6 z|Bsu0wL|7#?QQe#ynpriygy8)-^PR66N`bpw$&`X&YIpCJcDWj95uMQ_B2&<+rubuV%j(^}*@JxX=^MvTyV ztRPa=VH{#K+HxBDhNNP40DMBeEg%kULT>@C zHFNsjH)&CJxj0U*I0FoMAvC(aE_z=t zZ>EZ>;r?}?%HJvSh6mQDR~wyxtASeoMqM#H@z<_m+mtH(>m3Dm#1&2LNs3@O$oOT( z2AT<~0peatR_LJ=#~1m0`PTs#=o15+3`U-#V|!87s-9Tc3yYL2H7kC_*20O<_K* zHlgZYL|XU)&S%ee^V(8?0=*XL*~16g`RMD(s6Ta!douc%oDEJd`r|feKU24BJ%HhTEbaa(WzD=fs%Skf4I&}lSIKBFq zG~6i*X%r*9(W6LelcX;a`(eGi>Qpf|zJPz0}USjDRI)CR9 z-j3qL*d3^u^yyrzX8X{iJB_YAnpZU{jb5$%=w?Tu8vsk=x6r|f8eM%u#m8`b*}pBrDQ+y4%@QlKs>R5h-Rn%aGz z8P*iI7OYgz<{7Du)nY&cET=>EzM#zbF2OK@2){Srim~9#U@lrEoQoHy*?(yqJKM?p z5I&&z)q?hwgv_Vd-cm&&k#jeKZnIizlyB6ob@px~BRp{%T<1YeNnj}nBaThNGNZit z`j^dmMWb>_M>qYUCG1Q+NUBDUn%i6}S+sbBO5LWIHmgKpBn5EX>$%&YJ}{uez{f70 zGAZNbH;mL%kVY}T_jdR{WyIA14*P3cZFDm|za(|dlv1J!?ym9Bm;X!e4 z`TFptl~=@EF_Bz_IAZB{VgRCs`h8InOLbD}m7kKX3X3KUT$rdarG<$qr}vIvIe0rh zD5(|QQQ5Fw(r6rf1A-F-Vcrwv&i}j|p9k~dK6^Gaj8?n7yOWW_UirFo*~DqeW9wHWDQXeuA zKrgHMgrMZo!}>N^U{~RkE+KyD_;;nh63PDIIqu&J23)$S?9cZO`kJk+yeBIjxZ!lf zMt^Wvef}|YZqUsYyWiYdd5NfHa_J1O>5!_klzc`1` zJj}H)V$oDkJ`x2PCBfhWvn=Kh6y`TioFCxk{$>gL1u7A zFD0JK-}wGDTLgWC48S=-+cXF{UruIsA<SlB#-hg#+K141s%MO?ryz$@?$rwGIj)) z9>(y5f5YARU?J?z&xjzqKQ`h&-Z`(#4S^F`gj|y# zb&3_5#n&(L0%bDkGD97Xd`&`!+j#-W*%>DPxm=Yva&&s0d1~q!a;}Q4_J(gB$#?{d zKsbB!TK0)QtpyMot?18EDW@$v&W%oAV$-es?RUf!G#v7kheC4OO8Zn;PG#y+zI6m5 zsLH$D_nN@PkfBozxS^%d%p5hEYg=-L1|Vjh3Cfudot=nQek#t zju)4QionfZcBS}oRotzflXMPfvlreaw7t+XXq1FQ?gp)k7zFa2<_-NuEFmAks<%oq z#cTv&abFdu*Hqv&)Es&boPTQ^3#T{ZF_6S3H{kG}y5BVRH-oLa1r zkffTeL0LMgT(B+Y+cKu3hcgnJ2b$?$2{8fuqBHD-)#Ig7ig=6M z#U%$SZ>z04pcVm_JfM5S;95gFFeRz;&UiLxSi&&vHQe^`bt9A==MRdZZB{2r8k#=7g>s=Fp$^1XL# zRqOS=JatL#|9<1&Xww~BeDr?k;?0@2S7uil+FAfMf=^ocw(G; zkAIjexLN3Fy791EzI;}O^2S5O2a@wKFC;3}Jg&bYCmV?+vHrvqa+)xQfYe>RtjZ3L z_5J?KSs9TxHBI6?QnNzPkd|%Eb$Lrb>aAvkhA-*7e>@l=xW(P-y_jF%72&=bPR^6T z<@Lqr8%h(tj>hAm6fK+&Kc71RcV8MzrcvQ1u)Mr<0nlB#6W16Nxqroo1{E~QLGm;? z-DW@1LM!9_<)T2HeYDOqgJvru+rFppWgATC6R-gCn4=lBO6zRDB8<0vOczh0$AX6Q z0Y>6xQT(ZUV9^m>k6=0}BH9X5P(%PfLg{8;A{jc@9)hMU4cga8<;(lIl)`SHcr|-l zy4HJ+0++D?`kQq7e3$1GC3=WkAx(b`x7 z{(ipwlhp=K*&UEo@*mN@R@irKkT*i?yX|Ika3*1G9VvmAZ6ixo_j(l`-cyKW}AVp@E&0;p1 zkY;V(ko7*Wq`}a++rDp5 zlMdzGV_7^1W#thiS9U5&3&XgLlvC>3L^f};8RM`!jRBwEaNqZRf;%@zqO(|k2$*kT zn~vetXL#9_=q(L)@;?0mOk5z3g4Lh|NiL5Ujl5nHZHn;a{xx4YQb*Hd>g>Wne=>A` z99^+?>Ux0e<#>QLi}4rK8KBm4BTBr4i9}=!14bg0fmKB5B!Ws19qrUQC@W)`i~G>5 zvG?PNflI=^u{93LpBQ=45hgN@J^ zdqk}Y3((NBC`HmtKSDg5x|4}Z0&8z|R z5J{{AiB`1;{K8ZrU>TFziuy+hu=b8I4^e<}SBo;Eh1oDhsQhjy*mClWuuCfTNv@Dd znXmYim6X?PwDFp{7~q{a0@Sir6Z^Y9$MvAjp!f=<^;wX?rE7jA3D$a}+No*aq}pk6 zStJYeJ)f^aUmyWkgP6*!-DVy~S6OdS_5_uS*df}-45v{;)9FT1R-}%k1mY}no_cHN z!F$Vic=L^iS6>FGY5wMn)&9Sn5&LD}jM|%vGZe!p&?Ncq#j{ zAMOZVN2u>7&f%TA4eN}(@7X-WV#iqu{6zv&HJpjY3bHcV9CLCQ4*8!s(5U zU)(wU)4NpAirN{h%3GqRcfsLO_B8G3WlBbEx_4ogJT~pQ=$S3@mBgcU3MboVg4vnH zXH;W)bC_7Wtj&WLz~}=u&X$A*9--Ec*MhUechD$tbXJn?+fYfKoTd|06VFU4FH_xF z@z}=IWw6ujSu=vS*i+9b9eUaHo86P}wUFbqe|6R$pCyBf!I$ZH_{GX_`!|#6xPLM1 zOB6Dc;UbTSa;3pM`wr8==?r%||0RWrelq+&uQjpt66K4t#YN^fL?Zc|Gm*==< z6}k1+k@iCDRNvNXzLa&R3ioeN^knxAHwUV2Uwftg=MXO^j<#ENmS1t`e_NR=i1iEt z@l$yYiU2clE5);N(#AEjuh;V$hQU9r=GCc-sWta2dY$F|ikh4Dr#F-2bN_N+b&S)` zmmibnSSrUp4Nj1DK{?|{()VljuxRNPG5e)Jkbg#fQvJA9sn?EKkJYPm+x1YZ)u?ov z^-8DLBW!W2Td6l&<#NYnrP--f+Kn#1UTGY+D^22>Cdm~_#uipf=0{Vn+hBY}TWL7W zcBj(r)>)b6WU}3Ahw-#4$Bky?xW=c|E3M;trPprkj;L|msT}tjI-=fjrQ2ylHIWn6 zNTjp>*cMh1SwBdoyLK@38I~TpvvUIh%Yq9i8<*%IWJn2mUn}pAIJg{@r9xM-zB$!Vj{??s5WZ+U3z0 zTny`3A$2+MiVeRo#=g{o7X$N;Zf=NL;%Rt9v@&>&FKm}GF3UjpJ(a| z#DRATo*|_lYJWRx`Td3=F7G=hTccf@hPTjn^S+d=E!f*OT4)f%Na*B9KN*?KV9-av z3X&latSa~1T+qD4UARUsRz$9`6Y__dD0qpyS_8Sn@RuG%3lYr08~%paT$3W$T=%nI_I$-kXKAf(KLq*U(ti$mYvO+6>CI;~cuI3&G zoG*oDt<|yLeKGu=cqOTHed6h=qX2P=dLgfcwMi5091WfpzA8;S3A>6o6Mj#)P%UZL zFWuEbZuS;#soZynl;bKXA9e_3C<I9DEyRhRhI^@wo4<`6nsPN4v4{hiGKy$DsyIG7edm z)!K>S;k7$EAba&zH~dE1Kt*L}qz!$TM_^Mlf(V6aDw$CwD52dE!V--dNmUX;T)#ZT zM*DbYhz9-gkA|qtYp*Tg&DC`5Onhy6mo5NMmG8EsIDc3TrdFAFqFW2iD4kZs@A|5p zQ`*zq&A9=q`|$8jwv1p7rzFR^>Z{)7nZ0i`*>DA5irmiib~S?2gDuypHg*`Xjm8%J znB}x6;K8)PSlKgUIw&;DYYR=I9-3zOwd1NbcZ(r}4w`zmTHn8aT_5n;+I=0IA$fK- zzWFS4v3>Yzve(=h-5;J!61Owad-CkeLg=&gR*B;KC+^*x8Z$qJ?gF zx9in+AAS>r1RdXS+RB}XCCm7OP>ufG%cb>jPTPgHa2Fv1mKA)w@C+CxX541a?T?KK9 zw+tzD3Cf5RJhO>_6xj5p#roV`94pY%2%LW=A0jsrdTJ%pfhN3l^YYySB6N?PF|pEI zq6adkV8R5(+Dbg;Cl)r}#D#O7?N+oYvAj_mKp*CZrAqboHr)UhPbrL22<)D(G@~kg z8q1P^!QGrq&0*ezNReB>pwyyC^P(>6s0d?%FUD}ch79gu_sZ~c`dp08y)zhKhLvCZ zbXU}J^o29Pagea5(AZ7B(}qahQTaU$tAu2qGaf%o1}EI`mB>~&)pLtvayc-b?tC=v zR>_4`3oGWC?RO@_MG>qdR4?WajQiKZ2VaMiNz(s}Y=qNOKvudQkI)~1sKWRL1_PKw zaDqU0<%JT1P5Nd%9vBx;73j#llS~3NkZQ1U^89fT}wnmm6hi{?9Eu$sc*TGjPEs zar5KL+W2(B2$K#j;*H43tFcrwVR2|WE0HP{mKIqNd5l$2$K^(zf;OtMj#o7eu)t2R zTb`p%fX)+^LVL`fXY{W+tB}>Ig_Jqaq|X_lck6oRG~`l_iy0f}|_zDp3zQz zohpqLA`3)EN$E}E;Z+wUf@q*+_wEofB5&fpZgsFfODDi;W0th4ifPKXJ z`#jSrh&QebmtAto&P^3^=e#Tikb-yivsCGZUsAUT$Tv5GlNpPL#aayZ&5O@BXG?b|Al2m$Ldo)Y zO{ZBHtKe`~EIiP<+rJCq=s#_pEeYoG48yqOF)x5u@<6$@5M1NPATAN%3%MJDBH}x9 zx>y{VJ-CHK8fLLw?8u~UYCCA5#w#iv0SaFhmkQgu{a&Jig73~6fGpq_;O=w}ArfHB zqwU2;^h$XZQ=yW%V_}GMBTRhC)}gmUydrwBx-HbY(<*U?6)hqhqZ;{((fy^^0TFbx z1lL&jz%2Fc%9J&ceyvx68L?cs7gkGwp9sW}<=lx{O-$_G+uTV+gKx0dDX-9Yj?j7f zph&jKmZ6JlXXGpHvqYdE^aRtf80;E)707xY0^zp!sYMn4*Y4T)kP%HYOs;Q@vc-b} z_3b!ivqg?&>00(cPc^ zr7~|p+DkJs+xk-0d48W6iHT4bD|o9KnyfewT7$yZmuN$FKQ_ay@SH}P*a_VA0+ufo z@3^^HCnyuw6oWT&yz5i%RYUVcY;?rk4&_?;Ml{Fhi2@>TLd^;(49Yhz z@kH2Id7$8IC29gv5`M^UdR@~|a?!uJIz6Y`H5rVD15)u_j;;pZtTybr5AI9`>-nPM zs?3wmp@xA5%{+zuqJ{i{3v3gx54C<&k2UxmECAV9CmvJUnr^5oWyiinS613l!S439 zxLfB+Y$%0V#t|x_y_40}KXxMYBr*UW9Qsp=V7@w=0a{QBWb3VNwW*PKu?>Xqj9SR| z4Idv@>n$x(c*uVkEu6N`saknqpB{VO+qHU#Cm&kGX*C&-z77pPN*a|;GjQE= zzJ0lne6{)NSN9{W!$8C=s2xS}j-5Ve3wjqu7ClA_WkP9hl`bpceQ=wUmUg{W;$&Zj zK_V|&uUG4(E`2ZEzyjYL3tWDoyhn)7Q3h#hUj5CdM_rC(9MC6 z)7JA|z6O~#?SGmkC*x874EQ%dX#Oz$bn-Dd9bH~u45mUi#@sLoU#EZk3$e*`Mv868 zUn1M{!TsU$#}a)EY3VboH4bE-M@Ek~3q(>nNp6Pk(Oo~!6sxHykrZ9vnTU~AIF1nJ zP3Ku02`Hf1w}#Ivb)JvFO!ba?xnC9!4eSawCfV({U!zrv`e)J-bE$tBMbHzrf+A*; zC8(O_O4Nz*kBmmJM_JET%m4BRCaNeZuyDwgmwTTMZ4Kr~xS7}|)3I5kh49s__|dhq zIi1}xe|pT*O7hjM{n6L7fDCWoR)0Mm{_E&6ccxG!`7-@i|L6>*QL;(hzt4|Qe7AX( z0At#}_yS3~9u2RizfGot!3E{1e}z+NvdK{U2CF%+Yr`CpUe`yUNj$zd1Aggb%es9l z>MxiBOV?9ijMdlkVbc{0b`e+*nx)uukgst7*0vmZTX1-wL)C|Waz2ck438+GSVe?z zkhx&WMX4{St)xY}5_*`$En{qW|)<(vs`UTPc`O1AuA*gyNIt2X5fAr zJFVCovJ!zTg10a4!j22nt{|r2CQ@F`OQYwhh^AUjQOT}}SUM>PlSO@Vt|=5RZ$;v( zFr46;(=BgU31#y}l12{>16DrVs9cQ2C*L!PF3|zY?j^gkvHJMpDc@vb;b$mgdBhx4 zOd#J!Vrp)=2ltZA!Nu@uBNGW=wnpmhv1i5bRC62G6UF@Ckfovn8x3J>F-x@(!!1LW zZNgTz30IOCge~*AfH4H%OJ5fsG{8JxEX*a_ZX{{w?AIfD-Gr);sCA>ZJUsc`q z!WIlq?g%uw(i7qJod+0k8ewC7@-Ody&_m2DTKt^8I9VvQ@1c+z5zg;X42pYywvyEr z7Rp;?Ftu==82$0hWP@0;)hCYc?+JL$s$R(=RW$wdu#$3onM!wn75S~iT?{F{-rngo z^fSW+mnW+62vgSGu4~*-lBiOAy^0lx8N!y-G#bN-Qw;P1{A@Y8_ztCr-&^a@6LVU14BgLl4H8LGQ6Bd956}ctbr10$1G5T+5#y)A_v^ zE$PPEZKRk2WfxMzB1^XW^BxH~uie3_wHL@g;Idg_jZQ&ug7xUb^UsUMIgC7+=zus`}w&f2|ACBi*P z=PSS_Ia67T3U@G6A}!{|QRV3nMR8`BRe7;rSh6J~k_@2yTG@%I%O-T+dcZj%8drOE z-CW`9FRkvxG-cT74=X*5c{^t)G;55xi=b91dTc<@?To^e-`Mfl=%GZv%9|<|nVhCU zOzOSTUJQ-SnC3>ct2DsmiBM#I-Dw1p(EX-(EV|olek%kk%azec+S7k9)+<^eoX?RR zR@z;D5-cLfjeUix&s)opUT2bpUf*r$!(D?;>^UPs^pKe^H~KBx?^ltP5&D=Oqb3xu z9Ka2|T772^g`MA4M8O&u7Couk9xPK^y-fh3fNPxRZZ1)0E6) zx9+(d15vtUt%L&r78yW@EJwBllfX_gh=CqG!p@0?kT;~HbJQ?7OZBgrhTc{38wl%o27ZsBR#<3e4qj~Ld=kyhnAxvT{I$gZCpUfxgt znz&+#1`g-=)8AyOQDfZbMm}yP6NyLmDS;<@HN?Iy*T$1M(@}54jp@T0ERI2cvpl{C zElJItf%`{>9AUlEI(*TIT=d#x16QBnNn;}U)W5huJ^0tb;0prK{p<1YBIzP^kkdUf zi1wu2sdU&{JJHNnN4p`($W1V#$i z+!qeFdX-woPlz3UjYg%`>%97|w)u8vw_a@=$53Q8zl!JJjk7SR^=h;9ja_=2p^xA> zeIf~TVKzX#B~9+PYZ^w)b~B!vI9_`&)aWGW0(Ir!V;CS$aWT63oRAH0a`U-APEJQx z({caQ$<~qkc{T_yn-{`8DDVe??Ntk;v$>)guqTo&=rYl3XURUN?8K!TL?0SuR>k%K z@mRG4h5ogulsdD;JYif?^6!MQHHeRn>>-;$#{3bd+V2Wyct>hVtx@_%`T)4ubpAc* z&jI(6vOxBh=lpl^Y2w>6mH1qQ*M`0C%e>YsIGMjO^TSJI6)p%{JQ=ucd)*&TkslH+ zU__KkHT(F@(M5irC4V?AFTsz9@0aC~$RqnAt-Jh5aqEE+B<)Rc@r3V}^$zl8wf7VR z_i~ayenoaIT_k@v6Mk{atOZtc3mPT{bZ3$!-rwS(F&zeuc={xAv-`-SuBGZs3PBI5 zyU`gMAcoi9`s1^So4nE4$7C`cgX#*6Qot@c9rfLRr>hxSAM&S{&|JphMD!Ml^Z@~o z0A?r^TA;S-XiMnB>r~2SPgg9sU^6~yW2AY45kB8ro(g&I1f5#QG@u%mr?Ljl$)@4a zg(vx+<++NHM3jUj5M|&zVx&r}*X2ySIjf>G;F;Mh>bfzeSEXYHld>+k9~9s|rH4uf z@0=&r)GU1ih+--GlwP-@MKjT8JwQtMdTFZs46S`}hcEcuYv!;bbii7J)On*d@2QqU z9D1S9di4bRwvb38S0hU|8@Cbs>+H@fz;n}Em|fEE%vocG0s{zpW0%XiEE)EOVzj7d z!xoCW>}iwwx|lQ?p%XW)8g-&tk<|-2>Zks9c%O`&(jFye&Y&f>ukS3k=0f(B53f$g zgFdiw*18v_oq`Om_NN3 zk^Q{h08qZO8zg_zCL*XI%v%I?##iBM(0C!LXieS$9RBd9xxR}$nDkZQr{ln zEKb?lNIwsS|0%s68l0!A`F3%OvhmSlzN!R}hujb{whUz@wu~4oiFjfT zXM7>7lKcNN>7*^KJucD(Lug8BpDd?vCp^^}ZMAU9YsWcu-_7{jd+@=20c`_%$%Q2+ zxf%@4nCCtjUVxl#x+W;xi<`l_wR8*ro&q85gdqce+}+PtQZDX0muYM$-sPj#EZ%B$ zxEy^hB=2~e=S=XE5M+|tYyi9F3}{OPsXTY?uK+ltKb^4xR%c51E~HqT_Ss{m&S;LT z>|=hvu`cfbCX@QyJn8`vo*~X=@{Z}J@41*s)CooLPOzH-4UR=~X)KiP@|ew^+&)8~ z5jB@meTk((D9!LGtt|XIlL(fj^P)Js{FJ=I+(;H~^}~k-7wO%4sJkZJf~#%hah#$tyIGO%}Xw!0<_9BUI?*%JtC2U2x*d0 z!of0bh4O)$Wo^S)Fof-9tEY``MDezq4nl8*Yu%*$$nv>lCi$*9y~6EhJ66VkEbMx> zBHRiI5Te6dZzb;Oe8(73+d5IOg)mk;8=W<2UZ4p^+x%oFy*7E)uOW3kJV1_E%c1nN zOAi2@-ucXNj%pzq)p+F2Oo$iofG$EeO?qd`=*7>51l+kZeQk-LDA~p(%N9>yH~1^t zmnrEez$+qp@=m}-S=^^vY5kbF3!MNF?w9QgyE-C!@nx{@R3XXc@|(aQh=u8nfLPKZ z;v6{VM5B+KYCmJdT7m2W!!=${%4AjC?p7LZJUATF++kXzb0d}}gQR0aAa;gktG7== zD;5#LM(Y|T&euBR@xEN78%4<^VuHz&kd$P1TXCuo%09eM+*=UM@E(l8ivS|eA_vtE zYx<#eZ41?HEK27}Fp@HVv4AvTV}unH4j$84=VZ-L#OT0}_dUSq2IMudQ8H(vF#xd+e;FC>o61VsGSs2(!)P z;ffg3f>@cN-aYgfZdO1(wN!g~%2upYL!QM{#68qPsxpg;j$pf>ornybY5C-g3V{p@ z4jV~dPqM#D*0a!4z0)RgB(CraeF{u#Co)MrdZ6Iy4D zS~NGA3gy$4sBx;%k*ZZ-WjZ4wFT|Ng&4{rJJ-*SfUZHEnrjHtAb&%NfovCWp`~ zO8G!g&2p)I&eknK!;2@j+4f_6#VvhndH*2%(b?$h)mLES{oOj-%I*>k?XNCrfy3;n z7p`u&ncs4z8(ciHHp(x6uQ=(K5R+g0pMJm3gXx6#V)63yb8Nz(iHbCZHcQS^)cRFW^eGsl zYM<*pujQVf^q#Rl{>#Pnr~a>%2v{PZRbd$O70e+CAo^f};0Pyzlq@(#S~Thd-Fg6M zuv-Uk>DlB@n9Kg<`tsygDHu2MB}~0Dp$j33bJMO)s#awS)+@{{l<4ZwsUk?hbexJe$>BM->1C2&UaoEcf`tF6uHjcD~)@SN^SftzTW%V;O z0E+Kw&S19v=^6k=)|tTkNNh)t&JnNmt)6yQnuz)`I=oNmdSO;SvZ0TJp%Y})`8<0d z@K*7k+bsJlQ|Q(8QX+{eB|0f^Tt$$iNe?xqtkp<2*5NG$bNNGgb$8-BCT50Z!YGd- zB>&R?baM7<Nc^?OF7s=49SddDBi7tz5Z_4YECfLFJ6^gNEqkZ2R9Xfe|Nt{$7Zjsf1o+pH6hL z^SBMYQJu*gS`mUJ3WPa+`;AEIXlab^Sw*;S}Wi>pJ3zQSjcW^ zPF1urwea4zDG7FunYg-&~C7BC7T<`yYjN;d!mmVm?0zH1r7gQSXhduxtnEoEKMy(6x( zu$S&VN=&Lc22T{mGzs~t&cRmGoc_LYv21hHzWwZ7b)Rd?46ve9A?tMEdi%6FB%0bZ zE&JBHe_S{ALZki*iU4h(ve`YT?45=4yhd^^>J!VVJ@rS7SC5eQBAq|6jQuV%R_2tx zs$~N0mXb8g;W%6AofRylfr@I2shh5VDd@%!K_C<3F7Wxwhvn&yb#5*NHLqDil)eSSx%Q+;oBBNW4_(+&u9 zL@m*)7?7AVkkM?589Z9XVof_}SzcqlY`xknbIbAGn7I2bO0T^jNPj}oskU?se>>Pv z(0uIcvqMX{&)A>rZ674h6|GJX7sFOQZtiSl5f;`o<3#ZN|9he)OJPJ?jh@<6$nXdy zh8C+!g?po2EA^gEAGhm5-C0xXD~;sUtBv*!2>%k9m6@_WOt;bEvkc#hq!L>$#m{8A z$@-T-Jy$DjJ-~g^){yIz0Rc-T2|jj%A~}%ZIFmD4!C!PkXkT#rYWFrg|9lMvvbb^S z_AQ~Dj%ch(0oJl$s7xS>bjTdaAO(!@(yL1dg%v>_!Qc-alBFqM+nEt*bsJweRH0qCxNm_vP&pU6^54z(ODh zRQ#61SUtwd_!VJ{_NOX2_8R@k0C3Q4@m=Zu&NO1?)eH>!H5$}?EM>La)jB#jJt}-@ z<;(sT!<>9t=O)A|&$ldAe`}O121CP#8{}K(|LN%JvvZs$mwhlpPsbw=Y)pVu{n`CS z$^}n`7Z>0?sV2X#9^u`h{?a+aN5lENT8RkZJ+4_@Iy*&lapEe9yA*0I(b1IE{x&IEL(A9?;7WAEZL_Hj6Nfp2+ZfBp#}C zQh;fq8q*&XMgSUyaknZ2(#G>o*LSd54Y9T?DabpO-b%TY|0B?9*~}xD@*${^)4-nuYn?oX)*LVRih&;Wp6<{AA~lY(^?O#)3ILdH zHWo66UR>*Z$FpaWE)*sH!I1FOMsL%YhJlW(i_cOy09`~SywfxKnC zQEf%g7FtaT%A;n-H=hCfbLHM2&r)+LpOZtn0YGpQsZO=m+5du>m&ixlryy5z68o5| zhv#d;*-!6eAb*O``NGVYOGykdqCBM`gC@PM5nlW!W4U zh~H!bI-gUZ6*^;U^H)VT-fFEi4LZvK`jk+B3;g#4fE?~PP~Y>$?Z|~Sl~>~BsP4k% zZ?=ouc_yJ@$`_$O0VjU1`)Y!1zVA?C|=%-2yJyM%>=(F@KLXQiST zWzw7Mv()z@#s-I_rvayK*YT4Qz5|ut(Qlvy(x)c{a?{F!{n1wh&Q5`%b{GOG?zQiv zUDwK7u$M*dX&O5nFG;;n+hS;0M(A2%S9o_xPWFIF#VDAQn^XY!P-z#C1WY=r%n=F# z;RWgm=?Y(Gybxo_q*Hmt4po{45kDPc5L_Edw8y-92^miv zKJf0bG02Uco)udT60}`gYq&;Y5j9+9Yi+&H*#woTsHiSt{!qHgl^=A6u5V%F z#*~K=6;maesJPpD7&Tk#8W@k3NT9~L_?wxY!qT*mFPJJ-q4Dke%zWmBCFoh`zOk~Q z98LFZANQS}x4s*k;ihcH))Fl#sA%hWimyD&k4FeB6F>*Lh3^&SsiawSW1|42kh!^w z4x_urjNnUnHxJ3ReIe|qt%QKo0WMSZ=*Cx|+C=ECLK|pIj;Af11oArj& zb$b{7B^We^Va2TXRwAxk^Dz2id-tg5FL#_dvgIlIz+*Oh$W)nLIsto0F7K{TVo{+G z$%Tqz5vjzGd@|{LG#gb4gxz+R>PY!Vn0}8-Zpk~Az?)#9XmEY2={>cJ3=BAaC%vAi zHewxs`Ic4nfP;`UL2N4A0{llzg|d!Q;XHkGp5?jV>%Q<8K@^<`-GV|)vQ{+a9@F)c8gphP>7ra_YgA?cZ_>;q!=Ie94OF@p)Zp?6 z5Y`edsJPM$uT&TiGbeUp<{(-nV{qftc3VZakpfYS9$F3nK0v|0q&fXvK%rvEv(8jL@G^ZDD; z{>8=MECIq^|LXIAJ!}`FufSqc`Ijya(Rm7=U=e?7MOp@2VDTTdi`Y2*qGXS>wW+1D zMBX80O+^r2qp9ny+g&~@h^U4T-j<%^$6&Iu7Rc%02uMJuS%z z34{2jeWJ~JuUe~{8GP%Ez-(@CABr1<2#uU}tLM^-cr_Rizm|+9aX*;*BSR+dbv3pOIag<(2^A`bJ<-X{c00a~rHoPI@r4ucb$lq4O{CCD@|3O6AnDYkk&E7Cxz?8IfJqVO z{_udpT;=8_ENZ&Y))cm;J2&<1k=HA6lw2c<_iUe#q6t(H5<*pcP{Z(#f z>S;IX;g;m&jU~Q3s%x3j^=d5;1j)7$XxQyq z@?Bb8YA_BKreGPo=uZdWEk~skFI@`~Hs~_o(9XAofvr4h zOE$HUYTuNytm%VYN)9te;0&myP3asFWHi^Tp}Jrb-kYXkvATy1B1j+VyvP&81{HUR zdvg9#YFs9ESs4;b_Icn`H;?%Q2lr$H_?KY)U#^zx6+C-qthndpOT##}R|z2FCK(3p zSj6qBGF_?X?q*TCBz$eb_eM}X7e?-H6UI{yl_l=TQvO>~iy-+izv^QltTB-8H zFDUZTghc6e_dCz?08o5j(MXPSn{PZpOUKExQW9wvi{etz(E&{y{`9jz)7`_drys{4 zL5|SM9m~2%7)zxi1&k)oZXMD!sk(_-hVu4wuJ0%D0-GMA^az*l0%@8uve@1uqVB98 z@VM)$p?oC%kqCY;eWY!QVImrmKx>s?r*vkdQL8&1m@OYRp8g7RKwb`m?&DhOri9(R zgmm0RMWYiM(c@KiyVVH9=_>g!`KSuO16kl|kraJrfg!q2TE=mww~u!yXknNFP z^0VhkWTOL;hwcP=Y2 z=HaL*zUay*9tB=}>$`T~;K~w9B0|hLS>WwUxSuYu51Cefuev?_te>t4gNTF#u|W-DXe$k2$SxzY{f6kIxAJ(TXoL|RkoIaDzA zUf_MPBQ(NISnY)hZJh`&k_HF@QV@QPO`#JH-y#Ga5n`>4Eynr;BHTj;i~0!x5yG{^ zg$5J^fo8RxlPO0QX4d~s^HsuI!u(*tB-(cK`0Rwu!ry+z1P2sW1Lycw8biTAbG2ce>_uMoEthv=vr#n;B`I?57FI7I z7<|N7GWbDqJqpEk`tAmazAL9xp>k(r+$256f#+4!=ifO^El2=6o!G6SP=5C|)d3}< z|M2bDhPba#(&Yj@BL{GzmWY)RjM_(m8g=jWkjq^ALt5`~o4LO%qvG)}bzBB-;s=6C z#NNw~w$ODxn+e|OrIn)`@+QC?c-jAbq)vqVZQy)Img-JN-W>-dT<4YH~GbB!e9C5_=57k zzVhucPtw+3L zsC6aq9HmGprs^whFmc9aCcZS`rcoA)Bd*}@ zVY3GyQ$++`-s_y@EEG!aEFP_cyWd@-q}a~fH;6{f{3z=h{n>P#Zs*A#t2ufGU|ig9 zvgMJ}I~mho&z@t%^I^L4MVPu%OA`UMHs@d>`cPo9E}h&e8kXDGd3GQ6K*f?;k)C@WSBtr=Clkl2@pE5AQH0tu6y~Pz z>WE^oqCtmY*wkIVoH$#JITz)@NFlaVzss1!|!7FxRfp0YwziUz|mUy@(Y%)TnR zK&w!d6^U(ND1dF|!S8(yjFzV=FNah>2wL|tq<-9q@mv5qNlWGShIyxz!3zyR#aKxV7SiYBAJjv|C*R|($uKZ+H<-?c z5Tv1sFOR%A>_VqA9x{(rt-GjVV&Lf^{Yb7r|hRRXqacIv{4k zssO`dP10Fwmg9RzwpGNFW`Ku3E+g1A4^XhqsCSKELIqpom?rpYwMk0p zH+k}ddqd2=;ODrPKhkCr#i=j0i>J7T41P<+=~nAXvY0|D-?-h*aIAWl#yk;gZSWLg zl|ZFIx!bO@SSC_McZ&fo;`-o<7I=U%AOMGD%JoHS7lp5nE3`Ri>1)~Ai~IlG07CRW zHew|W1xHBn@0hNa5;tt+XnW|+V2C@k!p}X@m%qJr&aRCYUE7=h>cCtMx-wE0{#62b z?Cpz4o4GIQnm9c`o3NxrEDjY}x=^V9xy|psiwn$Rx}Ec&qA~p7vPn@khi`)HWYeh*q0Si0 zeL%B%Sl{JJdYpyBX`Rnt9>N0@24$@T-mO9qXX3zY1gr{O;9SbNe5X0ASys~984<|g zUWAE`IU&-k`{GE|)w}h*_Vz)K8DIulk{*_!ncEZhO5{@;;+REKAGBpYUtKeO#uNA@Hk*hhy&@gbU0Onk8d5vF0R$SeP?F;$_E{^{XfSsxN8%U)Y&3M_tMkrY@i)` zFvu@9!E*Inu@#FVPsH@s8nyz5KClGlFLu0pM>=vjZUsV8Xnt`Z5qV}!77AKO@2<%Q z1utfe*1rWJYVXPd^eRq{6N%CAA~_on3Fqqc8#r&TlJStp!K4W|9gWYDi@_g*3mRfK zOXeLWsE%?zpfEtuAX+WGlZ}v{kaQO`&B^u5q7YcDDB|&Txz4{oLc7>5Q6w7*&#TFW zv(+}Y^K@H@(CI-3l)+)iwhQxpU$}#5YKBqm)_Ul*=GP0!PPm`#hILjeUmf1ty>Gh$2teSJ7GWc6!4rc7QtE_IfmB0K z^1I#W5DIh(O4pk&-&hWstL_0pv-d8{iBZUfF*gHS7v7lP)?#Xo;a+q!*Mrm0 zKvU9W-sUzzJAZk0b;odmVyHDtbPI&tR)+{zQpaKx~Q zbrNwxp*YLzeq%(Mv&F>-k+^3iJrq-#Xul8gD)RZjMDIb`MyfX43vD|3ERWHd8*T_e z`~t|lfaX#&S)K_bdgA7!WcXn-eoiNTX)0QQ90fi(7p!0wYa16dnqCX(?7oL_k2S$6 zI88cVtqnk=w8R=FGS7tIsu~Sc@WBwa5cfdZPMI)fCG|=QsRXA^!!Q{lfS~gB9CNyw z{kQJPhf5_M3trX=BpE_w0h3Z&QWbZSZ93cy#-XX$i`I*jdfFx6Y~n(OEDR{?M^+B% zGZ>f?(-9jMM0v#QUNR2kx$m)jw4ay$EFNWQYOa0YX~+YWajY=umJ4_n zBBYX$KGgkf&CiOFK>4 z9#G2dGL%@MZ_&-}MtnnU<=D-YD<7EVV|C%eiZfhoWupi4V;Xo-B2!7wn5hy9lR?z5 zl8h4L3xs^tymQ;&hnV~Gz4eD0>J6NhYb-PLC3jmcPcHM9^sn?et;pZp3VqUS^rdE_ zBkyQo)t#ayWGSB0{iwX3Vrz_A3z7;$HdZO9rDHfx@3ZAB;Cks|gSpM4WveX)G)VCU zm_A91kRbR)rO3_GufCNC%W3_SQb%v>8BS=hfnpV!MPrKMcH5$`hzQ>p87G}JnHs-8 zq8tj0BF+`wM1v^LZ1od)ZMLedoyHRZtFW!BE4SV|WRrkDzudgj3L6>54!%Tb*JEnpD3aD-mI-2Hhe8;Eoh#DE=<57s zPg={{aWeTdg|CG(OwhQ|0V9Ea zX%NRhUX9CRLV_^E{@81A@_;EmlLcqCntWw3goPE|glw12;Xx|memk!yC94v(mo=Lr zz!lmfU}_~t{sToS*-R(^iJiNdYw-h@WdsIG?Z5~Io%k#hNCjV(zF$Pd>hWmFGEX=YtvkW&Vxj`i zG9z-}yqmks;IdGy#cI?a?BH_h%y3ZdHJ(N-Nv&)8L|&=97oHy}wRT8Yt{1ieGIB4H zb}>K`Ir)6qld1xf*I@j4!5agUxt?Xix42}P*1>P0G+UYwy49-x(mA68oWTD&60Qn+*q!{n0i~@A0 z1spp`gxK&AmNe3IM9fi8{?Pv3K;@!qALe%L#23uvq2)l{+2>88bb|73BJv@e?gYlU z_52nRuZ{p_BKUd1;zkLt zaSQYlZMF+VEWxd)E5Qb}Q;8xC?~&c?kDuKvn7)kF*hCqUkYaV)37Ruh@a#wtCLy2!zipN$T^uP)#ukwWa)csxhSfybQZr z$R4vkq@uAr-%CUjANL6uCHeXg>5@cg zZxvculbynj{^6AGux+S7^S|bX% zHV_jlJY;0AD2EN`pBHgPCz>f(GJd-nln5)EXVF{*#e+7=(0?VTU=f!m5fkJ`+#6AX zoFjl|@>i$@q3^j9UU~&wH8!;89F(N$wqo(5f_FrgW@VM!c_M-381458?tTs)HeC?b zN<@3dJN9ZC#*WAm)|AHOd{PFDQG#3OK>&m^!OQdgzH=DzHc&|zyb70hhs{iklrVAe z_ToZ32V>9S_2}ne>QhGZC+EXY=%^W=4aOHg%Xld_*(r* z{@u7*t$?JE>LA(ozRpALPNg!B4F&RBu1U87Z7B7gc^p zgGV=iS3q;U_=4K#A=o-Yz(hCVyR72_8(LDeQOJgX~Lsjl)sSH5s&V%Y*5@nK6lle=)1r#)F!*0gWvQD*;*iN&2ZU83|vUtv{R_7&Crl?TN=#(hz#fnk< z%~pcyB=ov>Btl5Cr8Fknn-h*Y7gsn8K94zi-{5Y1n^bs*4N{y0&`?Hfg{v7!1$~wy zVz%^P(kS#&qaIO+^iB#=W7cEgf%Q=lI5!gN(Lyvkb)V9gh$J<_E!0O7g;k1r4RXO7 z8Nx4;Wo*p^@Xnp8Rhnv+RBQJQi1k(4>q=m3pTZqhsN4 zJc2e2uuo|;M$Nw>Kw<@ z+jT3{Y1^fTW-S_CAU-F2SYccgQY^_!WRB2dXsi=`}gD_+#@>D5j7nw!dY^C+}jyA88-!)D$Nm-V`JxxRa> zA8PGh>C#gTi6-vdI!#_wub{u1AKIPf?!6faz{pz-4xv$L%GHfZz1Q5GlHuw(mDg+6 zWB_gX(5Uq4Y!7!ns`H$#s%O#k44~82A+>tFa980vqQL5=#~rzCaWVI# zn?{f_Xk&n4$2fZfE{vpybb2YGoME@XUJi{v=-bgyB(A`^Rg=U2JE#gLXMN)253XSF zUL}KnU6007MHUzj_z#N{u9pF*Myje2R0f<|1ljtU0&B>w6f{#-ecY~8_+#|ITe8SV zi6E1vWoS69il_t=;1v|wNQIHnP6~@)J#Tz%!9WfZK}!)mU(wp;(tVQQI{i=05&0U9 zo5^IXg(y7v1a@6eo7F}3$WF|mIvY>j6l`;;x|SjeVn!6sj(*yRngCRx0von#%?g-j zO=*#YWG&@}Qu~a-31Lo}KU=E!>tmK~!fMr>05-#2Y)ob#3O1P3fM=zWa(vfq%>A!I zf_`)2u3zPtxJ>uXJbo;q#dDK?%LtmBArY~KF1nx61IwJpD>+%`Xs7s|`WhIP4k|Lu zGhZ29q85Onc6{1`rmo1ezN*rGeHvsdw44~~g zEq32UwYp!TMynQ&O<+ftnapQ2ojdckG*0g9$k}b{*;~6to~)@(8Dd2h<`-uRrg4IE zSga9?^8zkOkMQybH;SHYSxNG`vG@d2$Jr3yGbQDOSfDP_sdjAqP;Rwn$~1d_ry2m_ zDkKn6U@6N~PGVlj=Oz{ zcZuMx3PC|6fNqkgU!$T-k8!v3^MFFfIC(qEY~YZJO3P249``b}Bv;yhqVF$nM6+)F z>;9XXy$&^~jp$moupe62Q`^XXal3VDmGxE-Z*J}P3XOhrlsK~!_r5Ku!UJljUa!CTqrh4CSAc^XKV^2<2>xakfBHEX&W_%*> z8G7!rX!^jcs{Dng`fcP3;G^^JRcm7~nGP@eQ)gaGMt=+k)8y;K$Rw%1LmCb{()>H>%CpLaD$P*!^XR{Ama+zp4amxrj6^Xj-|uluQvZqz(9;N}|xmPytl!^V4XoaSPNmsu}E zOS8h!Zc0njJMdbaH&){`y5;VY?8qsoqxL6Ef4Rr2MmtbnZuhN%uz!?DbeX|k5Xz3~8(Qeo#b`WqKSyKm7MQ1VI=Y$+&j#bcS#si>;^E}{?-Ws} zRE}s({ab)26J&tNO6nD(&CZ0V{PsV#s2-O}dI8HL=1Saea{F0wy z%z}6I4Oq|231$J9*zoHr$*0rN?;ft7b=5J#ZIt-L^hMev^d`u&RtlB>fVymPA$V&L z+}y7I1iVJEjU_Fjgq1J1y7uMp)TnDD$lmL-`wN|DW}c@R>8^j3t=*`K-g6^CR7l2B zZ?yI#Rlg=DW8gS&H)&Q|;TAh&FT2ww0nVk(GN2X7k-M`j?r{AAfe9{qSlK1v*F9|KNB%S*L&HbGJk$(IFfq zCt|&ONL%ukVWEEVbalG%_zi`*PtaJhcKn$8tWI^d+Z)5~9#GBQLk}8mB4Iyh5*#?A z{qJ^%{OWXC7*nm>ptT#-_TKGxe)q~m?`%{Xw>=5-U2W`7xBad(^jGz$>d)VAhNlDR zi1G&B-wN43QzCz>1x>;H(sBp56AV^T+^+LKlhCj9t2mdcIHGmHYDP>C8Vy#U;8-R3 z$Cz`f4s*TgeAC@)yeAtAO_W532(5jBLGnCaZG;0QatuUh)N#UDLH~{AIBt3k1N}uB zMsY4l>BGHY)xWitatKb!F~I)mbod82G^aliakFYHrTd|z1+3H9b#z{#P-plPTa3f0@Drok36BLlHk%=(YQf@Zyp)Aq^@=>Y6g(wdX*`dkyUmJ! zdt3bf7=lBVz3h{l%gezyxgJgX)6wAX4@?30C}na;hT`Kxyel_+v0W_44!lV>fWbIA zifNc~iFP!Q#c6Vq@18fwFNluJZ;M|=|JN$Vx&ruPQ2#+ES-i>F7Z0)qPMCqQh?^g` z868UR*5(ekD?R(g8@KyL-3;ZXH-qYS-;n_yb1+Y&azG#i0|oDK^vhUj`p3K`Yp*WU zYx7_1td9F@uh9s9 zHO`CKZf|ge5&_4{%d-+YlVRDy-qoxF)5?zr42S zc~Lf0TCD>D(5aUNAeGIWKCH^Y(Zh>987KX-Gi5a1qf`x2ZiPZ;d4s5g{)hwpOiy+q1Wi$|gU|6e zW#sjcH0S=4G*&-`VJUAIXT}^%Ws&ShvCEc4T1E-lPtd6*EnRG-Q(}>|YCBW8%$^OQ zeL0KE-%2rQqK3rBW*i6LGvkT3`ew}OTdNQX#6+SiNegZt(6N2b}6v%x9S9FxbjK;VJG z1!ZABghD*&wfIlVqwsf5wN^UKhQFrXsWxe%OIDXgySSwE4SN1>SEK3Re-ZH$X}w@R zfI=ol%6nd;q*&1Sk&8?JH2O6r&FYoLv38~(e&>tH(7mMl@5ki&qJK4UZ+S8mKyoN* z{(YL=JuU@r2^=0q{p=z9?vbHDnd3t!AqzEyzF26I-NDSwRpdOoIcpW!^+}lGe0C&K z?e{>c{PUT1K;WGLGAAwlfJ6s)V{nh=}R)+?R59DtC`N~ja#!#rY z&J47Q-}jMOdihRVxBG|hFITHg_P_jtNd7CH73oZ*#Ub4Qrvn?Cd9I4@BuBZa8l3>C z>WL)FsP3Q*@4S!zodvHPD#6$3)*Uud*9(QAkJQjFV~6eIPPk(Qc~KIe>R^|OgxBhJ z!&QFL&}+9vu?z6I;YW}@=cIK(GvpzdH8-4v;{q*tIS5VRZ_$me3cqb4dfc5|| z75^GuIyDOpjMJO(m|$raqY3Qz3#Tj9J39QRO7h27&^j-_2f8zz6oc0(PfkGb1oWoH zgUXZa7r?jXN$Xweo&{E??^&@(mph(L_`P)Ivs#5pixkZ6n_w?syB6E4Jznt>(S0NN zRpAX{NhqC?5Epy$AH+us&*crY0>U|2QT7~iT|AMkT)>6@bGzCcny{JkyA&56jWOTR zBSuW5r|i!7%zYiLl!nZ8!AuwWd!_e)kk%lA`?dbo*7Z3!JJx@cBbGG4i; zslK-ZswK;ZzL=et>HqAB>B&K9)`v~b=4<$^{%P#RU#z5o64ntb`L)wa9`?AJ&wTZs zTcJ=Nvt%jA+*@BDaHyG`;D>`dAJ9kMcGWgGzuuzt{0N;g7i2kA&qkBjtaVDKc&Q9r zbFA_mz*qV<8ce5+m+-X9{n?FM_&CC5pVS*Wl^09L;dlzzhXsv?O$9Zlj(Q> zGj=*U{WAIOrhg%ZuKVyYnL70)SP6Dct(Ly~RUDOV=H?_ze*~x;_tw12^6n7fIsQZ@ zzxq-8x1LZK^OL*tkJa2MPbRQEKM4qou%@;F#0aowaoLrfVEaTsVUO`+?db;MU7`ty z>EqX1_qd`Qn}{jiXR3qaNr+e}WKlfSB2RGXG9+&LM&~es#xRZYoE8B&>y)iBD`ND3 zpRMqfpx4eh`wJa2_vs>M)m?H5YC_iDBkQq!@rdo9KCl8cy`QICR8wm2zCs5D;BmR& zz!U`tk9ZQmvO0srOMAQESSZCVRGa}2%n^e_B1FN;+&9k9EHZNs*)E=lrvakVnG&D~ zr;L-7ir!h;Y?uQU6T~JJ1E^A_h-DZrModl=W*N0d@0AS&jP~hgx zb?tHP=WFTa)du%NF4$M+Yb77te?KOZ;peM~(vlHP?z7u&2-hWFQ{W{v-NNl$dwRM+q=uP(H2gm zRaP*UmJjZ|N8qeMe-=yzS$MgqNPZa&C%;;-I1w^O7F?$!lGN1QPd*NZfV&?NdnM=u zrg7)E-x)GoVR%LT*^iVUQ#<#m2=*X7SLY=kw{?Y0;uTsvbTRFJt?s{O;05pwaT@$^ z183i%n*b@?cOoOPi1tzax0d|VDf{o>NOVv2v+$~vOwOH1fKN{-`@muNhx^UM$W z&)vy&euHf6aN55Z_OCS4?nB;Mv2?S%Pk*e0#P&H$wQJt86>+{zxoQ~q^)%@$6mnKl zx*(tWN;B#{0(uGoKqe1yIf<>2&ox^wth&D~TJc3_)9lFgiQb0~2!!LBF)L2B^RTwPLjW} za9u)%h}Ao1ZE*`U8>h@RTD0ylbz*o{L>56TA&u6~2_r|G5Iujs{8-UsPU_V3cjKgu zdDCyV`J9mQQSPsU9;`aPRFvB3plk3^=4zuk~u_eZ7*ST*c7gNk$m;*y5Z`Tl6klP(Qad$ZZ9v>Uy6P>o8f)u=QYb#L%N z9f`%QonhL;S*~CbRK?926z1U(JvS|#Ud6))cimocdqdw?P0MO>x)l^s_yoLz5{|P? zB73kPYNdAEsdSr-a6k7eH>b76aWsiL<8t!8eG<Lv9W%VP6b+Ls>F zn8Ddoh&Yd;6!3G)T>FjApw)db98nX!_?W)tz6GHGZihQ1$%rwWOm0xPA(7$9r*U#J za(2WA;5WF(9{rK6pHsJRZcg0Umd)qI-J|oWlFy6V^N(~cR#|cHj-p)zNYcagVm|FWB}O+Krt5loOd_`Jdao>~T)Ew+MthSyus9=+b!t zT_u{r=Xx{Z(d1x2x3Ep!i)}Jd{1Z1#^SP4`Hu|r(utOQ!VQLApC*xQpUz{HW%DYWG zzoc0M$&h{H++E^4|M;3W7J4w=w-QhKS6^T%UXS-vQr(v2qnuq=PDi1X49&evM|Ipk z8roqjw%kO5A2WpN6tTW8c;rH?=R-8Pf@>?Xwlqc28J3cf92n{^>BBbrtCC!Av$fMB zaE1=W>s$HQTz*3n30j1dNv!uTpCh1~lq{fA*m=cPrVvGp} z22pv3w|@3-T@^x>@@p;-q}Z<2NKjC(cU~v@=@pd0Y-X8ra;lV=g6)n7UA61gj?Wnr zkG5W|wKapRSbuJ>0@e#g&{k_8tIbBqF;|uedkMGg*7@z$j%`~Z$yDvql_@?b>UMlU zSfXpBreK7^e8`Q@#aBFFu+?xNAR^?9MfSkB{%gWJymE%#mD{G*W7f@m_RYUArNN zGW_lI&Yd-~>oK&_?QUNL(1uX17B0avKDhCH`WP`AL^tsvD=t)9tPMHEPFB zSAqp4)T5eeq~Q^de-B-8?0n4}@kkJJ?y!|JbeSzcW1Cxw5_q;;l>%nIGkRgTmNx)# zuNRgD^2`pmFc;Q##{9^XNFAYjL$Rl4a{;hn5soW=QFstE+`W;EyIw+b zkyEL}bU5#V*k8sjgRMhaeAoGogwnkUI5#7_gH9B|)qxnMu$xv*92F+Wuh~@GDiW7< z&N>=4U|QV3u{ujmM^l95qJqk|>JkI!z4hs*IYZ7u31u96V%>Vf%DotG>W^leGJLQ6vB%Uu;lL&KL!|4K!EXHm2!`Zhdwc6l#Y!^G{!Gzqww(C{ zp26dm3`{|Wm6@=^*Wc8)hu>xXF5ANJ0R*fcqxeKi1ZMS|nt_vYirO^7f@MCZ7=h_d z4*;Bmtl_IO;n7+c!~}L~xHD^wTG1l-0}W=eEjNfH|8~TMe_u`ZMiI<7sBvN?+Ou?% z?uBuD*zmaW`dbmTp(IWq9@p8;m@$@W4}n#R-cBp_1!sFL(NGVod(E2z`=C*c@qp}q zeXm5N^uHeq*yL%oNiM%*2WTh>mmGuKCqNusT-ufxVoRKQv;}8%_GGdNTNAtzE-_8b?&^DoHpT4_0!N2{S;yKV~ zVL?{a_*UecvxowNl$z|{&a#D^S`uA8h@BNT!=c9OX(J5oC@KNx@=;W|EHfZw-#h=a zA(el{4lw_}TI;vkF)yk${ig%JlfDq*O!eGq)a6sBT9;3qX0>x{pSm?0oE4%ch~l-Y zfY`#PUbWRGcvk(mTJP~wqgHhmvLru`fa=w7i!QKNjk*&kB7QY$t!k?wpPI;l zOAxu*)tU^Z$);~$a&_G^d-ADU1!~v;co>`S&MR*o*Empo>cLIIr&g2S#3%QXEdXQr zEpDKH8YI(!?UsHBtHXWe zEahV~PvS#Q-lpYV@VKiVdTx#k;eh`;?&9dehhDWIA6nIRM?W;(hfgrVZi`yXHUpB(GLILtosy%tz^wrGiJ8c8WXF3E@x8Q zc9foey0!4Oa>*Qp>U9NeP+J7fk^Q}5NQ9Va39S+B{|cBz)@HIa%~ zWO{DzH0t=-(>3k2T2%nU;b*V4D>3bA2je?-TfJ+);a+k-v7cjs!NS+N_*v5xwNLoj ztCi)Yd)%ouogV1Adv}PH*X`E0?%l4_2d%QWBqrGIXG=%Ysn!~YrKRC?OuK=fooa(8 z+1@Pu?+})MC@by||K-x@|0(tVum8scrqSfi!_T^;xI1ny9?MT)48{V}>dLlm_1sc& z@3-6(_B!$t`?Vx6xDyWgcCXqN=tgIi)En}%wj(eNw{$IcRik&Ti`sDK3=D+^REPW4 zbYtS;p63J8-?Z}DSeVjh`Qx>RGQ{^K+%W~3`JL2cD{h}BCa0+^d zh2))ZN6HEAd~C|7BR}i4-KB3Zd+U&}Y&d$O+>j1WZ`%za{uD3>n zEpd|0510}^vHbu1cxZ`3;_LugYdD6*G;GO<883e4SdZRgu_pmxcihsSJ2nK=YB&1n zyh7Q}_}KPPT-%V`!@Byini=5nzsJzl^4F*k`tz> z&c@!${__oQA)lW*@k0rRvDC;?j}iU&1v8K#c3b2>V>bTJ?Da-V^frFV#&Mdm-U&bB z^=>)~;kbsMJ^pl`;PyA_Q!C7GBm9VIUQa7F&IyW zCl`Zca5EmUIK*8feR@bymFheizB#(o>6>lWD;9Y`fKWiut^mcRff%bWTt#IcTI_++ zP}LjlqvmhI0FIhAqdgB*xKF5;i7P;y(Ur}dSWdQXXqg61PgX2&zu3%n$^d-@2N>2Q zKg70yW{p)p(%34Mj#3>@z@Zl*Bfe9BhUId*kfQOm)W7o{>awXmBOm;C9#_+r} z3wiu(pFNcZD;vOEU=`dqF1CMWi`&(9{h)6rHCqy1pDB4I1@4u^&bPr;G8wr4jE5|T z>VLu|Aan;*#?966-);t;kJ{Z!(^JBHj;_q5BqO#v$m6YMDJ(qQ*t{0PZl~hQ!co5cGFL&+qwvCqpS|Vfv4yqE0`{ zU87bSPz1h{90y6M*x{k6Q@+Gka1~BAYZL_*0b8Kd4~bG~cd9>T+|c7pAp<-Ch*5Q4 zWzWgC)%rW1_rIk2L^EOFD|%F1VP`C4n{5$T52HB-nDAOAU#n84;g65#|3F?@l;tTc0p;Omi)Ni zK0;NY1|XZc%SwdQR@C!z{IRs`^ zlK;V8(0o!FOQiI{EZnTlUv~Mcl}AjN%6v77drNZgPz73jkj5Y15SDOVz#eHYzoaO*IPU?RZIjoo(z-LTc(>VG;H~DO}W!=_$tT5 zHZFZRcQ3TT?DKLXbF#FiuX?*xbynJQpsXt7*vv7d{vPs;_i|pfI#qBy&L!brY)o~p z1SiPDJGoRn!e?evvHC!L29Un*d^f6simLv^n__hPb& zP-50_XJhFr*HMI~^&%qFmfCN0zXfB(kLSBvny2FB?h&5ngH< zv7cx6sDadG@+a1Hmya{kfu0!l2SDqb!uoTg)L^Tp%$QQkWahNq zv~9W@%O=yC;D(e8bd$3~OHU8MRcGr(wvoz}+ZUDu%srTb_C2DWk}@SCNo^-@rIiEv zitFrFDI!scCM_WD%xG{a-(a8Ap~7=wJqub2Q>j);$+xCFxN58EqiGXj67|fPZtYy~ zG?GHRC^XjTCY8Y&YtnA-hEsRX=oiN(Sro|LbHtoC4YVa(FdFmAT-`oDY(({qYsrN=9G4jgrwP z_xmfhGfk3yass_pJO-9{tE+}8MYYzlOu-!CRXdF|Co_!HDRX+=Iiz-!+S&e#VtF#= zaIczUmeXPenugmEGKJitG zeJbYj)w74B1*%>Sv(X9f(smxUoq)Ddd%=!-6jsjCY=|`k*v_T)C8NYNBW6{oHk87x zcf@+3v*A?bJJnm^g~eD!<4K9Yy3aN*_f-l3P-+O6qi_fMBF;G@5XzGovWbd{oIQ9y zXLvqwQAz3PHl8!^wj8~d2uunp@7%Y*kgSrO>)& zg|_n|vT*OHjhL@UlkC)?eN(`Ih>oZ0tXp|QcZ2WnssO~R%lGFMLLwQOtpV+U)1iMS zi;A&`U@IsQH@gG{=GmXQ8ml-|g--UTgq@m2?K4*37g0q!RHEB|aEf$>eL%u^OcQ6$ z7gm=HQ*^dYSsk>(F?hda|4F!`JGL_^J^d==p0{uxz|JR<4Fe)2U$wDP%dgY*nkO(; z)UI(aIiu1HcuL~OryeTkFB-&q%JR12V5aUuZ62VFK|ouFY=ofXzQ^XFH9P;fYM-o#+FSa+a-e# z_qGUJEO{Ds1<<^fKUVV}nVwafd~@W!ceIIonQ{x5eEC7V&)~l~j~iaPb7J(cLiZ~S zFay63{%0pjkNK0|Y;gL&2fyNVH6s(V5iJ&O1X0=qihDL>=7nr3IaAW5FVSD9mMST! zpU_e76ika-D3(>4fO@z<0FzUR-nmy5sHzuz&1XTW zifGCN&jt5)w@{DQDpv8PYjy~m)Kp5kKvkUQ|MlBdT6C#br09Gme)B`hRvDhrAyX2dU z{hVK~k>Uv_pF0jx@Rb7w{_fEZdXs#CAcNw0SP_|OPFDHlYf4TwtL!J+(040Cj5Jbv z0c~{a;fqL5WyLnax4ZPHMB(=(qg{NBv=o4$0lD%;^&Z|OxLJZBC3qtc^a8<#EPEn! z(8jr#_xH*subtxe@ri^M8+ul7#0Z3_L0|4%g$|i)Skr5EiQ9fr+krf@(4p?npjbbE zF4}1DjLaoV9<+ObP-oeljlU(6>%j#8eUj_*;l=R!dI%H#tbf@Dc0&L2czAkqF})cN zln8?i32qUIYsJ>xjF3m60lr?ZSDNiIr~t#)QaHTU2HFs%LhB$L5)y5zZ|PfqiA=5l z3fJzG$e*5aR-4A%#j8>)#4d~u6H(oIrPhr-jqrVsVX#Bfqi_d`Zu*oJE7@6dv`p#(Y-L~1?{`}OBKQf@sWa_66%x!qhY`C&6Y$=V0Q=N!RyVdY z#L>}88MKny95anQ2;Q3=RDqU{ttjS2#!Yeo=YWSVs?paAc3<-VGFXQnne=Ci#mx?a z(1hqE#Nv?jt5R#0<>V=yyAAitcDaxgA>`)Q3fwk0H;VBXBT5k87p;Mf3Kq)9 zi_`g5*09@EAQCQLC9aG{@pd$w;u^*h)aUxQdX-*d54oRv_PASVHlh~@kF9rW@pDC$ zX=othsKv>R`{8%`c?6LScZPw>Eu%PpoFlQn&18MdH(|;}&a{!rx+j+Nkh=m9wZ-BE zI%D&=RSrJ^Hpz>~D|J%%FwY+lneq&=cKq0cL~rBeP3%O(E4Y?!e9^*j0+*}h5ls~x zj7)9Hyb}*?Rg(8PWmESYdJ+Ybgf|hC0<<1!8G?sNfX5$nhC2(~JGm9@YuN3@5F6#! z`#2@a;pux=6t~2^@=1QTn(IkG!XYq73Rs96WV1{fp*uckOTWvhE$oG`Z}5bJr3AXF7mdpN{)ygXC;Dxpp`3K{JgXW;^FjEMVdzw%+&(3bBGzW>OR zPuuU?@9A?|N&d*=XtaUVh%>JuB*8NJe3BM<@r`7ag8-#EBUFqg^WlAihi8x~9;qI$ zG}S|&oRa>@;;eX#&bOuvqy&_jU_J1-@^*u-nh8_OLm^PAd%PW+Cwa{f9+eKg$YR&U z5o^$+EtUsp2)9H`RhM3F!Gk*KOV_h_X>)|UH7(_B4*yQpk)(hcf&!X>bL#HJ2EpAf zjY#EIOK*nB%U%S2hS2Ca&e8{ZLvfE?5IqL-MV1YAh_ME$5~?Q7)%((mKCYJV2-ARH zu+w3U{M&51crr9t#?2PF(@S^GN8(e>igU+^PfnC)ITcV)`b3!NN36ZUu&r#8mtsB2 z43t-S|CSPLaYNl~82iVu%-p%+4%oXo!E*AHnp=gZOZcBwDuN{z|9?ug_<~Q2)(+4l-!q_uYIT2hF7QKK_6}N6Le=ovrUGV zPOk4-*Yo_x8i z0z5uaf!u6Fh`ECIcZ-eg-X|RW=|hxZ2-pTb38N_xg|Y_Yh97RlXZ7ziv>BW$0t*!4 z@;c{Y3;33-!n{zn-n_*9-j$L;PloCkM1Kb}HN>)`Vh|m-!Ncv&PO-gp8z_gZdzro< z^z_RWjNJ^if|J=2)29t;m>aDd5K0D~DXZrt;WZBl*yQr&^XCiHft+0QF9&CWjeaqj z!08!%O3t150Z(pn^SM7xJ`Km0&IuBu16Hcx?WPI>Btj3IYr&5*rhCvMuE2||K(9F~ zEP>7y0g3t?Z-}U_w@ZnDg?JfLqZ~l7*gibWGfgsF=IJ8$Pzy_d?VwBsccgDXRbn~c zh;hLUNV9kN-nrXFK#cIpXTnDraLX^}{mH4@Ph7=(HqVkKfsFM1Ed3y@#3@`^!2cIX z(`=%oC7%j76Z#(RF=pegRbfAfH2Hje*ioAuAPNm^ooK+>y~cKlC{~8UK|!~1LLQ;N zoZ`e;W2-b@M)hTpRI4mGy2DAbiH^$^Lsd_kK&^tDL>Re*6V$$`Ro;p9NZ}Am%fDK- z8&`?9c{tI2&(h`5={yB@>)F=%75L6Q7MiOgL@SUE9)*SAt0BAu9GWb=pkOTmA5-EF z#rBRU@OP^fH`WTk>GyMYw@MmRTnwh_-@F25HdBp?ods^a=ZI;(^CLXDjA4&Z@~ce*pSA(y7dkKNIO= zTWB#}7GSGFxS*wU8{1`pD4;XzjyS&Lmsb1e`@^qph8_wpH7c=R={j@&NfD73VAkbq zAFTY3BF!7?*S}`Q%L56>P(^dUDVcZy>BWhoy^-H6QIyedCBAJp!sVe9P!FxvYnh8B zqta?>*JDM8=Qn=I$kEt5GUX17QKHr2+TBxJRQxvwa2X1#fY2?a6y8?qlEXQ-J(h`jV`l`#VCKxfmr9)FdWoPAw#Akc zvdkCTMe?lBd{P!0@SG%B^&tLMOnj%+UR1z(+|2xOrC1)UwwY&`Kh_Ort0#mlqW6$j zcwhK^C)JHMC^~|Ad^J38q$k(38;UdHY`sX(HEj!?q;*sWrRQrE8~!~afzBVZtmFG) zUp!O%7OH?O7E{aHzxErrJh07Fh#f8eti6xAV<=emD| zjQ-8l;PU!n^o?NCpZdU{np_M&pHDBqWd!Jx({raI(;W6pWUg(OGX28vfsh7DlWUb$ zr>zr2UNLmM5^R6X-iH`V6*J~`@?9nnaO=Cr3W2V->wx=WbuLiEszY8G0ytBwWCn&< z1Fd`#eZycXI)e7W5d=fX9;o4%W#+D9J_Y0{MgZIgT5PrjE# z?squk5aEKKIb-b{0iQExtU71oEO!U^X5~XZ<-)`@{I1ijJ1X5F(w`}+s5mnr-dmCU zoVFvaW#7RuX53j~e}zH3&z#5Rb|+3Hsg!i`y<1Ex<;r-=ypRl=9^5hYdhn<{LZ7ec zk;0&DpenG`RYlS4w{zAZcz&PKbUA%c%GkxHT>w^V-2!Al-OZVUd3xecj33eU2Y{?D0ovu60eK?2U8E#W1HW*ouquPR&B)Pjc6v*zLJ@R#Mm<*U2_<-66JpaBaPN7ySxndTv72b z5O>5L_K&ftGGUqKrmA3Fh^pOG0jJ=Vqhi^(!y8^JJwCTDccS8)3x($m$FF|uk{ z9iG|;wp4t+X&A}5>T4gyF?Jb=u46--rpt7enj24kSA#j1eu{miQis8&2?y*u8Kkpy z zrTgXd@2=zZPo3D+}ck&Ib!pTedCyTnOq)KVjcKPVesTYON(k2FRDl2w36~?Ga zC2U8b$pep?q)2Qey6sq>!!S(PACQ_Yax9yGOv%KlR=GmY=L>i2pw?@y)sQ#|5)^l^ zPL2}l9*RO9IazFU(Q$Yl1#)Ccm-a>m$*%H*TCbHMv@^7ccs+e~4H;=L3q!SbK5zTD z)v!t~tO@I_oOR18%3OE|vLzLVlf*d=A?IEy&h5hdPn&A7f4*c(1eq!S2s*D8w`rj} z`IDa42yE$)Sp{bB!lg{KQ$>u)0uEN02iMucxq${a>1B} z!^oV#JL{aFr?3~v8&I$wFp>}-#4i8zF;kZ`V=-4-K9EH0?!}XO{0Z@D2kJyEe$r3 zurN+|DYdP(L~W5uy|~1$Qju`r6EkdXoL|xfib0apZ=^?2>yiD8 zS1S>}@WeT;*+aZ&8fcu!q*>6~2UX0G!OWKy3j?v6ThJp~Kfrt}H_8Tmua!pW@^F>W zE_)27D)^LKZ_flnnTT6wORFFqnyWPl51%Cw&sbuX!NP<$3 zl=3nqzWFI#8v)8Bd)kOa)~r>0*%)Uy%!AsVzja0 zT%RHVirf<*{BQbWe4EJ35iJGkSW*PS5v7E&jBjd1ij%wgaVSXnTNAh4taMttuE zHT6arU%b{VvqZJ$I9Ch+H9C%KQkA&QU%-OPzI*q~M}Zim}-q~)l{3%d6K(y4#fI6AiTB>zHu1(rLr z-VCDSq|-w*<1Hv>*g(g1>9^|1DjIpR9sw$hI8tWr9f@9`iM0J~)%?GRbncA7!PRHN zHcv;%<>+cS9gWeac6Fq1s;~`_0-i<A}&lO=cQJ3l2-6u8N>xSvPfZo~D5(#K z`^`d+h}F9iLN>&@bQYPnWhI#M;$%Gi$Tbvz|HJYHbVij@5cOY)+bBE zj$~cXhs{5r(Cs!OQ3cLEx7i{D0efLB*^fg{y6%3NH&(aj#PD%{F+BnRg9XfwFePj8 z&O-tqbtZ+MvF%baCwT1u9$^DW1HmV&bTmJe8a{(nu!F<`P0DTnvsG3oFFX8ZXU08> z)BPy4!ALA_m-_h4i=Jt|KfQ!Ft-Bmu*djZy=}LEcdVDEzY0K60xhJOHd(~)x8Blum z#lwHIEbg{Ky+wnzeVns0Z`s^Ln0~X*!A>Kx*<)w0P#{6WJ zXD5)|RJfhAgKrZ(dnwf;us6cn)~mIaxFCG5UNe4=YG7LWbihRGgx#yz3L3sv&#LM0 zJ0a8X%pstUzE|ENyS5UO^}y)@GO3^8FpabeUlR~x*|D)KSIaf@&N^Cr%>GgXR4uK9 z3y&E5Y3zm4wr5vYi~uQLMxBW9y|jGc*0_tcH&TjTB{(lZ;FcM}bwNdnPT@u1|;)p7Jo5C^x^pMqMOm=_bPLF^lJu3n^@3WR z8*6MggjZ&qKr?qN_SQ})Nj4dOuTcCVxnC4+WLunsA*xUcs^hX)*&5Tpth{2@YnN^qKy{Ee6tOqqIWk-1e~P8R8KsSn$&wTd11@-$e?$L zOzR0z+U>$KOq<@!k)X1PiCpt|QgJ(x%pG6W9|g^}O_CEPTjcOH@qrur-)pq|_xAP= zGnecm}#MAAd_e4=<*mhV5T~_UvlloM2~iBqulD zlKwR5-zbvD_0KxBG02FT-BiUMYj8JYCpl8dsA$3>0+ge#5e_)>PNiF&^ODbwF4LDv z2@}XPNW={$MTM4(bpwQGqfc7GE2~-Q97S@|=^;lj0(@N4?RjTp_C-ce$xA-KQM5{D>)xC=!0BEK0`xw<&3K*OjAN9?zH z(Kp)ZYa%3+T_N4{J43upxAX5(K_MY)0tQ$xuhHEJ2}p$$;U;80Ikf_wsnz_c8@y|LfKt147-3XI0g9)+Xdcu}F&5FuBJtpl=Kk!DP%re7o$P~Ly6De*FK zOYi`eK2gn_MlTY2=$UK-(Ho|S={7`e$P`*HG+tm9r%|om-pBakxBVOHZuKP`J9ym)(%=0Zn@XPemEX!t* z1~LmtLxpjSClY(7mfUQlPmI9EZ{|13qqSsvqu4EitiGQ+SbWv6(R7*_aE*21_5pHl zPr+#$wXCOvw3_XZz1};S_X#&>rL*rtaK4#bT?J<>N5Xq8b+#1dhgX9W8-#N1!twWQ=xs`d=mQ-hql8lGK^@xda>UU4T`Z~r8IdLuY_#pXeB zr%T1OLx>Xfk{U~wJuXG_KWHez!bJRyRF25ODzoF7dna&DLtibULrxahta#=B?jH4e znx(`^#srjB&G1cbvoLOW2Jx`nAF%SJfj&xkPI;L1b~HwyJgiM1mv7!3g#^x$itq1V z8uMtkWmlv;E=FZrq&dSzYAZ&kA^Am2PNmw5}ca5GT zHYCoku?R(}(G4qm7k!dU?O@M^V?@WseB9>cDf19k-UeG{IO)@MPPiGg>-ey60F_=D zhhb@e(CAK87o}Ezfh^HLA@jeS=V;r@Ek$tIsY1w7C)>!n=CmxdqbGo$?y1CRV)-hZ z3XftFk6eM=5TCeRt-crKA_$cd?eq`_*jSOnUE&~6X$EfDSfTA53ca1p1&h@;3Mif- z|GX2Rrhy(%!hA!5`=(4qC2VywGI=5iUTNw!mvJsKQ^1CK5(H~>Rb&NiFm^WZ>yZ-L z*x`Mbg(S%fzH-30Dur1*iXbg17WAI6*fl`8*;n-K>$EPJc=|(WB`qp`PfR7XBzL}b@qfv2Jm?SwB5k>>(U2>BFvYQR6_s zg4$t2CrZ1-H`#^Z_2!5KYEjX)!WBgbHU{ifOB}*YpJ$sag3mgTbh-5jX(}V1^BqPb zk;}ZtC4HkBEH1+YtL&O6J(?I-N1w{za%mAnhR-pkR|;G0zxDo-|YErZkkWD38zRjVbJ7gxz-dN`MC zyUf?Lq<)>bQ>0;|rP+P8I4D=~wxib?n?53WFr`unn@9tbAon@1ZALm-oY{q*zk(9z z{o+UVU;5~r+fhdXO209CQohX9n-Kz7s@tozUdcj|QC+9(B9YpkO$hBfnYaAc4$!7|Ln?^Y#Feoj`M#E+#zPA*s zXQGfFAJv;6;&mrO1yG>5sA6#6& zxt8U1j8-u9wAGz3Sm$%+g_MCZ7D+w3TtVI6Vh>t}gjB^uSFb!TfgsU0?UvOTGTWFq z_6~6Y0r`;|(0~^VQ+Xx=V$iqRVflTb{p<1Y0{mL2N}4>n?>AWW@}s~Rai@G|C`$nGdV_`X?r#7Sm8yB^ z0LQO7RS{==ZdT=hp)vdeHSkU(JC#PWYj@~WYVC%|7)GO0tsfdjT@eIi7`p3SZgCj1?rO!H`mKz!5dH&5XtrJzPw5?d`)AU*A^5V2#Ky3+0ytN9X3J+7 zI1R9sSXSlZ&(jvZC=}1Cn?Q*V!r29(;epvC|BJ8F1uI#-SM@xG2gY^iCQ`3} zM;+;-V!$qdTd_l(T?Uqo60qXc`XODC7Rvqo+#ShITTQA2Gt9(MIzvEWR)8dLd;i_Q z%CjW^hGWK@9R|O>>-P8 zY1L04kt3?ktzkqHk`5}^Nz;9{UY7#%$l1_O<(bLUu!bLstxqqYwZ%ZE{8|1a@JQta zvo5ED`+4g2=?DB5#ZHsK;L8NbbtKyfFgCiH+&KHh{cZxtEOAq+pC^&t})Io;D#JC<&ma z)kf>bHIy&dC_h+OsNL)FS$_DH4QuTnD+<>4r2oaO-ucbv^TFhO^5Nus_)(JC4A~<@!MJ&tq?LX?&lU>6mI$kmcR0BU|q++Jc&;3Xt+asL>=iVzp|kpKu!Ly!f-P-rl8Tx^+HUMqe@RilT+!>-nQB&$rD5tv6U33SKL$`b&3>sjL-ZVo2^z@WX8J)*YqWX)4cgsc2O>JgC^=%aVf znJ~=FaOU4CNhQn#W5oHIz5l_aD|*iruFqo@KNEW_W?F>(5Tgi9L;#ujv{?p|ki96d z%49e`32#-!e#|DhRwtoc&^xx69ujPtCKR(3D?JhLLZeE%a{3@0BcbQZq|NxgSlgQ4 zqn=L?8-n;)fiElw%W{acs0dXKEU3?Pkyhu4%syI%7#E5xU&+v=@oin7A3Pp+nea-ByniDXH z$GkAMt8_UlQEo=e(Q><|)cZB>gt>XH@)u154H_Y#mgWa{YTV`HAk_}SYkOqf{={ODJ zlStUUnw%@1I(_3Vn1l~2X{qA%e@C-K)AMoPZ5PaQ{di?lSmpXD_=4vw?I7tV^xdTs zjR(~1y+nSQqT0W6WAu~PB35a z2GCs(2p~$c1V<>uuM*}x?)KgnQCFSUz5{yzO-FZ^cHF$f{Jk@Hw~Xa9k9$!okY;0a z(PTlQy(mcG^bn1|TC6=oYQ(q}Qq?E9^p%mOul=HRQ|Ia6Jk zRV}DkfD=2*{*ppxSaVfC)>%X)wT{tHunL(KhmLsFsAm@aj;cYceL-6`5!2ji2q9N) zRZS+3>+C^%iTAtl_X_i=tUVJ>n^1p zge7R{BIA1-W3s(G(lF5uWiW@EezX7C7GU_ zCg+klk}#id26|9=EWLr7hQ-qc?)LCL6t@LPnN5KDPt~fH?^~U7*`jy1Tc=u2)H;^R zfY;tCmLKS~hqcfm=}Xa0^F>G7m!w{4G<%h1z3#j5)`-8BfMCJSVP;S0%iCFuJ+`L4 zI|ykr&J%W_+(n&U(I~AP3AG9~8qoEuiyXuwLpColDDH~o)v0k}|b7s%Hom?QC2?NH%+ z$QV5umsI1TbW~Bo}Eu%vCq5 z_0l!14z7_C4hR1ckjXu6hsJVJX9PBU+EsdOjR%r9bL0}{&lIEeam9k{XMuW;4FyJlB-CKry7VAfWKmC{DP^g3>}gCJ zd+r?ABf2<8B)MxoMn^`No4`W%042G838@PDqrKg9r9RZVXUr``{Lwpl zgQpRJ9>h7I7lOw%ng|v9 zmpjlL3X~zCqrG{ofiSL1b*|O`eCjsa zf*D6%5NbE9Ab%-^N?x}MEZBPg`9+*vbOiC`n#L?llhiS=gVa_mvmu@)aw=~6Ly<#@SU zr?-nN-Gt(4{X_A%b`D5{T8#tPeeWy-LydDU@0g{4WRn$fzWOtFVvrYQA&eU$GLkwL zo`X|uqmYG^YRafS*Q}Tq}N(d(F#ZNEK|Gdadmt9qAhbyn$I&f-G=F=yq8l`5> zuz63`>HoLAf7@;w*|J5^=lvIGesN}N5YxI#wcNZLMbQ@1x|K+!)AJ1?AqmqY!3mI( zmEV3IPV_#;hzNj`GiR+?YgL_7wb!n4+7b|m%ZPF5qxV0Gsh6tPjwK;B6gqir9&Ktq z_V@!=0(HoQDRa>4T_K6V+e4jVfW8)`Z((A6iBByeCSTM^aQX7oQTWw5SsFZdM1MBA zg%QeN6?DlkfX4eZm5QF52G#E?VxzUf^d5_>bul_8t{Q8_ zR#Ug&fRDiJqt{0qvBipUtIiHAx?<*=UX^@$lMt9`oG?OCvTF5RI#-4MQ>%{bEgT1b zAK4=^VXLb{wC(i8>#hsAk2c$y4;eAcx-e910@J{jcYKM<6ky4a;}jL8WkdllDZB&L zwk{;=b)!)buu>!3r9~_eHQ#GTC&UgvU!vFnN*K z@s7emhSs0U;;{sB%bltsA|STFDppRhwp1awQ~JnwbU1vfZZ{XC62R0TO!u{hxOJ}* z8d=cKkrqxjt_N<=nblM1i)95V^nFG0Of{ETCDm?D{RGOo0+ zxy+Q%0HY}xy;IHks-ZQ_uSXG4NyKwNGX(jqL}}EB?7fxO?H^g{_f(yH83XXuFj)i#RQ$9Ha(teNAxqC zxgIXvK@Lm`Qtt4s=4v?(LQsw>XTXE&SXgow2Bzd|S$*^9$)sE3rtdpHX_n1Bpj9BK zJlIH4uQY)a=|sv_=p+t>3VdgK+xLv_Ld#~|qP5z5wDDj~{AS3Z=uv4rcqv-K z=mnJPi&B4eXR{pOwoGyYF&VNLyh8NU^jRbJH??C5JV z#EaT+uI3N20wJx2a|#zxA@6B5g}4?O+kU)&Ya7X|3g2iFXwnmHqh=lZflBd$%+sd7 zcG&-drstKR_~H6>{46_-BG?Bi z*T!Wa=83eBx)mEI*mR6ra}=;r5wPLM)4uV$*($Zh;q=N1$)U{@&r?TBpH0x$D$HZL zxiKG04GVc6sICxcUMNLtPCDQAy!JNPr$PJdrSSz5zjAWsQM_3q>EcOu&0P;zG&SlQ z?2&1Q21mw@sIaShPx+OxVh|Is%aXvsu`SAi@wG`Q9(;1n!Q4Icfp%U=YFd`e z(@I({I+Kf@*JhF_1^s>FmmbJS{8Ub#Jc3m84{aZF$8lWKhj~*+aW&PJQaZ`n^TNup zr|k&Nh5Emuxhtm0MH-KKMi%g1YiXWiS=5U9s1K~hM$m@}8}3AnH~keJ37?ydR+!k` z2CFOP_SM6{2&%QLv74+CW9BKxYS}a;`Mj8IM2x4AlHojYe&5trI?%PZwXDE24sS@K zE1yiZ6|?+``%aCgX%rfeT~nDERfX&8f(OlCyXML z?`n4h!M~%J(kan6&!IIzlCgEhxq`+=8KNA;-x@CRk#>OV{L~n#}tCxFE zB_QWEHhn7V>L})cRVs#i8m+DHQ1T%C_-8Uy z^6VH5iwW2-ZpwO+V;*aL=i|NlL$te`Kg=MAUn06Aa@k zPi1Fjp%neEjrXto&&F+ZHn{9UCgHaMs9R3@y{n7*pCFGltu46>rDMD*7rMsRIb)R@_RKlk_>ld93SP6G;7NcQ{~mPg-8a9j?JH0j zA-WrAttx}3KTl)sB6iJD6Bzsbha-4aW{BrL@Ws^bX>EYEC9uqpr0tdD{v zR;(tvywGI^tL7qz#}brWQ!2Y}F|es|iBekugnN1|KH)FncG#|F*o}Nqw6F2G(!o1n zmfCg(dgSJTs7Q+a;V8kA;|IL!Y}?)Iy1MS)bVt->=v|$B2Jkje48+VCy>d${@)23RT?Bo4K9bt>0o%#{n|fGPQEs4 z<6RQ5OSzF9n_~gkRk^!i=XM#eRMNdi+`^tB()xtTzhLX}=}0$aF9BIVUR`ORwXn{4 zo9?LjI`*HG+LND9>Hc|RsT+x~#@~CXVp-X%x$53(@sgvCK4l(#0$DDd7xAL|?nItD zY`jl{MrWZ|X#`4$M<=TTC;bn;>pBxmnU=z=?cazVwbTUyJN48#Q>h%-o$XfRY>k@L zgw%S9VF(T~;>|fq00belrU+tW7L0)XbE#|3D!aa|3`JCS`=LD`sERQr3TzdA+Ea|{ zC?01+MuuJI%=WA-mf7CrPg`FCKPly65Fo&0Y2bl!LkV00^H>ISzx7ftpM_qjclHka zEe8=d`C;aQugKen?I=twN8a>5b3Qg54O3|j`C{kUVMmlEuBp4B)@28JP^gI&v%&bZ zGI(7?-`@tN%F{AsJDUjQYDDUbd<(xe3)}>?i4%cD+kY<`2wATS;?suU2tgUTP{e)n zM2V!QlLcuo>=L)KEz*ldmkG>-P!}oh?sj^-yPwn(Df!4lX2k@u%c2SSQwf-WRkJyu zpIgkcE}CAlHMFESyY?v|@t92%8k;S(lb87l!=|+dXKWJOkb$UoJ3YW>PAV;0t+ThG z9KN0IAMA8Y_Z0ti+}S&bifK`M$R-QNj$93Hv`qOJq$9LU6cpO6L>M_-D}TLJt7#An z3OioB+i{!~%T~a=E{ky*Z3G|2;eG}jAZ|Z(`(qqqVK;t^wQH~zHWxR0vpH6@OL@ng zYdsNmVMr}PG7uGJ&}GeFG0n;9s?XZx*8EhkXHA*U;(R_wTWc#tsOMu*gArkf@( z{wmW%^Z;#7dm(2i%v~S`sh#z$idq2}4GY$^?`F_-&#J}LtJ7FI#B~Ts+OQA8qNB_i z)_RSB0U>l%y)ZZVXgxZfqtR0nZo197{;6iYl@S?(0;5O#7UK{rqz^+ip0S;c??M^z zo0fwk`B)Y12jA9`w@4Koi`)fH7)7#>7{RxtJBQoBHO_%MlqNNkd)c@zftJ3j8+!1T zJk+5+l!^9+PeoKUVVZF~Wmx)Hw1}MD)<_dHNhA7Q>Z{RU>B4eVG^p2()@CQ}9Ma6E zx5LrRz&X**uO5ByDXEhi=S~mLBu~{Vr^^;B#JkjSiPMurZ%#mguBQ=Y=Bsr;yrKSt;si)=h(#rkU=RKx0rX?WROTs45@Z|MK*h{pp`J;#; zQnWE>e;aU!sV7Lu$!HCz@@k&Hmc{WO)P&D+%aA6ooR$wS5jj``_#P2~(I}R_YN2uv ze!|lYOkdg!xEB~I16q6Ya$gKC-TnB!cRuQjgx6`9T=e@RfQ`NkMrR2&0+|TiqC3C+ z{HoVIAHpU>XHnK?J-x=Fq8v9dY|jK3lbXio{quav;=N#Fv`QgJO8P_i%x}3JITtV# z&Oe*w)1}srJ{PEHPID;Vb$T84y=?vAh6PT0;`nWp^`dl3FjJNQe_jN04`cNIDaKKBXtSj% zyXbrI0;xr3Y*ZFaf~S}@C|MQ(hr(=n!E#U>#A9dW-%}L}aIc`BWE_cKFe69XahIOr zNr6dvDur@le@{ESo=PDu<2vo_H~#j;+B{ctlTMG^UQV^{0e9_+oUZL_dO&gMfmD06 zgoJnNb~3xU^jaLo;;auIOKFS`#UrJMQ5-T)WT_8pZ@6>oN04Zshb}aI>lqNJ2A+8=bZ5GQ@VnT>0F21x03$Lc`Z`}#!2}7| zJhE4+VY8ZB%k^V9FT-v%1#q)Wc1^|S52wteX2*}VEqKOto+ZZ#h8C1U{j*c+09F<% z3Uo{qJAmdrQy3!!JTor}Q*!~?)Hu0DR0oQTLaHlD%H0AMF;^1a-;nWH(Mwm453+RP zFsOc%gm|a6* zE4M+dRHbqyFc}YrL$xC3_frM;;S`3;W&KWw{)YrJTp0)yR1a!$rORQNZ|KjXhad%G zGC28}s`7G?&y@tr3~GSa=vt>6rl)at#3uf)-W$SCGK zKeq9Iu&i>l9l?p5?oD*Cg^DxXXYwlWX9Jco5Yq;qjUVonGCb|OU3+*BdHc7_A zdSJN8Hr`;xpWR?eSn}WY4r{E$O2H`;!`6!x^UW-c<0l+G?#;AnQigmkn>MZ!`3a)O zTPspF+vuUP-y13GDL@eOh=@)zIrHT2X~L@s}JKmQO$p!f87zkB-2-`pSnyIj=&h3@J9qSy5Q?fdz^%=P?V`F1|;{9n10 z|H(Uf_5bU4@_+lQ_}w=>8RM{r^eA{!bG2f0D5OlZ5@BB<%krVgEHr z*y|g2!3_JjU)&Wk_}oALnq2+Xr*_L#?`mr}>W;1k=YZ~A-;VmpB`9cL+F1S?aHdS; zG10Y3dF+)Hq9^%1VBT2*1_5g*InB|4Sk>QhcKg)H)%5Sqq)h8?*&^xwGECm%)t#&J z56MsXfwGwXtS4+5CxAIyqnx#5 zuxY~GJZ!)7-U7D&YOa&sUJv5KJa`HIihr^De|~G|bo0{Gc2~~b=y6rm3jE0K3bde@ zS3+1nf5`eh*=i(&!-%Tv2yl&>Y`aZnp$JS((zkj+oP(S-w49@FYD)pV#nfLFw(j}& zyc$z{A#khe659uIU*XJ7O;Ky=uFScDfKoKjdG6o4_AFW>o(RYnDr`s<4%^jk3EFcB zKMy?U#3Q54U=04xv57>;FB3s6>n3<&nbXsQF~ zQ$D|OeyWn}ib@)FmN|<$r=aq@F-_tkqP^Cz_p&TMFsX#4cZb6(=lZ_ybw4C0SLeeK zM6u4gm#0bhQ+IGV93_L33&Pm=kgl?D=53gh5UU&os?#7}={D5uc$`nA$?N@Xum3?A zJcX5GDf_(!-6nens7%Wt}G0a9I8{|bHwXSX22`z%(A6|oCS^i)(I(JYOHgbyxFwU zhM**z4%XyZl`^=c7eI7Eusx#W@K=O;neYwuJ0CmA*=i{aF)|Ayg`@)YKko#Z4mky6 zg;ljPA2JB27OV+u-c(6#XTPQ@fdIaBT?E8C-&_=;u(rDrqm9#IPfFaDMFhI;P-d}v zV8;`i2CMhOCakn08PhBL?@_RE)C)m%sQfL2%sJGS1v$f&EK3tlTzu%2rdu*MG`L z!e~sKUprfOSuLz}B9>IDDA>aDquYKli*2fTxphbjvM%)6AiG6^Gjv6+9#(aOCjEsB zQ<`j1s$vpB?=^Z;K=CKid7P#MVrW%~!Kj(K_&6MWu-n6hiFCNEKODjK0-fx14fO-u zJ11ookjL#^bYHH#gpnsAfLI`cixU^5XN(=nZwLJFtO4fS)Rq)=yS*P1 z+q$3RvE!9LUwHvr0eBy&+Ll|A-FL?^^|#%Mk&uV6i!1WJVB7XmMyq_@kWhX>lWq+^ z5O*Q&m3cYe@=~OMR8-OaO2|QWAald+OXnuxA-Obnte;>%soj@AkGk zo1`IPEN=O217nRd)P@~T<-kz$O}coSG}qp#C$RsnU=Tq6gG#l~6*zTPFss#y$XwyK zaz*+wZ6-G?+0Vhi9?~5XaPX0ZMy7iwB_f(^6WFUN@lY&9Qt>d-nUON!K}>+pIp`Y2 z2{d=)A>wjF?`*knzIs*gZM)9lXPGXNE8!J6pS~E?ucc5_^)V2Jyt>nO={ezF&a(V| zu8aZCMeTWE)WE1*w~Wx(+}wJsFlC>r`~hO)zjne=rdz?8LvVo-0Xl9XEe9y&!n5s@ zo86hD9*Sx2KRJIg3>QR}W1{+t<*SzPt(ZCtlPl^TaUvP8HAEp*M>8W`J`wUy4d9J*hqDeB}65%{1F9g2s;cPPO!VU>odIOL>!;Rv- z4$ErV9P4{4Rj#4Qd*;IgJD!KIK^ z>rKvQlW9}C!O_w}hf7KQn_=r)n+B3(*Orrx<&A==<^_#jBwu)f2!GteglUU z5ivnH!hIDdx*)-vGluoDAudNRhm#?i^&HE_qvIt!Jn%qA7-?F_2h*{~H^kmQKb}KT zkw!%G62N|AE=pu0^*-|C>vpYSTj3-9C^%GdrHrsS@o~O_&zsfrNhuR5tcx>bN8u63 z-ppjBaVhO@&}s>r)IV}c*8?3E(p(ebH0+>;l=FJLMi(sFWtt1&<(KMlvr~g;j^2;B zEIv7~X;*F9C8Wa6Ji|VgjWz&+xwX*hH`<1zk&+tQYbT|C&KAjDP~3YqRX+*gO|pkI zWid2*vn%^eZLILSyS%|_Z-6OyU@H*`#c)mbHm%8UnJ?ghjylQ3xaYjJsoWD6c?Ip^ zdjhHgCQuUKqpmoP3@q|VBM;2N#DAS&mS`%*77~_?eGI-hGL0> zh$0-b7R2!agBW**D5ju=%aD&@7y8SW!dtw4!yOwjo9g(z%F;$FBAVSX`Oo}(Mce6m1Nw%N9^QIcK*l_TJjE!U`$MMP&Ft-biK6y$nm@;kfxV%=JoW#i0S zC3()G#od%DGf3e+S?Er4$9zdh;DN16$hth4ke*$#6#u${4lYEKO*2_J5!xPv7c<0V zg$lNDEn`zo%C(u6`-y@9GE=}w@D4J9%Vp{2>(N4F?1E16R}y=J?udqA_^VO+E=a^lh0a>$BFZLHi1E(4gq~ssUd=KsGP}f%V$~#A zMsHhK`-^xiWOSm-)z(Dv0!pFD6D_LioxM_0j2^X-$U^Ve&cU%4F^i5A`B3I{8BJ30xaqMk*>Sn)05JDjqSQ)C`-NsL^}a5-LWYCFbKmxh(t* zePax7Tm?F_J^eeo)0(_lI6&06l^9|@H4>DCHZ0N7OH>9K0Ny&_4=^Sg#MFOn=j!b^ zu^!GQdzXK!r9W_}(#YOz%)(hPA_AC9hWl212Rx7U7%Q^^_{Eez9dN}f%L<766aTtwSykrJzs{mupQKh@o; zdPuH|IWi#fSXu$PYJtgGMIYkYBKc@To?PmcoAO*-`l;#{ zeElh9i4siTc{-6(q-;XEOh$JXTC5ucr6uI^!pq=BCpvui%dR9f(Jw2r`R(dEk+= zcwkY{Gs5r_Qs+m^%CQR)uurePO*8j|ER~r2v5-=7^-bm5DR*BBnRUHfOOn3+J~_@Z*G<_3h~~RSmmM0JlR##+VPfKco-bk7T7is(WaChr)FX>j<9e*{S0R>KkFKZFc z$gVXf-koV7J(PE?mM(&ire68p->6TnTL%0@ww^}9;`dADDqg!Q5SFy?m0_i$W7qB9BS`KAY!&pw<3|1FvYLE+P87@&mRe^H#m(PDEQIB^_TL z(!>D@wb&!i7@kd(&f*|)bJf8m$B`tDM@Vl^Tw(KWKf|EOUs2C8s zAG2qWEf>q_oXsh5%a87eOAg^FcONx{TRmT@-&D>yFkzzRamXjJd{lEG@5x<_}+uRiiz_jnRzT`oX>$~QMTlKuAZg-QkYvF2b4V7nu;yW>qP$7`^W zHv#cQ+4G34QI}fE>F#gpWWTc$5=NWG$=^1gapMnosUva-`8u1IxL~0t{@oZ9km-=z z`-g7E&r9bZ`|!f8LT5cT-Z%{L-J+e0kav3<8+t}YZ~=u#?h@M?jNO|a6taBsu*f&Y z1vJfBIP`X}8LYpAEPFWN3NU1(&?a(jJtb?&w(mGDk(2j{2g)Ode#0y>@Zk6RP6+fz{0LoQ@;h|mk76Mg-g~AE zqYS9ercXtA7_mev1Fp0lZXYLF+%kqjjNIYTyOeBxJ~OmwVBopiYbCQs{+zT4oCMkq zKEFBl>regD5uJL9h3{+{Vl!-$O;zVh;Q-rxXMmE4);pSaq@pwwJDOS`qh%TU9X_fq zzQ3}yPWzv`=O@Yg)6eH8Cmk{~ojZ$o)*oED&6b>9eR6j4=5v4Kw#c_^de^Xe0^Y#T z&zvKPKcAHkKz{)Eg;azdE!214hPA9c0@*EL-xgRhT33F7ud%>C^@ms2KL?sMwf9o? zsu}rDBqbYygOds*`X|Ns+dNixqz)}(cIH7d4Y!dE(vl+?z~I{#fHVdY%hUWbzT8WI z=WA;2;VqAD0zyTIbnIp%?kz{42uim=5A@N+LsW@J{NotC08LHZjcYgft$#Bdd>ptK zJmOOO>GY!av*%I+huSbXE#UqY)F5nesd;}B8wW{~K;1{}>j3oeeIcsoiz=D>)(2v* zJ$o9%L!VANarel_;`=O{2RX^+#?C8N`Mi+|rO zRyo0k4dKyPvi!V&{L@P^ME3(0;7f@x{-^EZgPot*?}imrj}M=#GE4-3%u47nlyYJE zc`)K_QN1YU%B<=o!eTgQ{GqWaP?ccCKOzWXgUFq)z7*dZ`F3UzWu8dnEiD{(VA1vW zR#-lUSL!azK@y7ECi(E4ai14|pSa02t1(nz z8@KZJ$c#AGGE)h8NJ|gTodpxqEXmXk{F{gvP>paiKbrcqydsa}K+mMkF0A7>Xs`X| zUiOEkYB$=n)!bXPPICR|JlaVm099mrO)L@ho(?g*Ej=&0({0rWvwHDJ}-f=x%pCs`u@a z(jKHuycqO}lE+BfG|Kk#6HyDBuuHQREUa^EDIj2X;HJjg`FOgTSbV=5Gr?Lmy^2u{ zsRDlCCTpFH_!26a3u@I%yKf*o6{ss~Q%Ywo&6M`Z5z>S2sGIDxdOUm%g@McqovWay z<;PQ~*mo#b3U%_(MZ&Pdr&ab$6%lxY?yL+?Zx}8K1FSG93+-}X?Y^t2yj0#ziwE&4 z-2n7`0LjK{#@)~d9!FL4*g~3B%Lb$+V(ayI4S9-8nb!RC@HV%RL_d0KiQA*4Zmx#3 zS473~g)MK+-CAt?o|)>emvJkCexD5Su+6NVxf4(T?IPUl3rN~eJbO0!bZ5w^*2{d6 z05~eW8Bw2=Kbi4xu$aJb&WpRdX>?4XBiZVdSUbS@Y~tLaQ%6z`-?tR~wwKd(F2xIG z&OJb(X-UAj5_@NMi}qF0-b^s7dB}4?K{RGn;Sk7Y>&kdj881t>mZsKB$bt+9drC7UppKPoC~F0wq3&1Jk0C9`ogopshY%90OCYbJxn%j|(c-;x{WE9og9?C50V5&NP){}gBy{+d z0$>E$*pix`l>{xZLD2LnB~n0B83`GpFOny!wpXpUMmBkZhgXXTyxST39bsZnZ779@ zg`!*Z`bkwT7W%y4rX^HSto#FaJNx_LiMySnfYD-07k}aJ-ZWRI2whf-3+ zqQvM|A`t!PQYcH<+D73OV!K6lZezC!@>FhM#4D9`*%%07!*Wo~K~P!Z!C;6oBJS5@ z?0&&T2aq4g^xXPRA`~uEkAhdq+K4ftkA*OGNc(rC05k!zF9pyiz_k1937H029zXT` zG(W|vXxDg2zz}?ags%%PRG_s9{=F~RWDFm*xhis}M44dsD{F&}L=LENtHn$%G<2Xd zb|3HHU|Ky)H*0MvImGK&Wr!V@s(;I+kXwbAgzb0lIw~{9tSUudRbc4F!leDq@ln_! zm@i;hxhmVo2m2ONDqjguRE_`xyB=~-%eXmZNIsN?6`;1acRB}$yJ6>|M7s@GCvYt~ zrjh{~e13Fv=7v2~mj*qOd2mVLX1o1t!yH?Q6>mCxx4-@-z4-5o?8>NEL8p7{yIyMbSgA3K`yLyu()6YO>9}+9%|P>6!3Sk!)2#^HZ3getsoK@% z6gOEWX(*kMH+}3X%8ZbC3;R9d$9yC8Yqv+jrU`q=dXZw^Q$ad1de1PHkvnR)v&X9d zTDC4Ip=&W#!VHzhu8O;rxDgB_{77{3y8SwOymDdov!RnFfeV9Uw~M5w9I>io)&KCu zZohL7T-Ro6P3*#%^QIpe+(9*5^@8n{s-shqW{jT`pPxH|RCp2B8p^M-mfgOnmGzidHYu%BX zL@fKg1GHGlzZQp=D34@@`D4WAL0JrFkdW59vOX6i>~XHyjTCyS*uZZ zaCzbWlw5Q__lL>J?boZ@5t3n08ox?(C+or&;ZZVt-@o|K z=zwMv>X9jGn*@M?%4n7S@sji>lOi`9gY{9gJDQ^2qSi05WW~%;j36OKJge1RX^y!C z#Ci_?#vG@T-n9ZJV*1ILk|op5m1b{_}Y_$7~W%NosO4L1qV{&%@J z0Q75oMvM&pi)S9lBcR+ySJ%VFVHXR~H3xuRG=@3Kzk33Z2@*K)l1sKE<)or0=cq)4 zrhg^J-`RWDq_kH)|`e7`)(sW}@O1=;A!(t)$ZP&+GLrvc)|;z z+BgE@;be}Do$hYsshFhVJEMDqIv){n#E3-aO;sa936Wz#@K`Ou<$7OQR~uc?iv2`x zJglHqFS<;V3QMtCG^_8-!A1MXZiuD76wnx8=>SbS7vF(nb{-5W3`Y59!SHE5cg~Bm z9n(M1+yD{`7njP#?!ONOZP3pSM~qCu_Vb` zV(*Lk+e?5VVo_e(Jgph)Pm6MyoM$V^9D~gZ+rzn$-HJd^4Cjn>r<%AR^#5okqb zllK{dNVjkPzM3rz!f&=N*~9ac-bnh;-A^9MQ&am$`HN*a>2{lmVe1HNue0 zWMr<>26WCCo;}nt!@O$O+0>V3wWXq{bqS(NZ+q|c^=>#}O|#U1L$xrK;vBOMLx$pc z+29b~F#FO>bPBB?{S zviGe-fSq;EZ(_^qpfH>&Ew2?t5sycv;GsnRiy@<4i)nQ2Sh~=;!;jCo2`)(sj^sn^ zer7giEWG#b`A-&bN>W={ki<>UY8JPIPr6DeT^$YXbL{6sRdo)y=$-8tiP_K?mkjD< zzEmegG3lQcg3Tya_yt#8+-7&_kO&%shJDHRNwU5zA zxEdeg`4tTitF2h%S=5sje;4Cg7t%-a*#bn@)2h%ULp=$uv2DL!J6wV;S|r;$o&7b# zdfeH6_gW73w{&O>fBmu}?XS+%8B;RgqFU;{m#UTJ|(1z@Gn)_WK>fc{n-gG;|4{@D`sq{lybfVzR0>fy75Al-t`E|e@q{gxzdfQMa-R&GQ9pJuNz_Fg4 ze(rXzH2-byb&ih%BEXQDrUqazrH@*(9Ry(djT^l)?4LIr&-X+Ouhg-vRnk z)qO%xSNqCuixUfX@89xlDx`t26dazy8J2ZmMY`5mcw2ASCfoZ5ot-05m6u6!hxlOc zI3n7#uCo1OvA5QSj*a1iXU~-u=h=M#fnkNTohOl#Da^uFV)}z8c(-$WxYoLiD2b1< z>s>1D^fV{dSF7)_QEF<%(+ky>dK41Bi5LzR8kD*RZ^$?gO;+8;n>AnPo0}kXRLo^7 zjK}$6$;1`Xh>E}W(ZXFgcy1(JB~7feOH@Om-C&u0H(PgM8M3?DBD##R z0@49d6+0wUZ8eOB5=KdzM2 z{7zw^3h-e|7~&Y;gn~s?S}?6K%L}wjx)F<)DG4b+)_%-#H$~-nnVd8t0MpIWq|K~W zb9SrgM&YTIR$MZ;8Z+dEVTDcRt9FEZlX6kYiE&~+3ag4qe9=mF)9vuLy1kzC0l2|t zxv{0k6c`(e=fmMua?>3Q`@>{7xct<;zPjlr*ZtAW)p>G#dvTpy-wb;0AN$Uv03`X4 zEf9ZQyK@_B<|$fd;UaQ_lancnOdb-Tm;Rc^tr}s@-@)s8n29Xa10VsCiaQ2VH+`93 zro~eAQ{81%qMJfo?&EiE8;QHaf85u9RH||Mt6|VV3`VF=?VP@#`crulCoYVk z9xJNG#{~dm^j*V8SVr*CID?#A^W$@OPw8Z(Ztd>e5Fi`0Hj`=6%V^HKwTtWd7tzo7 zNA*fS_4xji>kVt7X-tL{(R3}7;{%(mj*Qht#597uzVXXD@h`vYsAnk=>F~N|Ly&(x zut#s)XdnBhN&oljes9!2mG;O{(z_jwt}fh9-t@0;Z+d54>_RwKAM**>q`;Y4iL(;x zX8}`L$v=4~gO_)c*O|#+lCFu=kRWSFWvrB@5hgN1qfJ)I+JZidk`$orLuMT`ro7Z@ z8_?}QR2KGN9bt<9zM4G5F`WBWp|z7{w!nWc;W@u@3+$qRhvh8PO82F@UE%hWL0PO{ z0#i3}5ij<&_lZQxR;JO5euW*3!Q8u9(#{XfT+vhA+TQ+U`#}709-_)_I8WLukSN;! zgU-ROLVz|!a32Sk-AlH74Ew#?o5AR-b7zLb^Df%-TZDrjJyqaP(5x@XkOZpY>YW2? zY@{6~#Mo$|RaOjkqc3ghGM@z#Bd=IN>j~86J?x;@XpV-+&?FzHB|hKykllN}AVZqX zl5@1<{%p z?$3kE(|*r=*Kd6hKy$V@yZWN*Nejm&vP>G$v+qx+;VGR8s z*+^sCY`CCU6r38gRT5;=?u+-?Wl2U^<HrphECsN3Xm{6l8>QMIYOsu@w%QDSL8lz1WPfq_UQz$m{5D-1k0Z6_eDP)~${f zu1!x7kqx3yD{6&eQtXj|zLr;lwP^aGMbs3Cs8}rE=SynkK@%BKrza&)sOR_+{kCYN z#sI`Sc{s4b&i{z!@wKCS)4%MUf!s9ta@OyT(U->UNrLUL4(a-WkeWq0YVp)Ss(TQjK#Ol1ZiVlxuuO|s@y7HhWxl-09a(Kvf68miU5Od~ zWP)hSB&SbP6y`!MO~koLadtaA?P7h32SqpH?zA`PIuL68Za0SXTGkPjqF;^<-4k{? zT7G+0(;HTRLH|5ZF|p(-JzV~y3N;g9>UE($!tu0a@OlUX_wr)7LPODD)ccq=9yBW^ zTVivNno|GL_lzBM_K)l{;DS7&K|}+^Xb(m^t?9V}WgoiabprfKuYaPv@|T15ukA%@ zr0uq<%)UMvkh9iL#%(H5$?o>1CE3|m`yNJ@`W5h9e9aTuP!VA=kI>(oqu=Ws2y=?g zRCty9m+jW8G>CRbgS!fzOH7C6m&y1fZHi47&yuk2cXB!!8u8raE+Ct04%lO8Nzsd! zMUA#mDCOtQj!Lwsbga>2=i(qx3KVz%rXRP4%?4l2?r*n+w6$|n2U&plXOdxra9h6@ z69oTYpxO)*#e1}p*{qB?hVfXr#XnY>zQ(3$;3Aq|%TMe-yetAa!bSG*Smk%oQ-8WB ztI`?BH0e$sK*Oit9=A`5?B^Ci(jyAyPW(wJcyIhmSVt8MB*I*e{XzjVP)&N+OIoxd=!Bj>gtq;ConkUNa zus^MKlpBY1)3yJpJ4|}rn;Vou`@>%M+6B^|ZqHpvIU04HqkfL7|4YCBIoV3i&#%6o zO5)`@n`blfBTKs~%VQ6$0aq2$j+ZZw8G?v3**i)P4|kHu^HP>ITQ0NlHy<-A&2sXv zn%|Y#((63zrOElUd~p*!d@L3V!FDb|)$!nQOC}WPOb2QPB3XSe0r?7)nROk*2cHPf zB9#@8+|r#@RiyXKv^||9bl%gtanBoNb9Z6;aqUaLbbh~VBz|7WW_{{koql!Ew+qFu zZf}qu9bEPx0e5wZ=x=Xymi+tZCi&PuNp_ChmsW<4ka{hOE>E@nowNP;3(1UnB*pIT zyYzqUAMQs#xSj6(ufyFGC4J%9cK@`U?!VhkkB<-Vb35t5yPb6Z;7G~^PXJQ8L$m!Z zJwDh=50Cgmd+FZZL3+F|KlByZaYj@7e{+ZwUyR1cA3S#N-r0?Q>d-xR=jb>L zZZ|#J-%F2nw(&#z>CUeEjpNqf4$^n-LA$$q>tnmT8ukB8a0I?|VjAt1GYCn7kQ?JU z+uKXG50&B4J^A=JJvN!g0?kz*6A~!!?%`p&y}j*!bnob$8+Z8G7n`D1-lptO91=y@?D7v{DM|@oiOp8jwC%RdLGo#imlfz_4UxQ#nTm&nQ;TL`**_ zIYjG=w_TWO?5f1ggk}N$MrJh-7u`pkzhos?YEjJn-mJllZpC4tJGT`MtsrkkqJfCB zv6_{YiX&Tu>Sm$#n%T@}L%8ulH5c53;u4&{l||0x^)sgXjb|B%xUq$=ts{C4j5y+5 z77L#!#4d#%y`=C+i+xvlcC!sr4w?i?tqZ zs4&faz(+kP$_qEB}#k={)>`h zgtUkx>RbDnm&!qyCwb|_cDA`W2V1+p94UdupC9ju+ZQ|UoT-wH!aYndj&9sP-?Gi) z*xtVC!-h#~{{WJ@HSAv#Fk+2gf29|t?iyYoG{KxyB6kRAVu?cK&3 z4tJe*v3<-NnvTX%^oH#{#i%~<&jLOTKE$=(sb161FDQ?Ny56!4quuV7$_S9?)W0^D zgWl)iFK1onb&Gt)nfuca)y|tP9mV&<>(j58*pIiM;vZfQe4dwvDhtqTU_&)HJ}a2B zLBBzk$sfAoY~lhtFRgKYowk|)mDpaOtTSI5mpwTz%1YP{LXuIpQVb3y*NWZMt!e}q z&QKV9nTKA`y@QR7pb1;L?Hou_x-Hthx4*MNT)WYP$KFcCcLnTQq^IS^5Pc7Rcy zGB5~hX7y&loYj*CjH+>l*`yX z>})3&=a+Rz7bijpJU%#0AF=-b#dY%y{nHOhiCY2D9nAKIKjXzUG}z?UB4v} z=9MCJE;A`He9G#W4dJJ&%TK+Z1K0h8qQi{ap4q*kUe!rCyMrjs{XN(n7&&BKNnJpl z^jk13R;`#fEpnPo#i^)_-OizYAXcG+Yo6 zIR5#PIWhBucGxs>=cYcF$iv>feI)%P3HV`KXA02)OG{peltZRXb9`C41gnz)zgOs$5OOJ9%Y6y;djlhsMSxQ^P;1+n0tdih%iBf{Ng+1&X?Z-tAq&TY8B#%omYRL$eP0=bM z4M~Tc{q0SSNYUrfPnX^+f?M8MeM8=sdR2qzOL#763h`X-cbV~yKa=XX6jNiYLB(Io zH!(?0^tC>al|x{$!RDiA5TZ%cr%3MJ?c>9DHS1!sGq|Jbcn5I!@z;whXkdv29+RzJ zNoEc}OmIn6DCdCmu5Pa2q6~UBolz7CAOyJf2s3s_3T9uvR)Q^`=j5qUKc30L?L+U( z^JnKF)46}N$Gv1f>I}hwFj)IU;$gaXylX$i)D(+g^#2FHCl{l$;l^P!a+xxYy;sMv zoo*i3Gmt*dXhs4iM9i|y~h)1a?T{tS12QbTqr^1W*qJdyw4y#niO!@AEw)|YHI z`!$oD=)XHnJ>h%A4Ud3zi{ci;hpK$ZrsPQtyng!~wx3xb0^GoS86{Oa_2UZGw92PG?0>QuNKVpx{r6i#y!;{5^gTY3~gV; zwt+uSxLHlLvfygoivZ|-ApYemLH#ln;{W7CTaWofPS?rp@EqA7*5}>-dZ9gZhV=cY zd-)-`ahAJ8;R~F6JRE8&DLb#n?g4*BXHr2}-yIzhIVGeWBwM8d-BoKw!WjRT!d_?w z`*mZj*V~Pe`g-cwrcSr+nzi#K?KrviZF>0u_RFATAKCIkViJJtxbR@57i zXu>|8lrgnGoSWmq8KzFmx-QR|~R5H`*E^xpCu|slq20wW2ry1i~-hucpaQ zdN2Om_~lNA7ZL3j8+Nq~+Fn}=uJ%Qp4f`tDw?Yx(h6VbnP!o6%D%!>+Z{M6IPs~uN zf30H6)avff3Y9%DT_zNXrul>P0b;oBg)5*tN&>goRp&;?sWYaZH=x91CZtff=`44* z-W%1()&$r~7&rU)*M#87eL2l1DuCk|uQ`gEjm9~JkLHFxY71qX#y5?O$D7*r5D>;I ze*UlBYq#Gly|uHKoVYEA&Gof^$*dMLlHJQs=iSr(@GSY&eXji~lYIz%X z2hNZqf|bjq>L@IZTpM>qI*SX<&Y9q+c=6NEm1_ZBqio`!^OkGl*6?Nem@k#`N!!m~ zbm%|m0ab{5P|ygBcfY0{dG3PQOW-Or@Nz-dW&ewd zwcM@-M+E-GP2c&T|1r5K>P81pP@T}DS5Wm?xQY{yJdF!2KDr;6Ekw5+bTVYkcK-M| zM9UJ7^q+fGE;ns+la+I^$$hF;O*5UafSK1dG+NfR^Fwyuw)<(^VOa%u<2({B2C&7w z(#^gNWJZ0eN>#v2CwNk2&o~zRl9)La4yUngJ_iUz#w=MT;D(}9rM73L#NBRpe=*B^ z?+H0L-qxW#0L3OEQ9>i10Seuy;1BRYJTLOCKMni+q~E*gas`LiSEKX6r?XLV*6*I4 z4=($|pM!c<6&Ud{UAc7A3I|NUy}P684*q8_cX!i7h*Oo8nCveZx@FY(kfUfpvU-+} zl@S*Jii`Za`&d9-B(_#_U`^~UHhOSI>XUQNi!3sSH=?9D2`PW|n^b_83PT1uS)sCr zwQcP*uM=y^5qByBLBeYrq?W}d&fJ+VPqE*bDqn!%n48qaMKhB&&^gTJF|Do$Cu);d zJh?)uKFw%Cp0lWy7|T?L?I$AKe($>XecC!?0)uD)=geVAC(;Lkr_Lu~9^4~;%jd;7 zH{jmf2=lwzxZ~y|Q*Yfqm>(#i#V7Z`20y;^Xka)whZE?3xgswjx8}xBR^$eIX?K+j z7r8W+=bD2+LzYJPX>wbw9#-s^H0qKIbLONEw|RuU=h~lFg>yKQ!Gm-03!^~ZbhySo zU>8j(uD@1=L_yqxkm$zxS8AI%sz3NrBfG<|bHbK9vJ+3;5R*u^#vQYLb$O_47j=Q1Ape7am zRGmv*1o7&(+?bE4N=Jf$st}=#=2xn4oyR59zPCyr_{0D)UlYhs3S%bK-9|#XfgMTD zm~2UJdCCHjP*Ykpg;VEq{E!<+V}qJhSi?|-3oWa1FZ0y0dYL~5RfJxvBP}>*qVhjnxR`kzxtSC&sh}li35VM96U^=rm{MHp(N8rARb_t^)3TZywC(K63=iIsPzC%At5JP1Pb5K0xZ;`8oLZlVay1DgXS$S6Lo@3Efkd|R+PAC{a z(~p*HrA`keXfZnz@IogKr8pb-tN-^6p^C)qaD~?LE0OOVR~Ei2LW_3XQ_T8fe`gF` za~IIs&!MwVNFt!3kx7KZ=HtRN*Qz|+1JXHTX*~uXj&W%Qb=Yj{tOw;$)i>=^R#6m#?flLdhhxBIdU-_;q zHTYCaU_>e!M@rm_4XL55GU1#QX%NJ%SWev4DBoDF%%rHFk;G7A5~%gXOJo8(aw0|c zinVA+GaAC^Gz?8#vuxh1jWvna1hX#aC;a>tIyjauAsB%p;BRrf-YtWQhhyirk5wn> zN!JXo%xGvm}a=vNe%t}dK=FzR< z@nPq1t@wk!)zs_%H11&RyH}s~DI=lYyzCzbYhGi-{MT<3S z4(G(nx~gblfykqpOs&*1c{RRJW>W8HSUj*Lc^XEsl0yOn{$b8*N@5530w0)U3Z1R<(0V23gNp$&VHa1wb+iO zB~ORyZvjfDv^;JKb)1qvVl-L{2dBoRse}rG#Xjs9LGyD?m#;9g?5w(bi55GXv4^3U zN5q?{B#9b2lULFRv!Cj&L&ptxPfR1Lxw!(*p1nupZAHnhG{+``3*x448Wh%%B%dOQ zU~6bn^g{S03<6|!od{*jiY*%4c~ORR2jMvP_ci(Sks07JuAhZ26%W`Und+8(YkRz*j!P*D1SReb z_p6#+)KuJD&7D6l+m{4VVZy3WP1*Q&} ziw#_SGWIsflN!^U%24d!)Kb!O_jBd{Qj)x+IUJkHu(2)k>zs9p=R$^YovqyG0*HkJ zika|v0-VO-BkaBV)bsq2l(3aC?;wZOYac>Q1 zFhjD5WnR+F#Har;?*y~MxBqAOCCF0D@;`A;npZ~}kJ-m{%b@!5`Y``f?wfzkArZ-u zps34$h)YKR4znV%1bnOJ+l{#_h&{_MmOy2tk?O;JMh@;yGSXSBL=(N?xUj=-HP^XI z|7kxaX}L20VizXVZgnec_GyS+|3Bfm{1>@0|Ct-pR)nERAhB(YBQ_a7^mf`FO!9xv zh1ui^Ty%fEx)DY^v}j!quKR!*B*U}YQF40qntVLJx;h=AKeL=fMaCO9KR37< z>@J~v<1bg=rrBdT%b?O?!ie?bY8gU^6Uc@q*Tn*YZm7AtuiGj>hP2SX(|HHPM$t`r zB;5bqeWfu6GRV)lPEF133^dy)zKUZbHrbtO676A}csKrKT>9aFfI2jK;RqJswu~FB zn3pKTKxjK|UAl8(jDgR<8wchIxU*;!)d`?Fj;J}pe$7z%xbr0r@OQC9N#Gmcc!9G! zOplk}cF~zqBIxDErUA&3yis<1q9K`6Y+=*FTw|KKk(sH)?fYDAG4)zU6Dg*0p3A@J zT+6?pUB|mp7sW*2?=t0*ax<1-PPL^ty@H$4vVHO+X5u z2fsrjP)wsTJ&PC9vi6Agdu(Z4TA~J8<0y2|g5)Qd9>5S$1TvTg8ygeJD%4EE%t|>@ z%?>@sl3lD@2?o*xe_<#V{%FKJCaQj6)A?YDkq6h=bmt!CO*2Bu%w z<~<4r)){!5l!z8k9C{+7MLXP|x-ise()e!jKR8$LTc-TZL$nKerl5WCIpJ%mDzA;- zjSz2g-%DkSg?R9W-Z=#w@e+x8wnefVgWiNd4~SMp9Yf^1+(%|b#i&B|cDg14=ti^< zFxh?ay=5}yQkSHa5WAkKtq{eRL~5bd--SaUI3YW5{#pfSW1?6yxf)E$k}w3R-Rsp5 zKL}O}RR=RU#WDoatU?s3Fu#=2MBE50D{NkQFhn|i>qL-Rk9#oQg2rUXw?jpc_nb6A z5R~>HZKaGI*(9wQpKygmfC8zN^6Q<1xJZaEN9wb71dLQj?np-bwu z%e+=jEkIe7(XbIJmYaQovWFRF?!0Spy$n<)T*NBbcI_L4*m98iNBSVJ!DSMOHAKAV zv@!JCojqG2c9mG#{VDF@P{^D#VT*jD%3)$hVL=a~wn7Oia)`it79me#!=N$xw#|$D zJ6Il)kAri_?SS78N+_c%@C7v}0k2=-M{`@(DAcaq2QAHp^*kkq2@71G_x$F#F9fPR z&7Mif=5`3@#yT7lr6^MX_752Zg{VJha?wNyp&EArtY)fww~{$vQn5}AcVt#@jatmV zHo!xG99EKoO(biCF9-rpGq~_ zBIf08az8GRN=k4Q==RytuIrLOW@_GX-$*WovkFPwFj=^4fsA1j{ z5Q}Dm{78k%2*Qy39qGdZ6Hk+6F?zPAKx{N!G)7Mc`zYN$h~C_ZX5VQ3+_eJRLI;Bw zhWs&`-Zw}CdF{@*@+&eh6!W!x4O*LQ;m+&wMfA0B*8n+2cq`6K!};G6OZg?EED3M0!=qo(Vx~;`7?( zJA5~e_M`#^6#{uAF^n}iOX-S>@uERf&wpqnVA`Tms{c`og@o)1i zw}iY`GoWEStUi=S)JgmkE>1sID-n?F@dt(t#+fH+wJ7H?g`}O;=!-Ud4UqPa8R>8v z7w&5AR^fgHTRbj7BP6g@i{)syie#hsL#;&mS1u71SnIi>XjO5$RZx>Rts2SMofpExM;G>UbaB9U}5TNh0b+f$l0?*K_rgfxA6 z!PF6-W9X+amafU4IM`pVZqH9a{h8eK(fA2%@WDs%xIcCW=fglmTp`!&0#DR{+hoMj zR8jn?&+XN`ppqKWmuR^VV{Y1=Kj!8Z*I$#YW)#(TZ@cC(mX0Mun#gwl8JAo>5vh7< zOHcjxxibuYe_Nl3MSA^Ep@bVs^Y-tBYPVP%sGOWk+BuVFlDOK$#9B8!fN8b@O@dxB zHa%X)QR$(Ddzp1gqa)B{?;ZMQpM+=hF=GM`ZT?d5pGC{}ij9z2Z3IK777X=lf_)T0 zzj#XT`!NgK!?(aj{@I!@5qyp~l!#?nBPS@;^!x%YnKTGImNiqg?W^BA9D(T*sUaO( z$`dP{+-pOUw3BzP8t&Sfr`1VreWFPGA~TIV9Y%$JA1i{}4QUFCfu_jJicd6(G4<3T z>m1#$1Z}SwfZlm`a0zj`+u`VDV7d$!z213?)wp*C>N0W~Z@)q~ytBKP?(T@N!ofj$ zu)iBrS<6Alpdwcuo{yXv+A3*fr@F+a~xU=WkjNjSwH-7Nd z$>97i3HZLQJ)0{dD7!m;px{eaW`}hURsf&NCfVaBBCmtuEKhwy+}Acn5TaFOU$bw= zdkVFPkQgEnyk`&T*n4)ud-SPZq1Hd|=+NEy;aO@PuY0~Tx5?t(`uH$R6B|B=#fC*G z%8#R&T2I%djdd=4LQvA2HAU~{(W(TwWc+>5$dU445Pj;0^yynx61CUxH3{B}tIJ{c za&+a+!>C94VU-h2ZqBYQ`&-xU*6h3Eq7=jz?n&94d{wbJo}}Aonw3L%9AqT}o$&h` z@aS({m#~XtbY{$o{B8XK3<@uc$9(<*Wk%seibYb_ImKRO+vqaM#N52RRwmTv14%@U zRb79hG|fC0!V6%$uXlz1nE54?VB%01XH8db7CseI=Cx+Y=nUi~527|HSDdo+@=}fS zj?9uR(1W8UI3GXG%W3)Wl7@Ny5fI^O*t_~7TtOrXy6KO)=Y#I3pL9Pti#Hs_>ORAy z>s+`dCk*0|q4PB$=X@!4R*A@@YY)^uWP~e9z1QSY!s*w?{J2%l{9V!u6=cVd zlxECd(VgbsGhQzPdO{#<&3vJrL!0FFH-j_ji<@oppyVTSPj|X?lNIi6+UjPrhGsdOQb7~%28*G8W@rbq|5e?T-aTS!GsClRi3G>eSMNyyV+KYEjN?Wgo# z(S$=HfxI#A(pMS?3uG2F@)(Z^SC9CNhD%e(s+YMGuvUwxa|aZP+6^yohkNaNr+Owl z+lL&wmbqprYUMEy(l+31Sdh!Bo6*@P?7H`tpML+4{QGVHq~CMy_RZ)1 z%`oY^>lN27=vEM3J;0~#EB!CFy;qC}SZDgOMSgvdN9R&dB*V0Gbey$7(^xjkR%!C@ zD;J@SQ_fz^w4<-;!h_1CiPue64skSrp9`hCho4EFL>g;B|0bz~=iodW9up>u>w#FK z_?9;*itF`539kU@UguH@% z!ERdI`$P^x3I9FxmuAWUuWWZ}Mg7DF9HtTO?xuS#Am()^Ov!rl2qTs%BBfDHOhnw2 zJP%lmS}5EC@oeh(R)i72SXYcRg)S`NvC!qL1w{A)?~-yANMpVc{9RaaU1&e7W zjXP>H+h{SvR@F`V>B@zK|0j0pU)`u!(f>x9RR9I^AqI1C+oa$|cxX`SEU-g$w(GR1m3xhUi-dK1n4DtT!rWQe zrCLor4~4i91ZYexNwwhvd>^+&_6opx<+TbN(lG?g?Fa!hGIW*kX$U%_sNMSk9g8K+ zF#9@C77zmlMjk8fhCA$2>0gF3`y`n=rAnL9)L-BP#bA1 zpiB)!i!V%)o-=w9{ebhK1R|^d>OL221mH4|ediU=R!bHS(Olto+I>ZS!SSOo^aFie zm486G$5SV>9_E~(T4`5A@x+>>a5fn$&DFC?dlb?s{n-P7lt_#ghp-1sdMgO zZj81q!BEQhIKMV*806ONDeIV2P+1u&yz95S{+vH?+|IUbJ+{fniq=o;%ldhxoQtQ) zgdMq)hL4=Q+6!HRD5Kv~wY5z)>+17)_w4GToAl26-OIt{r{wlJ>3)RYNNv%pkBW{? zk)pU9LiAagV~|Wn>f3#$heXHso$x!jzs@5BlU<{B16oIfiyEGIAt||{ws_tfB%5<3 z+Mdsl*r#l~V)$n`E&mOZ(999W z=?CcZfzZx-Dv+AJ&*yc%@ztcDvW??xJbTH}Tfty}1%y%%(;_268;bH&to{f#27P;~ zv>n=;F&9d!Jo%Ej7-y+sHHICll}de*uvTcxxxRFUY2)Y1z&~cy%n03Jar~jl_|h#c z9*OcX^@c&;QfILU3C${)7$OM|BX8F(50WebGE`f4(i7lob7yZ(^67L|&V?Y@9gflC zs;Vbs8B;x@EG$Y+vdj5cQc&&PDyp~fxzFlv>x{4L5y>ZDf#5|_3jsgqAAQani{s%? zR2cycDS@By=2L1~-QK(o-K9=O689;B{_eQYbNMW6azLW3?(UGcenG=n#ShhGVVwpk z!CW^e4fS)W!jT>4o4E)4eMKG?Zf<-1E4JXe`!%_In_PE4C-1K>uRkQ8kVG1Nxq{3M zd~VP-Li(V4`8BzABf$@6-<%U#DvsG_nY@M>#?#`W47u^5nI;UGyqX-H%g`0$L5Fv% zm%OUmnpm*|q8$Mt-IW8vmIcUXAV0(jX9v$oG5rJRL8W=X-)G9r0`$^L$oOSa8rCa% znv0%2RE=MVAtL}Cs95=pXqIA`O%*+TS@-wDNPJZwkgqoK{6V5i)MM_ZLhvgbm)i%O zgF|lyspLkX+|KSh|F_hKp2#G7Ie_tMw$g^oT(RXWR=bmx=QIAX4QL_|G-E$kDE|i3 zXfy{Ao)&F=;Tx&yZlytif+$a|uPE{jiW+K1H?30Z_F-rL-TKGx#DBZ-*-fX)TPQFGJ*25`YJS0_{;)?Ycc1)q- znNd_zx;M6unq5T63SC$2&`(Hjap~igaVi7i6A{(v#X5B6!E2Rw6(#obGEf>M&K%;o+)$x%w^L0LCf#bpuPG{V@U$ygR z$s5y6Wj+nww|HurQS2mdBE>vf`-);fnN_Bqt!byM3KNi1utlm#Nac1dgzuB!lv2CB zrY{m{qV=Vw@8`_5J9-Kl$R5fzV}v(2kOF|pK?4+&sLBZSAOqS+9^JO6<~Ay^APfp(~4j1k_>c}_FZnc{nv z(xU~096&Wfe~xZi_aOp7*eBVG{LQm*f(m`b+Y|cmqfI_*{qz9__#X?~Xg4}@BT#_)m zz5AmXjdGKczem=hu9-@mN}>^bjHUFAFk>M0*k-~5cN40aAh2ipS_Lkbgj?WKl<`0b zm}SoE$ITX=4Wn{YG{u8!1vO+{KpTTV{S^|`Jlex-H7O{Ck$lV^CvRoo)wen-5U1wl z%h`+T8k`b3fu-u50MHrDg-}M;5i}9A&}vqd;c8>*8=e7YL0$N(_v>>$CGjNVTOl?M zDnMHMs8Ar;R5-pq>)u>+dsml)CB4Ug0*|HmlcD>a-+EWc?d4#UTsuF0mi1KbiHI++ zxM#?YYw2DC5@2Sa7ttce;NIWlPvbHp(dgA7!}S9R>(rl6 zO00_(x1lj9-q-Lx*L;6X-_Mfm?RX00kZ$iZ|L)${?S!eW44=}R=eiLgU6_}a0aVj)24^JE3#W* zP`Md@;hE73@yYGvnWe+EDb-oMJGd}z&bQn%wzRO}Vr}ezv2)<*>RPGsHK+b;a6Y(p zzWdeiiga};Ag(XvqF;Fk-TW)0ub5KXT^TpacrC?%HK zHE{xs>s}R)qECtfOg9b*1!~jHB@4zUiY>h;tbmY9(CD}we(J8U42;5VQ7(bxc#*-Z z4>HzDU?ZOk4WU#TQ5xj4D5d4bu*y`XT|WfDs|lpun?kdC^L(Lr`jQQBSD^FL^9`Rn zx6bH_0Mk;PXYBG9U0*~c(qt~L8JCL+X>F$TL87B-N=UJ2BAquI988g(k&lD-Ea4U} zJ+Qo$--U6^#ahn(mk$t6@_Ksf>=kY#=)0Y0uhYBEyB%hPnh#{CJ<|pS|IZk&Wh&is zVrKt|#EDt`O>HW@c(>kH<0aZ~;71=xk!@L)=qEu;U;~S6jLZxtC44;hJeX{Z_MHNt;xKAQFVRV|~rNU0b4cWnAB#hqEpfvsiY$x*!9&I0{2YcJ@F>~r#x|?O+`bEUa80!ULzb3r&2jUy@ zff$W6>mBZD(?#${HqJ0VT1@uU_j{xPZs$xyI{VN38n&lpg59M~cB&OV&tNXKb(*dK z6-O+Mp9D34Wrk(ROp7F&8Vs|6QE5#o)5XBN1uclx9FrMB`DYQl=aZhB57tN4Lh5MJtR?sLyqs_G&jI1({#GCG zQ2PEsH6uw?mP>`pMaINiULS#zTqItDU?Z5|W6a6)SyPS?CYXXFT7Oq{gwVXPHqF#H zEEjgYYYmny1VHmYhXlcciiA@5i6oMoc5wEV>c`rMJhYI~&TnU?i3NBy9SimOvt>K8 z@df|Ty7&h;`OwZ14GR|bW! zPXeg$4d4wltrwG4!-+yWN1b7_5NMNWcg|fF>U|*2T$}Kgt?)aiwNY9-i9`eLR(4Zw zPp5=Ml)b01dT9%{EKA9e;1VM1t+L4evS8{ca4T!B+vjb!k2?Fu^0Tt7GA0!k!h^=& z!&|p9ck05`w*9VSJhkheyXhKfvI1wUjv5lVF(GtlUdGtVyc6rk7Opl?abT1yK^UxE z+Rr7@L@+E3A)*Hu5%)cCTk58_0VQwAK0uv|H%Q~BH4Cj!ASws@JsF=y4kN{BCm{)Y zS1sGs$N?Ldtzd#n0VgV@zmbrso*&%yJFkLgyF<`O%Zv^2KxnX3@Ia}1++gLLX?3{= zyyCN^6jd9IbRuelqPlIl*X6q3@QWoHc? z{}%MzZRM90TQr-A;%>5hS!j{T*-JF@b7r_B5pf|E>h>Ui*;x~@p(RKmg)|3+2eP+n zogNN6rYQsFH*=<#r>b7K5Wjr|#{OvFqI(+&i%wNGt*mCgy<96C-vytnRMNo#;Wgfz z$cScEBBZH6ZL@iV{!s+heCyF|sjTraa<#~_sXD6vVk7e+q$a}1>rGB^8t-8Ijx@_E zFMzn6&q{TPkfXgRTwv`E@dFtda7Oim11G8fW(u}r^2&wD6Ea*~@KW|QbDJDP>o)eD z92#yZX>(BO>yD(1!qik80qC)WZ$bRN#+9#gw=r=%xEeao_PzVRo$h&*F&_3W2ED7x z)7#z%a=;^KJYN@XamLBWH`j$W4yMFZQz+R}Ke9P0>oqj8ks70^$%7zgBq{uDOM!7(Y6 z5#o3J`lf<^%2GB66OHZ3=v(e`5&fPq(hi`8xMx=!_d1l9Q(HBx3(TBVsp+&ExTVU=>cI`i^F~ZeysT12f9Lpsw;4C^9RUE? zDz9a2Z@Rzr8$<~2-QRpjPHw*rlhMs>?{ji;b^6u)_Ym6Y!{jD|P}@M0$gS+RRjlNz zR!hY`RjV-&JCm2tSM4V7r!I$?B_E5sD%)a{S&GN~EI4YnP>Kyy%-cWF@{L)x$D6+{ z46K31T7}Z;Y??>>j5|^8AdL#ozC1BeX7nk~E2rZvAUDrKK`?6=xYUoLIb*5Lquk0M zxB4`7hj$6SQch?vk<3y=@3+ncz;)MVxR8?K#)a;8KXosYv%%$PC?VOGF8+AdKfiYC zpImoQb3I3M23y3}5bDk3iDf;Mt2{>mn!U>I)>=h0Ue3ct#>+I?v!$SuR7zL{*?rrP zO0_d+mp&^0LNG5{rv${v3lSK`b%EYIQ~~GNQ}GZnJ`UzW_Lhsx$RabWrx#<%>J5F< zRQmIR$eK<7{m19kbLQ@3;gDs+w&ntxE%a$)#_;?gi5xY7WGvG#W_=Sfh3J~JYG}Nj_rrXE;xf9=b3t*}yLiO2i>J(=?stRQoYR=}Y_@3RRIvo@zX9 z@^y}Ns`%MVqe^`wvx@@EK@{ltSY_l+YE~l#&X;{jrN>H=HU!A|;JUg0J!?4L#=NTA zjUs=e2Zp8bqb(QAH)xZ$cX8sxt}V;OmgEWy38?mSmi@h128{mM`@0~G@DV$A9a z7rGnu1By6GDIg3?QkIwbLhLXVUL-q4+>78u@VwfBf~!NmhmJ_HI7DdP2MJX zg|t2~;-Hu0Ua@kspr~c9SE?lMI64*cP`Is}iJNF*VFA&?`=Gu%#*aG7O5d+pvx|-e zT)T2w8;PG3KfUnXi~EO}DT4cM?k9K=$Q0{Wah>I@5*-EUnu#Z5)Y652haX%TUoM%) zXX;PsQ&pTHc4~57D+*FHA(6)L$5vu)$K{d}SVlgFDNv_LuAG%JUfdD)kcOYoYN{zw zjQmG!BD={w0wk`ALMpi>E4snYG8=`Uv+2?LItC#&JOCePJI6}0ja`!atrt@bP z!una%*0ri+Z)iLchp&YAG(+dua}b0k8xL|XLxHpF;dr_g`nJRP87~2FDKqIol$f$n z`R%1N+I*iW1SZ~Ob1`dHYaUXZcwLtlsv%jWG%D^t*a$g)00Y?=ZDtIC+%ci>N=X9N zhjO`Wrexk|M4(uA9j%x`C}7!u=DCk~=mAd|*0BO3&M+}`8y&yP!SL!b`P56gm#0Z@ zF#4L@^oLisH$CP@M#(jNKG0scP+)Wmo+o$9Ku`*LZEQ9}tUpJ;+=OpMy5#-Mr;opO zh98ojZa(#XmP8bZG2Hv#_i*{&colB<^B*J$xQTn7Z3~oKNuHEQlxW>87gWwf{~%9J zR?~7yp3T)0E-i8;NFyAVVX?}{bodFpHOQv13Iw#c=L%+rc{yvqL3zFphnlHCm0*V5e~y5TRO*bzaeENAo0?XHMpm!2!!(GZsp z0{^auTx5YkZyf9ERR0DaQtm1NOHUa^ zOpvd4|M%D))sQW)gzb~ipQH@epV%;!NpGhP7nGV|N#AvLlWMW#|3#u&6z1l01J@!a zyz}l*a+Iq(^xFBP`2#%TsoznZwS6zN6U6^EC51-eV|D9-QWbT$kR z_G1vH=y(himN4GP<|ua4o!$M$D7Mq>o!#a*_R=E-^723S_ro|Zok#j_f>*~gwNTkh#`mK7*Ds!Y(X&iP?$fLrgQ!;o%tm)~}0e@}CdkCkrVY)Wr(U$W^n;|_EoW%bxu zGpdDNun@@$=VJH?OXR{maAZKM0kT^8cmIP}f8?ZJNp%gox2FR#{eHR{4Y~w%UVTiu zH{FZgRdNbKor|mAup7GPSMGlc+`d~mHzOm~exAGQyYL%Y0Km>1$jNT?+PUAGjmw7U z2{pI0I6MVw96XNv`xea1oDKy*i{f$$CLfs|2~f!lDP5bZ`6N><9BXGr_$989SukZ) zN(HfKJSywgp>}nxW(21?m+OH({+B%ehKRLLEwzi%y%e>ee4D^+Y*M2}(gh|Ynp))P&&5>@JI#L`BNoVFkwXrWs1cql4P&*Ny*#*hf zi(d}ytKzo&=%Ob6qnmC9+TcBxvVLO^DE8Xd4(yqp(Mq>~K-0oDXiR`_Jr=qw-3X9k z9ZTNc$SY^C)cx<@J-zMkSC=1>JyjZUk4oDba>?#~^pC>pD!+Phbs};my~q5i@=xrY z{q`Z*55E*M_h`U%!d4^3_$L4B9GmFHjUHZWA)~TvZVMJL$po`R-{fYAr5HS#$E$P^ z%!6*PP^9q_%ndyF60v)EwTeiPAhgtwYsVKdr6mk>hd?Aq;?1;1B27H^3j)GKkkvl$ z!9Dx6e4Hoe&IMZDO+oS&i(L~}5gjx=jW?!!{Si8tmb+5qSMP<3 zbCyk20)?FeuwB|>WvmITIY^H%a-oc&w0REG^{71}`cMAU)a~Bk!n@7)*^|_5*@HUI zTlN4U!CWA|0!@`EMLJmaj$7dI*v)&C=Zjte55>b|wCm1)o0|X|8quvH`dw=X$|e*3 zh$JtT*+V%m{>bGZc^h1kv3xaM>n5{hZ0xMOxg0hLl=M>^B=b}F$|a{nTmRx6L{p~J zK>QB3ObfW71b^5>2b1+^{aHl8&Pf)i;U@K+gKd{^Ux)VE7K#ucHBl-kFQc-czeuTQT5Gi&mL5zM3`gP|%HR1JeBZ zJZJ%#Key@ckT*#suA44Eldu{K&9QKFnBtJ%a*Y~D8u1e3fw{Ly!?zUPIrmAV zAM&>ai=O0x0qa$T1Az#KiBdcnUY;qk%tZ%u+nx%LQ&nh&e|!J1C3XUsR~__a@;V$*83<(vyA|L$%S7Ng-_%EvkTc*ikp1ApmznLaujws#;`LRsf6LPc)A zvuq-SeR48bBFh-3S<*Awo+iK1Qrcc47H^`eB-4hEr(!(r(7&?1+S^io=N_^OZNfmn z13miH*=eQQXs=@QC2wEf;+T~im9RZ@L3KloOCjar1MWrtD*v$GNv;|sYsNey?`Uy+ zfim2kgcUiu^a+PFWwKw4F6Tw`Yo?agaYjx;yLf0}qi#Hrw&y}i3J$wbJHTU|ws<=ma1hHvD27*@9Qn0sne?PCNBi?0L(0 z|7+~Fu}@w!cS%X!x0dC8)o`1F)ep>dJR2lJZ;jDCdhL{nU~w>{dg??l6rDYYv|c@d zubR?7zck?VDKOdDs%ciC{h=nHf=&^>bN<(^jsKi0;ECG~8rBo}^0KL;UwW z>7p$HPs6TuBWX*TG+#q>39ZPin9OO0-NF_0l70NNfoM}-hIh!fDEVaEX3|jjaBo%T3Ha)T_M$(q_zP3br6hVY(ZmFrr8S%0!>w~3&%ErNN zHo1N;Tn)~Z?h)uS=-eXV`sK1AN)y&CP!b_V1g54IG)7S&Jg~hXJ>)LFo<iuJ-Bs!2DU5)l3A?LU?Ritb6GCo|f^c2?Nl z^M6ccX?WEe^haOmTn@XaI}DTe?k|b9cYk@|HedIqe>rk~*0{)*vUX|b>Tu(Gt)z5a#_H( zgx}jR^w-=5nT}sxnToTk^HY>&j2U>-oNS(lh~(Anmj+O`{Xu^=(ULp`l5xxZ$d*9N zNrn6BN8OIjlF!b%bfGZk{ts-*>8)FKXgA#4p7xW$g|j{`$cVk^$E=PhoT#L(H0@*0 zD5x|PI0Sp0o#PgVAb0Jq{e2n<&T=cK+SKP}=H<2g4qnMfL=-$wcT$F670}gRUsNTo z&{~SZQt-(=`MSo8GHwl()FM_Iif=m;w%;1R28g~^Zp$kFyOEyn?zCmCS92w=CL7FLG|o-q zxykaWjR=^;UU8RgeaZ`MczWL(eEM7GZy&sbwK6A)Fz)tI=O}K0<8!=jXk+xAcMBj6 z6(aS*q;6Tvo_KbkBqpC83SiD4H+B6)Ps>s||cECTL% zj=ZK8Z4E-p?#ds1g6WAiCF1AW5{q79feEu)IZ`+m8nN##^8Q1?r`C<-PfDEp0SAx| z$c^s($2-N-<&J}Tu!6TZlk_l@nkh5_;l3!1$B@8xFCRJc%IND0X2>#?H9o0UK=|C` zipCTilOc48()}%c19Qmv3B1x6crYYB zbH3D#;U4=)4cI_3)675 zU=sDR@mzCCwYvA(#b{%9V{|T%$*P@qZ!ddi3B*KCyI);gbv5b_M{fAeRb?*#MuHS; z&>V9S13GMkUCNn9X+Ks8FY(TK@?lA>qkyPlx8458=Tu=7{0I+{$xgd%TNTNOtnHDU63&<+$erArNl5PDD0Ll%EE83QIkxW1dX;kJ5-y&fXw z(T|Q4sw<*agFlz_vbzE>uwb!;3{$PkA}1Gl^jgJVNNj<4ji)>ck0dUPu}QH$oAHzh zHA92gq>`0?i?x`Rvpd`^V=XvrJWB4w++1|sCUg$jr~YOCrhD$3wNK8Hse9J{{aV__ zFFqyRC;(JItG_-%jhc$ap)rqi*;R#;b^hWO*i8)iHh~G3Ix`#n_&K{ro!v$kt-Fid z$*Yt>Hu(fsI@3}(?|yfq6cfPxdEi2ai_c7ojtiE&5J(Fx?P|Id;VkJ05gsSznM@Ty zn7;|o+gEI}_5H>lvw=9%B1m5wo~}Pu@fh3hLRZ8D%tq3tilb|+n}xf(CWW-l^SLf#%p|$Uir3oe7etgtvvNS}!$$v7@68f9307;eCR-*jgei%1e`^}xq_*d zi}V#4cB<5p2^6b0C1i%xCSASOD_Mzn%+R$+-8=IK$J0Xi`Ws^&MgL_E7+P;NsxcmT zxA$&`T`3=>Qk!o~n!-qO^sDF`*5D{v2GuQP@QGQfsCcxR(ML7kkF%vl&%WDSwD^tK z#=?lRwqDGY`bC?|VnMV?w}DR?dchxY2A7@oTylTxY;FIt$Gv;JwfoCayw0ApEl|tH zv8PH7X>^GwatPKWbcdYhkGMZhDqdR>#J(S%^*_il3LAv$f&NTYs)*;nx5oCr%jMlG zllR@~(lC+hI#9F{ z#-f@+O9ZRB=F&FBrUXH$lnC=T$b%lJnJUv8t4BZ0k;#3m_i5l1_Vl zjS@sd0nuDJ_e07*sqSHW=fT&}UvCk9ky(CKQ##!m>9EIE0!$3ueBupCS`gJyRx{@u zVB8_DAU8ijHX#pk)lE$bLq9=cM2bN0m2}wo^8R$@?lffRE3Gqp{E)r7WR{ge0NX=9~n6k;Jekf@MZgWo)yf z*f!qolkkMhN$$yeGmyBH5oKs4Iymyb<8@brywzhq>HH9i14DF_Ztuyj(DM>ab!1YU zDftsCqC|+Q5Xv$5=@$CKba!8#%`GYo7(ICWLnAN#)SN~pI7(%4Rumbk>~1V!hUsf* zE4n3F08oS zL%v-ln$z}e)VMT7fc0)QCF!^}c20#*-+H9wE~G4uD0a}+I^RO+oK;Qb7jH=6I0-Kw zyAqF(0r_R;n28V>v-4iXCsOZf0bWmOqYV{uiLj(ut+~r1)f9DK%V&75sT!)B3LLex zbktl}D;lGfwkk>4SU+ocj8rEXMzjUQFl#BqG>$ohmWoZ1FA?P@pLs_s%tXI7hY@KE_4{O z)%;m_54mK~^?39y{t_iKk@s^hBwtdM%rh!a>SN9py`Y3PLXWJde7|+p#LXzKM+RyV z`ia0F64g`60h^Y*b>;f|2ezycbK@^}c6iPXWp$6jT zOSZz9%0=yKUV~9r1Wz6jn3`1*wh)~bWQ~@{b}zNgdnX(@U{n$x)SEudoy?76S=F$T z)$GpA)-+Q$&>cK1%Sn~}p}f$&N75*u#@c#jYLZ;5d0Wv6AM2LsVdXUX1BMC$-W9iZ&10C{{r zy1qoh`Ib`a`kPRJ(ja9xs&!0I#pWTkW3#OPf>z0ETX8>%jXL0`+i!M6Tyd&Fh~#UCpgA` zE6K10U(v2afDHUKXnh6jsK+N|$@b1xfilRnF#f%2g_hQFRq8!|wSuu*X2}h*1u~Q` zFYZ2n1{Jw`b+?+ZLD%yfEwV9oD7L_kZ?T4I1)NNj+TuAx9njC6Xn$fr43Q@KP<><} zyUu=nsZ~@yz7$QxG|WM(rg@UpxCHc7X-Ej@-{89m(Aj+#HIuu)lqvwMowLwLUU@Fs z21ZlTwI*C8h+;BSAgGesI=75}A17PZfZ8!0Mw7U!E|hb${bZsss1ouT(VLs- z@*qxfCccy=`B&y6W)IibhBd$^%e_?biO=k>l2#;byOLcwu^w!0N#r0!-8gqwkzAhg zfZ*uima8>kR4ly7Y7SB7gt{vQ&ym*|2;YRwvQ8!h&NxO(3)?{Ho&IgY>T<(4GAbJcn>g1FCfV=9v zyvtAcs-IKnrxqHGXoGZ%ECUCIL-SFCW)LM3cOUV0=Z=SaQDl+ajM+6tPzJHW4}l2& zGA|U&rawK`g>C(`?~a?zBKCERrxSwVJLM(imMQ0XV11;cTS(FlGaY5Z3YuAEYOHeW zGK>Q#8yq>(n5-sWhlz8zId<`OQCga@soAVblqQ%OC>_0+Ij-<0B+Brc-XqNnE@1g? zJmH~`z=czV*gV%8A51d8Y#*?>^A}MbhlLMY zIEWs3dc(|IUKwDAYZs+p;w)t?1a^T`oSj%1UqDbks{38n?6TGKNQzcc_tPK(07dMSDz+ zZ$OPhoy-$AaK*U81m_R^3f&>KqODP;%ES52zpJ`t^clM3x?aY2vvG%A&GpCXz@h86 z_ayOnB4`Upy}w;8CkgUO9{eD@Mu*sAbhdE0mZ&K;98zR_*hFbR=cbRbUKLYU3s3U~ z$P|s{#nom1tChY!4M5Sovro|1aP%oT?~a`7KO7E*BWH*EdXG14g{S0}lETDTZrrAd zGZakwp<;-{wxmck)4a29UlZkNQSz(3Z9=CtFC}&qdRxv6;P&oeV>X%@-eM_AjdXD( zuPA}__I{Ye`Vn=7hu2!j*pTZksovGib@Kjpc+x{;bk=REc6P6?Z?1lGuCDyJg86^C zat?ku|G5zxNgin;U09`r_=*lBY2`G=6t=?$H~K5Lb1u!~*_*7-PlyieTP zW#!W=fx#05r1>lHiv@J>ks2cpfX3Q3@&wNefeb%@AJzPTvm7kO4`DO)oI~S$B{Z>i zB&TM-DN?s}#Do=-o^!58j}>Tq@LT7lbes%bESa36h}5IP`0X-#OqIh@?zM`b23QkM zpl7s7jRnM08}n@mabLx0QgV@Q&-{qiXB$zYVozENsik$qIk~Vb8crT@59VY0aJwZp z*BEM~@B8d*#a`#wP%rU&46pI3N9LYLUiPbx$wmM70b0}sXotDI93{Q;!Nm!iCwGtB zo;mK3xJ^6pb;S$Khu;8r0Pu$wF0G^ADZF>KmEOtHHojT@aG zy+Dl7q!Ny0PeD{xwT9iALc~Ft2if#U5MVnJyatxdoAc!F?jl+(^dn`PARUc;%z_y2 zctq>l7O8CSv>xU+d0Ykj<$@qi@R%WMl%XL={QQUYnIOCe&3X$-T2Nz zgB!U^SDM#US`EDSA1gOVjjp6|UIM)5mi)x6;Blo9Xt|oD()Pk#D1>iuj;Jy8!LCw@~b(Gx(AL%E+* zcx~c{Q_+76$z?^|j@_ioD(J#BJhLr{<0P7z6!(M+&_yu4>o-G= zIwq>7geyJc;FmwHD(usUcMC2`+R%S{qSqKwjWKVya7lxryJ&;JceanV7Pus3GTq6q zfP!W|#7(jhouhz2O1QIP`c3w(3W#6kQ^MHprrAvWGUd}(0tdJ~qD-fJ!fKFtOKMek z`{1C>tl^8dDdC`xO3&wO3Jvyb+0{mC?!LKQQBcb)+Ma4LMcn}qCKB2+UJZ-M&o30_K$dgxu&g9l`6#rVAG{okE~$IUFH8HsjKz9a<55bisfu>)DEffhYsm>#+Uj2oQ}ae0p{9v} zsHbqKNt^Wir4d$+Eq@*;<0qcrJ)ns69ni0{kA0Gy?zM}WczD|QU*0~WKi}^S9M`h%6C_ z+Ed%I-a;Jc4KH@zJtHi>PD|TZipFus$<(`4|HT;kHFT3EP_kCaL9~ci_8F zvIzXz{`Fb%2919ZrDmI*&SlB2kv9!0n9z#c_%@#`0nyXSv?2b?@}#ke^$RA28~aX! zAV4Uam#SzUiQf>5I6UA%>r`yvUtkewIbQ686b%e9??Ar$(gZswW}# z7hZvB_ME={g(vu!U7;o<&c|FvFWaGP99%>AjM?NV6K**7I2dz4C}BRLgQDWAD8)2C zFk^x_6U3D-hP>%bX(?0L7B*#|+T+>vlNw$@;ukmx((4CvS)k%PW1)-QHyo>q3grqE zaCg%)v!9yCd|FL$ZQqF(k@4zQEv2kAy9B08-BPUFhqoDJ**<7Pyq9&KmUnmXGs5== zD~+XmWkF;;RKk8zE|yv|fpRdhHfY)u;Vv^pkx%g84Ed7S6}_~4Qk<{b!JlC9^@5BH!+`a#_b zE>V^LW2LEC=H>i`@GK9OrR+boqG|m;R#Or%onGW14v=IT7XbA(pv<`q`0qw zTu5Bw_q$Bm6x`7ZlNLF#180izx1>d*8?c!2c0y*7FWj^Pa1Gr_Yx;AxbbHA88*;Rx z)o8wr!}(Ja@8tzeN6{;rN!H1KQ*<{eZ4nmi0bPJfS1+Ug(MJAJazt**)9$F74Ez1d zq&rML_rD^QczShy-o0@@(7!wt#46#j!aQJX`hq+#B_{EpNEZX--NFn9Hr39dwv`+3 zRT=x$0WPI_xmpUJw)Y(~L*rHX{Yv_Hr16+k-gQ2GVuLZ@`2idGto|z;y#89Gs|Btm*q_4X{-_ISGUJ6#-ChBQAwro zRH0&~Q-OGdpt=~vy`Q$9t&$r$Fk0+3`Wm}B4V zO->6-_=wldtV*2i5r5I!V2i;fM87^{i=dGvbLsBaY?=ktpDW|&**R={BXQQeVE5i_ z#T^IRyN8{feSRDE2Id0vFfYRER8Dd_pv>;e9#U})STEd(dHxN4Z7Fht+k5gTDOEM0 zR;XpdZ`P?Mgpj~1?T*HE73FpWc+57u;JijFrl9_N_n$6|B@4N8ckRM$II?e{b(B3c z1PV104$ecgf89JglUP-hR6tf4Dc?4gv2Hy8vl|dgI!~CpjpD1k{5& z-I8A#fmF$zpSzs&8rn=6*8ruA<~&rNb8n{9>X3|}|7cwqY%*iI!Gdk!4|!_ zx1$Q0Z8)B2ROFXDixHEDN*M&S2VtO%6Fvz-y>^_plmr$!0U;M5(53VdWaj>m7D8umpRKzl&=?^d|NlCO|>sFM98y+itD{mt&|KshF5Pd zbK#|dwSr4?YQnon?MUbFke8iYb?8fjSyAC-i3kvsx-(y|%H--B6w63D;arq*R?4Turp`Tg zPOasEd+q!!Y0&21erST+cj+6uv7|*~)D;0Wq3v#a@1P?#Fx63UA6nn2yYrW#<{JFX z1$_x}C80SOH?a9?ac!iT0)!W5+EkDld1h(Ht@g^KdwXlHoVN9{4NZ|tbJCEBjmQ2yG?gM9RtjHPHkzOb| z2WQ~kedVJ8r;sQyCV)-4=}NlzCjY-HQu>z_N_D^e%fv~eph?9osZpH%^CHF6h%|Vp zl*|=INt7AE! zz0%AexwUMn=o<53vT!2n7<_Kh7I4L(-Ilq?M(eNlyEW5JVekSk743Ud*> z63sZFaNyh5c&%Bxi5@)_jDO9dP?f?RPxlVb0S5;DC{+8zp-JnG!1zMXcUQKNI7R_sm$b8Ks(o*GbTjJkC zW@{r1;DD4%MYyn+8H?c&cnPo!dA~KS)#)p%{DY<se`8gVLOV=@d{Ux8t7G3ue5PfJp%(}rpa06 z9GDkpOmY3y-(;VCf#;RYk~74cfzObm4=;Ob+?|AK=OQ*cWpV26MLT)4pK>i!cx|07 z2Ovc04#A24{M0# zv7SL?({h&gw8$SyA;gLryQ0|76G|eTcP1Tv7*ib;wiJb^XiKJh<*EfT6XrK%q8HC;!%b>3MRNG$#8CU`F zFYO-=_EP6VM?blLknZgsMpCPtV}8jK{7Y%%5_j^u8CL#h>+Po9DwWGk?>$VncXxxv z>-Ju{w-degSY?=@Gph_{e1iM@e`9BvBtqB&Eaau!?N8R$hHP@>_A?N4-$MJ;YaM2{ z!|UF_?b_?!2T3go6EfOH+!DA=d)@5~J`Q@x$>3Bg+IRF&%@6vLPn=alrpBG0TNW#? zGl4)!U-IeFwfk@YFl}8v%GVvR06H~1b?OLRs zE#O)Y25ly{6#?P?YQ~nRr@T$a9ILlV4$o69_$EQrz|mi7Xi`jMT=YKb95%=V11*W_ zGHahHi0O{DwtqQFrAf~-f63CIm`-0^+*RLLlPhTzv`m)9!pe`pRWBT=jTRJ{-Ld!8 zlxVaP8JxU9h$^KQu&TvfKZQ1qN9(cHIU$2?eIJYTyL$$ylx?H|HZyuUB+pfK2Um;O z=a~GDuVG8WM5<`o%@LL)&b}xac*w(cba0*mw1%7F`>(@e43{&`*?!RQJ428Cj!?C| z;~~#Hlb|(UWLo=5^{ochb9Z2-!tnqyOEE>!APDC6-l4O*HNy=mIFlg3C(t6SZAqUF zdxl(3!|X}{gF#ql?_G06BoEoy(Z9BRBTFDkSiV*_SW+G|4CBZc<~1VR?i_D^{NcL@ zgFTr0{*qR1Nq94|KiqySMY^r|C7!ubh|ps|mSR;F$mzXUe}UYRnNSAm3e#0%=U%@Q&b-OGQ`gKc@n#kr@=i4)+U@ShYOQU6O8kokwIR^*4YhRg+FiYk?5U`Nh=88(=TbA7fw3mL}bY(tK_njR<@6o?gJGHlnKVqqEXwu z86|`$#~+s853vEJ^v!c)R7S@1P2B1{(!L_grXVG-vhd5|Zvs%VUVzT8;9R1|Q821- zbQal*8WUz?*g(97Dq%>gj6b<*rHfK3LHO2lCX5wYhG}jYfl4?NiZi@q$&ks(a-pv= z8kTr5S{BZ4rkXYc8F{{TUPgzZ6S6>o#a1vAQ5UWt#frKXwTfug?@BP{3y?x0mK870 zDrZ8AbNF^t6$^KLcHi1@mBel5&+Y^?WnNy9EkGsy`kz%{?KC; zqkH`h(U)M6*;w+8=D^Ui+g85lWo&AlX9D6${gW-sVx{w094D3umOSS^J8TCa^dfN* zlviE$h*2F&ZK+oD8&bLEZqdaF^BvGyN~=cmIk!7Zq}}v_^QS^mNH_|ywEL)l50!lDg=0PEcb z@n>ECT2?(a4CyO(in0uduY6wIl~qSc4YY{l_hK6Lr$*#FT&p?tY>##44U zh)3Dt5m;?}6=j375epW3i!`EFzJ7nRu2M_5cKpt|vi;zt z1uX7KH)|j)kYgo%qr{L8T*8BBE7{xHG%aYOs^;YGztg2|PMC68&P(MriD-ub(u2ka z)!jjngyZUczs8a^$@YT8@y4mhEM>){S<5J+o3b%2cZJsb!R(*21{C`Pf(oQ)Alwad zWHe)z@V&s%iSK5$Jbjb|Z`jj9Z4&K$&+;RcJc_`{W(r%|6R1lC528>~Pc!#U{E6UA z_yS$nuB4`w;$+=kCy*BV#(op86E=`y9+9|z!EF)MZ_T|3?z{v7NEdrYPtSksj{ZrM zisqp>;~NBA%hAhTrw0WDXPd z|5P?-?6Wy@&F0*{n$1xmGUjpa6z|H$GBoXBP+{k_jezMiic3$na-~a*F8IRA`Y0eQ zeD-JdFG~6l^#3nQ{M>I${DiAYSp_}ZiRl^1&L|bXvbMVYMINywQ$!&_U(7=s_QJNB z$v>Sr5-Zur7#T)|pJKS&wGA2DU`S z7l-k1;Hm7CiHW{$A4`fuuS+*T=XOoY8ucz`EhhONkc@VgqihiN8qXm5WdeDpu|s;Y z$u(YI8%87x7HC7rdvE1HE}DJ0Egk_RLIiA=pK;OaY~vnyEbcrjYjacRuHhlKegpvz zQbXC_lR{r06RCQh?&5-do)+|hZ2z>9c7wS!h@CyR3LVWmO1*}<`I@;1SLI6%pQ}woPux85pLKyz+;ejz7 zymG5y$B0*)#dtM!&ub_0U)~xe3bTY2nnfQ%->Di2TtRQzvo#kndLBmKOi6OuSFd8X4mm{^ z6q{w6hgY#|-Ziu2ck`NgwEn>CNsA@u3^TR_$Y?D{dLwOHIA`wM1`U%zTpL`o+~nsoTAG{?ipe7Rg$@mPQef`>(g3`>LC9atp!nJ}AvE zM{kFwu`77-UsvDqyYf3wwdBPqBKZvUR{_&N_BpnOEo!Bu_Y%H%6;l=QjI%at=oU&`q{RCfl0 zfhRy;2Zp|PhYGtn`2wYo4{7o~G`UvPT9?26wUbHd!Dg8A8N%vY2q?`)*YrD(jS;@8!Ltf9pAbygdM z5*v1V>(Icn>}mVKw{WzU4^<6Y|Ie}x>UN0Ul+Ke}_eCApe;Vt}Rp>QaDEcCK5&`sO z6#|1!?DVJ6^v0fXRLu2!;?b2Gq;Ngde{Hd34@!E;Ta4^SpAzl+X3Kz0SYZlZ5nT>SnaIl1cIoF-SF!4rCMb$WZ= z1s^EP#TOz(qqd4jRj2kqfjUq;XArT#r%&b5Yqko1V`sx~q8=7cYVBtGdtsrKy#+k7$I6F`&SD#5+q_D~ z*z-J<$~wv>RL@JJ!?DA1<~HWma9l2zTQ^jj@md_(b!|LlhBkjz-7n0Mkdhp}wkZdf zr&nJHl7#R^(jA;4zyJC6OLs7GSKCE*a5LYpn$VzBf_~_G;o|=l0%BjW%K45CX@bI$*F?b{hnh?mu{?=;V4n;|VNxF2yM_ zF<6^{+li!2PPMrQdsl^SRjyG%Gz?KFbG1}hKSNufFTBo2NWgdvid<>FR_%cs2un;#l&W%1i-aY>1a5o$>VYI=f#J?WyAN_K?1IrNgGlgi_pDD>j%E40Oq^;5F zMnsEzpNT4lY&mr9WwUHTg^o>5@ZY)@CxdR%gSOZ8&7gfQm5!i*G7j*!#39e3xp&^KNN&Mq%K zi=dauah~FW0^;SQ4!p(qEiz+2s6#l*z>GH|w~hCM+WLX~c=>7f@&ZBxj13A7epITa8bzgnE22Ec5tUbtHlf@{N?iect;#ZSG3DVo{HSQiF z$E~;%R+W^1QfYm?4`>T&L1QJ|PG00hHBAZ%4Em{?!r0)K>M~m>)ANHUU@}3$9@tWE zi5qnV{h*`8gaeuR`SqD4PG}5(Dp~DkdMjrt3dFs4Q`KQshFujtT0(lk$Mxr{I{QWu zNlS!jGaD!aYN)+vDo>5CvNDTEIkzM>zU)&Mu+BLY<@K{_qVvR-Ym5+|Gj#t+@lGoa zVamh8`aXY!7_n|hUP>*p3m0|}IF_^;TG4!Q9Dnug*|~0+5m$cCC%#xKB>iYJ0#Ji{ zacA47=YvU&9|{(BI9*05#$)_YZWYU|Y>E|HJJPy;^sC8LnJaKeFc@NGoM|0gd`z+x z>Vvr4lp0iZwuQ6S>Za@arPa>zkSQ~UbGK%TDXGFmLg&~lG<|N>;GhgpRGz(32x#;MaqiLtg@jJ>5gl&0>BU-bWZw!Jx%Dqp zvL75Dnlt5k_hRaS_%1frRiSK^peb6LAxV~@%GL!Y<1^uRW02Pre;_NKONyu2O#9VJVhOTg@Y&Qn0&3V zm}^?>f;IH&B|@?IT6!~zw$b-Eg~{Y?_a6Od?z=C_$86@D1?L?8!<{y%!>te(g$vEU z=f$85+A_$-FkBusBF3I|?i@CfSPfnHS0LWM$o`HvaJn^o%pUVR!UNdIF7xL)=?d6* zaC6x;*p_?7953WzigD*ci(_maDA;2rvc$@LZBPgvJ~y6c$)|A8wdPsfwrFg?m5-c> z1Qpx_Cw@IAA@5R0-c8ClUo696{ru=mRQJw>n$zr2q!>ZREB4R8HMK=;7%>Lm4#mgp z8HBQ68v(psHQj;Q)!l^Ck>XAl_^fSn55EpGYjcA@Ir@<+%Jlw?_bY z?x~TQ-u?O^v!jLSuV_v6eCRU9Tln+sWjBor%q+;l3GzJ@wmxl3<}H`9n0ZP{9bf)f zWj^9rWMpWVs9Zcrc9Oa91`~)O&(sm}{0NyGZ{4X9Pzv60Qn>4J2DFj+Ty!@|T?(@eL3_>3Z4un69uM&yeP1IS}3JY)&LJms7vCkcH8+T9qX z+8BJqB%fK<)z7_Hi0HM7P$RltXBRL#WEF{g(Dez{#l31u#VF3XELR#(V?IH(G5p&3 zf6lvKhF6zKuX}zD%%=PI-};xNZMk+K7!|#SqwdMzd@%Z&e7fqMi(Y|qNAkER;v>^0 zJK+ZHKN7v@mc?!2Zz4n^pizg1MNzP*FOLqpI~$oNJ(Po9=WDtfo@+YRh-eX!1r!py z_B<{>@R2o|uiQoD&ZzTeeeSLAUW(dxm0T7N`PBJ=&vh9SIuFHOX~)zXy};kd1%(5S zC00{jroQ=xXPs|H+fIapVX+j*J% z^8Xs+-=OiD=(bn!L*Q}$E|4)VbVS@ur_;HieXxT2oum+X zEJWSK-RGGwE!S&P8PCRJq#)boVs%e_!$6US7O1;i>~Nb;x0Er*MD;Kq2^ zs!WO9fgZ+4_dV8e5ue9|Tw3@nxZqP!qEYi_d1C>`M3Sjea+wxNSx&7Sshw6q-sHaV znQh{Up?+?r%F+hn24{?1s`gbB+N#}3T$aeIVm911pL$vO!SG|Z zH@dp{%3JWfTs>8>kf*v}e@c5v&DrQYsr*j3)FPxEsXU}pSRmqPA9QAdnV*=j32B}A zgM0s6DN}eq__T-ck$V2oz)ZTe{Vo+X^#8y;ca7Gq+~;}ZExt=JIw{YdpUK2bkdG88 z_D0Gz;w*jbJ``1;e3sc2=9?`9ZUCi4Ri^3CYM#e~h z(pc!)i|LXsXu|)WjnNYm0v6*JcUNb|HOoE$0(Q|p6{3KMU@{)ex()QeoT2oT=CeEW zb-90jgVu3E5~I)4)Vh+Kn$}v2#s@eo0@UE^va5IlWXRLQ@9g;FCuM9*fNm}4C&3)m z!ejpO+LfE~y5!h~vIacrePrbekb)0p#6_K{>Kj+(mOtvTL&i{U`VxefG2?UKihka;eJH9$=`B+qC za+!V0YNEn@%Dd@Mla#9P*jA+^^i8RSqD_NlAv&BN0i3FH6ZYv)C6V-1v8l%7N;8h< zW(e8}iuwRImd6BoH9kh%4;2@cT0SB$B6VQtYhf&fRbB#yIeiI0;U`JP&+5 zxwnK6#C|ECR!I?=k5vAt8(^*cwLK zDpM@_yA*;$UX#6#$H(vJ3J61X#tM%c1K&#As0%wYCVYKWRyvi<-N$t(&7;1fEiWN$yLCi&6xv>eby=ojsxtJ2JK=85ERS z%=ZmxX}l#R<;e6&rB@J^$d|N0?@E7xjGg|qa?HBca`@CP*rDW=t-&iV|!iAYI!Hfc?ix9E$y)0XVokc zne_WuCGEmKmJ`rv_zGTu$ulbCK5;~&s1e+2$po(6O@D}F`Y<^kd>kcbSJ&>3S09s) zw>P7+K7f?Nesa?tATWWRNULic6=E_Fz!H8=F*F#HjMLjO~2w$baItUl;$+!X0$*c085;q@o_L(c*viK-Nla3A10Oe*Mfg6gG_6y-*YbdOry z9t&c4^lZB4-f2EVQeAE``5x?-Y>uqPy)56=q9CIwK0r)L+-cv#zN?-@I|IOd)cz=7 zgZ7pMd*b=AQEKwd|8|6CN8Lg+r~v;nnIN1b0&Z$8!;5ILB>t6+o2PPax}SkzAlH_4 zOW(U);?MWOOHF=81QbQ#1|~y%{oRAZn)v`L|6uy0`N?;l&ohr?3Msb$k4))J@rtop zBqhk3iGcZ9?F-iPpDY-(%|5p7+PTW}_{Ll<=m2^zwVeCRf@B|PyxdC7mtv&vxrov- zFPUM}Ba@^zMO9gT;bmZ)+;(=RC0q%Wg!AVcnU9kkS`1)8xewxl*bK&u<2C}d6Nf%s zokm@kxb{lL^3L;@fQG$S;1~@SQMdq{SxNyrSg5V?0zO-EkuXm&L8Gyv z?7#_Le-nUNl(47&cevZxwk{80ia*>pSyZ$Gzk8Q_x)|ZR-A@W?6@R}if1irZ1wcmF zqv1BbFCG|XDm#YA#6_wSYC9kRffn0mRg#&oTVJTi%+1w>_xJf5Gu7B;bLD#ls8G!hf_Z~RiYzcuTPdr8t12oW1@ z5rL{t+-qe2*IdTTK&g5|i@}q^P?bkwYr#$f>AWg*OV!Vr3qmSpi4-a8z z7sqlNbV-WRNTpe>XZJl-E*!1FD+omsvyLcAJ>`=%5OU+FZ*W{c1tUd6tT+Lw16KT5 zDM=x*CafjK>gCs(oz5$soZ5HhJeLA;30^Czb#z5$i>dFEj4rar)kD5^8rD6&*7fSv zn}*v?72prB9+FR?b1y}C_|#TuHr&Y^F&C^4u^euPpq?MHhu+z*g{gwg67ao)=*8<- zvE8WbJ0!-5bQVTbqTd{bO4>+G79w#vPlknak}68vaMVS*3MdnuUJQpe4)1!=H{GH0LhaB` zl7Ri$l$LJFJ5i&idZYQWv@HF`Mr-Axf;aV;L?L-iUAO z2F|Rp=Hh&gDpm*Dw`N>^)JsizFja_>2ATBcjQiRyK|304v0Mog81fQP-Pv-3)G<hB@x=Xu7%Q{%@R?Z;2y{jGw8M)wXKXi*u^4!kNn z)Aw9azS6FL`g2o7^V2X{dtCgBQCuf%g>-1e=iS})quRLbBSUC563!Xj;kz)AlIFIT z`tAq9St$MgV3{^LCyv4 zKPF8$ss2k=i{!X-ymz?4c4kX$#FsP;fAVNM7@5FxMJi}a0*qhN9R7a+Z2Pe*lIRBj D3Z%dl literal 0 HcmV?d00001 diff --git a/debian/NEWS b/debian/NEWS new file mode 100644 index 0000000..cb020ae --- /dev/null +++ b/debian/NEWS @@ -0,0 +1,8 @@ +python-whoosh (0.1.22-1) unstable; urgency=low + + When upgrading from earlier python-whoosh versions, you will have to + reindex all data, as the index format has changed. How this has to + happen is application-specific. + + -- Daniel Watkins Sat, 06 Jun 2009 13:35:15 +0100 + diff --git a/debian/README.source b/debian/README.source new file mode 100644 index 0000000..5dde0bf --- /dev/null +++ b/debian/README.source @@ -0,0 +1,58 @@ +This package uses quilt to manage all modifications to the upstream +source. Changes are stored in the source package as diffs in +debian/patches and applied during the build. + +To configure quilt to use debian/patches instead of patches, you want +either to export QUILT_PATCHES=debian/patches in your environment +or use this snippet in your ~/.quiltrc: + + for where in ./ ../ ../../ ../../../ ../../../../ ../../../../../; do + if [ -e ${where}debian/rules -a -d ${where}debian/patches ]; then + export QUILT_PATCHES=debian/patches + break + fi + done + +To get the fully patched source after unpacking the source package, cd to +the root level of the source package and run: + + quilt push -a + +The last patch listed in debian/patches/series will become the current +patch. + +To add a new set of changes, first run quilt push -a, and then run: + + quilt new + +where is a descriptive name for the patch, used as the filename in +debian/patches. Then, for every file that will be modified by this patch, +run: + + quilt add + +before editing those files. You must tell quilt with quilt add what files +will be part of the patch before making changes or quilt will not work +properly. After editing the files, run: + + quilt refresh + +to save the results as a patch. + +Alternately, if you already have an external patch and you just want to +add it to the build system, run quilt push -a and then: + + quilt import -P /path/to/patch + quilt push -a + +(add -p 0 to quilt import if needed). as above is the filename to +use in debian/patches. The last quilt push -a will apply the patch to +make sure it works properly. + +To remove an existing patch from the list of patches that will be applied, +run: + + quilt delete + +You may need to run quilt pop -a to unapply patches first before running +this command. diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..9b44d70 --- /dev/null +++ b/debian/changelog @@ -0,0 +1,391 @@ +python-whoosh (2.7.0-1) unstable; urgency=medium + + * New upstream release. + * Update watch file. + Thanks to Piotr Ożarowski + * debian/copyright: Update copyright years. + * debian/upstream/metadata: Added upstream metadata. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Thu, 07 May 2015 13:22:20 +0200 + +python-whoosh (2.5.7-3) unstable; urgency=medium + + [ أحمد المحمودي (Ahmed El-Mahmoudy) ] + * Update my email address. + * debian/control: Bumped Standards-Version to 3.9.6 + + [ Jean-Michel Nirgal Vourgère ] + * Change python-whoosh.maintscript into python-whoosh-doc.maintscript + /usr/share/doc/python-whoosh-doc was a link to python-whoosh. Fixed + 'prior-version' as the current one, see dpkg-maintscript-helper(1). Drop + optional 'package' since where are not using maintscript but + python-whoosh-doc.maintscript. Drop unused Pre-Depends on dpkg with support + of symlink_to_dir, added missing Pre-Depends on misc:Pre-Depends in + python-whoosh-doc (Closes: #768275) + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sun, 16 Nov 2014 12:16:23 +0200 + +python-whoosh (2.5.7-2) unstable; urgency=medium + + [ Zygmunt Krynicki ] + * debian/rules: convert to pybuild, simplify all rules + * debian/control: add support for python3, depend on dh-python + (Closes: #647439) + * debian/python-whoosh.install: remove (not needed anymore) + * debian/control: build-depend on python3-sphinx for documentation + + [ أحمد المحمودي (Ahmed El-Mahmoudy) ] + * Moved packaging to collab-maint. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Tue, 22 Jul 2014 11:53:36 +0200 + +python-whoosh (2.5.7-1) unstable; urgency=low + + * New upstream release. + * debian/copyright: Update copyright years. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sat, 22 Feb 2014 10:06:33 +0200 + +python-whoosh (2.5.6-3) unstable; urgency=low + + * Added debian/python-whoosh.maintscript to switch + /usr/share/doc/python-whoosh symlink to a real directory + (Closes: #736299) + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sat, 25 Jan 2014 11:53:46 +0200 + +python-whoosh (2.5.6-2) unstable; urgency=low + + * Remove override for dh_fixperms, seems to be no more needed. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Mon, 23 Dec 2013 22:51:23 +0200 + +python-whoosh (2.5.6-1) unstable; urgency=low + + * New upstream release. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sat, 21 Dec 2013 16:05:11 +0200 + +python-whoosh (2.5.5-1) unstable; urgency=low + + * New upstream release. + * debian/control: Bumped Standards-Version to 3.9.5 + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Thu, 14 Nov 2013 13:36:18 +0200 + +python-whoosh (2.5.4-1) unstable; urgency=low + + * New upstream release. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Mon, 23 Sep 2013 21:44:53 +0200 + +python-whoosh (2.5.3-1) unstable; urgency=low + + * New upstream release. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Fri, 13 Sep 2013 04:39:56 +0200 + +python-whoosh (2.5.2-1) unstable; urgency=low + + * New upstream release. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sat, 17 Aug 2013 18:20:03 +0200 + +python-whoosh (2.5.1-1) unstable; urgency=low + + * New upstream release. + * debian/control: + + Remove Daniel Watkins from Uploaders field, since he seems to be MIA + (Closes: #705280) + + Bumped Standards-Version to 3.9.4 + + Use canonical URIs in Vcs-* fields + + Remove obsolete DMUA fields + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Fri, 05 Jul 2013 00:19:56 +0200 + +python-whoosh (2.4.1-1) unstable; urgency=low + + * New upstream release. + * Removed test_final_ranges_thisyear.diff: fixed usptream. + * debian/control: Updated Standards-Version to 3.9.3 + * Bumped compat level to 9 + * debian/copyright: Updated copyright format & years. + * Un-link python-whoosh-doc documentation directory from python-whoosh + documentation directory: + + debian/rules: remove override for dh_installdocs + + Update python-whoosh-doc.doc-base + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Thu, 26 Jul 2012 13:45:44 +0200 + +python-whoosh (2.3.2-2) unstable; urgency=low + + * Added test_final_ranges_thisyear.diff to fix the "oct 2010 to feb" date + range test (Closes: #655641) + * debian/control: Updated upstream URL + * debian/copyright: + + Updated copyright years + + Updated upstream URL + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Fri, 13 Jan 2012 11:32:18 +0200 + +python-whoosh (2.3.2-1) unstable; urgency=low + + * New upstream release. + * Build-Dep on python-sphinx (>= 1.0.7+dfsg) + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Fri, 16 Dec 2011 15:54:19 +0200 + +python-whoosh (2.3.0-1) unstable; urgency=low + + * New upstream release. + * Updated copyright format & info + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sat, 08 Oct 2011 16:39:48 +0200 + +python-whoosh (2.2.2-1) unstable; urgency=low + + * New upstream release. (Closes: #638765) + * Moved packaging to Git. + + debian/control: Remove DPMT from package maintainership due to the move + to Git. Put myself as maintainer instead + * Split documentation into python-whoosh-doc package + + debian/control: + - Added python-whoosh-doc package + - Added Suggests: python-whoosh-doc for python-whoosh + + debian/rules: Symlink python-whoosh-doc documentation directory to + python-whoosh documentation directory. + + Renamed debian/python-whoosh.{docs,doc-base} to + debian/python-whoosh-doc.{docs,doc-base} + + Added debian/python-whoosh.install + * debian/control: XS-Python-Version -> X-Python-Version + * Use sphinxdoc debhelper instead of managing symlinks myself + + debian/control: Replace libjs-jquery with ${sphinxdoc:Depends} + + debian/rules: Add sphinxdoc debhelper sequence to dh call + + Removed debian/python-whoosh.links + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Fri, 26 Aug 2011 10:26:44 +0200 + +python-whoosh (1.8.4-1) unstable; urgency=low + + * New upstream release. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sat, 04 Jun 2011 19:44:07 +0200 + +python-whoosh (1.8.2-1) unstable; urgency=low + + * New upstream release. + * debian/control: Bumped Standards-Version to 3.9.2 + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Tue, 19 Apr 2011 08:59:07 +0200 + +python-whoosh (1.8.1-1) unstable; urgency=low + + * New upstream release + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Mon, 04 Apr 2011 20:21:30 +0200 + +python-whoosh (1.8.0-1) unstable; urgency=low + + * New upstream release + * Remove fix_test_combine.diff patch, as it is applied upstream. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Thu, 24 Mar 2011 10:50:58 +0200 + +python-whoosh (1.7.8-1) unstable; urgency=low + + * New upstream release. + * Removed use_nose.diff & shm_check.diff as they are applied upstream. + * Added fix_test_combine.diff patch from upstream to fix test_combine test. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sat, 19 Mar 2011 10:15:14 +0200 + +python-whoosh (1.7.6-1) unstable; urgency=low + + * New upstream release. + * debian/control: added python-nose to B-D-I + * Added use_nose.diff patch from upstream to switch setuptools "test_suite" + key to use Nose integration. + * Refreshed shm_check.diff patch. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sat, 26 Feb 2011 23:57:15 +0200 + +python-whoosh (1.7.4-1) unstable; urgency=low + + * New upstream release. + * Removed fix_methodcaller_import.diff and fix_test_colonspace.diff patches, + as they are applied upstream. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Tue, 22 Feb 2011 21:30:12 +0200 + +python-whoosh (1.7.2-1) unstable; urgency=low + + * New upstream release. + * Added fix_methodcaller_import.diff patch which fixes import of + methodcaller, which was only added in Python 2.6. + * Added fix_test_colonspace.diff patch which fixes test_colonspace failure. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Fri, 18 Feb 2011 10:02:16 +0200 + +python-whoosh (1.4.1-1) unstable; urgency=low + + * New upstream release. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Fri, 10 Dec 2010 10:54:37 +0200 + +python-whoosh (1.2.6-2) unstable; urgency=low + + * debian/patches/shm_check.diff: try importing multiprocessing.synchronize + to check for ImportError, this is to avoid FTBFS against python 2.7 + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Thu, 11 Nov 2010 12:46:49 +0200 + +python-whoosh (1.2.6-1) unstable; urgency=low + + * New upstream release + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Fri, 05 Nov 2010 09:53:55 +0200 + +python-whoosh (1.2.5-1) experimental; urgency=low + + * New upstream release. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sat, 30 Oct 2010 05:54:31 +0200 + +python-whoosh (1.2.3-1) experimental; urgency=low + + * New upstream release. + * Refreshed shm_check.diff patch. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Fri, 29 Oct 2010 08:07:24 +0200 + +python-whoosh (1.1.0-1) experimental; urgency=low + + * New upstream release + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Fri, 22 Oct 2010 14:30:44 +0200 + +python-whoosh (1.0.0-1) experimental; urgency=low + + [ Jakub Wilk ] + * Use ‘chmod a-x’ rather than ‘chmod -x’ in debian/rules. + * Don't ignore errors while running tests. + * Respect the ‘nocheck’ build option. + * Remove embedded copies of pyparsing for all Python versions. + + [ أحمد المحمودي (Ahmed El-Mahmoudy) ] + * New upstream release + * Bumped compat level to 8. + * debian/control: + + Updated my email address. + + Bumped Standards-Version to 3.9.1 (no changes needed) + + Dropped python-pyparsing from Depends & Build-Deps, since it is not used + anymore. + + Add XS-Python-Version field. + + Drop python-support from Build-Deps + + Bumped python-all Build-Dep to (>= 2.6.6-2) + + Added Breaks: ${python:Breaks}, to avoid getting + python (<= ) in Depends. + * debian/rules: + + added --with python2 to dh call. + + Removed override for dh_pysupport, not needed anymore. + + Override dh_auto_clean to remove docs/build + * debian/copyright: updated copyrights. + * Dropped 01-remove-pyparsing.diff patch, as it is no more needed. + * Added shm_check.diff patch to check if semaphore locking works, since + /dev/shm is not mounted as tmpfs in build chroots. + * Removed debian/pyversions + + [ Bernd Zeimetz ] + * Adding DM-Upload-Allowed: yes for أحمد المحمودي. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Thu, 07 Oct 2010 20:26:14 +0200 + +python-whoosh (0.3.18-1) unstable; urgency=low + + * New upstream release. + * debian/control: + + Added python-sphinx to Build-Depends-Indep to build documentation. + + Added libjs-jquery to Depends. + + Added python-all to Build-Depends-Indep. + * debian/rules: + + Override dh_auto_test to run test suite. + + Override dh_auto_build to also build documentation. + + Override dh_compress to avoid compressing Whoosh documentation files. + + Override dh_installdocs to avoid installing convenience copy of + jquery.js. Instead, it is symlinked from the libjs-jquery package. + + Logic added for fixing permission of PKG-INFO file such that it would + work for both Debian & Ubuntu. + * Added debian/python-whoosh.docs, debian/python-whoosh.links, + debian/python-whoosh.doc-base + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Thu, 25 Feb 2010 08:49:55 +0200 + +python-whoosh (0.3.16-1) unstable; urgency=low + + * New upstream release. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Mon, 15 Feb 2010 11:51:28 +0200 + +python-whoosh (0.3.15-1) unstable; urgency=low + + [ أحمد المحمودي (Ahmed El-Mahmoudy) ] + * New upstream release. + * debian/rules: Override dh_fixperms to remove executable bit from files in + /usr/share/pyshared/*.egg-info/ + * Refresh 01-remove-pyparsing.diff patch. + * Switched to 3.0 (quilt) source format. + * debian/control: + + Bumped Standards-Version to 3.8.4 + + Added myself to uploaders + + [ Bernd Zeimetz ] + * Switch Uploaders and Maintainers in debian/control. + + -- أحمد المحمودي (Ahmed El-Mahmoudy) Sun, 31 Jan 2010 13:51:02 +0200 + +python-whoosh (0.3.2-1) unstable; urgency=low + + [ Bernd Zeimetz ] + * New upstream release. + * Fix watch file to avoid cluttered versions. + * Bump versions in build-deps to ensure that dh overrides and + --with-quilt works + * Drop tests part from debian/rules completely, there is no test + in the source anymore. + * Add debian/README.source. + * Bump Standards-Version to 3.8.3, no changes needed. + + -- Debian Python Modules Team Thu, 05 Nov 2009 11:09:36 +0100 + +python-whoosh (0.3.~0b24-1) experimental; urgency=low + + [ Bernd Zeimetz ] + * New upstream release. + * Whoosh is not compatible with 2.4 thanks to pickling problems, + drop compat patch and limit versions in debian/pyversions. + * Update watch file. + * Disable test for now as they're broken/not existant. + + -- Debian Python Modules Team Fri, 02 Oct 2009 16:51:08 +0200 + +python-whoosh (0.1.22-1) unstable; urgency=low + + * New upstream release. + * Changed Maintainer to my credativ email address. + * Refreshed debian/patches/02-python2.4-fixes.diff. + + -- Daniel Watkins Sat, 06 Jun 2009 13:38:47 +0100 + +python-whoosh (0.1.19-2) unstable; urgency=low + + * Add debian/patches/01-remove-pyparsing.diff to remove the upstream copy of + the python-pyparsing library. + * Add python-pyparsing to Build-Depends-Indep + + -- Daniel Watkins Thu, 07 May 2009 16:44:18 +0100 + +python-whoosh (0.1.19-1) unstable; urgency=low + + * Initial release. (Closes: #522934) + + -- Daniel Watkins Fri, 01 May 2009 12:27:22 +0100 diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..ec63514 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +9 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..e7b736a --- /dev/null +++ b/debian/control @@ -0,0 +1,60 @@ +Source: python-whoosh +Section: python +Priority: optional +Maintainer: أحمد المحمودي (Ahmed El-Mahmoudy) +Build-Depends: debhelper (>= 9), dh-python, python-setuptools, python3-setuptools +Build-Depends-Indep: python3-sphinx (>= 1.0.7+dfsg), python-all (>= 2.6.6-2), python3-all, python-pytest, python3-pytest +Standards-Version: 3.9.6 +Homepage: http://bitbucket.org/mchaput/whoosh/ +X-Python-Version: >= 2.5 +X-Python3-Version: >= 3.2 +Vcs-Git: git://anonscm.debian.org/collab-maint/python-whoosh.git +Vcs-Browser: http://anonscm.debian.org/gitweb/?p=collab-maint/python-whoosh.git + +Package: python-whoosh +Architecture: all +Depends: ${python:Depends}, ${misc:Depends} +Suggests: python-whoosh-doc +Description: pure-Python full-text indexing, search, and spell checking library (Python 2) + Whoosh is a fast, pure-Python indexing and search library. Programmers + can use it to easily add search functionality to their applications and + websites. As Whoosh is pure Python, you don't have to compile or + install a binary support library and/or make Python work with a JVM, yet + indexing and searching is still very fast. Whoosh is designed to be + modular, so every part can be extended or replaced to meet your needs + exactly. + . + This package contains the python2 library + +Package: python3-whoosh +Architecture: all +Depends: ${python3:Depends}, ${misc:Depends} +Suggests: python-whoosh-doc +Description: pure-Python full-text indexing, search, and spell checking library (Python 3) + Whoosh is a fast, pure-Python indexing and search library. Programmers + can use it to easily add search functionality to their applications and + websites. As Whoosh is pure Python, you don't have to compile or + install a binary support library and/or make Python work with a JVM, yet + indexing and searching is still very fast. Whoosh is designed to be + modular, so every part can be extended or replaced to meet your needs + exactly. + . + This package contains the python3 library + +Package: python-whoosh-doc +Architecture: all +Section: doc +Priority: extra +Pre-Depends: ${misc:Pre-Depends} +Depends: ${misc:Depends}, ${sphinxdoc:Depends} +Replaces: python-whoosh (<< 2.1.0) +Description: full-text indexing, search, and spell checking library (doc) + Whoosh is a fast, pure-Python indexing and search library. Programmers + can use it to easily add search functionality to their applications and + websites. As Whoosh is pure Python, you don't have to compile or + install a binary support library and/or make Python work with a JVM, yet + indexing and searching is still very fast. Whoosh is designed to be + modular, so every part can be extended or replaced to meet your needs + exactly. + . + This package contains the library documentation for python-whoosh. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..09c67de --- /dev/null +++ b/debian/copyright @@ -0,0 +1,144 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0 +Upstream-Name: Whoosh +Upstream-Contact: Matt Chaput +Source: http://bitbucket.org/mchaput/whoosh/ + +Files: * +Copyright: 2007-2012 Matt Chaput +License: BSD-2-clause + +Files: debian/* +Copyright: 2009 Daniel Watkins + 2010-2015 أحمد المحمودي (Ahmed El-Mahmoudy) +License: BSD-2-clause + +License: BSD-2-clause + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + . + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + . + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + . + THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + . + The views and conclusions contained in the software and documentation are + those of the authors and should not be interpreted as representing official + policies, either expressed or implied, of Matt Chaput. + +Files: src/whoosh/lang/porter2.py +Copyright: 2008 Michael Dirolf +License: Expat + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following + conditions: + . + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + +Files: src/whoosh/support/relativedelta.py +Copyright: 2003-2010 Gustavo Niemeyer +License: PSF + 1. This LICENSE AGREEMENT is between the Python Software Foundation + ("PSF"), and the Individual or Organization ("Licensee") accessing and + otherwise using this software ("Python") in source or binary form and + its associated documentation. + . + 2. Subject to the terms and conditions of this License Agreement, PSF + hereby grants Licensee a nonexclusive, royalty-free, world-wide + license to reproduce, analyze, test, perform and/or display publicly, + prepare derivative works, distribute, and otherwise use Python + alone or in any derivative version, provided, however, that PSF's + License Agreement and PSF's notice of copyright, i.e., "Copyright (c) + 2001, 2002, 2003, 2004, 2005, 2006, 2007 Python Software Foundation; + All Rights Reserved" are retained in Python alone or in any derivative + version prepared by Licensee. + . + 3. In the event Licensee prepares a derivative work that is based on + or incorporates Python or any part thereof, and wants to make + the derivative work available to others as provided herein, then + Licensee hereby agrees to include in any such work a brief summary of + the changes made to Python. + . + 4. PSF is making Python available to Licensee on an "AS IS" + basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR + IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND + DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS + FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT + INFRINGE ANY THIRD PARTY RIGHTS. + . + 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON + FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS + A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, + OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + . + 6. This License Agreement will automatically terminate upon a material + breach of its terms and conditions. + . + 7. Nothing in this License Agreement shall be deemed to create any + relationship of agency, partnership, or joint venture between PSF and + Licensee. This License Agreement does not grant permission to use PSF + trademarks or trade name in a trademark sense to endorse or promote + products or services of Licensee, or any third party. + . + 8. By copying, installing or otherwise using Python, Licensee + agrees to be bound by the terms and conditions of this License + Agreement. + +Files: src/whoosh/support/unicode.py +Copyright: 1991-2008 Unicode, Inc +License: Other + Permission is hereby granted, free of charge, to any person obtaining a copy + of the Unicode data files and any associated documentation (the "Data Files") + or Unicode software and any associated documentation (the "Software") to deal + in the Data Files or Software without restriction, including without + limitation the rights to use, copy, modify, merge, publish, distribute, and/or + sell copies of the Data Files or Software, and to permit persons to whom the + Data Files or Software are furnished to do so, provided that (a) the above + copyright notice(s) and this permission notice appear with all copies of the + Data Files or Software, (b) both the above copyright notice(s) and this + permission notice appear in associated documentation, and (c) there is clear + notice in each modified Data File or in the Software as well as in the + documentation associated with the Data File(s) or Software that the data or + software has been modified. + . + THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD + PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN + THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL + DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR + PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE + DATA FILES OR SOFTWARE. + . + Except as contained in this notice, the name of a copyright holder shall not + be used in advertising or otherwise to promote the sale, use or other dealings + in these Data Files or Software without prior written authorization of the + copyright holder. diff --git a/debian/python-whoosh-doc.doc-base b/debian/python-whoosh-doc.doc-base new file mode 100644 index 0000000..161408a --- /dev/null +++ b/debian/python-whoosh-doc.doc-base @@ -0,0 +1,10 @@ +Document: python-whoosh +Title: Whoosh documentation +Author: Matt Chaput +Abstract: This documentation describes what Whoosh is and how it can be used to + develop custom search engines for your content. +Section: Programming/Python + +Format: HTML +Index: /usr/share/doc/python-whoosh-doc/html/index.html +Files: /usr/share/doc/python-whoosh-doc/html/*.html diff --git a/debian/python-whoosh-doc.docs b/debian/python-whoosh-doc.docs new file mode 100644 index 0000000..ef1c0c7 --- /dev/null +++ b/debian/python-whoosh-doc.docs @@ -0,0 +1 @@ +docs/build/html/ diff --git a/debian/python-whoosh-doc.maintscript b/debian/python-whoosh-doc.maintscript new file mode 100644 index 0000000..3e276e1 --- /dev/null +++ b/debian/python-whoosh-doc.maintscript @@ -0,0 +1 @@ +symlink_to_dir /usr/share/doc/python-whoosh-doc python-whoosh 2.5.7-2.1~ diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..afabdd0 --- /dev/null +++ b/debian/rules @@ -0,0 +1,16 @@ +#!/usr/bin/make -f +export PYBUILD_NAME=whoosh + +%: + dh $@ --with=python2,python3,sphinxdoc --buildsystem=pybuild + +override_dh_auto_build: + dh_auto_build --buildsystem=pybuild + python3 setup.py build_sphinx + +override_dh_auto_clean: + dh_auto_clean + rm -rf docs/build + +override_dh_compress: + dh_compress -Xdoc/python-whoosh/html diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/debian/upstream/metadata b/debian/upstream/metadata new file mode 100644 index 0000000..6d41d69 --- /dev/null +++ b/debian/upstream/metadata @@ -0,0 +1,5 @@ +Bug-Database: https://bitbucket.org/mchaput/whoosh/issues +Contact: matt@whoosh.ca +Homepage: http://bitbucket.org/mchaput/whoosh +Repository: https://bitbucket.org/mchaput/whoosh +Repository-Browse: https://bitbucket.org/mchaput/whoosh/src diff --git a/debian/watch b/debian/watch new file mode 100644 index 0000000..3a4da48 --- /dev/null +++ b/debian/watch @@ -0,0 +1,3 @@ +version=3 +opts=uversionmangle=s/(rc|a|b|c)/~$1/ \ +http://pypi.debian.net/Whoosh/Whoosh-(.+)\.(?:zip|tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz))) \ No newline at end of file diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst new file mode 100644 index 0000000..27297f6 --- /dev/null +++ b/docs/source/analysis.rst @@ -0,0 +1,329 @@ +=============== +About analyzers +=============== + +Overview +======== + +An analyzer is a function or callable class (a class with a ``__call__`` method) +that takes a unicode string and returns a generator of tokens. Usually a "token" +is a word, for example the string "Mary had a little lamb" might yield the +tokens "Mary", "had", "a", "little", and "lamb". However, tokens do not +necessarily correspond to words. For example, you might tokenize Chinese text +into individual characters or bi-grams. Tokens are the units of indexing, that +is, they are what you are able to look up in the index. + +An analyzer is basically just a wrapper for a tokenizer and zero or more +filters. The analyzer's ``__call__`` method will pass its parameters to a +tokenizer, and the tokenizer will usually be wrapped in a few filters. + +A tokenizer is a callable that takes a unicode string and yields a series of +``analysis.Token`` objects. + +For example, the provided :class:`whoosh.analysis.RegexTokenizer` class +implements a customizable, regular-expression-based tokenizer that extracts +words and ignores whitespace and punctuation. + +:: + + >>> from whoosh.analysis import RegexTokenizer + >>> tokenizer = RegexTokenizer() + >>> for token in tokenizer(u"Hello there my friend!"): + ... print repr(token.text) + u'Hello' + u'there' + u'my' + u'friend' + +A filter is a callable that takes a generator of Tokens (either a tokenizer or +another filter) and in turn yields a series of Tokens. + +For example, the provided :meth:`whoosh.analysis.LowercaseFilter` filters tokens +by converting their text to lowercase. The implementation is very simple:: + + def LowercaseFilter(tokens): + """Uses lower() to lowercase token text. For example, tokens + "This","is","a","TEST" become "this","is","a","test". + """ + + for t in tokens: + t.text = t.text.lower() + yield t + +You can wrap the filter around a tokenizer to see it in operation:: + + >>> from whoosh.analysis import LowercaseFilter + >>> for token in LowercaseFilter(tokenizer(u"These ARE the things I want!")): + ... print repr(token.text) + u'these' + u'are' + u'the' + u'things' + u'i' + u'want' + +An analyzer is just a means of combining a tokenizer and some filters into a +single package. + +You can implement an analyzer as a custom class or function, or compose +tokenizers and filters together using the ``|`` character:: + + my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() + +The first item must be a tokenizer and the rest must be filters (you can't put a +filter first or a tokenizer after the first item). Note that this only works if at +least the tokenizer is a subclass of ``whoosh.analysis.Composable``, as all the +tokenizers and filters that ship with Whoosh are. + +See the :mod:`whoosh.analysis` module for information on the available analyzers, +tokenizers, and filters shipped with Whoosh. + + +Using analyzers +=============== + +When you create a field in a schema, you can specify your analyzer as a keyword +argument to the field object:: + + schema = Schema(content=TEXT(analyzer=StemmingAnalyzer())) + + +Advanced Analysis +================= + +Token objects +------------- + +The ``Token`` class has no methods. It is merely a place to record certain +attributes. A ``Token`` object actually has two kinds of attributes: *settings* +that record what kind of information the ``Token`` object does or should contain, +and *information* about the current token. + + +Token setting attributes +------------------------ + +A ``Token`` object should always have the following attributes. A tokenizer or +filter can check these attributes to see what kind of information is available +and/or what kind of information they should be setting on the ``Token`` object. + +These attributes are set by the tokenizer when it creates the Token(s), based on +the parameters passed to it from the Analyzer. + +Filters **should not** change the values of these attributes. + +====== ================ =================================================== ========= +Type Attribute name Description Default +====== ================ =================================================== ========= +str mode The mode in which the analyzer is being called, '' + e.g. 'index' during indexing or 'query' during + query parsing +bool positions Whether term positions are recorded in the token False +bool chars Whether term start and end character indices are False + recorded in the token +bool boosts Whether per-term boosts are recorded in the token False +bool removestops Whether stop-words should be removed from the True + token stream +====== ================ =================================================== ========= + + +Token information attributes +---------------------------- + +A ``Token`` object may have any of the following attributes. The ``text`` attribute +should always be present. The original attribute may be set by a tokenizer. All +other attributes should only be accessed or set based on the values of the +"settings" attributes above. + +======== ========== ================================================================= +Type Name Description +======== ========== ================================================================= +unicode text The text of the token (this should always be present) +unicode original The original (pre-filtered) text of the token. The tokenizer may + record this, and filters are expected not to modify it. +int pos The position of the token in the stream, starting at 0 + (only set if positions is True) +int startchar The character index of the start of the token in the original + string (only set if chars is True) +int endchar The character index of the end of the token in the original + string (only set if chars is True) +float boost The boost for this token (only set if boosts is True) +bool stopped Whether this token is a "stop" word + (only set if removestops is False) +======== ========== ================================================================= + +So why are most of the information attributes optional? Different field formats +require different levels of information about each token. For example, the +``Frequency`` format only needs the token text. The ``Positions`` format records term +positions, so it needs them on the ``Token``. The ``Characters`` format records term +positions and the start and end character indices of each term, so it needs them +on the token, and so on. + +The ``Format`` object that represents the format of each field calls the analyzer +for the field, and passes it parameters corresponding to the types of +information it needs, e.g.:: + + analyzer(unicode_string, positions=True) + +The analyzer can then pass that information to a tokenizer so the tokenizer +initializes the required attributes on the ``Token`` object(s) it produces. + + +Performing different analysis for indexing and query parsing +------------------------------------------------------------ + +Whoosh sets the ``mode`` setting attribute to indicate whether the analyzer is +being called by the indexer (``mode='index'``) or the query parser +(``mode='query'``). This is useful if there's a transformation that you only +want to apply at indexing or query parsing:: + + class MyFilter(Filter): + def __call__(self, tokens): + for t in tokens: + if t.mode == 'query': + ... + else: + ... + +The :class:`whoosh.analysis.MultiFilter` filter class lets you specify different +filters to use based on the mode setting:: + + intraword = MultiFilter(index=IntraWordFilter(mergewords=True, mergenums=True), + query=IntraWordFilter(mergewords=False, mergenums=False)) + + +Stop words +---------- + +"Stop" words are words that are so common it's often counter-productive to index +them, such as "and", "or", "if", etc. The provided ``analysis.StopFilter`` lets you +filter out stop words, and includes a default list of common stop words. + +:: + + >>> from whoosh.analysis import StopFilter + >>> stopper = StopFilter() + >>> for token in stopper(LowercaseFilter(tokenizer(u"These ARE the things I want!"))): + ... print repr(token.text) + u'these' + u'things' + u'want' + +However, this seemingly simple filter idea raises a couple of minor but slightly +thorny issues: renumbering term positions and keeping or removing stopped words. + + +Renumbering term positions +-------------------------- + +Remember that analyzers are sometimes asked to record the position of each token +in the token stream: + +============= ========== ========== ========== ========== +Token.text u'Mary' u'had' u'a' u'lamb' +Token.pos 0 1 2 3 +============= ========== ========== ========== ========== + +So what happens to the ``pos`` attribute of the tokens if ``StopFilter`` removes +the words ``had`` and ``a`` from the stream? Should it renumber the positions to +pretend the "stopped" words never existed? I.e.: + +============= ========== ========== +Token.text u'Mary' u'lamb' +Token.pos 0 1 +============= ========== ========== + +or should it preserve the original positions of the words? I.e: + +============= ========== ========== +Token.text u'Mary' u'lamb' +Token.pos 0 3 +============= ========== ========== + +It turns out that different situations call for different solutions, so the +provided ``StopFilter`` class supports both of the above behaviors. Renumbering +is the default, since that is usually the most useful and is necessary to +support phrase searching. However, you can set a parameter in StopFilter's +constructor to tell it not to renumber positions:: + + stopper = StopFilter(renumber=False) + + +Removing or leaving stop words +------------------------------ + +The point of using ``StopFilter`` is to remove stop words, right? Well, there +are actually some situations where you might want to mark tokens as "stopped" +but not remove them from the token stream. + +For example, if you were writing your own query parser, you could run the user's +query through a field's analyzer to break it into tokens. In that case, you +might want to know which words were "stopped" so you can provide helpful +feedback to the end user (e.g. "The following words are too common to search +for:"). + +In other cases, you might want to leave stopped words in the stream for certain +filtering steps (for example, you might have a step that looks at previous +tokens, and want the stopped tokens to be part of the process), but then remove +them later. + +The ``analysis`` module provides a couple of tools for keeping and removing +stop-words in the stream. + +The ``removestops`` parameter passed to the analyzer's ``__call__`` method (and +copied to the ``Token`` object as an attribute) specifies whether stop words should +be removed from the stream or left in. + +:: + + >>> from whoosh.analysis import StandardAnalyzer + >>> analyzer = StandardAnalyzer() + >>> [(t.text, t.stopped) for t in analyzer(u"This is a test")] + [(u'test', False)] + >>> [(t.text, t.stopped) for t in analyzer(u"This is a test", removestops=False)] + [(u'this', True), (u'is', True), (u'a', True), (u'test', False)] + +The ``analysis.unstopped()`` filter function takes a token generator and yields +only the tokens whose ``stopped`` attribute is ``False``. + +.. note:: + Even if you leave stopped words in the stream in an analyzer you use for + indexing, the indexer will ignore any tokens where the ``stopped`` + attribute is ``True``. + + +Implementation notes +-------------------- + +Because object creation is slow in Python, the stock tokenizers do not create a +new ``analysis.Token`` object for each token. Instead, they create one ``Token`` object +and yield it over and over. This is a nice performance shortcut but can lead to +strange behavior if your code tries to remember tokens between loops of the +generator. + +Because the analyzer only has one ``Token`` object, of which it keeps changing the +attributes, if you keep a copy of the Token you get from a loop of the +generator, it will be changed from under you. For example:: + + >>> list(tokenizer(u"Hello there my friend")) + [Token(u"friend"), Token(u"friend"), Token(u"friend"), Token(u"friend")] + +Instead, do this:: + + >>> [t.text for t in tokenizer(u"Hello there my friend")] + +That is, save the attributes, not the token object itself. + +If you implement your own tokenizer, filter, or analyzer as a class, you should +implement an ``__eq__`` method. This is important to allow comparison of ``Schema`` +objects. + +The mixing of persistent "setting" and transient "information" attributes on the +``Token`` object is not especially elegant. If I ever have a better idea I might +change it. ;) Nothing requires that an Analyzer be implemented by calling a +tokenizer and filters. Tokenizers and filters are simply a convenient way to +structure the code. You're free to write an analyzer any way you want, as long +as it implements ``__call__``. + + + diff --git a/docs/source/api/analysis.rst b/docs/source/api/analysis.rst new file mode 100644 index 0000000..d31e96a --- /dev/null +++ b/docs/source/api/analysis.rst @@ -0,0 +1,62 @@ +=================== +``analysis`` module +=================== + +.. automodule:: whoosh.analysis + +Analyzers +========= + +.. autoclass:: IDAnalyzer +.. autoclass:: KeywordAnalyzer +.. autoclass:: RegexAnalyzer +.. autoclass:: SimpleAnalyzer +.. autoclass:: StandardAnalyzer +.. autoclass:: StemmingAnalyzer +.. autoclass:: FancyAnalyzer +.. autoclass:: NgramAnalyzer +.. autoclass:: NgramWordAnalyzer +.. autoclass:: LanguageAnalyzer + + +Tokenizers +========== + +.. autoclass:: IDTokenizer +.. autoclass:: RegexTokenizer +.. autoclass:: CharsetTokenizer +.. autoclass:: SpaceSeparatedTokenizer +.. autoclass:: CommaSeparatedTokenizer +.. autoclass:: NgramTokenizer +.. autoclass:: PathTokenizer + + +Filters +======= + +.. autoclass:: PassFilter +.. autoclass:: LoggingFilter +.. autoclass:: MultiFilter +.. autoclass:: TeeFilter +.. autoclass:: ReverseTextFilter +.. autoclass:: LowercaseFilter +.. autoclass:: StripFilter +.. autoclass:: StopFilter +.. autoclass:: StemFilter +.. autoclass:: CharsetFilter +.. autoclass:: NgramFilter +.. autoclass:: IntraWordFilter +.. autoclass:: CompoundWordFilter +.. autoclass:: BiWordFilter +.. autoclass:: ShingleFilter +.. autoclass:: DelimitedAttributeFilter +.. autoclass:: DoubleMetaphoneFilter +.. autoclass:: SubstitutionFilter + + +Token classes and functions +=========================== + +.. autoclass:: Token +.. autofunction:: unstopped + diff --git a/docs/source/api/api.rst b/docs/source/api/api.rst new file mode 100644 index 0000000..f74a3c3 --- /dev/null +++ b/docs/source/api/api.rst @@ -0,0 +1,9 @@ +========== +Whoosh API +========== + +.. toctree:: + :glob: + :maxdepth: 1 + + ** diff --git a/docs/source/api/codec/base.rst b/docs/source/api/codec/base.rst new file mode 100644 index 0000000..28f707c --- /dev/null +++ b/docs/source/api/codec/base.rst @@ -0,0 +1,32 @@ +===================== +``codec.base`` module +===================== + +.. automodule:: whoosh.codec.base + + +Classes +======= + +.. autoclass:: Codec + :members: + +.. autoclass:: PerDocumentWriter + :members: + +.. autoclass:: FieldWriter + :members: + +.. autoclass:: PostingsWriter + :members: + +.. autoclass:: TermsReader + :members: + +.. autoclass:: PerDocumentReader + :members: + +.. autoclass:: Segment + :members: + + diff --git a/docs/source/api/collectors.rst b/docs/source/api/collectors.rst new file mode 100644 index 0000000..b27b8c1 --- /dev/null +++ b/docs/source/api/collectors.rst @@ -0,0 +1,47 @@ +===================== +``collectors`` module +===================== + +.. automodule:: whoosh.collectors + + +Base classes +============ + +.. autoclass:: Collector + :members: + +.. autoclass:: ScoredCollector + :members: + +.. autoclass:: WrappingCollector + :members: + + +Basic collectors +================ + +.. autoclass:: TopCollector + +.. autoclass:: UnlimitedCollector + +.. autoclass:: SortingCollector + + +Wrappers +======== + +.. autoclass:: FilterCollector + +.. autoclass:: FacetCollector + +.. autoclass:: CollapseCollector + +.. autoclass:: TimeLimitCollector + +.. autoclass:: TermsCollector + + + + + diff --git a/docs/source/api/columns.rst b/docs/source/api/columns.rst new file mode 100644 index 0000000..26fa791 --- /dev/null +++ b/docs/source/api/columns.rst @@ -0,0 +1,49 @@ +===================== +``columns`` module +===================== + +.. automodule:: whoosh.columns + + +Base classes +============ + +.. autoclass:: Column + :members: + +.. autoclass:: ColumnWriter + :members: + +.. autoclass:: ColumnReader + :members: + + +Basic columns +============= + +.. autoclass:: VarBytesColumn + +.. autoclass:: FixedBytesColumn + +.. autoclass:: RefBytesColumn + +.. autoclass:: NumericColumn + + +Technical columns +================= + +.. autoclass:: BitColumn + +.. autoclass:: CompressedBytesColumn + +.. autoclass:: StructColumn + +.. autoclass:: PickleColumn + + +Experimental columns +==================== + +.. autoclass:: ClampedNumericColumn + diff --git a/docs/source/api/fields.rst b/docs/source/api/fields.rst new file mode 100644 index 0000000..290feb3 --- /dev/null +++ b/docs/source/api/fields.rst @@ -0,0 +1,41 @@ +================= +``fields`` module +================= + +.. automodule:: whoosh.fields + +Schema class +============ + +.. autoclass:: Schema + :members: + +.. autoclass:: SchemaClass + +FieldType base class +==================== + +.. autoclass:: FieldType + :members: + + +Pre-made field types +==================== + +.. autoclass:: ID +.. autoclass:: IDLIST +.. autoclass:: STORED +.. autoclass:: KEYWORD +.. autoclass:: TEXT +.. autoclass:: NUMERIC +.. autoclass:: DATETIME +.. autoclass:: BOOLEAN +.. autoclass:: NGRAM +.. autoclass:: NGRAMWORDS + + +Exceptions +========== + +.. autoexception:: FieldConfigurationError +.. autoexception:: UnknownFieldError diff --git a/docs/source/api/filedb/filestore.rst b/docs/source/api/filedb/filestore.rst new file mode 100644 index 0000000..2dfc2ec --- /dev/null +++ b/docs/source/api/filedb/filestore.rst @@ -0,0 +1,31 @@ +=========================== +``filedb.filestore`` module +=========================== + +.. automodule:: whoosh.filedb.filestore + +Base class +========== + +.. autoclass:: Storage + :members: + + +Implementation classes +====================== + +.. autoclass:: FileStorage +.. autoclass:: RamStorage + + +Helper functions +================ + +.. autofunction:: copy_storage +.. autofunction:: copy_to_ram + + +Exceptions +========== + +.. autoexception:: ReadOnlyError diff --git a/docs/source/api/filedb/filetables.rst b/docs/source/api/filedb/filetables.rst new file mode 100644 index 0000000..3fbf70f --- /dev/null +++ b/docs/source/api/filedb/filetables.rst @@ -0,0 +1,22 @@ +============================ +``filedb.filetables`` module +============================ + +.. automodule:: whoosh.filedb.filetables + + +Hash file +========= + +.. autoclass:: HashWriter + :members: + +.. autoclass:: HashReader + :members: + + +Ordered Hash file +================= + +.. autoclass:: OrderedHashWriter +.. autoclass:: OrderedHashReader diff --git a/docs/source/api/filedb/structfile.rst b/docs/source/api/filedb/structfile.rst new file mode 100644 index 0000000..7d45c66 --- /dev/null +++ b/docs/source/api/filedb/structfile.rst @@ -0,0 +1,14 @@ +============================ +``filedb.structfile`` module +============================ + +.. automodule:: whoosh.filedb.structfile + +Classes +======= + +.. autoclass:: StructFile + :members: + +.. autoclass:: BufferFile +.. autoclass:: ChecksumFile diff --git a/docs/source/api/formats.rst b/docs/source/api/formats.rst new file mode 100644 index 0000000..9cd9dd1 --- /dev/null +++ b/docs/source/api/formats.rst @@ -0,0 +1,24 @@ +================== +``formats`` module +================== + +.. automodule:: whoosh.formats + +Base class +========== + +.. autoclass:: Format + :members: + + +Formats +======= + +.. autoclass:: Existence +.. autoclass:: Frequency +.. autoclass:: Positions +.. autoclass:: Characters +.. autoclass:: PositionBoosts +.. autoclass:: CharacterBoosts + + diff --git a/docs/source/api/highlight.rst b/docs/source/api/highlight.rst new file mode 100644 index 0000000..74d2ab9 --- /dev/null +++ b/docs/source/api/highlight.rst @@ -0,0 +1,50 @@ +==================== +``highlight`` module +==================== + +.. automodule:: whoosh.highlight + +See :doc:`how to highlight terms in search results `. + + +Manual highlighting +=================== + +.. autoclass:: Highlighter + :members: + +.. autofunction:: highlight + + +Fragmenters +=========== + +.. autoclass:: Fragmenter + :members: + +.. autoclass:: WholeFragmenter +.. autoclass:: SentenceFragmenter +.. autoclass:: ContextFragmenter +.. autoclass:: PinpointFragmenter + + +Scorers +======= + +.. autoclass:: FragmentScorer +.. autoclass:: BasicFragmentScorer + + +Formatters +========== + +.. autoclass:: UppercaseFormatter +.. autoclass:: HtmlFormatter +.. autoclass:: GenshiFormatter + + +Utility classes +=============== + +.. autoclass:: Fragment + :members: diff --git a/docs/source/api/idsets.rst b/docs/source/api/idsets.rst new file mode 100644 index 0000000..0f55306 --- /dev/null +++ b/docs/source/api/idsets.rst @@ -0,0 +1,23 @@ +============================ +``support.bitvector`` module +============================ + +.. automodule:: whoosh.idsets + + +Base classes +============ + +.. autoclass:: DocIdSet + :members: + +.. autoclass:: BaseBitSet + + +Implementation classes +====================== + +.. autoclass:: BitSet +.. autoclass:: OnDiskBitSet +.. autoclass:: SortedIntSet +.. autoclass:: MultiIdSet diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst new file mode 100644 index 0000000..ee38645 --- /dev/null +++ b/docs/source/api/index.rst @@ -0,0 +1,39 @@ +================ +``index`` module +================ + +.. automodule:: whoosh.index + + +Functions +========= + +.. autofunction:: create_in +.. autofunction:: open_dir +.. autofunction:: exists_in +.. autofunction:: exists +.. autofunction:: version_in +.. autofunction:: version + + +Base class +========== + +.. autoclass:: Index + :members: + + +Implementation +============== + +.. autoclass:: FileIndex + + +Exceptions +========== + +.. autoexception:: LockError +.. autoexception:: IndexError +.. autoexception:: IndexVersionError +.. autoexception:: OutOfDateError +.. autoexception:: EmptyIndexError diff --git a/docs/source/api/lang/morph_en.rst b/docs/source/api/lang/morph_en.rst new file mode 100644 index 0000000..2a3dfe0 --- /dev/null +++ b/docs/source/api/lang/morph_en.rst @@ -0,0 +1,7 @@ +======================== +``lang.morph_en`` module +======================== + +.. automodule:: whoosh.lang.morph_en + +.. autofunction:: variations diff --git a/docs/source/api/lang/porter.rst b/docs/source/api/lang/porter.rst new file mode 100644 index 0000000..4a0220f --- /dev/null +++ b/docs/source/api/lang/porter.rst @@ -0,0 +1,7 @@ +====================== +``lang.porter`` module +====================== + +.. automodule:: whoosh.lang.porter + +.. autofunction:: stem diff --git a/docs/source/api/lang/wordnet.rst b/docs/source/api/lang/wordnet.rst new file mode 100644 index 0000000..8adcdb0 --- /dev/null +++ b/docs/source/api/lang/wordnet.rst @@ -0,0 +1,20 @@ +======================== +``lang.wordnet`` module +======================== + +.. automodule:: whoosh.lang.wordnet + +Thesaurus +========= + +.. autoclass:: Thesaurus + :members: + + +Low-level functions +=================== + +.. autofunction:: parse_file +.. autofunction:: synonyms +.. autofunction:: make_index + diff --git a/docs/source/api/matching.rst b/docs/source/api/matching.rst new file mode 100644 index 0000000..12f24c6 --- /dev/null +++ b/docs/source/api/matching.rst @@ -0,0 +1,34 @@ +=================== +``matching`` module +=================== + +.. automodule:: whoosh.matching + +Matchers +======== + +.. autoclass:: Matcher + :members: + +.. autoclass:: NullMatcher +.. autoclass:: ListMatcher +.. autoclass:: WrappingMatcher +.. autoclass:: MultiMatcher +.. autoclass:: FilterMatcher +.. autoclass:: BiMatcher +.. autoclass:: AdditiveBiMatcher +.. autoclass:: UnionMatcher +.. autoclass:: DisjunctionMaxMatcher +.. autoclass:: IntersectionMatcher +.. autoclass:: AndNotMatcher +.. autoclass:: InverseMatcher +.. autoclass:: RequireMatcher +.. autoclass:: AndMaybeMatcher +.. autoclass:: ConstantScoreMatcher + + +Exceptions +========== + +.. autoexception:: ReadTooFar +.. autoexception:: NoQualityAvailable diff --git a/docs/source/api/qparser.rst b/docs/source/api/qparser.rst new file mode 100644 index 0000000..d3c5ecd --- /dev/null +++ b/docs/source/api/qparser.rst @@ -0,0 +1,97 @@ +================== +``qparser`` module +================== + +.. automodule:: whoosh.qparser + +Parser object +============= + +.. autoclass:: QueryParser + :members: + +Pre-made configurations +----------------------- + +The following functions return pre-configured QueryParser objects. + +.. autofunction:: MultifieldParser + +.. autofunction:: SimpleParser + +.. autofunction:: DisMaxParser + + +Plug-ins +======== + +.. autoclass:: Plugin + :members: + +.. autoclass:: SingleQuotePlugin +.. autoclass:: PrefixPlugin +.. autoclass:: WildcardPlugin +.. autoclass:: RegexPlugin +.. autoclass:: BoostPlugin +.. autoclass:: GroupPlugin +.. autoclass:: EveryPlugin +.. autoclass:: FieldsPlugin +.. autoclass:: PhrasePlugin +.. autoclass:: RangePlugin +.. autoclass:: OperatorsPlugin +.. autoclass:: PlusMinusPlugin +.. autoclass:: GtLtPlugin +.. autoclass:: MultifieldPlugin +.. autoclass:: FieldAliasPlugin +.. autoclass:: CopyFieldPlugin + + +Syntax node objects +=================== + +Base nodes +---------- + +.. autoclass:: SyntaxNode + :members: + + +Nodes +----- + +.. autoclass:: FieldnameNode +.. autoclass:: TextNode +.. autoclass:: WordNode +.. autoclass:: RangeNode +.. autoclass:: MarkerNode + + +Group nodes +----------- + +.. autoclass:: GroupNode +.. autoclass:: BinaryGroup +.. autoclass:: ErrorNode +.. autoclass:: AndGroup +.. autoclass:: OrGroup +.. autoclass:: AndNotGroup +.. autoclass:: AndMaybeGroup +.. autoclass:: DisMaxGroup +.. autoclass:: RequireGroup +.. autoclass:: NotGroup + + +Operators +--------- + +.. autoclass:: Operator +.. autoclass:: PrefixOperator +.. autoclass:: PostfixOperator +.. autoclass:: InfixOperator + + + + + + + diff --git a/docs/source/api/query.rst b/docs/source/api/query.rst new file mode 100644 index 0000000..9a7e9ff --- /dev/null +++ b/docs/source/api/query.rst @@ -0,0 +1,83 @@ +================ +``query`` module +================ + +.. automodule:: whoosh.query + +See also :mod:`whoosh.qparser` which contains code for parsing user queries +into query objects. + +Base classes +============ + +The following abstract base classes are subclassed to create the "real" +query operations. + +.. autoclass:: Query + :members: + +.. autoclass:: CompoundQuery +.. autoclass:: MultiTerm +.. autoclass:: ExpandingTerm +.. autoclass:: WrappingQuery + + +Query classes +============= + +.. autoclass:: Term +.. autoclass:: Variations +.. autoclass:: FuzzyTerm +.. autoclass:: Phrase +.. autoclass:: And +.. autoclass:: Or +.. autoclass:: DisjunctionMax +.. autoclass:: Not +.. autoclass:: Prefix +.. autoclass:: Wildcard +.. autoclass:: Regex +.. autoclass:: TermRange +.. autoclass:: NumericRange +.. autoclass:: DateRange +.. autoclass:: Every +.. autoclass:: NullQuery + + +Binary queries +============== + +.. autoclass:: Require +.. autoclass:: AndMaybe +.. autoclass:: AndNot +.. autoclass:: Otherwise + + +Span queries +============ + +.. autoclass:: Span + :members: + +.. autoclass:: SpanQuery +.. autoclass:: SpanFirst +.. autoclass:: SpanNear +.. autoclass:: SpanNear2 +.. autoclass:: SpanNot +.. autoclass:: SpanOr +.. autoclass:: SpanContains +.. autoclass:: SpanBefore +.. autoclass:: SpanCondition + + +Special queries +=============== + +.. autoclass:: NestedParent +.. autoclass:: NestedChildren +.. autoclass:: ConstantScoreQuery + + +Exceptions +========== + +.. autoexception:: QueryError diff --git a/docs/source/api/reading.rst b/docs/source/api/reading.rst new file mode 100644 index 0000000..e0fd2a1 --- /dev/null +++ b/docs/source/api/reading.rst @@ -0,0 +1,22 @@ +================== +``reading`` module +================== + +.. automodule:: whoosh.reading + +Classes +======= + +.. autoclass:: IndexReader + :members: + +.. autoclass:: MultiReader + +.. autoclass:: TermInfo + :members: + +Exceptions +========== + +.. autoexception:: TermNotFound + diff --git a/docs/source/api/scoring.rst b/docs/source/api/scoring.rst new file mode 100644 index 0000000..73ea1e7 --- /dev/null +++ b/docs/source/api/scoring.rst @@ -0,0 +1,42 @@ +================== +``scoring`` module +================== + +.. automodule:: whoosh.scoring + + +Base classes +============ + +.. autoclass:: WeightingModel + :members: + +.. autoclass:: BaseScorer + :members: + +.. autoclass:: WeightScorer +.. autoclass:: WeightLengthScorer + + +Scoring algorithm classes +========================= + +.. autoclass:: BM25F + +.. autoclass:: TF_IDF + +.. autoclass:: Frequency + + +Scoring utility classes +======================= + +.. autoclass:: FunctionWeighting + +.. autoclass:: MultiWeighting + +.. autoclass:: ReverseWeighting + + + + diff --git a/docs/source/api/searching.rst b/docs/source/api/searching.rst new file mode 100644 index 0000000..8acfe49 --- /dev/null +++ b/docs/source/api/searching.rst @@ -0,0 +1,33 @@ +==================== +``searching`` module +==================== + +.. automodule:: whoosh.searching + + +Searching classes +================= + +.. autoclass:: Searcher + :members: + + +Results classes +=============== + +.. autoclass:: Results + :members: + +.. autoclass:: Hit + :members: + +.. autoclass:: ResultsPage + :members: + + +Exceptions +========== + +.. autoexception:: NoTermsException +.. autoexception:: TimeLimit + diff --git a/docs/source/api/sorting.rst b/docs/source/api/sorting.rst new file mode 100644 index 0000000..faf78d0 --- /dev/null +++ b/docs/source/api/sorting.rst @@ -0,0 +1,48 @@ +================== +``sorting`` module +================== + +.. automodule:: whoosh.sorting + + +Base types +========== + +.. autoclass:: FacetType + :members: + +.. autoclass:: Categorizer + :members: + + +Facet types +=========== + +.. autoclass:: FieldFacet +.. autoclass:: QueryFacet +.. autoclass:: RangeFacet +.. autoclass:: DateRangeFacet +.. autoclass:: ScoreFacet +.. autoclass:: FunctionFacet +.. autoclass:: MultiFacet +.. autoclass:: StoredFieldFacet + + +Facets object +============= + +.. autoclass:: Facets + :members: + + +FacetType objects +================= + +.. autoclass:: FacetMap + :members: +.. autoclass:: OrderedList +.. autoclass:: UnorderedList +.. autoclass:: Count +.. autoclass:: Best + + diff --git a/docs/source/api/spelling.rst b/docs/source/api/spelling.rst new file mode 100644 index 0000000..e89bb79 --- /dev/null +++ b/docs/source/api/spelling.rst @@ -0,0 +1,34 @@ +=================== +``spelling`` module +=================== + +See :doc:`correcting errors in user queries <../spelling>`. + +.. automodule:: whoosh.spelling + + +Corrector objects +================= + +.. autoclass:: Corrector + :members: + +.. autoclass:: ReaderCorrector + +.. autoclass:: GraphCorrector + :members: + +.. autoclass:: MultiCorrector + + +QueryCorrector objects +====================== + +.. autoclass:: QueryCorrector + :members: + +.. autoclass:: SimpleQueryCorrector + +.. autoclass:: Correction + + diff --git a/docs/source/api/support/charset.rst b/docs/source/api/support/charset.rst new file mode 100644 index 0000000..b0a687e --- /dev/null +++ b/docs/source/api/support/charset.rst @@ -0,0 +1,13 @@ +========================== +``support.charset`` module +========================== + +.. automodule:: whoosh.support.charset + +.. data:: default_charset + + An extensive case- and accent folding charset table. + Taken from http://speeple.com/unicode-maps.txt + +.. autofunction:: charset_table_to_dict + diff --git a/docs/source/api/support/levenshtein.rst b/docs/source/api/support/levenshtein.rst new file mode 100644 index 0000000..cb64027 --- /dev/null +++ b/docs/source/api/support/levenshtein.rst @@ -0,0 +1,10 @@ +============================== +``support.levenshtein`` module +============================== + +.. automodule:: whoosh.support.levenshtein + +.. autofunction:: relative + +.. autofunction:: distance + diff --git a/docs/source/api/util.rst b/docs/source/api/util.rst new file mode 100644 index 0000000..9359f74 --- /dev/null +++ b/docs/source/api/util.rst @@ -0,0 +1,7 @@ +=============== +``util`` module +=============== + +.. automodule:: whoosh.util + :members: + diff --git a/docs/source/api/writing.rst b/docs/source/api/writing.rst new file mode 100644 index 0000000..0bebc86 --- /dev/null +++ b/docs/source/api/writing.rst @@ -0,0 +1,30 @@ +================== +``writing`` module +================== + +.. automodule:: whoosh.writing + + +Writer +====== + +.. autoclass:: IndexWriter + :members: + + +Utility writers +=============== + +.. autoclass:: BufferedWriter + :members: + +.. autoclass:: AsyncWriter + :members: + + +Exceptions +========== + +.. autoexception:: IndexingError + + diff --git a/docs/source/batch.rst b/docs/source/batch.rst new file mode 100644 index 0000000..5caf256 --- /dev/null +++ b/docs/source/batch.rst @@ -0,0 +1,114 @@ +=================================== +Tips for speeding up batch indexing +=================================== + + +Overview +======== + +Indexing documents tends to fall into two general patterns: adding documents +one at a time as they are created (as in a web application), and adding a bunch +of documents at once (batch indexing). + +The following settings and alternate workflows can make batch indexing faster. + + +StemmingAnalyzer cache +====================== + +The stemming analyzer by default uses a least-recently-used (LRU) cache to limit +the amount of memory it uses, to prevent the cache from growing very large if +the analyzer is reused for a long period of time. However, the LRU cache can +slow down indexing by almost 200% compared to a stemming analyzer with an +"unbounded" cache. + +When you're indexing in large batches with a one-shot instance of the +analyzer, consider using an unbounded cache:: + + w = myindex.writer() + # Get the analyzer object from a text field + stem_ana = w.schema["content"].format.analyzer + # Set the cachesize to -1 to indicate unbounded caching + stem_ana.cachesize = -1 + # Reset the analyzer to pick up the changed attribute + stem_ana.clear() + + # Use the writer to index documents... + + +The ``limitmb`` parameter +========================= + +The ``limitmb`` parameter to :meth:`whoosh.index.Index.writer` controls the +*maximum* memory (in megabytes) the writer will use for the indexing pool. The +higher the number, the faster indexing will be. + +The default value of ``128`` is actually somewhat low, considering many people +have multiple gigabytes of RAM these days. Setting it higher can speed up +indexing considerably:: + + from whoosh import index + + ix = index.open_dir("indexdir") + writer = ix.writer(limitmb=256) + +.. note:: + The actual memory used will be higher than this value because of interpreter + overhead (up to twice as much!). It is very useful as a tuning parameter, + but not for trying to exactly control the memory usage of Whoosh. + + +The ``procs`` parameter +======================= + +The ``procs`` parameter to :meth:`whoosh.index.Index.writer` controls the +number of processors the writer will use for indexing (via the +``multiprocessing`` module):: + + from whoosh import index + + ix = index.open_dir("indexdir") + writer = ix.writer(procs=4) + +Note that when you use multiprocessing, the ``limitmb`` parameter controls the +amount of memory used by *each process*, so the actual memory used will be +``limitmb * procs``:: + + # Each process will use a limit of 128, for a total of 512 + writer = ix.writer(procs=4, limitmb=128) + + +The ``multisegment`` parameter +============================== + +The ``procs`` parameter causes the default writer to use multiple processors to +do much of the indexing, but then still uses a single process to merge the pool +of each sub-writer into a single segment. + +You can get much better indexing speed by also using the ``multisegment=True`` +keyword argument, which instead of merging the results of each sub-writer, +simply has them each just write out a new segment:: + + from whoosh import index + + ix = index.open_dir("indexdir") + writer = ix.writer(procs=4, multisegment=True) + +The drawback is that instead +of creating a single new segment, this option creates a number of new segments +**at least** equal to the number of processes you use. + +For example, if you use ``procs=4``, the writer will create four new segments. +(If you merge old segments or call ``add_reader`` on the parent writer, the +parent writer will also write a segment, meaning you'll get five new segments.) + +So, while ``multisegment=True`` is much faster than a normal writer, you should +only use it for large batch indexing jobs (or perhaps only for indexing from +scratch). It should not be the only method you use for indexing, because +otherwise the number of segments will tend to increase forever! + + + + + + diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..e106a33 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,198 @@ + +import sys, os, os.path + +sys.path.append(os.path.abspath("../../src")) +import whoosh + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.append(os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.ifconfig'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Whoosh' +copyright = u'2007-2012 Matt Chaput' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = whoosh.versionstring(build=False) +# The full version, including alpha/beta/rc tags. +release = whoosh.versionstring() + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be searched +# for source files. +exclude_trees = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = { + "codebgcolor": "#CCC", + } + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +#html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_use_modindex = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Whooshdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'Whoosh.tex', u'Whoosh Documentation', + u'Matt Chaput', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_use_modindex = True + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'http://docs.python.org/': None} + +# Autodoc config +autoclass_content = "both" diff --git a/docs/source/dates.rst b/docs/source/dates.rst new file mode 100644 index 0000000..ab1aadd --- /dev/null +++ b/docs/source/dates.rst @@ -0,0 +1,202 @@ +================================ +Indexing and parsing dates/times +================================ + +Indexing dates +============== + +Whoosh lets you index and search dates/times using the +:class:`whoosh.fields.DATETIME` field type. Instead of passing text for the +field in ``add_document()``, you use a Python ``datetime.datetime`` object:: + + from datetime import datetime, timedelta + from whoosh import fields, index + + schema = fields.Schema(title=fields.TEXT, content=fields.TEXT, + date=fields.DATETIME) + ix = index.create_in("indexdir", schema) + + w = ix.writer() + w.add_document(title="Document 1", content="Rendering images from the command line", + date=datetime.utcnow()) + w.add_document(title="Document 2", content="Creating shaders using a node network", + date=datetime.utcnow() + timedelta(days=1)) + w.commit() + + +Parsing date queries +==================== + +Once you've have an indexed ``DATETIME`` field, you can search it using a rich +date parser contained in the :class:`whoosh.qparser.dateparse.DateParserPlugin`:: + + from whoosh import index + from whoosh.qparser import QueryParser + from whoosh.qparser.dateparse import DateParserPlugin + + ix = index.open_dir("indexdir") + + # Instatiate a query parser + qp = QueryParser("content", ix.schema) + + # Add the DateParserPlugin to the parser + qp.add_plugin(DateParserPlugin()) + +With the ``DateParserPlugin``, users can use date queries such as:: + + 20050912 + 2005 sept 12th + june 23 1978 + 23 mar 2005 + july 1985 + sep 12 + today + yesterday + tomorrow + now + next friday + last tuesday + 5am + 10:25:54 + 23:12 + 8 PM + 4:46 am oct 31 2010 + last tuesday to today + today to next friday + jan 2005 to feb 2008 + -1 week to now + now to +2h + -1y6mo to +2 yrs 23d + +Normally, as with other types of queries containing spaces, the users need +to quote date queries containing spaces using single quotes:: + + render date:'last tuesday' command + date:['last tuesday' to 'next friday'] + +If you use the ``free`` argument to the ``DateParserPlugin``, the plugin will +try to parse dates from unquoted text following a date field prefix:: + + qp.add_plugin(DateParserPlugin(free=True)) + +This allows the user to type a date query with spaces and special characters +following the name of date field and a colon. The date query can be mixed +with other types of queries without quotes:: + + date:last tuesday + render date:oct 15th 2001 5:20am command + +If you don't use the ``DateParserPlugin``, users can still search DATETIME +fields using a simple numeric form ``YYYY[MM[DD[hh[mm[ss]]]]]`` that is built +into the ``DATETIME`` field:: + + from whoosh import index + from whoosh.qparser import QueryParser + + ix = index.open_dir("indexdir") + qp = QueryParser("content", schema=ix.schema) + + # Find all datetimes in 2005 + q = qp.parse(u"date:2005") + + # Find all datetimes on June 24, 2005 + q = qp.parse(u"date:20050624") + + # Find all datetimes from 1am-2am on June 24, 2005 + q = qp.parse(u"date:2005062401") + + # Find all datetimes from Jan 1, 2005 to June 2, 2010 + q = qp.parse(u"date:[20050101 to 20100602]") + + +About time zones and basetime +============================= + +The best way to deal with time zones is to always index ``datetime``\ s in native +UTC form. Any ``tzinfo`` attribute on the ``datetime`` object is *ignored* +by the indexer. If you are working with local datetimes, you should convert them +to native UTC datetimes before indexing. + + +Date parser notes +================= + +Please note that the date parser is still somewhat experimental. + + +Setting the base datetime +------------------------- + +When you create the ``DateParserPlugin`` you can pass a ``datetime`` object to +the ``basedate`` argument to set the datetime against which relative queries +(such as ``last tuesday`` and ``-2 hours``) are measured. By default, the +basedate is ``datetime.utcnow()`` at the moment the plugin is instantiated:: + + qp.add_plugin(DateParserPlugin(basedate=my_datetime)) + + +Registering an error callback +----------------------------- + +To avoid user queries causing exceptions in your application, the date parser +attempts to fail silently when it can't parse a date query. However, you can +register a callback function to be notified of parsing failures so you can +display feedback to the user. The argument to the callback function is the +date text that could not be parsed (this is an experimental feature and may +change in future versions):: + + errors = [] + def add_error(msg): + errors.append(msg) + qp.add_plugin(DateParserPlug(callback=add_error)) + + q = qp.parse(u"date:blarg") + # errors == [u"blarg"] + + +Using free parsing +------------------ + +While the ``free`` option is easier for users, it may result in ambiguities. +As one example, if you want to find documents containing reference to a march +and the number 2 in documents from the year 2005, you might type:: + + date:2005 march 2 + +This query would be interpreted correctly as a date query and two term queries +when ``free=False``, but as a single date query when ``free=True``. In this +case the user could limit the scope of the date parser with single quotes:: + + date:'2005' march 2 + + +Parsable formats +---------------- + +The date parser supports a wide array of date and time formats, however it is +not my intention to try to support *all* types of human-readable dates (for +example ``ten to five the friday after next``). The best idea might be to pick +a date format that works and try to train users on it, and if they use one of +the other formats that also works consider it a happy accident. + + +Limitations +=========== + +* Since it's based on Python's ``datetime.datetime`` object, the ``DATETIME`` + field shares all the limitations of that class, such as no support for + dates before year 1 on the proleptic Gregorian calendar. The ``DATETIME`` + field supports practically unlimited dates, so if the ``datetime`` object + is every improved it could support it. An alternative possibility might + be to add support for ``mxDateTime`` objects someday. + +* The ``DateParserPlugin`` currently only has support for English dates. + The architecture supports creation of parsers for other languages, and I + hope to add examples for other languages soon. + +* ``DATETIME`` fields do not currently support open-ended ranges. You can + simulate an open ended range by using an endpoint far in the past or future. + + + + diff --git a/docs/source/facets.rst b/docs/source/facets.rst new file mode 100644 index 0000000..277d69a --- /dev/null +++ b/docs/source/facets.rst @@ -0,0 +1,771 @@ +==================== +Sorting and faceting +==================== + +.. note:: + The API for sorting and faceting changed in Whoosh 3.0. + +Overview +======== + +Sorting and faceting search results in Whoosh is based on **facets**. Each +facet associates a value with each document in the search results, allowing you +to sort by the keys or use them to group the documents. Whoosh includes a variety +of **facet types** you can use for sorting and grouping (see below). + + +Sorting +======= + +By default, the results of a search are sorted with the highest-scoring +documents first. You can use the ``sortedby`` keyword argument to order the +results by some other criteria instead, such as the value of a field. + + +Making fields sortable +---------------------- + +In order to sort on a field, you should create the field using the +``sortable=True`` keyword argument:: + + schema = fields.Schema(title=fields.TEXT(sortable=True), + content=fields.TEXT, + modified=fields.DATETIME(sortable=True) + ) + +It's possible to sort on a field that doesn't have ``sortable=True``, but this +requires Whoosh to load the unique terms in the field into memory. Using +``sortable`` is much more efficient. + + +About column types +------------------ + +When you create a field using ``sortable=True``, you are telling Whoosh to store +per-document values for that field in a *column*. A column object specifies the +format to use to store the per-document values on disk. + +The :mod:`whoosh.columns` module contains several different column object +implementations. Each field type specifies a reasonable default column type (for +example, the default for text fields is :class:`whoosh.columns.VarBytesColumn`, +the default for numeric fields is :class:`whoosh.columns.NumericColumn`). +However, if you want maximum efficiency you may want to use a different column +type for a field. + +For example, if all document values in a field are a fixed length, you can use a +:class:`whoosh.columns.FixedBytesColumn`. If you have a field where many +documents share a relatively small number of possible values (an example might +be a "category" field, or "month" or other enumeration type fields), you might +want to use :class:`whoosh.columns.RefBytesColumn` (which can handle both +variable and fixed-length values). There are column types for storing +per-document bit values, structs, pickled objects, and compressed byte values. + +To specify a custom column object for a field, pass it as the ``sortable`` +keyword argument instead of ``True``:: + + from whoosh import columns, fields + + category_col = columns.RefBytesColumn() + schema = fields.Schema(title=fields.TEXT(sortable=True), + category=fields.KEYWORD(sortable=category_col) + + +Using a COLUMN field for custom sort keys +----------------------------------------- + +When you add a document with a sortable field, Whoosh uses the value you pass +for the field as the sortable value. For example, if "title" is a sortable +field, and you add this document:: + + writer.add_document(title="Mr. Palomar") + +...then ``Mr. Palomar`` is stored in the field column as the sorting key for the +document. + +This is usually good, but sometimes you need to "massage" the sortable key so +it's different from the value the user searches and/or sees in the interface. +For example, if you allow the user to sort by title, you might want to use +different values for the visible title and the value used for sorting:: + + # Visible title + title = "The Unbearable Lightness of Being" + + # Sortable title: converted to lowercase (to prevent different ordering + # depending on uppercase/lowercase), with initial article moved to the end + sort_title = "unbearable lightness of being, the" + +The best way to do this is to use an additional field just for sorting. You can +use the :class:`whoosh.fields.COLUMN` field type to create a field that is not +indexed or stored, it only holds per-document column values:: + + schema = fields.Schema(title=fields.TEXT(stored=True), + sort_title=fields.COLUMN(columns.VarBytesColumn()) + ) + +The single argument to the :class:`whoosh.fields.COLUMN` initializer is a +:class:`whoosh.columns.ColumnType` object. You can use any of the various +column types in the :mod:`whoosh.columns` module. + +As another example, say you are indexing documents that have a custom sorting +order associated with each document, such as a "priority" number:: + + name=Big Wheel + price=100 + priority=1 + + name=Toss Across + price=40 + priority=3 + + name=Slinky + price=25 + priority=2 + ... + +You can use a column field with a numeric column object to hold the "priority" +and use it for sorting:: + + schema = fields.Schema(name=fields.TEXT(stored=True), + price=fields.NUMERIC(stored=True), + priority=fields.COLUMN(columns.NumericColumn("i"), + ) + +(Note that :class:`columns.NumericColumn` takes a type code character like the +codes used by Python's ``struct`` and ``array`` modules.) + + +Making existing fields sortable +------------------------------- + +If you have an existing index from before the ``sortable`` argument was added +in Whoosh 3.0, or you didn't think you needed a field to be sortable but now +you find that you need to sort it, you can add "sortability" to an existing +index using the :func:`whoosh.sorting.add_sortable` utility function:: + + from whoosh import columns, fields, index, sorting + + # Say we have an existing index with this schema + schema = fields.Schema(title=fields.TEXT, + price=fields.NUMERIC) + + # To use add_sortable, first open a writer for the index + ix = index.open_dir("indexdir") + with ix.writer() as w: + # Add sortable=True to the "price" field using field terms as the + # sortable values + sorting.add_sortable(w, "price", sorting.FieldFacet("price")) + + # Add sortable=True to the "title" field using the + # stored field values as the sortable value + sorting.add_sortable(w, "title", sorting.StoredFieldFacet("title")) + +You can specify a custom column type when you call ``add_sortable`` using the +``column`` keyword argument:: + + add_sortable(w, "chapter", sorting.FieldFacet("chapter"), + column=columns.RefBytesColumn()) + +See the documentation for :func:`~whoosh.sorting.add_sortable` for more +information. + + +Sorting search results +---------------------- + +When you tell Whoosh to sort by a field (or fields), it uses the per-document +values in the field's column as sorting keys for the documents. + +Normally search results are sorted by descending relevance score. You can tell +Whoosh to use a different ordering by passing the ``sortedby`` keyword argument +to the :meth:`~whoosh.searching.Searcher.search` method:: + + from whoosh import fields, index, qparser + + schema = fields.Schema(title=fields.TEXT(stored=True), + price=fields.NUMERIC(sortable=True)) + ix = index.create_in("indexdir", schema) + + with ix.writer() as w: + w.add_document(title="Big Deal", price=20) + w.add_document(title="Mr. Big", price=10) + w.add_document(title="Big Top", price=15) + + with ix.searcher() as s: + qp = qparser.QueryParser("big", ix.schema) + q = qp.parse(user_query_string) + + # Sort search results from lowest to highest price + results = s.search(q, sortedby="price") + for hit in results: + print(hit["title"]) + +You can use any of the following objects as ``sortedby`` values: + +A ``FacetType`` object + Uses this object to sort the documents. See below for the available facet + types. + +A field name string + Converts the field name into a ``FieldFacet`` (see below) and uses it to + sort the documents. + +A list of ``FacetType`` objects and/or field name strings + Bundles the facets together into a ``MultiFacet`` so you can sort by + multiple keys. Note that this shortcut does not allow you to reverse + the sort direction of individual facets. To do that, you need to construct + the ``MultiFacet`` object yourself. + +.. note:: + You can use the ``reverse=True`` keyword argument to the + ``Searcher.search()`` method to reverse the overall sort direction. This + is more efficient than reversing each individual facet. + + +Examples +-------- + +Sort by the value of the size field:: + + results = searcher.search(myquery, sortedby="size") + +Sort by the reverse (highest-to-lowest) order of the "price" field:: + + facet = sorting.FieldFacet("price", reverse=True) + results = searcher.search(myquery, sortedby=facet) + +Sort by ascending size and then descending price:: + + mf = sorting.MultiFacet() + mf.add_field("size") + mf.add_field("price", reverse=True) + results = searcher.search(myquery, sortedby=mf) + + # or... + sizes = sorting.FieldFacet("size") + prices = sorting.FieldFacet("price", reverse=True) + results = searcher.search(myquery, sortedby=[sizes, prices]) + +Sort by the "category" field, then by the document's score:: + + cats = sorting.FieldFacet("category") + scores = sorting.ScoreFacet() + results = searcher.search(myquery, sortedby=[cats, scores]) + + +Accessing column values +----------------------- + +Per-document column values are available in :class:`~whoosh.searching.Hit` +objects just like stored field values:: + + schema = fields.Schema(title=fields.TEXT(stored=True), + price=fields.NUMERIC(sortable=True)) + + ... + + results = searcher.search(myquery) + for hit in results: + print(hit["title"], hit["price"]) + +ADVANCED: if you want to access abitrary per-document values quickly you can get +a column reader object:: + + with ix.searcher() as s: + reader = s.reader() + + colreader = s.reader().column_reader("price") + for docnum in reader.all_doc_ids(): + print(colreader[docnum]) + + +Grouping +======== + +It is often very useful to present "faceted" search results to the user. +Faceting is dynamic grouping of search results into categories. The +categories let users view a slice of the total results based on the categories +they're interested in. + +For example, if you are programming a shopping website, you might want to +display categories with the search results such as the manufacturers and price +ranges. + +==================== ================= +Manufacturer Price +-------------------- ----------------- +Apple (5) $0 - $100 (2) +Sanyo (1) $101 - $500 (10) +Sony (2) $501 - $1000 (1) +Toshiba (5) +==================== ================= + +You can let your users click the different facet values to only show results +in the given categories. + +Another useful UI pattern is to show, say, the top 5 results for different +types of found documents, and let the user click to see more results from a +category they're interested in, similarly to how the Spotlight quick results +work on Mac OS X. + + +The ``groupedby`` keyword argument +---------------------------------- + +You can use the following objects as ``groupedby`` values: + +A ``FacetType`` object + Uses this object to group the documents. See below for the available facet + types. + +A field name string + Converts the field name into a ``FieldFacet`` (see below) and uses it to + sort the documents. The name of the field is used as the facet name. + +A list or tuple of field name strings + Sets up multiple field grouping criteria. + +A dictionary mapping facet names to ``FacetType`` objects + Sets up multiple grouping criteria. + +A ``Facets`` object + This object is a lot like using a dictionary, but has some convenience + methods to make setting up multiple groupings a little easier. + + +Examples +-------- + +Group by the value of the "category" field:: + + results = searcher.search(myquery, groupedby="category") + +Group by the value of the "category" field and also by the value of the "tags" +field and a date range:: + + cats = sorting.FieldFacet("category") + tags = sorting.FieldFacet("tags", allow_overlap=True) + results = searcher.search(myquery, groupedby={"category": cats, "tags": tags}) + + # ...or, using a Facets object has a little less duplication + facets = sorting.Facets() + facets.add_field("category") + facets.add_field("tags", allow_overlap=True) + results = searcher.search(myquery, groupedby=facets) + +To group results by the *intersected values of multiple fields*, use a +``MultiFacet`` object (see below). For example, if you have two fields named +``tag`` and ``size``, you could group the results by all combinations of the +``tag`` and ``size`` field, such as ``('tag1', 'small')``, +``('tag2', 'small')``, ``('tag1', 'medium')``, and so on:: + + # Generate a grouping from the combination of the "tag" and "size" fields + mf = MultiFacet("tag", "size") + results = searcher.search(myquery, groupedby={"tag/size": mf}) + + +Getting the faceted groups +-------------------------- + +The ``Results.groups("facetname")`` method returns a dictionary mapping +category names to lists of **document IDs**:: + + myfacets = sorting.Facets().add_field("size").add_field("tag") + results = mysearcher.search(myquery, groupedby=myfacets) + results.groups("size") + # {"small": [8, 5, 1, 2, 4], "medium": [3, 0, 6], "large": [7, 9]} + +If there is only one facet, you can just use ``Results.groups()`` with no +argument to access its groups:: + + results = mysearcher.search(myquery, groupedby=myfunctionfacet) + results.groups() + +By default, the values in the dictionary returned by ``groups()`` are lists of +document numbers in the same relative order as in the results. You can use the +``Searcher`` object's ``stored_fields()`` method to take a document number and +return the document's stored fields as a dictionary:: + + for category_name in categories: + print "Top 5 documents in the %s category" % category_name + doclist = categories[category_name] + for docnum, score in doclist[:5]: + print " ", searcher.stored_fields(docnum) + if len(doclist) > 5: + print " (%s more)" % (len(doclist) - 5) + +If you want different information about the groups, for example just the count +of documents in each group, or you don't need the groups to be ordered, you can +specify a :class:`whoosh.sorting.FacetMap` type or instance with the +``maptype`` keyword argument when creating the ``FacetType``:: + + # This is the same as the default + myfacet = FieldFacet("size", maptype=sorting.OrderedList) + results = mysearcher.search(myquery, groupedby=myfacet) + results.groups() + # {"small": [8, 5, 1, 2, 4], "medium": [3, 0, 6], "large": [7, 9]} + + # Don't sort the groups to match the order of documents in the results + # (faster) + myfacet = FieldFacet("size", maptype=sorting.UnorderedList) + results = mysearcher.search(myquery, groupedby=myfacet) + results.groups() + # {"small": [1, 2, 4, 5, 8], "medium": [0, 3, 6], "large": [7, 9]} + + # Only count the documents in each group + myfacet = FieldFacet("size", maptype=sorting.Count) + results = mysearcher.search(myquery, groupedby=myfacet) + results.groups() + # {"small": 5, "medium": 3, "large": 2} + + # Only remember the "best" document in each group + myfacet = FieldFacet("size", maptype=sorting.Best) + results = mysearcher.search(myquery, groupedby=myfacet) + results.groups() + # {"small": 8, "medium": 3, "large": 7} + +Alternatively you can specify a ``maptype`` argument in the +``Searcher.search()`` method call which applies to all facets:: + + results = mysearcher.search(myquery, groupedby=["size", "tag"], + maptype=sorting.Count) + +(You can override this overall ``maptype`` argument on individual facets by +specifying the ``maptype`` argument for them as well.) + + +Facet types +=========== + +FieldFacet +---------- + +This is the most common facet type. It sorts or groups based on the +value in a certain field in each document. This generally works best +(or at all) if each document has only one term in the field (e.g. an ID +field):: + + # Sort search results by the value of the "path" field + facet = sorting.FieldFacet("path") + results = searcher.search(myquery, sortedby=facet) + + # Group search results by the value of the "parent" field + facet = sorting.FieldFacet("parent") + results = searcher.search(myquery, groupedby=facet) + parent_groups = results.groups("parent") + +By default, ``FieldFacet`` only supports **non-overlapping** grouping, where a +document cannot belong to multiple facets at the same time (each document will +be sorted into one category arbitrarily.) To get overlapping groups with +multi-valued fields, use the ``allow_overlap=True`` keyword argument:: + + facet = sorting.FieldFacet(fieldname, allow_overlap=True) + +This supports overlapping group membership where documents have more than one +term in a field (e.g. KEYWORD fields). If you don't need overlapping, don't +use ``allow_overlap`` because it's *much* slower and uses more memory (see +the secion on ``allow_overlap`` below). + + +QueryFacet +---------- + +You can set up categories defined by arbitrary queries. For example, you can +group names using prefix queries:: + + # Use queries to define each category + # (Here I'll assume "price" is a NUMERIC field, so I'll use + # NumericRange) + qdict = {} + qdict["A-D"] = query.TermRange("name", "a", "d") + qdict["E-H"] = query.TermRange("name", "e", "h") + qdict["I-L"] = query.TermRange("name", "i", "l") + # ... + + qfacet = sorting.QueryFacet(qdict) + r = searcher.search(myquery, groupedby={"firstltr": qfacet}) + +By default, ``QueryFacet`` only supports **non-overlapping** grouping, where a +document cannot belong to multiple facets at the same time (each document will +be sorted into one category arbitrarily). To get overlapping groups with +multi-valued fields, use the ``allow_overlap=True`` keyword argument:: + + facet = sorting.QueryFacet(querydict, allow_overlap=True) + + +RangeFacet +---------- + +The ``RangeFacet`` is for NUMERIC field types. It divides a range of possible +values into groups. For example, to group documents based on price into +buckets $100 "wide":: + + pricefacet = sorting.RangeFacet("price", 0, 1000, 100) + +The first argument is the name of the field. The next two arguments are the +full range to be divided. Value outside this range (in this example, values +below 0 and above 1000) will be sorted into the "missing" (None) group. The +fourth argument is the "gap size", the size of the divisions in the range. + +The "gap" can be a list instead of a single value. In that case, the values in +the list will be used to set the size of the initial divisions, with the last +value in the list being the size for all subsequent divisions. For example:: + + pricefacet = sorting.RangeFacet("price", 0, 1000, [5, 10, 35, 50]) + +...will set up divisions of 0-5, 5-15, 15-50, 50-100, and then use 50 as the +size for all subsequent divisions (i.e. 100-150, 150-200, and so on). + +The ``hardend`` keyword argument controls whether the last division is clamped +to the end of the range or allowed to go past the end of the range. For +example, this:: + + facet = sorting.RangeFacet("num", 0, 10, 4, hardend=False) + +...gives divisions 0-4, 4-8, and 8-12, while this:: + + facet = sorting.RangeFacet("num", 0, 10, 4, hardend=True) + +...gives divisions 0-4, 4-8, and 8-10. (The default is ``hardend=False``.) + +.. note:: + The ranges/buckets are always **inclusive** at the start and **exclusive** + at the end. + + +DateRangeFacet +-------------- + +This is like ``RangeFacet`` but for DATETIME fields. The start and end values +must be ``datetime.datetime`` objects, and the gap(s) is/are +``datetime.timedelta`` objects. + +For example:: + + from datetime import datetime, timedelta + + start = datetime(2000, 1, 1) + end = datetime.now() + gap = timedelta(days=365) + bdayfacet = sorting.DateRangeFacet("birthday", start, end, gap) + +As with ``RangeFacet``, you can use a list of gaps and the ``hardend`` keyword +argument. + + +ScoreFacet +---------- + +This facet is sometimes useful for sorting. + +For example, to sort by the "category" field, then for documents with the same +category, sort by the document's score:: + + cats = sorting.FieldFacet("category") + scores = sorting.ScoreFacet() + results = searcher.search(myquery, sortedby=[cats, scores]) + +The ``ScoreFacet`` always sorts higher scores before lower scores. + +.. note:: + While using ``sortedby=ScoreFacet()`` should give the same results as using + the default scored ordering (``sortedby=None``), using the facet will be + slower because Whoosh automatically turns off many optimizations when + sorting. + + +FunctionFacet +------------- + +This facet lets you pass a custom function to compute the sorting/grouping key +for documents. (Using this facet type may be easier than subclassing FacetType +and Categorizer to set up some custom behavior.) + +The function will be called with the index searcher and index document ID as +arguments. For example, if you have an index with term vectors:: + + schema = fields.Schema(id=fields.STORED, + text=fields.TEXT(stored=True, vector=True)) + ix = RamStorage().create_index(schema) + +...you could use a function to sort documents higher the closer they are to +having equal occurances of two terms:: + + def fn(searcher, docnum): + v = dict(searcher.vector_as("frequency", docnum, "text")) + # Sort documents that have equal number of "alfa" and "bravo" first + return 0 - (1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0)) + + facet = sorting.FunctionFacet(fn) + results = searcher.search(myquery, sortedby=facet) + + +StoredFieldFacet +---------------- + +This facet lets you use stored field values as the sorting/grouping key for +documents. This is usually slower than using an indexed field, but when using +``allow_overlap`` it can actually be faster for large indexes just because it +avoids the overhead of reading posting lists. + +:class:`~whoosh.sorting.StoredFieldFacet` supports ``allow_overlap`` by +splitting the stored value into separate keys. By default it calls the value's +``split()`` method (since most stored values are strings), but you can supply +a custom split function. See the section on ``allow_overlap`` below. + + +MultiFacet +========== + +This facet type returns a composite of the keys returned by two or more +sub-facets, allowing you to sort/group by the intersected values of multiple +facets. + +``MultiFacet`` has methods for adding facets:: + + myfacet = sorting.RangeFacet(0, 1000, 10) + + mf = sorting.MultiFacet() + mf.add_field("category") + mf.add_field("price", reverse=True) + mf.add_facet(myfacet) + mf.add_score() + +You can also pass a list of field names and/or ``FacetType`` objects to the +initializer:: + + prices = sorting.FieldFacet("price", reverse=True) + scores = sorting.ScoreFacet() + mf = sorting.MultiFacet("category", prices, myfacet, scores) + + +Missing values +============== + +* When sorting, documents without any terms in a given field, or whatever else + constitutes "missing" for different facet types, will always sort to the end. + +* When grouping, "missing" documents will appear in a group with the + key ``None``. + + +Using overlapping groups +======================== + +The common supported workflow for grouping and sorting is where the given field +has *one value for document*, for example a ``path`` field containing the file +path of the original document. By default, facets are set up to support this +single-value approach. + +Of course, there are situations where you want documents to be sorted into +multiple groups based on a field with multiple terms per document. The most +common example would be a ``tags`` field. The ``allow_overlap`` keyword +argument to the :class:`~whoosh.sorting.FieldFacet`, +:class:`~whoosh.sorting.QueryFacet`, and +:class:`~whoosh.sorting.StoredFieldFacet` allows this multi-value approach. + +However, there is an important caveat: using ``allow_overlap=True`` is slower +than the default, potentially *much* slower for very large result sets. This is +because Whoosh must read every posting of every term in the field to +create a temporary "forward index" mapping documents to terms. + +If a field is indexed with *term vectors*, ``FieldFacet`` will use them to +speed up ``allow_overlap`` faceting for small result sets, but for large result +sets, where Whoosh has to open the vector list for every matched document, this +can still be very slow. + +For very large indexes and result sets, if a field is stored, you can get +faster overlapped faceting using :class:`~whoosh.sorting.StoredFieldFacet` +instead of ``FieldFacet``. While reading stored values is usually slower than +using the index, in this case avoiding the overhead of opening large numbers of +posting readers can make it worthwhile. + +``StoredFieldFacet`` supports ``allow_overlap`` by loading the stored value for +the given field and splitting it into multiple values. The default is to call +the value's ``split()`` method. + +For example, if you've stored the ``tags`` field as a string like +``"tag1 tag2 tag3"``:: + + schema = fields.Schema(name=fields.TEXT(stored=True), + tags=fields.KEYWORD(stored=True)) + ix = index.create_in("indexdir") + with ix.writer() as w: + w.add_document(name="A Midsummer Night's Dream", tags="comedy fairies") + w.add_document(name="Hamlet", tags="tragedy denmark") + # etc. + +...Then you can use a ``StoredFieldFacet`` like this:: + + ix = index.open_dir("indexdir") + with ix.searcher() as s: + sff = sorting.StoredFieldFacet("tags", allow_overlap=True) + results = s.search(myquery, groupedby={"tags": sff}) + +For stored Python objects other than strings, you can supply a split function +(using the ``split_fn`` keyword argument to ``StoredFieldFacet``). The function +should accept a single argument (the stored value) and return a list or tuple +of grouping keys. + + +Using a custom sort order +========================= + +It is sometimes useful to have a custom sort order per-search. For example, +different languages use different sort orders. If you have a function to return +the sorting order you want for a given field value, such as an implementation of +the Unicode Collation Algorithm (UCA), you can customize the sort order +for the user's language. + +The :class:`whoosh.sorting.TranslateFacet` lets you apply a function to the +value of another facet. This lets you "translate" a field value into an +arbitrary sort key, such as with UCA:: + + from pyuca import Collator + + # The Collator object has a sort_key() method which takes a unicode + # string and returns a sort key + c = Collator("allkeys.txt") + + # Make a facet object for the field you want to sort on + nf = sorting.FieldFacet("name") + + # Wrap the facet in a TranslateFacet with the translation function + # (the Collator object's sort_key method) + tf = sorting.TranslateFacet(facet, c.sort_key) + + # Use the facet to sort the search results + results = searcher.search(myquery, sortedby=tf) + +(You can pass multiple "wrapped" facets to the ``TranslateFacet``, and it will +call the function with the values of the facets as multiple arguments.) + +The TranslateFacet can also be very useful with numeric fields to sort on the +output of some formula:: + + # Sort based on the average of two numeric fields + def average(a, b): + return (a + b) / 2.0 + + # Create two facets for the fields and pass them with the function to + # TranslateFacet + af = sorting.FieldFacet("age") + wf = sorting.FieldFacet("weight") + facet = sorting.TranslateFacet(average, af, wf) + + results = searcher.search(myquery. sortedby=facet) + +Remember that you can still sort by multiple facets. For example, you could sort +by a numeric value transformed by a quantizing function first, and then if that +is equal sort by the value of another field:: + + # Sort by a quantized size first, then by name + tf = sorting.TranslateFacet(quantize, sorting.FieldFacet("size")) + results = searcher.search(myquery, sortedby=[tf, "name"]) + + +Expert: writing your own facet +============================== + +TBD. + + diff --git a/docs/source/fieldcaches.rst b/docs/source/fieldcaches.rst new file mode 100644 index 0000000..49091dc --- /dev/null +++ b/docs/source/fieldcaches.rst @@ -0,0 +1,52 @@ +============ +Field caches +============ + +The default (``filedb``) backend uses *field caches* in certain circumstances. +The field cache basically pre-computes the order of documents in the index to +speed up sorting and faceting. + +Generating field caches can take time the first time you sort/facet on a large +index. The field cache is kept in memory (and by default written to disk when it +is generated) so subsequent sorted/faceted searches should be faster. + +The default caching policy never expires field caches, so reused searchers and/or +sorting a lot of different fields could use up quite a bit of memory with large +indexes. + + +Customizing cache behaviour +=========================== + +(The following API examples refer to the default ``filedb`` backend.) + +*By default*, Whoosh saves field caches to disk. To prevent a reader or searcher +from writing out field caches, do this before you start using it:: + + searcher.set_caching_policy(save=False) + +By default, if caches are written to disk they are saved in the index directory. +To tell a reader or searcher to save cache files to a different location, create +a storage object and pass it to the ``storage`` keyword argument:: + + from whoosh.filedb.filestore import FileStorage + + mystorage = FileStorage("path/to/cachedir") + reader.set_caching_policy(storage=mystorage) + + +Creating a custom caching policy +================================ + +Expert users who want to implement a custom caching policy (for example, to add +cache expiration) should subclass :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`. +Then you can pass an instance of your policy object to the ``set_caching_policy`` +method:: + + searcher.set_caching_policy(MyPolicy()) + + + + + + diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst new file mode 100644 index 0000000..e9dd52d --- /dev/null +++ b/docs/source/glossary.rst @@ -0,0 +1,65 @@ +.. _glossary: + +======== +Glossary +======== + +.. glossary:: + + Analysis + The process of breaking the text of a field into individual *terms* + to be indexed. This consists of tokenizing the text into terms, and then optionally + filtering the tokenized terms (for example, lowercasing and removing *stop words*). + Whoosh includes several different analyzers. + + Corpus + The set of documents you are indexing. + + Documents + The individual pieces of content you want to make searchable. + The word "documents" might imply files, but the data source could really be + anything -- articles in a content management system, blog posts in a blogging + system, chunks of a very large file, rows returned from an SQL query, individual + email messages from a mailbox file, or whatever. When you get search results + from Whoosh, the results are a list of documents, whatever "documents" means in + your search engine. + + Fields + Each document contains a set of fields. Typical fields might be "title", "content", + "url", "keywords", "status", "date", etc. Fields can be indexed (so they're + searchable) and/or stored with the document. Storing the field makes it available + in search results. For example, you typically want to store the "title" field so + your search results can display it. + + Forward index + A table listing every document and the words that appear in the document. + Whoosh lets you store *term vectors* that are a kind of forward index. + + Indexing + The process of examining documents in the corpus and adding them to the + *reverse index*. + + Postings + The *reverse index* lists every word in the corpus, and for each word, a list + of documents in which that word appears, along with some optional information + (such as the number of times the word appears in that document). These items + in the list, containing a document number and any extra information, are + called *postings*. In Whoosh the information stored in postings is customizable + for each *field*. + + Reverse index + Basically a table listing every word in the corpus, and for each word, the + list of documents in which it appears. It can be more complicated (the index can + also list how many times the word appears in each document, the positions at which + it appears, etc.) but that's how it basically works. + + Schema + Whoosh requires that you specify the *fields* of the index before you begin + indexing. The Schema associates field names with metadata about the field, such + as the format of the *postings* and whether the contents of the field are stored + in the index. + + Term vector + A *forward index* for a certain field in a certain document. You can specify + in the Schema that a given field should store term vectors. + diff --git a/docs/source/highlight.rst b/docs/source/highlight.rst new file mode 100644 index 0000000..5244c5f --- /dev/null +++ b/docs/source/highlight.rst @@ -0,0 +1,419 @@ +================================================ +How to create highlighted search result excerpts +================================================ + +Overview +======== + +The highlighting system works as a pipeline, with four component types. + +* **Fragmenters** chop up the original text into __fragments__, based on the + locations of matched terms in the text. + +* **Scorers** assign a score to each fragment, allowing the system to rank the + best fragments by whatever criterion. + +* **Order functions** control in what order the top-scoring fragments are + presented to the user. For example, you can show the fragments in the order + they appear in the document (FIRST) or show higher-scoring fragments first + (SCORE) + +* **Formatters** turn the fragment objects into human-readable output, such as + an HTML string. + + +Requirements +============ + +Highlighting requires that you have the text of the indexed document available. +You can keep the text in a stored field, or if the original text is available +in a file, database column, etc, just reload it on the fly. Note that you might +need to process the text to remove e.g. HTML tags, wiki markup, etc. + + +How to +====== + +Get search results:: + + results = mysearcher.search(myquery) + for hit in results: + print(hit["title"]) + +You can use the :meth:`~whoosh.searching.Hit.highlights` method on the +:class:`whoosh.searching.Hit` object to get highlighted snippets from the +document containing the search terms. + +The first argument is the name of the field to highlight. If the field is +stored, this is the only argument you need to supply:: + + results = mysearcher.search(myquery) + for hit in results: + print(hit["title"]) + # Assume "content" field is stored + print(hit.highlights("content")) + +If the field is not stored, you need to retrieve the text of the field some +other way. For example, reading it from the original file or a database. Then +you can supply the text to highlight with the ``text`` argument:: + + results = mysearcher.search(myquery) + for hit in results: + print(hit["title"]) + + # Assume the "path" stored field contains a path to the original file + with open(hit["path"]) as fileobj: + filecontents = fileobj.read() + + print(hit.highlights("content", text=filecontents)) + + +The character limit +=================== + +By default, Whoosh only pulls fragments from the first 32K characters of the +text. This prevents very long texts from bogging down the highlighting process +too much, and is usually justified since important/summary information is +usually at the start of a document. However, if you find the highlights are +missing information (for example, very long encyclopedia articles where the +terms appear in a later section), you can increase the fragmenter's character +limit. + +You can change the character limit on the results object like this:: + + results = mysearcher.search(myquery) + results.fragmenter.charlimit = 100000 + +To turn off the character limit:: + + results.fragmenter.charlimit = None + +If you instantiate a custom fragmenter, you can set the character limit on it +directly:: + + sf = highlight.SentenceFragmenter(charlimit=100000) + results.fragmenter = sf + +See below for information on customizing the highlights. + +If you increase or disable the character limit to highlight long documents, you +may need to use the tips in the "speeding up highlighting" section below to +make highlighting faster. + + +Customizing the highlights +========================== + +Number of fragments +------------------- + +You can use the ``top`` keyword argument to control the number of fragments +returned in each snippet:: + + # Show a maximum of 5 fragments from the document + print hit.highlights("content", top=5) + + +Fragment size +------------- + +The default fragmenter has a ``maxchars`` attribute (default 200) controlling +the maximum length of a fragment, and a ``surround`` attribute (default 20) +controlling the maximum number of characters of context to add at the beginning +and end of a fragment:: + + # Allow larger fragments + results.fragmenter.maxchars = 300 + + # Show more context before and after + results.fragmenter.surround = 50 + + +Fragmenter +---------- + +A fragmenter controls how to extract excerpts from the original text. + +The ``highlight`` module has the following pre-made fragmenters: + +:class:`whoosh.highlight.ContextFragmenter` (the default) + This is a "smart" fragmenter that finds matched terms and then pulls + in surround text to form fragments. This fragmenter only yields + fragments that contain matched terms. + +:class:`whoosh.highlight.SentenceFragmenter` + Tries to break the text into fragments based on sentence punctuation + (".", "!", and "?"). This object works by looking in the original + text for a sentence end as the next character after each token's + 'endchar'. Can be fooled by e.g. source code, decimals, etc. + +:class:`whoosh.highlight.WholeFragmenter` + Returns the entire text as one "fragment". This can be useful if you + are highlighting a short bit of text and don't need to fragment it. + +The different fragmenters have different options. For example, the default +:class:`~whoosh.highlight.ContextFragmenter` lets you set the maximum +fragment size and the size of the context to add on either side:: + + my_cf = highlight.ContextFragmenter(maxchars=100, surround=30) + +See the :mod:`whoosh.highlight` docs for more information. + +To use a different fragmenter:: + + results.fragmenter = my_cf + + +Scorer +------ + +A scorer is a callable that takes a :class:`whoosh.highlight.Fragment` object and +returns a sortable value (where higher values represent better fragments). +The default scorer adds up the number of matched terms in the fragment, and +adds a "bonus" for the number of __different__ matched terms. The highlighting +system uses this score to select the best fragments to show to the user. + +As an example of a custom scorer, to rank fragments by lowest standard +deviation of the positions of matched terms in the fragment:: + + def StandardDeviationScorer(fragment): + """Gives higher scores to fragments where the matched terms are close + together. + """ + + # Since lower values are better in this case, we need to negate the + # value + return 0 - stddev([t.pos for t in fragment.matched]) + +To use a different scorer:: + + results.scorer = StandardDeviationScorer + + +Order +----- + +The order is a function that takes a fragment and returns a sortable value used +to sort the highest-scoring fragments before presenting them to the user (where +fragments with lower values appear before fragments with higher values). + +The ``highlight`` module has the following order functions. + +``FIRST`` (the default) + Show fragments in the order they appear in the document. + +``SCORE`` + Show highest scoring fragments first. + +The ``highlight`` module also includes ``LONGER`` (longer fragments first) and +``SHORTER`` (shorter fragments first), but they probably aren't as generally +useful. + +To use a different order:: + + results.order = highlight.SCORE + + +Formatter +--------- + +A formatter contols how the highest scoring fragments are turned into a +formatted bit of text for display to the user. It can return anything +(e.g. plain text, HTML, a Genshi event stream, a SAX event generator, +or anything else useful to the calling system). + +The ``highlight`` module contains the following pre-made formatters. + +:class:`whoosh.highlight.HtmlFormatter` + Outputs a string containing HTML tags (with a class attribute) + around the matched terms. + +:class:`whoosh.highlight.UppercaseFormatter` + Converts the matched terms to UPPERCASE. + +:class:`whoosh.highlight.GenshiFormatter` + Outputs a Genshi event stream, with the matched terms wrapped in a + configurable element. + +The easiest way to create a custom formatter is to subclass +``highlight.Formatter`` and override the ``format_token`` method:: + + class BracketFormatter(highlight.Formatter): + """Puts square brackets around the matched terms. + """ + + def format_token(self, text, token, replace=False): + # Use the get_text function to get the text corresponding to the + # token + tokentext = highlight.get_text(text, token) + + # Return the text as you want it to appear in the highlighted + # string + return "[%s]" % tokentext + +To use a different formatter:: + + brf = BracketFormatter() + results.formatter = brf + +If you need more control over the formatting (or want to output something other +than strings), you will need to override other methods. See the documentation +for the :class:`whoosh.highlight.Formatter` class. + + +Highlighter object +================== + +Rather than setting attributes on the results object, you can create a +reusable :class:`whoosh.highlight.Highlighter` object. Keyword arguments let +you change the ``fragmenter``, ``scorer``, ``order``, and/or ``formatter``:: + + hi = highlight.Highlighter(fragmenter=my_cf, scorer=sds) + +You can then use the :meth:`whoosh.highlight.Highlighter.highlight_hit` method +to get highlights for a ``Hit`` object:: + + for hit in results: + print(hit["title"]) + print(hi.highlight_hit(hit)) + +(When you assign to a ``Results`` object's ``fragmenter``, ``scorer``, ``order``, +or ``formatter`` attributes, you're actually changing the values on the +results object's default ``Highlighter`` object.) + + +Speeding up highlighting +======================== + +Recording which terms matched in which documents during the search may make +highlighting faster, since it will skip documents it knows don't contain any +matching terms in the given field:: + + # Record per-document term matches + results = searcher.search(myquery, terms=True) + + +PinpointFragmenter +------------------ + +Usually the highlighting system uses the field's analyzer to re-tokenize the +document's text to find the matching terms in context. If you have long +documents and have increased/disabled the character limit, and/or if the field +has a very complex analyzer, re-tokenizing may be slow. + +Instead of retokenizing, Whoosh can look up the character positions of the +matched terms in the index. Looking up the character positions is not +instantaneous, but is usually faster than analyzing large amounts of text. + +To use :class:`whoosh.highlight.PinpointFragmenter` and avoid re-tokenizing the +document text, you must do all of the following: + +Index the field with character information (this will require re-indexing an +existing index):: + + # Index the start and end chars of each term + schema = fields.Schema(content=fields.TEXT(stored=True, chars=True)) + +Record per-document term matches in the results:: + + # Record per-document term matches + results = searcher.search(myquery, terms=True) + +Set a :class:`whoosh.highlight.PinpointFragmenter` as the fragmenter:: + + results.fragmenter = highlight.PinpointFragmenter() + + +PinpointFragmenter limitations +------------------------------ + +When the highlighting system does not re-tokenize the text, it doesn't know +where any other words are in the text except the matched terms it looked up in +the index. Therefore when the fragmenter adds surrounding context, it just adds +or a certain number of characters blindly, and so doesn't distinguish between +content and whitespace, or break on word boundaries, for example:: + + >>> hit.highlights("content") + 're when the fragmenter\n ad' + +(This can be embarassing when the word fragments form dirty words!) + +One way to avoid this is to not show any surrounding context, but then +fragments containing one matched term will contain ONLY that matched term:: + + >>> hit.highlights("content") + 'fragmenter' + +Alternatively, you can normalize whitespace in the text before passing it to +the highlighting system:: + + >>> text = searcher.stored_ + >>> re.sub("[\t\r\n ]+", " ", text) + >>> hit.highlights("content", text=text) + +...and use the ``autotrim`` option of ``PinpointFragmenter`` to automatically +strip text before the first space and after the last space in the fragments:: + + >>> results.fragmenter = highlight.PinpointFragmenter(autotrim=True) + >>> hit.highlights("content") + 'when the fragmenter' + + +Using the low-level API +======================= + +Usage +----- + +The following function lets you retokenize and highlight a piece of text using +an analyzer:: + + from whoosh.highlight import highlight + + excerpts = highlight(text, terms, analyzer, fragmenter, formatter, top=3, + scorer=BasicFragmentScorer, minscore=1, order=FIRST) + +``text`` + The original text of the document. + +``terms`` + A sequence or set containing the query words to match, e.g. ("render", + "shader"). + +``analyzer`` + The analyzer to use to break the document text into tokens for matching + against the query terms. This is usually the analyzer for the field the + query terms are in. + +``fragmenter`` + A :class:`whoosh.highlight.Fragmenter` object, see below. + +``formatter`` + A :class:`whoosh.highlight.Formatter` object, see below. + +``top`` + The number of fragments to include in the output. + +``scorer`` + A :class:`whoosh.highlight.FragmentScorer` object. The only scorer currently + included with Whoosh is :class:`~whoosh.highlight.BasicFragmentScorer`, the + default. + +``minscore`` + The minimum score a fragment must have to be considered for inclusion. + +``order`` + An ordering function that determines the order of the "top" fragments in the + output text. + + + + + + + + + + + + diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..213f8cc --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,50 @@ +============================== +Whoosh |release| documentation +============================== + +Whoosh was created by `Matt Chaput `_. +You can view outstanding issues on the +`Whoosh Bitbucket page `_ +and get help on the `Whoosh mailing list `_. + + +Contents +======== + +.. toctree:: + :maxdepth: 2 + + releases/index + quickstart + intro + glossary + schema + indexing + searching + parsing + querylang + dates + query + analysis + stemming + ngrams + facets + highlight + keywords + spelling + fieldcaches + batch + threads + nested + recipes + api/api + tech/index + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/docs/source/indexing.rst b/docs/source/indexing.rst new file mode 100644 index 0000000..e8278df --- /dev/null +++ b/docs/source/indexing.rst @@ -0,0 +1,440 @@ +====================== +How to index documents +====================== + +Creating an Index object +======================== + +To create an index in a directory, use ``index.create_in``:: + + import os, os.path + from whoosh import index + + if not os.path.exists("indexdir"): + os.mkdir("indexdir") + + ix = index.create_in("indexdir", schema) + +To open an existing index in a directory, use ``index.open_dir``:: + + import whoosh.index as index + + ix = index.open_dir("indexdir") + +These are convenience methods for:: + + from whoosh.filedb.filestore import FileStorage + storage = FileStorage("indexdir") + + # Create an index + ix = storage.create_index(schema) + + # Open an existing index + storage.open_index() + +The schema you created the index with is pickled and stored with the index. + +You can keep multiple indexes in the same directory using the indexname keyword +argument:: + + # Using the convenience functions + ix = index.create_in("indexdir", schema=schema, indexname="usages") + ix = index.open_dir("indexdir", indexname="usages") + + # Using the Storage object + ix = storage.create_index(schema, indexname="usages") + ix = storage.open_index(indexname="usages") + + +Clearing the index +================== + +Calling ``index.create_in`` on a directory with an existing index will clear the +current contents of the index. + +To test whether a directory currently contains a valid index, use +``index.exists_in``:: + + exists = index.exists_in("indexdir") + usages_exists = index.exists_in("indexdir", indexname="usages") + +(Alternatively you can simply delete the index's files from the directory, e.g. +if you only have one index in the directory, use ``shutil.rmtree`` to remove the +directory and then recreate it.) + + +Indexing documents +================== + +Once you've created an ``Index`` object, you can add documents to the index with an +``IndexWriter`` object. The easiest way to get the ``IndexWriter`` is to call +``Index.writer()``:: + + ix = index.open_dir("index") + writer = ix.writer() + +Creating a writer locks the index for writing, so only one thread/process at +a time can have a writer open. + +.. note:: + + Because opening a writer locks the index for writing, in a multi-threaded + or multi-process environment your code needs to be aware that opening a + writer may raise an exception (``whoosh.store.LockError``) if a writer is + already open. Whoosh includes a couple of example implementations + (:class:`whoosh.writing.AsyncWriter` and + :class:`whoosh.writing.BufferedWriter`) of ways to work around the write + lock. + +.. note:: + + While the writer is open and during the commit, the index is still + available for reading. Existing readers are unaffected and new readers can + open the current index normally. Once the commit is finished, existing + readers continue to see the previous version of the index (that is, they + do not automatically see the newly committed changes). New readers will see + the updated index. + +The IndexWriter's ``add_document(**kwargs)`` method accepts keyword arguments +where the field name is mapped to a value:: + + writer = ix.writer() + writer.add_document(title=u"My document", content=u"This is my document!", + path=u"/a", tags=u"first short", icon=u"/icons/star.png") + writer.add_document(title=u"Second try", content=u"This is the second example.", + path=u"/b", tags=u"second short", icon=u"/icons/sheep.png") + writer.add_document(title=u"Third time's the charm", content=u"Examples are many.", + path=u"/c", tags=u"short", icon=u"/icons/book.png") + writer.commit() + +You don't have to fill in a value for every field. Whoosh doesn't care if you +leave out a field from a document. + +Indexed fields must be passed a unicode value. Fields that are stored but not +indexed (i.e. the ``STORED`` field type) can be passed any pickle-able object. + +Whoosh will happily allow you to add documents with identical values, which can +be useful or annoying depending on what you're using the library for:: + + writer.add_document(path=u"/a", title=u"A", content=u"Hello there") + writer.add_document(path=u"/a", title=u"A", content=u"Deja vu!") + +This adds two documents to the index with identical path and title fields. See +"updating documents" below for information on the ``update_document`` method, which +uses "unique" fields to replace old documents instead of appending. + + +Indexing and storing different values for the same field +-------------------------------------------------------- + +If you have a field that is both indexed and stored, you can index a unicode +value but store a different object if necessary (it's usually not, but sometimes +this is really useful) using a "special" keyword argument ``_stored_``. +The normal value will be analyzed and indexed, but the "stored" value will show +up in the results:: + + writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title") + + +Finishing adding documents +-------------------------- + +An ``IndexWriter`` object is kind of like a database transaction. You specify a +bunch of changes to the index, and then "commit" them all at once. + +Calling ``commit()`` on the ``IndexWriter`` saves the added documents to the +index:: + + writer.commit() + +Once your documents are in the index, you can search for them. + +If you want to close the writer without committing the changes, call +``cancel()`` instead of ``commit()``:: + + writer.cancel() + +Keep in mind that while you have a writer open (including a writer you opened +and is still in scope), no other thread or process can get a writer or modify +the index. A writer also keeps several open files. So you should always remember +to call either ``commit()`` or ``cancel()`` when you're done with a writer object. + + +Merging segments +================ + +A Whoosh ``filedb`` index is really a container for one or more "sub-indexes" +called segments. When you add documents to an index, instead of integrating the +new documents with the existing documents (which could potentially be very +expensive, since it involves resorting all the indexed terms on disk), Whoosh +creates a new segment next to the existing segment. Then when you search the +index, Whoosh searches both segments individually and merges the results so the +segments appear to be one unified index. (This smart design is copied from +Lucene.) + +So, having a few segments is more efficient than rewriting the entire index +every time you add some documents. But searching multiple segments does slow +down searching somewhat, and the more segments you have, the slower it gets. So +Whoosh has an algorithm that runs when you call ``commit()`` that looks for small +segments it can merge together to make fewer, bigger segments. + +To prevent Whoosh from merging segments during a commit, use the ``merge`` +keyword argument:: + + writer.commit(merge=False) + +To merge all segments together, optimizing the index into a single segment, +use the ``optimize`` keyword argument:: + + writer.commit(optimize=True) + +Since optimizing rewrites all the information in the index, it can be slow on +a large index. It's generally better to rely on Whoosh's merging algorithm than +to optimize all the time. + +(The ``Index`` object also has an ``optimize()`` method that lets you optimize the +index (merge all the segments together). It simply creates a writer and calls +``commit(optimize=True)`` on it.) + +For more control over segment merging, you can write your own merge policy +function and use it as an argument to the ``commit()`` method. See the +implementation of the ``NO_MERGE``, ``MERGE_SMALL``, and ``OPTIMIZE`` functions +in the ``whoosh.writing`` module. + + +Deleting documents +================== + +You can delete documents using the following methods on an ``IndexWriter`` +object. You then need to call ``commit()`` on the writer to save the deletions +to disk. + +``delete_document(docnum)`` + + Low-level method to delete a document by its internal document number. + +``is_deleted(docnum)`` + + Low-level method, returns ``True`` if the document with the given internal + number is deleted. + +``delete_by_term(fieldname, termtext)`` + + Deletes any documents where the given (indexed) field contains the given + term. This is mostly useful for ``ID`` or ``KEYWORD`` fields. + +``delete_by_query(query)`` + + Deletes any documents that match the given query. + +:: + + # Delete document by its path -- this field must be indexed + ix.delete_by_term('path', u'/a/b/c') + # Save the deletion to disk + ix.commit() + +In the ``filedb`` backend, "deleting" a document simply adds the document number +to a list of deleted documents stored with the index. When you search the index, +it knows not to return deleted documents in the results. However, the document's +contents are still stored in the index, and certain statistics (such as term +document frequencies) are not updated, until you merge the segments containing +deleted documents (see merging above). (This is because removing the information +immediately from the index would essentially involving rewriting the entire +index on disk, which would be very inefficient.) + + +Updating documents +================== + +If you want to "replace" (re-index) a document, you can delete the old document +using one of the ``delete_*`` methods on ``Index`` or ``IndexWriter``, then use +``IndexWriter.add_document`` to add the new version. Or, you can use +``IndexWriter.update_document`` to do this in one step. + +For ``update_document`` to work, you must have marked at least one of the fields +in the schema as "unique". Whoosh will then use the contents of the "unique" +field(s) to search for documents to delete:: + + from whoosh.fields import Schema, ID, TEXT + + schema = Schema(path = ID(unique=True), content=TEXT) + + ix = index.create_in("index") + writer = ix.writer() + writer.add_document(path=u"/a", content=u"The first document") + writer.add_document(path=u"/b", content=u"The second document") + writer.commit() + + writer = ix.writer() + # Because "path" is marked as unique, calling update_document with path="/a" + # will delete any existing documents where the "path" field contains "/a". + writer.update_document(path=u"/a", content="Replacement for the first document") + writer.commit() + +The "unique" field(s) must be indexed. + +If no existing document matches the unique fields of the document you're +updating, ``update_document`` acts just like ``add_document``. + +"Unique" fields and ``update_document`` are simply convenient shortcuts for deleting +and adding. Whoosh has no inherent concept of a unique identifier, and in no way +enforces uniqueness when you use ``add_document``. + + +Incremental indexing +==================== + +When you're indexing a collection of documents, you'll often want two code +paths: one to index all the documents from scratch, and one to only update the +documents that have changed (leaving aside web applications where you need to +add/update documents according to user actions). + +Indexing everything from scratch is pretty easy. Here's a simple example:: + + import os.path + from whoosh import index + from whoosh.fields import Schema, ID, TEXT + + def clean_index(dirname): + # Always create the index from scratch + ix = index.create_in(dirname, schema=get_schema()) + writer = ix.writer() + + # Assume we have a function that gathers the filenames of the + # documents to be indexed + for path in my_docs(): + add_doc(writer, path) + + writer.commit() + + + def get_schema() + return Schema(path=ID(unique=True, stored=True), content=TEXT) + + + def add_doc(writer, path): + fileobj = open(path, "rb") + content = fileobj.read() + fileobj.close() + writer.add_document(path=path, content=content) + +Now, for a small collection of documents, indexing from scratch every time might +actually be fast enough. But for large collections, you'll want to have the +script only re-index the documents that have changed. + +To start we'll need to store each document's last-modified time, so we can check +if the file has changed. In this example, we'll just use the mtime for +simplicity:: + + def get_schema() + return Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT) + + def add_doc(writer, path): + fileobj = open(path, "rb") + content = fileobj.read() + fileobj.close() + modtime = os.path.getmtime(path) + writer.add_document(path=path, content=content, time=modtime) + +Now we can modify the script to allow either "clean" (from scratch) or +incremental indexing:: + + def index_my_docs(dirname, clean=False): + if clean: + clean_index(dirname) + else: + incremental_index(dirname) + + + def incremental_index(dirname) + ix = index.open_dir(dirname) + + # The set of all paths in the index + indexed_paths = set() + # The set of all paths we need to re-index + to_index = set() + + with ix.searcher() as searcher: + writer = ix.writer() + + # Loop over the stored fields in the index + for fields in searcher.all_stored_fields(): + indexed_path = fields['path'] + indexed_paths.add(indexed_path) + + if not os.path.exists(indexed_path): + # This file was deleted since it was indexed + writer.delete_by_term('path', indexed_path) + + else: + # Check if this file was changed since it + # was indexed + indexed_time = fields['time'] + mtime = os.path.getmtime(indexed_path) + if mtime > indexed_time: + # The file has changed, delete it and add it to the list of + # files to reindex + writer.delete_by_term('path', indexed_path) + to_index.add(indexed_path) + + # Loop over the files in the filesystem + # Assume we have a function that gathers the filenames of the + # documents to be indexed + for path in my_docs(): + if path in to_index or path not in indexed_paths: + # This is either a file that's changed, or a new file + # that wasn't indexed before. So index it! + add_doc(writer, path) + + writer.commit() + +The ``incremental_index`` function: + +* Loops through all the paths that are currently indexed. + + * If any of the files no longer exist, delete the corresponding document from + the index. + + * If the file still exists, but has been modified, add it to the list of paths + to be re-indexed. + + * If the file exists, whether it's been modified or not, add it to the list of + all indexed paths. + +* Loops through all the paths of the files on disk. + + * If a path is not in the set of all indexed paths, the file is new and we + need to index it. + + * If a path is in the set of paths to re-index, we need to index it. + + * Otherwise, we can skip indexing the file. + + +Clearing the index +================== + +In some cases you may want to re-index from scratch. To clear the index without +disrupting any existing readers:: + + from whoosh import writing + + with myindex.writer() as mywriter: + # You can optionally add documents to the writer here + # e.g. mywriter.add_document(...) + + # Using mergetype=CLEAR clears all existing segments so the index will + # only have any documents you've added to this writer + mywriter.mergetype = writing.CLEAR + +Or, if you don't use the writer as a context manager and call ``commit()`` +directly, do it like this:: + + mywriter = myindex.writer() + # ... + mywriter.commit(mergetype=writing.CLEAR) + +.. note:: + If you don't need to worry about existing readers, a more efficient method + is to simply delete the contents of the index directory and start over. diff --git a/docs/source/intro.rst b/docs/source/intro.rst new file mode 100644 index 0000000..95c70d0 --- /dev/null +++ b/docs/source/intro.rst @@ -0,0 +1,60 @@ +====================== +Introduction to Whoosh +====================== + +About Whoosh +------------ + +Whoosh was created by `Matt Chaput `_. It started as a quick and dirty +search server for the online documentation of the `Houdini `_ +3D animation software package. Side Effects Software generously allowed Matt to open source +the code in case it might be useful to anyone else who needs a very flexible or pure-Python +search engine (or both!). + +* Whoosh is fast, but uses only pure Python, so it will run anywhere Python runs, + without requiring a compiler. + +* By default, Whoosh uses the `Okapi BM25F `_ ranking + function, but like most things the ranking function can be easily customized. + +* Whoosh creates fairly small indexes compared to many other search libraries. + +* All indexed text in Whoosh must be *unicode*. + +* Whoosh lets you store arbitrary Python objects with indexed documents. + + +What is Whoosh? +--------------- + +Whoosh is a fast, pure Python search engine library. + +The primary design impetus of Whoosh is that it is pure Python. You should be able to +use Whoosh anywhere you can use Python, no compiler or Java required. + +Like one of its ancestors, Lucene, Whoosh is not really a search engine, it's a programmer +library for creating a search engine [1]_. + +Practically no important behavior of Whoosh is hard-coded. Indexing +of text, the level of information stored for each term in each field, parsing of search queries, +the types of queries allowed, scoring algorithms, etc. are all customizable, replaceable, and +extensible. + + +.. [1] It would of course be possible to build a turnkey search engine on top of Whoosh, + like Nutch and Solr use Lucene. + + +What can Whoosh do for you? +--------------------------- + +Whoosh lets you index free-form or structured text and then quickly find matching +documents based on simple or complex search criteria. + + +Getting help with Whoosh +------------------------ + +You can view outstanding issues on the +`Whoosh Bitbucket page `_ +and get help on the `Whoosh mailing list `_. diff --git a/docs/source/keywords.rst b/docs/source/keywords.rst new file mode 100644 index 0000000..82bb6cd --- /dev/null +++ b/docs/source/keywords.rst @@ -0,0 +1,94 @@ +======================================= +Query expansion and Key word extraction +======================================= + +Overview +======== + +Whoosh provides methods for computing the "key terms" of a set of documents. For +these methods, "key terms" basically means terms that are frequent in the given +documents, but relatively infrequent in the indexed collection as a whole. + +Because this is a purely statistical operation, not a natural language +processing or AI function, the quality of the results will vary based on the +content, the size of the document collection, and the number of documents for +which you extract keywords. + +These methods can be useful for providing the following features to users: + +* Search term expansion. You can extract key terms for the top N results from a + query and suggest them to the user as additional/alternate query terms to try. + +* Tag suggestion. Extracting the key terms for a single document may yield + useful suggestions for tagging the document. + +* "More like this". You can extract key terms for the top ten or so results from + a query (and removing the original query terms), and use those key words as + the basis for another query that may find more documents using terms the user + didn't think of. + +Usage +===== + +* Get more documents like a certain search hit. *This requires that the field + you want to match on is vectored or stored, or that you have access to the + original text (such as from a database)*. + + Use :meth:`~whoosh.searching.Hit.more_like_this`:: + + results = mysearcher.search(myquery) + first_hit = results[0] + more_results = first_hit.more_like_this("content") + +* Extract keywords for the top N documents in a + :class:`whoosh.searching.Results` object. *This requires that the field is + either vectored or stored*. + + Use the :meth:`~whoosh.searching.Results.key_terms` method of the + :class:`whoosh.searching.Results` object to extract keywords from the top N + documents of the result set. + + For example, to extract *five* key terms from the ``content`` field of the top + *ten* documents of a results object:: + + keywords = [keyword for keyword, score + in results.key_terms("content", docs=10, numterms=5) + +* Extract keywords for an arbitrary set of documents. *This requires that the + field is either vectored or stored*. + + Use the :meth:`~whoosh.searching.Searcher.document_number` or + :meth:`~whoosh.searching.Searcher.document_numbers` methods of the + :class:`whoosh.searching.Searcher` object to get the document numbers for the + document(s) you want to extract keywords from. + + Use the :meth:`~whoosh.searching.Searcher.key_terms` method of a + :class:`whoosh.searching.Searcher` to extract the keywords, given the list of + document numbers. + + For example, let's say you have an index of emails. To extract key terms from + the ``content`` field of emails whose ``emailto`` field contains + ``matt@whoosh.ca``:: + + with email_index.searcher() as s: + docnums = s.document_numbers(emailto=u"matt@whoosh.ca") + keywords = [keyword for keyword, score + in s.key_terms(docnums, "body")] + +* Extract keywords from arbitrary text not in the index. + + Use the :meth:`~whoosh.searching.Searcher.key_terms_from_text` method of a + :class:`whoosh.searching.Searcher` to extract the keywords, given the text:: + + with email_index.searcher() as s: + keywords = [keyword for keyword, score + in s.key_terms_from_text("body", mytext)] + + +Expansion models +================ + +The ``ExpansionModel`` subclasses in the :mod:`whoosh.classify` module implement +different weighting functions for key words. These models are translated into +Python from original Java implementations in Terrier. + diff --git a/docs/source/nested.rst b/docs/source/nested.rst new file mode 100644 index 0000000..106a350 --- /dev/null +++ b/docs/source/nested.rst @@ -0,0 +1,238 @@ +=========================================== +Indexing and searching document hierarchies +=========================================== + +Overview +======== + +Whoosh's full-text index is essentially a flat database of documents. However, +Whoosh supports two techniques for simulating the indexing and querying of +hierarchical documents, that is, sets of documents that form a parent-child +hierarchy, such as "Chapter - Section - Paragraph" or +"Module - Class - Method". + +You can specify parent-child relationships *at indexing time*, by grouping +documents in the same hierarchy, and then use the +:class:`whoosh.query.NestedParent` and/or :class:`whoosh.query.NestedChildren` +to find parents based on their children or vice-versa. + +Alternatively, you can use *query time joins*, essentially like external key +joins in a database, where you perform one search to find a relevant document, +then use a stored value on that document (for example, a ``parent`` field) to +look up another document. + +Both methods have pros and cons. + + +Using nested document indexing +============================== + +Indexing +-------- + +This method works by indexing a "parent" document and all its "child" documents +*as a "group"* so they are guaranteed to end up in the same segment. You can +use the context manager returned by ``IndexWriter.group()`` to group +documents:: + + with ix.writer() as w: + with w.group(): + w.add_document(kind="class", name="Index") + w.add_document(kind="method", name="add document") + w.add_document(kind="method", name="add reader") + w.add_document(kind="method", name="close") + with w.group(): + w.add_document(kind="class", name="Accumulator") + w.add_document(kind="method", name="add") + w.add_document(kind="method", name="get result") + with w.group(): + w.add_document(kind="class", name="Calculator") + w.add_document(kind="method", name="add") + w.add_document(kind="method", name="add all") + w.add_document(kind="method", name="add some") + w.add_document(kind="method", name="multiply") + w.add_document(kind="method", name="close") + with w.group(): + w.add_document(kind="class", name="Deleter") + w.add_document(kind="method", name="add") + w.add_document(kind="method", name="delete") + +Alternatively you can use the ``start_group()`` and ``end_group()`` methods:: + + with ix.writer() as w: + w.start_group() + w.add_document(kind="class", name="Index") + w.add_document(kind="method", name="add document") + w.add_document(kind="method", name="add reader") + w.add_document(kind="method", name="close") + w.end_group() + +Each level of the hierarchy should have a query that distinguishes it from +other levels (for example, in the above index, you can use ``kind:class`` or +``kind:method`` to match different levels of the hierarchy). + +Once you've indexed the hierarchy of documents, you can use two query types to +find parents based on children or vice-versa. + +(There is currently no support in the default query parser for nested queries.) + + +NestedParent query +------------------ + +The :class:`whoosh.query.NestedParent` query type lets you specify a query for +child documents, but have the query return an "ancestor" document from higher +in the hierarchy:: + + # First, we need a query that matches all the documents in the "parent" + # level we want of the hierarchy + all_parents = query.Term("kind", "class") + + # Then, we need a query that matches the children we want to find + wanted_kids = query.Term("name", "close") + + # Now we can make a query that will match documents where "name" is + # "close", but the query will return the "parent" documents of the matching + # children + q = query.NestedParent(all_parents, wanted_kids) + # results = Index, Calculator + +Note that in a hierarchy with more than two levels, you can specify a "parents" +query that matches any level of the hierarchy, so you can return the top-level +ancestors of the matching children, or the second level, third level, etc. + +The query works by first building a bit vector representing which documents are +"parents":: + + Index + | Calculator + | | + 1000100100000100 + | | + | Deleter + Accumulator + +Then for each match of the "child" query, it calculates the previous parent +from the bit vector and returns it as a match (it only returns each parent once +no matter how many children match). This parent lookup is very efficient:: + + 1000100100000100 + | + |<-+ close + + +NestedChildren query +-------------------- + +The opposite of ``NestedParent`` is :class:`whoosh.query.NestedChildren`. This +query lets you match parents but return their children. This is useful, for +example, to search for an album title and return the songs in the album:: + + # Query that matches all documents in the "parent" level we want to match + # at + all_parents = query.Term("kind", "album") + + # Parent documents we want to match + wanted_parents = query.Term("album_title", "heaven") + + # Now we can make a query that will match parent documents where "album_title" + # contains "heaven", but the query will return the "child" documents of the + # matching parents + q1 = query.NestedChildren(all_parents, wanted_parents) + +You can then combine that query with an ``AND`` clause, for example to find +songs with "hell" in the song title that occur on albums with "heaven" in the +album title:: + + q2 = query.And([q1, query.Term("song_title", "hell")]) + + +Deleting and updating hierarchical documents +-------------------------------------------- + +The drawback of the index-time method is *updating and deleting*. Because the +implementation of the queries depends on the parent and child documents being +contiguous in the segment, you can't update/delete just one child document. +You can only update/delete an entire top-level document at once (for example, +if your hierarchy is "Chapter - Section - Paragraph", you can only update or +delete entire chapters, not a section or paragraph). If the top-level of the +hierarchy represents very large blocks of text, this can involve a lot of +deleting and reindexing. + +Currently ``Writer.update_document()`` does not automatically work with nested +documents. You must manually delete and re-add document groups to update them. + +To delete nested document groups, use the ``Writer.delete_by_query()`` +method with a ``NestedParent`` query:: + + # Delete the "Accumulator" class + all_parents = query.Term("kind", "class") + to_delete = query.Term("name", "Accumulator") + q = query.NestedParent(all_parents, to_delete) + with myindex.writer() as w: + w.delete_by_query(q) + + +Using query-time joins +====================== + +A second technique for simulating hierarchical documents in Whoosh involves +using a stored field on each document to point to its parent, and then using +the value of that field at query time to find parents and children. + +For example, if we index a hierarchy of classes and methods using pointers +to parents instead of nesting:: + + # Store a pointer to the parent on each "method" document + with ix.writer() as w: + w.add_document(kind="class", c_name="Index", docstring="...") + w.add_document(kind="method", m_name="add document", parent="Index") + w.add_document(kind="method", m_name="add reader", parent="Index") + w.add_document(kind="method", m_name="close", parent="Index") + + w.add_document(kind="class", c_name="Accumulator", docstring="...") + w.add_document(kind="method", m_name="add", parent="Accumulator") + w.add_document(kind="method", m_name="get result", parent="Accumulator") + + w.add_document(kind="class", c_name="Calculator", docstring="...") + w.add_document(kind="method", m_name="add", parent="Calculator") + w.add_document(kind="method", m_name="add all", parent="Calculator") + w.add_document(kind="method", m_name="add some", parent="Calculator") + w.add_document(kind="method", m_name="multiply", parent="Calculator") + w.add_document(kind="method", m_name="close", parent="Calculator") + + w.add_document(kind="class", c_name="Deleter", docstring="...") + w.add_document(kind="method", m_name="add", parent="Deleter") + w.add_document(kind="method", m_name="delete", parent="Deleter") + + # Now do manual joins at query time + with ix.searcher() as s: + # Tip: Searcher.document() and Searcher.documents() let you look up + # documents by field values more easily than using Searcher.search() + + # Children to parents: + # Print the docstrings of classes on which "close" methods occur + for child_doc in s.documents(m_name="close"): + # Use the stored value of the "parent" field to look up the parent + # document + parent_doc = s.document(c_name=child_doc["parent"]) + # Print the parent document's stored docstring field + print(parent_doc["docstring"]) + + # Parents to children: + # Find classes with "big" in the docstring and print their methods + q = query.Term("kind", "class") & query.Term("docstring", "big") + for hit in s.search(q, limit=None): + print("Class name=", hit["c_name"], "methods:") + for child_doc in s.documents(parent=hit["c_name"]): + print(" Method name=", child_doc["m_name"]) + +This technique is more flexible than index-time nesting in that you can +delete/update individual documents in the hierarchy piece by piece, although it +doesn't support finding different parent levels as easily. It is also slower +than index-time nesting (potentially much slower), since you must perform +additional searches for each found document. + +Future versions of Whoosh may include "join" queries to make this process more +efficient (or at least more automatic). + diff --git a/docs/source/ngrams.rst b/docs/source/ngrams.rst new file mode 100644 index 0000000..484a271 --- /dev/null +++ b/docs/source/ngrams.rst @@ -0,0 +1,51 @@ +============================== +Indexing and searching N-grams +============================== + +Overview +======== + +N-gram indexing is a powerful method for getting fast, "search as you type" +functionality like iTunes. It is also useful for quick and effective indexing +of languages such as Chinese and Japanese without word breaks. + +N-grams refers to groups of N characters... bigrams are groups of two +characters, trigrams are groups of three characters, and so on. + +Whoosh includes two methods for analyzing N-gram fields: an N-gram tokenizer, +and a filter that breaks tokens into N-grams. + +:class:`whoosh.analysis.NgramTokenizer` tokenizes the entire field into N-grams. +This is more useful for Chinese/Japanese/Korean languages, where it's useful +to index bigrams of characters rather than individual characters. Using this +tokenizer with roman languages leads to spaces in the tokens. + +:: + + >>> ngt = NgramTokenizer(minsize=2, maxsize=4) + >>> [token.text for token in ngt(u"hi there")] + [u'hi', u'hi ', u'hi t',u'i ', u'i t', u'i th', u' t', u' th', u' the', u'th', + u'the', u'ther', u'he', u'her', u'here', u'er', u'ere', u're'] + +:class:`whoosh.analysis.NgramFilter` breaks individual tokens into N-grams as +part of an analysis pipeline. This is more useful for languages with word +separation. + +:: + + >>> my_analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4) + >>> [token.text for token in my_analyzer(u"rendering shaders")] + [u'ren', u'rend', u'end', u'ende', u'nde', u'nder', u'der', u'deri', u'eri', + u'erin', u'rin', u'ring', u'ing', u'sha', u'shad', u'had', u'hade', u'ade', + u'ader', u'der', u'ders', u'ers'] + +Whoosh includes two pre-configured field types for N-grams: +:class:`whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS`. The only +difference is that ``NGRAM`` runs all text through the N-gram filter, including +whitespace and punctuation, while ``NGRAMWORDS`` extracts words from the text +using a tokenizer, then runs each word through the N-gram filter. + +TBD. + + + diff --git a/docs/source/parsing.rst b/docs/source/parsing.rst new file mode 100644 index 0000000..35327c7 --- /dev/null +++ b/docs/source/parsing.rst @@ -0,0 +1,437 @@ +==================== +Parsing user queries +==================== + +Overview +======== + +The job of a query parser is to convert a *query string* submitted by a user +into *query objects* (objects from the :mod:`whoosh.query` module). + +For example, the user query: + +.. code-block:: none + + rendering shading + +might be parsed into query objects like this:: + + And([Term("content", u"rendering"), Term("content", u"shading")]) + +Whoosh includes a powerful, modular parser for user queries in the +:mod:`whoosh.qparser` module. The default parser implements a query language +similar to the one that ships with Lucene. However, by changing plugins or using +functions such as :func:`whoosh.qparser.MultifieldParser`, +:func:`whoosh.qparser.SimpleParser` or :func:`whoosh.qparser.DisMaxParser`, you +can change how the parser works, get a simpler parser or change the query +language syntax. + +(In previous versions of Whoosh, the query parser was based on ``pyparsing``. +The new hand-written parser is less brittle and more flexible.) + +.. note:: + + Remember that you can directly create query objects programmatically using + the objects in the :mod:`whoosh.query` module. If you are not processing + actual user queries, this is preferable to building a query string just to + parse it. + + +Using the default parser +======================== + +To create a :class:`whoosh.qparser.QueryParser` object, pass it the name of the +*default field* to search and the schema of the index you'll be searching. + +:: + + from whoosh.qparser import QueryParser + + parser = QueryParser("content", schema=myindex.schema) + +.. tip:: + + You can instantiate a ``QueryParser`` object without specifying a schema, + however the parser will not process the text of the user query. This is + useful for debugging, when you want to see how QueryParser will build a + query, but don't want to make up a schema just for testing. + +Once you have a ``QueryParser`` object, you can call ``parse()`` on it to parse a +query string into a query object:: + + >>> parser.parse(u"alpha OR beta gamma") + And([Or([Term('content', u'alpha'), Term('content', u'beta')]), Term('content', u'gamma')]) + +See the :doc:`query language reference ` for the features and syntax +of the default parser's query language. + + +Common customizations +===================== + +Searching for any terms instead of all terms by default +------------------------------------------------------- + +If the user doesn't explicitly specify ``AND`` or ``OR`` clauses:: + + physically based rendering + +...by default, the parser treats the words as if they were connected by ``AND``, +meaning all the terms must be present for a document to match:: + + physically AND based AND rendering + +To change the parser to use ``OR`` instead, so that any of the terms may be +present for a document to match, i.e.:: + + physically OR based OR rendering + +...configure the QueryParser using the ``group`` keyword argument like this:: + + from whoosh import qparser + + parser = qparser.QueryParser(fieldname, schema=myindex.schema, + group=qparser.OrGroup) + +The Or query lets you specify that documents that contain more of the query +terms score higher. For example, if the user searches for ``foo bar``, a +document with four occurances of ``foo`` would normally outscore a document +that contained one occurance each of ``foo`` and ``bar``. However, users +usually expect documents that contain more of the words they searched for +to score higher. To configure the parser to produce Or groups with this +behavior, use the ``factory()`` class method of ``OrGroup``:: + + og = qparser.OrGroup.factory(0.9) + parser = qparser.QueryParser(fieldname, schema, group=og) + +where the argument to ``factory()`` is a scaling factor on the bonus +(between 0 and 1). + + +Letting the user search multiple fields by default +-------------------------------------------------- + +The default QueryParser configuration takes terms without explicit fields and +assigns them to the default field you specified when you created the object, so +for example if you created the object with:: + + parser = QueryParser("content", schema=myschema) + +And the user entered the query: + +.. code-block:: none + + three blind mice + +The parser would treat it as: + +.. code-block:: none + + content:three content:blind content:mice + +However, you might want to let the user search *multiple* fields by default. For +example, you might want "unfielded" terms to search both the ``title`` and +``content`` fields. + +In that case, you can use a :class:`whoosh.qparser.MultifieldParser`. This is +just like the normal QueryParser, but instead of a default field name string, it +takes a *sequence* of field names:: + + from whoosh.qparser import MultifieldParser + + mparser = MultifieldParser(["title", "content"], schema=myschema) + +When this MultifieldParser instance parses ``three blind mice``, it treats it +as: + +.. code-block:: none + + (title:three OR content:three) (title:blind OR content:blind) (title:mice OR content:mice) + + +Simplifying the query language +------------------------------ + +Once you have a parser:: + + parser = qparser.QueryParser("content", schema=myschema) + +you can remove features from it using the +:meth:`~whoosh.qparser.QueryParser.remove_plugin_class` method. + +For example, to remove the ability of the user to specify fields to search:: + + parser.remove_plugin_class(qparser.FieldsPlugin) + +To remove the ability to search for wildcards, which can be harmful to query +performance:: + + parser.remove_plugin_class(qparser.WildcardPlugin) + +See :doc:`/api/qparser` for information about the plugins included with +Whoosh's query parser. + + +Changing the AND, OR, ANDNOT, ANDMAYBE, and NOT syntax +------------------------------------------------------ + +The default parser uses English keywords for the AND, OR, ANDNOT, ANDMAYBE, +and NOT functions:: + + parser = qparser.QueryParser("content", schema=myschema) + +You can replace the default ``OperatorsPlugin`` object to +replace the default English tokens with your own regular expressions. + +The :class:`whoosh.qparser.OperatorsPlugin` implements the ability to use AND, +OR, NOT, ANDNOT, and ANDMAYBE clauses in queries. You can instantiate a new +``OperatorsPlugin`` and use the ``And``, ``Or``, ``Not``, ``AndNot``, and +``AndMaybe`` keyword arguments to change the token patterns:: + + # Use Spanish equivalents instead of AND and OR + op = qparser.OperatorsPlugin(And=" Y ", Or=" O ") + parser.replace_plugin(op) + +Further, you may change the syntax of the ``NOT`` operator:: + + np = qparser.OperatorsPlugin(Not=' NO ') + parser.replace_plugin(np) + +The arguments can be pattern strings or precompiled regular expression objects. + +For example, to change the default parser to use typographic symbols instead of +words for the AND, OR, ANDNOT, ANDMAYBE, and NOT functions:: + + parser = qparser.QueryParser("content", schema=myschema) + # These are regular expressions, so we have to escape the vertical bar + op = qparser.OperatorsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~", Not="\\-") + parser.replace_plugin(op) + + +Adding less-than, greater-than, etc. +------------------------------------ + +Normally, the way you match all terms in a field greater than "apple" is with +an open ended range:: + + field:{apple to] + +The :class:`whoosh.qparser.GtLtPlugin` lets you specify the same search like +this:: + + field:>apple + +The plugin lets you use ``>``, ``<``, ``>=``, ``<=``, ``=>``, or ``=<`` after +a field specifier, and translates the expression into the equivalent range:: + + date:>='31 march 2001' + + date:[31 march 2001 to] + + +Adding fuzzy term queries +------------------------- + +Fuzzy queries are good for catching misspellings and similar words. +The :class:`whoosh.qparser.FuzzyTermPlugin` lets you search for "fuzzy" terms, +that is, terms that don't have to match exactly. The fuzzy term will match any +similar term within a certain number of "edits" (character insertions, +deletions, and/or transpositions -- this is called the "Damerau-Levenshtein +edit distance"). + +To add the fuzzy plugin:: + + parser = qparser.QueryParser("fieldname", my_index.schema) + parser.add_plugin(qparser.FuzzyTermPlugin()) + +Once you add the fuzzy plugin to the parser, you can specify a fuzzy term by +adding a ``~`` followed by an optional maximum edit distance. If you don't +specify an edit distance, the default is ``1``. + +For example, the following "fuzzy" term query:: + + cat~ + +would match ``cat`` and all terms in the index within one "edit" of cat, +for example ``cast`` (insert ``s``), ``at`` (delete ``c``), and ``act`` +(transpose ``c`` and ``a``). + +If you wanted ``cat`` to match ``bat``, it requires two edits (delete ``c`` and +insert ``b``) so you would need to set the maximum edit distance to ``2``:: + + cat~2 + +Because each additional edit you allow increases the number of possibilities +that must be checked, edit distances greater than ``2`` can be very slow. + +It is often useful to require that the first few characters of a fuzzy term +match exactly. This is called a prefix. You can set the length of the prefix +by adding a slash and a number after the edit distance. For example, to use +a maximum edit distance of ``2`` and a prefix length of ``3``:: + + johannson~2/3 + +You can specify a prefix without specifying an edit distance:: + + johannson~/3 + +The default prefix distance is ``0``. + + +Allowing complex phrase queries +------------------------------- + +The default parser setup allows phrase (proximity) queries such as:: + + "whoosh search library" + +The default phrase query tokenizes the text between the quotes and creates a +search for those terms in proximity. + +If you want to do more complex proximity searches, you can replace the phrase +plugin with the :class:`whoosh.qparser.SequencePlugin`, which allows any query +between the quotes. For example:: + + "(john OR jon OR jonathan~) peters*" + +The sequence syntax lets you add a "slop" factor just like the regular phrase:: + + "(john OR jon OR jonathan~) peters*"~2 + +To replace the default phrase plugin with the sequence plugin:: + + parser = qparser.QueryParser("fieldname", my_index.schema) + parser.remove_plugin_class(qparser.PhrasePlugin) + parser.add_plugin(qparser.SequencePlugin()) + +Alternatively, you could keep the default phrase plugin and give the sequence +plugin different syntax by specifying a regular expression for the start/end +marker when you create the sequence plugin. The regular expression should have +a named group ``slop`` for the slop factor. For example:: + + parser = qparser.QueryParser("fieldname", my_index.schema) + parser.add_plugin(qparser.SequencePlugin("!(~(?P[1-9][0-9]*))?")) + +This would allow you to use regular phrase queries and sequence queries at the +same time:: + + "regular phrase" AND !sequence query~2! + + +Advanced customization +====================== + +QueryParser arguments +--------------------- + +QueryParser supports two extra keyword arguments: + +``group`` + The query class to use to join sub-queries when the user doesn't explicitly + specify a boolean operator, such as ``AND`` or ``OR``. This lets you change + the default operator from ``AND`` to ``OR``. + + This will be the :class:`whoosh.qparser.AndGroup` or + :class:`whoosh.qparser.OrGroup` class (*not* an instantiated object) unless + you've written your own custom grouping syntax you want to use. + +``termclass`` + The query class to use to wrap single terms. + + This must be a :class:`whoosh.query.Query` subclass (*not* an instantiated + object) that accepts a fieldname string and term text unicode string in its + ``__init__`` method. The default is :class:`whoosh.query.Term`. + + This is useful if you want to change the default term class to + :class:`whoosh.query.Variations`, or if you've written a custom term class + you want the parser to use instead of the ones shipped with Whoosh. + +:: + + >>> from whoosh.qparser import QueryParser, OrGroup + >>> orparser = QueryParser("content", schema=myschema, group=OrGroup) + + +Configuring plugins +------------------- + +The query parser's functionality is provided by a set of plugins. You can +remove plugins to remove functionality, add plugins to add functionality, or +replace default plugins with re-configured or rewritten versions. + +The :meth:`whoosh.qparser.QueryParser.add_plugin`, +:meth:`whoosh.qparser.QueryParser.remove_plugin_class`, and +:meth:`whoosh.qparser.QueryParser.replace_plugin` methods let you manipulate +the plugins in a ``QueryParser`` object. + +See :doc:`/api/qparser` for information about the available plugins. + + +.. _custom-op: + +Creating custom operators +------------------------- + +* Decide whether you want a ``PrefixOperator``, ``PostfixOperator``, or ``InfixOperator``. + +* Create a new :class:`whoosh.qparser.syntax.GroupNode` subclass to hold + nodes affected by your operator. This object is responsible for generating + a :class:`whoosh.query.Query` object corresponding to the syntax. + +* Create a regular expression pattern for the operator's query syntax. + +* Create an ``OperatorsPlugin.OpTagger`` object from the above information. + +* Create a new ``OperatorsPlugin`` instance configured with your custom + operator(s). + +* Replace the default ``OperatorsPlugin`` in your parser with your new instance. + +For example, if you were creating a ``BEFORE`` operator:: + + from whoosh import qparser, query + + optype = qparser.InfixOperator + pattern = " BEFORE " + + class BeforeGroup(qparser.GroupNode): + merging = True + qclass = query.Ordered + +Create an OpTagger for your operator:: + + btagger = qparser.OperatorPlugin.OpTagger(pattern, BeforeGroup, + qparser.InfixOperator) + +By default, infix operators are left-associative. To make a right-associative +infix operator, do this:: + + btagger = qparser.OperatorPlugin.OpTagger(pattern, BeforeGroup, + qparser.InfixOperator, + leftassoc=False) + +Create an :class:`~whoosh.qparser.plugins.OperatorsPlugin` instance with your +new operator, and replace the default operators plugin in your query parser:: + + qp = qparser.QueryParser("text", myschema) + my_op_plugin = qparser.OperatorsPlugin([(btagger, 0)]) + qp.replace_plugin(my_op_plugin) + +Note that the list of operators you specify with the first argument is IN +ADDITION TO the default operators (AND, OR, etc.). To turn off one of the +default operators, you can pass None to the corresponding keyword argument:: + + cp = qparser.OperatorsPlugin([(optagger, 0)], And=None) + +If you want ONLY your list of operators and none of the default operators, +use the ``clean`` keyword argument:: + + cp = qparser.OperatorsPlugin([(optagger, 0)], clean=True) + +Operators earlier in the list bind more closely than operators later in the +list. + + + + + diff --git a/docs/source/query.rst b/docs/source/query.rst new file mode 100644 index 0000000..c62f555 --- /dev/null +++ b/docs/source/query.rst @@ -0,0 +1,10 @@ +============= +Query objects +============= + +The classes in the :mod:`whoosh.query` module implement *queries* you can run against the index. + +TBD. + +See :doc:`searching` for how to search the index using query objects. + diff --git a/docs/source/querylang.rst b/docs/source/querylang.rst new file mode 100644 index 0000000..d2a214a --- /dev/null +++ b/docs/source/querylang.rst @@ -0,0 +1,191 @@ +========================== +The default query language +========================== + +.. highlight:: none + +Overview +======== + +A query consists of *terms* and *operators*. There are two types of terms: single +terms and *phrases*. Multiple terms can be combined with operators such as +*AND* and *OR*. + +Whoosh supports indexing text in different *fields*. You must specify the +*default field* when you create the :class:`whoosh.qparser.QueryParser` object. +This is the field in which any terms the user does not explicitly specify a field +for will be searched. + +Whoosh's query parser is capable of parsing different and/or additional syntax +through the use of plug-ins. See :doc:`parsing`. + + +Individual terms and phrases +============================ + +Find documents containing the term ``render``:: + + render + +Find documents containing the phrase ``all was well``:: + + "all was well" + +Note that a field must store Position information for phrase searching to work in +that field. + +Normally when you specify a phrase, the maximum difference in position between +each word in the phrase is 1 (that is, the words must be right next to each +other in the document). For example, the following matches if a document has +``library`` within 5 words after ``whoosh``:: + + "whoosh library"~5 + + +Boolean operators +================= + +Find documents containing ``render`` *and* ``shading``:: + + render AND shading + +Note that AND is the default relation between terms, so this is the same as:: + + render shading + +Find documents containing ``render``, *and* also either ``shading`` *or* +``modeling``:: + + render AND shading OR modeling + +Find documents containing ``render`` but *not* modeling:: + + render NOT modeling + +Find documents containing ``alpha`` but not either ``beta`` or ``gamma``:: + + alpha NOT (beta OR gamma) + +Note that when no boolean operator is specified between terms, the parser will +insert one, by default AND. So this query:: + + render shading modeling + +is equivalent (by default) to:: + + render AND shading AND modeling + +See :doc:`customizing the default parser ` for information on how to +change the default operator to OR. + +Group operators together with parentheses. For example to find documents that +contain both ``render`` and ``shading``, or contain ``modeling``:: + + (render AND shading) OR modeling + + +Fields +====== + +Find the term ``ivan`` in the ``name`` field:: + + name:ivan + +The ``field:`` prefix only sets the field for the term it directly precedes, so +the query:: + + title:open sesame + +Will search for ``open`` in the ``title`` field and ``sesame`` in the *default* +field. + +To apply a field prefix to multiple terms, group them with parentheses:: + + title:(open sesame) + +This is the same as:: + + title:open title:sesame + +Of course you can specify a field for phrases too:: + + title:"open sesame" + + +Inexact terms +============= + +Use "globs" (wildcard expressions using ``?`` to represent a single character +and ``*`` to represent any number of characters) to match terms:: + + te?t test* *b?g* + +Note that a wildcard starting with ``?`` or ``*`` is very slow. Note also that +these wildcards only match *individual terms*. For example, the query:: + + my*life + +will **not** match an indexed phrase like:: + + my so called life + +because those are four separate terms. + + +Ranges +====== + +You can match a range of terms. For example, the following query will match +documents containing terms in the lexical range from ``apple`` to ``bear`` +*inclusive*. For example, it will match documents containing ``azores`` and +``be`` but not ``blur``:: + + [apple TO bear] + +This is very useful when you've stored, for example, dates in a lexically sorted +format (i.e. YYYYMMDD):: + + date:[20050101 TO 20090715] + +The range is normally *inclusive* (that is, the range will match all terms +between the start and end term, *as well as* the start and end terms +themselves). You can specify that one or both ends of the range are *exclusive* +by using the ``{`` and/or ``}`` characters:: + + [0000 TO 0025} + {prefix TO suffix} + +You can also specify *open-ended* ranges by leaving out the start or end term:: + + [0025 TO] + {TO suffix} + + +Boosting query elements +======================= + +You can specify that certain parts of a query are more important for calculating +the score of a matched document than others. For example, to specify that +``ninja`` is twice as important as other words, and ``bear`` is half as +important:: + + ninja^2 cowboy bear^0.5 + +You can apply a boost to several terms using grouping parentheses:: + + (open sesame)^2.5 roc + + +Making a term from literal text +=============================== + +If you need to include characters in a term that are normally treated specially +by the parser, such as spaces, colons, or brackets, you can enclose the term +in single quotes:: + + path:'MacHD:My Documents' + 'term with spaces' + title:'function()' + + + diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst new file mode 100644 index 0000000..a0ffe51 --- /dev/null +++ b/docs/source/quickstart.rst @@ -0,0 +1,244 @@ +=========== +Quick start +=========== + +Whoosh is a library of classes and functions for indexing text and then searching the index. +It allows you to develop custom search engines for your content. For example, if you were +creating blogging software, you could use Whoosh to add a search function to allow users to +search blog entries. + + +A quick introduction +==================== + +:: + + >>> from whoosh.index import create_in + >>> from whoosh.fields import * + >>> schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) + >>> ix = create_in("indexdir", schema) + >>> writer = ix.writer() + >>> writer.add_document(title=u"First document", path=u"/a", + ... content=u"This is the first document we've added!") + >>> writer.add_document(title=u"Second document", path=u"/b", + ... content=u"The second one is even more interesting!") + >>> writer.commit() + >>> from whoosh.qparser import QueryParser + >>> with ix.searcher() as searcher: + ... query = QueryParser("content", ix.schema).parse("first") + ... results = searcher.search(query) + ... results[0] + ... + {"title": u"First document", "path": u"/a"} + + +The ``Index`` and ``Schema`` objects +==================================== + +To begin using Whoosh, you need an *index object*. The first time you create +an index, you must define the index's *schema*. The schema lists the *fields* +in the index. A field is a piece of information for each document in the index, +such as its title or text content. A field can be *indexed* (meaning it can +be searched) and/or *stored* (meaning the value that gets indexed is returned +with the results; this is useful for fields such as the title). + +This schema has two fields, "title" and "content":: + + from whoosh.fields import Schema, TEXT + + schema = Schema(title=TEXT, content=TEXT) + +You only need to do create the schema once, when you create the index. The +schema is pickled and stored with the index. + +When you create the ``Schema`` object, you use keyword arguments to map field names +to field types. The list of fields and their types defines what you are indexing +and what's searchable. Whoosh comes with some very useful predefined field +types, and you can easily create your own. + +:class:`whoosh.fields.ID` + This type simply indexes (and optionally stores) the entire value of the + field as a single unit (that is, it doesn't break it up into individual + words). This is useful for fields such as a file path, URL, date, category, + etc. + +:class:`whoosh.fields.STORED` + This field is stored with the document, but not indexed. This field type is + not indexed and not searchable. This is useful for document information you + want to display to the user in the search results. + +:class:`whoosh.fields.KEYWORD` + This type is designed for space- or comma-separated keywords. This type is + indexed and searchable (and optionally stored). To save space, it does not + support phrase searching. + +:class:`whoosh.fields.TEXT` + This type is for body text. It indexes (and optionally stores) the text and + stores term positions to allow phrase searching. + +:class:`whoosh.fields.NUMERIC` + This type is for numbers. You can store integers or floating point numbers. + +:class:`whoosh.fields.BOOLEAN` + This type is for boolean (true/false) values. + +:class:`whoosh.fields.DATETIME` + This type is for ``datetime`` objects. See :doc:`dates` for more + information. + +:class:`whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS` + These types break the field text or individual terms into N-grams. + See :doc:`ngrams` for more information. + +(As a shortcut, if you don't need to pass any arguments to the field type, you +can just give the class name and Whoosh will instantiate the object for you.) :: + + from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT + + schema = Schema(title=TEXT(stored=True), content=TEXT, + path=ID(stored=True), tags=KEYWORD, icon=STORED) + +See :doc:`schema` for more information. + +Once you have the schema, you can create an index using the ``create_in`` +function:: + + import os.path + from whoosh.index import create_in + + if not os.path.exists("index"): + os.mkdir("index") + ix = create_in("index", schema) + +(At a low level, this creates a *Storage* object to contain the index. A +``Storage`` object represents that medium in which the index will be stored. +Usually this will be ``FileStorage``, which stores the index as a set of files +in a directory.) + +After you've created an index, you can open it using the ``open_dir`` +convenience function:: + + from whoosh.index import open_dir + + ix = open_dir("index") + + +The ``IndexWriter`` object +========================== + +OK, so we've got an ``Index`` object, now we can start adding documents. The +``writer()`` method of the ``Index`` object returns an ``IndexWriter`` object that lets +you add documents to the index. The IndexWriter's ``add_document(**kwargs)`` +method accepts keyword arguments where the field name is mapped to a value:: + + writer = ix.writer() + writer.add_document(title=u"My document", content=u"This is my document!", + path=u"/a", tags=u"first short", icon=u"/icons/star.png") + writer.add_document(title=u"Second try", content=u"This is the second example.", + path=u"/b", tags=u"second short", icon=u"/icons/sheep.png") + writer.add_document(title=u"Third time's the charm", content=u"Examples are many.", + path=u"/c", tags=u"short", icon=u"/icons/book.png") + writer.commit() + +Two important notes: + +* You don't have to fill in a value for every field. Whoosh doesn't care if you + leave out a field from a document. + +* Indexed text fields must be passed a unicode value. Fields that are stored + but not indexed (``STORED`` field type) can be passed any pickle-able object. + +If you have a text field that is both indexed and stored, you can index a +unicode value but store a different object if necessary (it's usually not, but +sometimes this is really useful) using this trick:: + + writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title") + +Calling commit() on the ``IndexWriter`` saves the added documents to the index:: + + writer.commit() + +See :doc:`indexing` for more information. + +Once your documents are committed to the index, you can search for them. + + +The ``Searcher`` object +======================= + +To begin searching the index, we'll need a ``Searcher`` object:: + + searcher = ix.searcher() + +You'll usually want to open the searcher using a ``with`` statement so the +searcher is automatically closed when you're done with it (searcher objects +represent a number of open files, so if you don't explicitly close them and the +system is slow to collect them, you can run out of file handles):: + + with ix.searcher() as searcher: + ... + +This is of course equivalent to:: + + try: + searcher = ix.searcher() + ... + finally: + searcher.close() + +The Searcher's ``search()`` method takes a *Query object*. You can construct +query objects directly or use a query parser to parse a query string. + +For example, this query would match documents that contain both "apple" and +"bear" in the "content" field:: + + # Construct query objects directly + + from whoosh.query import * + myquery = And([Term("content", u"apple"), Term("content", "bear")]) + +To parse a query string, you can use the default query parser in the ``qparser`` +module. The first argument to the ``QueryParser`` constructor is the default +field to search. This is usually the "body text" field. The second optional +argument is a schema to use to understand how to parse the fields:: + + # Parse a query string + + from whoosh.qparser import QueryParser + parser = QueryParser("content", ix.schema) + myquery = parser.parse(querystring) + +Once you have a ``Searcher`` and a query object, you can use the ``Searcher``'s +``search()`` method to run the query and get a ``Results`` object:: + + >>> results = searcher.search(myquery) + >>> print(len(results)) + 1 + >>> print(results[0]) + {"title": "Second try", "path": "/b", "icon": "/icons/sheep.png"} + +The default ``QueryParser`` implements a query language very similar to +Lucene's. It lets you connect terms with ``AND`` or ``OR``, eleminate terms with +``NOT``, group terms together into clauses with parentheses, do range, prefix, +and wilcard queries, and specify different fields to search. By default it joins +clauses together with ``AND`` (so by default, all terms you specify must be in +the document for the document to match):: + + >>> print(parser.parse(u"render shade animate")) + And([Term("content", "render"), Term("content", "shade"), Term("content", "animate")]) + + >>> print(parser.parse(u"render OR (title:shade keyword:animate)")) + Or([Term("content", "render"), And([Term("title", "shade"), Term("keyword", "animate")])]) + + >>> print(parser.parse(u"rend*")) + Prefix("content", "rend") + +Whoosh includes extra features for dealing with search results, such as + +* Sorting results by the value of an indexed field, instead of by relelvance. +* Highlighting the search terms in excerpts from the original documents. +* Expanding the query terms based on the top few documents found. +* Paginating the results (e.g. "Showing results 1-20, page 1 of 4"). + +See :doc:`searching` for more information. + diff --git a/docs/source/recipes.rst b/docs/source/recipes.rst new file mode 100644 index 0000000..94983bf --- /dev/null +++ b/docs/source/recipes.rst @@ -0,0 +1,229 @@ +============== +Whoosh recipes +============== + +General +======= + +Get the stored fields for a document from the document number +------------------------------------------------------------- +:: + + stored_fields = searcher.stored_fields(docnum) + + +Analysis +======== + +Eliminate words shorter/longer than N +------------------------------------- + +Use a :class:`~whoosh.analysis.StopFilter` and the ``minsize`` and ``maxsize`` +keyword arguments. If you just want to filter based on size and not common +words, set the ``stoplist`` to ``None``:: + + sf = analysis.StopFilter(stoplist=None, minsize=2, maxsize=40) + + +Allow optional case-sensitive searches +-------------------------------------- + +A quick and easy way to do this is to index both the original and lowercased +versions of each word. If the user searches for an all-lowercase word, it acts +as a case-insensitive search, but if they search for a word with any uppercase +characters, it acts as a case-sensitive search:: + + class CaseSensitivizer(analysis.Filter): + def __call__(self, tokens): + for t in tokens: + yield t + if t.mode == "index": + low = t.text.lower() + if low != t.text: + t.text = low + yield t + + ana = analysis.RegexTokenizer() | CaseSensitivizer() + [t.text for t in ana("The new SuperTurbo 5000", mode="index")] + # ["The", "the", "new", "SuperTurbo", "superturbo", "5000"] + + +Searching +========= + +Find every document +------------------- +:: + + myquery = query.Every() + + +iTunes-style search-as-you-type +------------------------------- + +Use the :class:`whoosh.analysis.NgramWordAnalyzer` as the analyzer for the +field you want to search as the user types. You can save space in the index by +turning off positions in the field using ``phrase=False``, since phrase +searching on N-gram fields usually doesn't make much sense:: + + # For example, to search the "title" field as the user types + analyzer = analysis.NgramWordAnalyzer() + title_field = fields.TEXT(analyzer=analyzer, phrase=False) + schema = fields.Schema(title=title_field) + +See the documentation for the :class:`~whoosh.analysis.NgramWordAnalyzer` class +for information on the available options. + + +Shortcuts +========= + +Look up documents by a field value +---------------------------------- +:: + + # Single document (unique field value) + stored_fields = searcher.document(id="bacon") + + # Multiple documents + for stored_fields in searcher.documents(tag="cake"): + ... + + +Sorting and scoring +=================== + +See :doc:`facets`. + + +Score results based on the position of the matched term +------------------------------------------------------- + +The following scoring function uses the position of the first occurance of a +term in each document to calculate the score, so documents with the given term +earlier in the document will score higher:: + + from whoosh import scoring + + def pos_score_fn(searcher, fieldname, text, matcher): + poses = matcher.value_as("positions") + return 1.0 / (poses[0] + 1) + + pos_weighting = scoring.FunctionWeighting(pos_score_fn) + with myindex.searcher(weighting=pos_weighting) as s: + ... + + +Results +======= + +How many hits were there? +------------------------- + +The number of *scored* hits:: + + found = results.scored_length() + +Depending on the arguments to the search, the exact total number of hits may be +known:: + + if results.has_exact_length(): + print("Scored", found, "of exactly", len(results), "documents") + +Usually, however, the exact number of documents that match the query is not +known, because the searcher can skip over blocks of documents it knows won't +show up in the "top N" list. If you call ``len(results)`` on a query where the +exact length is unknown, Whoosh will run an unscored version of the original +query to get the exact number. This is faster than the scored search, but may +still be noticeably slow on very large indexes or complex queries. + +As an alternative, you might display the *estimated* total hits:: + + found = results.scored_length() + if results.has_exact_length(): + print("Scored", found, "of exactly", len(results), "documents") + else: + low = results.estimated_min_length() + high = results.estimated_length() + + print("Scored", found, "of between", low, "and", high, "documents") + + +Which terms matched in each hit? +-------------------------------- +:: + + # Use terms=True to record term matches for each hit + results = searcher.search(myquery, terms=True) + + for hit in results: + # Which terms matched in this hit? + print("Matched:", hit.matched_terms()) + + # Which terms from the query didn't match in this hit? + print("Didn't match:", myquery.all_terms() - hit.matched_terms()) + + +Global information +================== + +How many documents are in the index? +------------------------------------ +:: + + # Including documents that are deleted but not yet optimized away + numdocs = searcher.doc_count_all() + + # Not including deleted documents + numdocs = searcher.doc_count() + + +What fields are in the index? +----------------------------- +:: + + return myindex.schema.names() + + +Is term X in the index? +----------------------- +:: + + return ("content", "wobble") in searcher + + +How many times does term X occur in the index? +---------------------------------------------- +:: + + # Number of times content:wobble appears in all documents + freq = searcher.frequency("content", "wobble") + + # Number of documents containing content:wobble + docfreq = searcher.doc_frequency("content", "wobble") + + +Is term X in document Y? +------------------------ +:: + + # Check if the "content" field of document 500 contains the term "wobble" + + # Without term vectors, skipping through list... + postings = searcher.postings("content", "wobble") + postings.skip_to(500) + return postings.id() == 500 + + # ...or the slower but easier way + docset = set(searcher.postings("content", "wobble").all_ids()) + return 500 in docset + + # If field has term vectors, skipping through list... + vector = searcher.vector(500, "content") + vector.skip_to("wobble") + return vector.id() == "wobble" + + # ...or the slower but easier way + wordset = set(searcher.vector(500, "content").all_ids()) + return "wobble" in wordset + diff --git a/docs/source/releases/0_3.rst b/docs/source/releases/0_3.rst new file mode 100644 index 0000000..780d82e --- /dev/null +++ b/docs/source/releases/0_3.rst @@ -0,0 +1,61 @@ +======================== +Whoosh 0.3 release notes +======================== + +* Major improvements to reading/writing of postings and query performance. + +* Changed default post limit (run size) from 4 MB to 32 MB. + +* Finished migrating backend-specific code into ``whoosh.filedb`` package. + +* Moved formats from whoosh.fields module into new whoosh.formats module. + +* DocReader and TermReader classes combined into new IndexReader interface. + You can get an IndexReader implementation by calling Index.reader(). + Searcher is now a wrapper around an IndexReader. + +* Range query object changed, with new signature and new syntax in the default + query parser. Now you can use ``[start TO end]`` in the query parser for an + inclusive range, and ``{start TO end}`` for an exclusive range. You can also + mix the delimiters, for example ``[start TO end}`` for a range with an + inclusive start but exclusive end term. + +* Added experimental DATETIME field type lets you pass a + ``datetime.datetime`` object as a field value to ``add_document``:: + + from whoosh.fields import Schema, ID, DATETIME + from whoosh.filedb.filestore import RamStorage + from datetime import datetime + + schema = Schema(id=ID, date=DATETIME) + storage = RamStorage() + ix = storage.create_index(schema) + w = ix.writer() + w.add_document(id=u"A", date=datetime.now()) + w.close() + + Internally, the DATETIME field indexes the datetime object as text using + the format (4 digit year + 2 digit month + 2 digit day + 'T' + 2 digit hour + + 2 digit minute + 2 digit second + 6 digit microsecond), for example + ``20090817T160203109000``. + +* The default query parser now lets you use quoted strings in prefix and range + queries, e.g. ``["2009-05" TO "2009-12"]``, ``"alfa/bravo"*``, making it + easier to work with terms containing special characters. + +* ``DocReader.vector_as(docnum, fieldid, astype)`` is now + ``IndexReader.vector_as(astype, docnum, fieldid)`` (i.e. the astype argument + has moved from the last to the first argument), e.g. + ``v = ixreader.vector_as("frequency", 102, "content")``. + +* Added whoosh.support.charset for translating Sphinx charset table files. + +* Added whoosh.analysis.CharsetTokenizer and CharsetFilter to enable case and + accent folding. + +* Added experimental ``whoosh.ramdb`` in-memory backend. + +* Added experimental ``whoosh.query.FuzzyTerm`` query type. + +* Added ``whoosh.lang.wordnet`` module containing ``Thesaurus`` object for using + WordNet synonym database. diff --git a/docs/source/releases/1_0.rst b/docs/source/releases/1_0.rst new file mode 100644 index 0000000..524d1fc --- /dev/null +++ b/docs/source/releases/1_0.rst @@ -0,0 +1,482 @@ +======================== +Whoosh 1.x release notes +======================== + +Whoosh 1.8.3 +============ + +Whoosh 1.8.3 contains important bugfixes and new functionality. Thanks to all +the mailing list and BitBucket users who helped with the fixes! + +Fixed a bad ``Collector`` bug where the docset of a Results object did not match +the actual results. + +You can now pass a sequence of objects to a keyword argument in ``add_document`` +and ``update_document`` (currently this will not work for unique fields in +``update_document``). This is useful for non-text fields such as ``DATETIME`` +and ``NUMERIC``, allowing you to index multiple dates/numbers for a document:: + + writer.add_document(shoe=u"Saucony Kinvara", sizes=[10.0, 9.5, 12]) + +This version reverts to using the CDB hash function for hash files instead of +Python's ``hash()`` because the latter is not meant to be stored externally. +This change maintains backwards compatibility with old files. + +The ``Searcher.search`` method now takes a ``mask`` keyword argument. This is +the opposite of the ``filter`` argument. Where the ``filter`` specifies the +set of documents that can appear in the results, the ``mask`` specifies a +set of documents that must not appear in the results. + +Fixed performance problems in ``Searcher.more_like``. This method now also +takes a ``filter`` keyword argument like ``Searcher.search``. + +Improved documentation. + + +Whoosh 1.8.2 +============ + +Whoosh 1.8.2 fixes some bugs, including a mistyped signature in +Searcher.more_like and a bad bug in Collector that could screw up the +ordering of results given certain parameters. + + +Whoosh 1.8.1 +============ + +Whoosh 1.8.1 includes a few recent bugfixes/improvements: + +- ListMatcher.skip_to_quality() wasn't returning an integer, resulting + in a "None + int" error. + +- Fixed locking and memcache sync bugs in the Google App Engine storage + object. + +- MultifieldPlugin wasn't working correctly with groups. + + - The binary matcher trees of Or and And are now generated using a + Huffman-like algorithm instead perfectly balanced. This gives a + noticeable speed improvement because less information has to be passed + up/down the tree. + + +Whoosh 1.8 +========== + +This release relicensed the Whoosh source code under the Simplified BSD (A.K.A. +"two-clause" or "FreeBSD") license. See LICENSE.txt for more information. + + +Whoosh 1.7.7 +============ + +Setting a TEXT field to store term vectors is now much easier. Instead of +having to pass an instantiated whoosh.formats.Format object to the vector= +keyword argument, you can pass True to automatically use the same format and +analyzer as the inverted index. Alternatively, you can pass a Format subclass +and Whoosh will instantiate it for you. + +For example, to store term vectors using the same settings as the inverted +index (Positions format and StandardAnalyzer):: + + from whoosh.fields import Schema, TEXT + + schema = Schema(content=TEXT(vector=True)) + +To store term vectors that use the same analyzer as the inverted index +(StandardAnalyzer by default) but only store term frequency:: + + from whoosh.formats import Frequency + + schema = Schema(content=TEXT(vector=Frequency)) + +Note that currently the only place term vectors are used in Whoosh is keyword +extraction/more like this, but they can be useful for expert users with custom +code. + +Added :meth:`whoosh.searching.Searcher.more_like` and +:meth:`whoosh.searching.Hit.more_like_this` methods, as shortcuts for doing +keyword extraction yourself. Return a Results object. + +"python setup.py test" works again, as long as you have nose installed. + +The :meth:`whoosh.searching.Searcher.sort_query_using` method lets you sort documents matching a given query using an arbitrary function. Note that like "complex" searching with the Sorter object, this can be slow on large multi-segment indexes. + + +Whoosh 1.7 +========== + +You can once again perform complex sorting of search results (that is, a sort +with some fields ascending and some fields descending). + +You can still use the ``sortedby`` keyword argument to +:meth:`whoosh.searching.Searcher.search` to do a simple sort (where all fields +are sorted in the same direction), or you can use the new +:class:`~whoosh.sorting.Sorter` class to do a simple or complex sort:: + + searcher = myindex.searcher() + sorter = searcher.sorter() + # Sort first by the group field, ascending + sorter.add_field("group") + # Then by the price field, descending + sorter.add_field("price", reverse=True) + # Get the Results + results = sorter.sort_query(myquery) + +See the documentation for the :class:`~whoosh.sorting.Sorter` class for more +information. Bear in mind that complex sorts will be much slower on large +indexes because they can't use the per-segment field caches. + +You can now get highlighted snippets for a hit automatically using +:meth:`whoosh.searching.Hit.highlights`:: + + results = searcher.search(myquery, limit=20) + for hit in results: + print hit["title"] + print hit.highlights("content") + +See :meth:`whoosh.searching.Hit.highlights` for more information. + +Added the ability to filter search results so that only hits in a Results +set, a set of docnums, or matching a query are returned. The filter is +cached on the searcher. + + # Search within previous results + newresults = searcher.search(newquery, filter=oldresults) + + # Search within the "basics" chapter + results = searcher.search(userquery, filter=query.Term("chapter", "basics")) + +You can now specify a time limit for a search. If the search does not finish +in the given time, a :class:`whoosh.searching.TimeLimit` exception is raised, +but you can still retrieve the partial results from the collector. See the +``timelimit`` and ``greedy`` arguments in the +:class:`whoosh.searching.Collector` documentation. + +Added back the ability to set :class:`whoosh.analysis.StemFilter` to use an +unlimited cache. This is useful for one-shot batch indexing (see +:doc:`../batch`). + +The ``normalize()`` method of the ``And`` and ``Or`` queries now merges +overlapping range queries for more efficient queries. + +Query objects now have ``__hash__`` methods allowing them to be used as +dictionary keys. + +The API of the highlight module has changed slightly. Most of the functions +in the module have been converted to classes. However, most old code should +still work. The ``NullFragmeter`` is now called ``WholeFragmenter``, but the +old name is still available as an alias. + +Fixed MultiPool so it won't fill up the temp directory with job files. + +Fixed a bug where Phrase query objects did not use their boost factor. + +Fixed a bug where a fieldname after an open parenthesis wasn't parsed +correctly. The change alters the semantics of certain parsing "corner cases" +(such as ``a:b:c:d``). + + +Whoosh 1.6 +========== + +The ``whoosh.writing.BatchWriter`` class is now called +:class:`whoosh.writing.BufferedWriter`. It is similar to the old ``BatchWriter`` +class but allows you to search and update the buffered documents as well as the +documents that have been flushed to disk:: + + writer = writing.BufferedWriter(myindex) + + # You can update (replace) documents in RAM without having to commit them + # to disk + writer.add_document(path="/a", text="Hi there") + writer.update_document(path="/a", text="Hello there") + + # Search committed and uncommited documents by getting a searcher from the + # writer instead of the index + searcher = writer.searcher() + +(BatchWriter is still available as an alias for backwards compatibility.) + +The :class:`whoosh.qparser.QueryParser` initialization method now requires a +schema as the second argument. Previously the default was to create a +``QueryParser`` without a schema, which was confusing:: + + qp = qparser.QueryParser("content", myindex.schema) + +The :meth:`whoosh.searching.Searcher.search` method now takes a ``scored`` +keyword. If you search with ``scored=False``, the results will be in "natural" +order (the order the documents were added to the index). This is useful when +you don't need scored results but want the convenience of the Results object. + +Added the :class:`whoosh.qparser.GtLtPlugin` parser plugin to allow greater +than/less as an alternative syntax for ranges:: + + count:>100 tag:<=zebra date:>='29 march 2001' + +Added the ability to define schemas declaratively, similar to Django models:: + + from whoosh import index + from whoosh.fields import SchemaClass, ID, KEYWORD, STORED, TEXT + + class MySchema(SchemaClass): + uuid = ID(stored=True, unique=True) + path = STORED + tags = KEYWORD(stored=True) + content = TEXT + + index.create_in("indexdir", MySchema) + +Whoosh 1.6.2: Added :class:`whoosh.searching.TermTrackingCollector` which tracks +which part of the query matched which documents in the final results. + +Replaced the unbounded cache in :class:`whoosh.analysis.StemFilter` with a +bounded LRU (least recently used) cache. This will make stemming analysis +slightly slower but prevent it from eating up too much memory over time. + +Added a simple :class:`whoosh.analysis.PyStemmerFilter` that works when the +py-stemmer library is installed:: + + ana = RegexTokenizer() | PyStemmerFilter("spanish") + +The estimation of memory usage for the ``limitmb`` keyword argument to +``FileIndex.writer()`` is more accurate, which should help keep memory usage +memory usage by the sorting pool closer to the limit. + +The ``whoosh.ramdb`` package was removed and replaced with a single +``whoosh.ramindex`` module. + +Miscellaneous bug fixes. + + +Whoosh 1.5 +========== + +.. note:: + Whoosh 1.5 is incompatible with previous indexes. You must recreate + existing indexes with Whoosh 1.5. + +Fixed a bug where postings were not portable across different endian platforms. + +New generalized field cache system, using per-reader caches, for much faster +sorting and faceting of search results, as well as much faster multi-term (e.g. +prefix and wildcard) and range queries, especially for large indexes and/or +indexes with multiple segments. + +Changed the faceting API. See :doc:`../facets`. + +Faster storage and retrieval of posting values. + +Added per-field ``multitoken_query`` attribute to control how the query parser +deals with a "term" that when analyzed generates multiple tokens. The default +value is `"first"` which throws away all but the first token (the previous +behavior). Other possible values are `"and"`, `"or"`, or `"phrase"`. + +Added :class:`whoosh.analysis.DoubleMetaphoneFilter`, +:class:`whoosh.analysis.SubstitutionFilter`, and +:class:`whoosh.analysis.ShingleFilter`. + +Added :class:`whoosh.qparser.CopyFieldPlugin`. + +Added :class:`whoosh.query.Otherwise`. + +Generalized parsing of operators (such as OR, AND, NOT, etc.) in the query +parser to make it easier to add new operators. In intend to add a better API +for this in a future release. + +Switched NUMERIC and DATETIME fields to use more compact on-disk +representations of numbers. + +Fixed a bug in the porter2 stemmer when stemming the string `"y"`. + +Added methods to :class:`whoosh.searching.Hit` to make it more like a `dict`. + +Short posting lists (by default, single postings) are inline in the term file +instead of written to the posting file for faster retrieval and a small saving +in disk space. + + +Whoosh 1.3 +========== + +Whoosh 1.3 adds a more efficient DATETIME field based on the new tiered NUMERIC +field, and the DateParserPlugin. See :doc:`../dates`. + + +Whoosh 1.2 +========== + +Whoosh 1.2 adds tiered indexing for NUMERIC fields, resulting in much faster +range queries on numeric fields. + + +Whoosh 1.0 +========== + +Whoosh 1.0 is a major milestone release with vastly improved performance and +several useful new features. + +*The index format of this version is not compatibile with indexes created by +previous versions of Whoosh*. You will need to reindex your data to use this +version. + +Orders of magnitude faster searches for common terms. Whoosh now uses +optimizations similar to those in Xapian to skip reading low-scoring postings. + +Faster indexing and ability to use multiple processors (via ``multiprocessing`` +module) to speed up indexing. + +Flexible Schema: you can now add and remove fields in an index with the +:meth:`whoosh.writing.IndexWriter.add_field` and +:meth:`whoosh.writing.IndexWriter.remove_field` methods. + +New hand-written query parser based on plug-ins. Less brittle, more robust, +more flexible, and easier to fix/improve than the old pyparsing-based parser. + +On-disk formats now use 64-bit disk pointers allowing files larger than 4 GB. + +New :class:`whoosh.searching.Facets` class efficiently sorts results into +facets based on any criteria that can be expressed as queries, for example +tags or price ranges. + +New :class:`whoosh.writing.BatchWriter` class automatically batches up +individual ``add_document`` and/or ``delete_document`` calls until a certain +number of calls or a certain amount of time passes, then commits them all at +once. + +New :class:`whoosh.analysis.BiWordFilter` lets you create bi-word indexed +fields a possible alternative to phrase searching. + +Fixed bug where files could be deleted before a reader could open them in +threaded situations. + +New :class:`whoosh.analysis.NgramFilter` filter, +:class:`whoosh.analysis.NgramWordAnalyzer` analyzer, and +:class:`whoosh.fields.NGRAMWORDS` field type allow producing n-grams from +tokenized text. + +Errors in query parsing now raise a specific ``whoosh.qparse.QueryParserError`` +exception instead of a generic exception. + +Previously, the query string ``*`` was optimized to a +:class:`whoosh.query.Every` query which matched every document. Now the +``Every`` query only matches documents that actually have an indexed term from +the given field, to better match the intuitive sense of what a query string like +``tag:*`` should do. + +New :meth:`whoosh.searching.Searcher.key_terms_from_text` method lets you +extract key words from arbitrary text instead of documents in the index. + +Previously the :meth:`whoosh.searching.Searcher.key_terms` and +:meth:`whoosh.searching.Results.key_terms` methods required that the given +field store term vectors. They now also work if the given field is stored +instead. They will analyze the stored string into a term vector on-the-fly. +The field must still be indexed. + + +User API changes +================ + +The default for the ``limit`` keyword argument to +:meth:`whoosh.searching.Searcher.search` is now ``10``. To return all results +in a single ``Results`` object, use ``limit=None``. + +The ``Index`` object no longer represents a snapshot of the index at the time +the object was instantiated. Instead it always represents the index in the +abstract. ``Searcher`` and ``IndexReader`` objects obtained from the +``Index`` object still represent the index as it was at the time they were +created. + +Because the ``Index`` object no longer represents the index at a specific +version, several methods such as ``up_to_date`` and ``refresh`` were removed +from its interface. The Searcher object now has +:meth:`~whoosh.searching.Searcher.last_modified`, +:meth:`~whoosh.searching.Searcher.up_to_date`, and +:meth:`~whoosh.searching.Searcher.refresh` methods similar to those that used to +be on ``Index``. + +The document deletion and field add/remove methods on the ``Index`` object now +create a writer behind the scenes to accomplish each call. This means they write +to the index immediately, so you don't need to call ``commit`` on the ``Index``. +Also, it will be much faster if you need to call them multiple times to create +your own writer instead:: + + # Don't do this + for id in my_list_of_ids_to_delete: + myindex.delete_by_term("id", id) + myindex.commit() + + # Instead do this + writer = myindex.writer() + for id in my_list_of_ids_to_delete: + writer.delete_by_term("id", id) + writer.commit() + +The ``postlimit`` argument to ``Index.writer()`` has been changed to +``postlimitmb`` and is now expressed in megabytes instead of bytes:: + + writer = myindex.writer(postlimitmb=128) + +Instead of having to import ``whoosh.filedb.filewriting.NO_MERGE`` or +``whoosh.filedb.filewriting.OPTIMIZE`` to use as arguments to ``commit()``, you +can now simply do the following:: + + # Do not merge segments + writer.commit(merge=False) + + # or + + # Merge all segments + writer.commit(optimize=True) + +The ``whoosh.postings`` module is gone. The ``whoosh.matching`` module contains +classes for posting list readers. + +Whoosh no longer maps field names to numbers for internal use or writing to +disk. Any low-level method that accepted field numbers now accept field names +instead. + +Custom Weighting implementations that use the ``final()`` method must now +set the ``use_final`` attribute to ``True``:: + + from whoosh.scoring import BM25F + + class MyWeighting(BM25F): + use_final = True + + def final(searcher, docnum, score): + return score + docnum * 10 + +This disables the new optimizations, forcing Whoosh to score every matching +document. + +:class:`whoosh.writing.AsyncWriter` now takes an :class:`whoosh.index.Index` +object as its first argument, not a callable. Also, the keyword arguments to +pass to the index's ``writer()`` method should now be passed as a dictionary +using the ``writerargs`` keyword argument. + +Whoosh now stores per-document field length using an approximation rather than +exactly. For low numbers the approximation is perfectly accurate, while high +numbers will be approximated less accurately. + +The ``doc_field_length`` method on searchers and readers now takes a second +argument representing the default to return if the given document and field +do not have a length (i.e. the field is not scored or the field was not +provided for the given document). + +The :class:`whoosh.analysis.StopFilter` now has a ``maxsize`` argument as well +as a ``minsize`` argument to its initializer. Analyzers that use the +``StopFilter`` have the ``maxsize`` argument in their initializers now also. + +The interface of :class:`whoosh.writing.AsyncWriter` has changed. + + +Misc +==== + +* Because the file backend now writes 64-bit disk pointers and field names + instead of numbers, the size of an index on disk will grow compared to + previous versions. + +* Unit tests should no longer leave directories and files behind. + diff --git a/docs/source/releases/2_0.rst b/docs/source/releases/2_0.rst new file mode 100644 index 0000000..3978460 --- /dev/null +++ b/docs/source/releases/2_0.rst @@ -0,0 +1,333 @@ +======================== +Whoosh 2.x release notes +======================== + +Whoosh 2.7 +========== + +* Removed on-disk word graph implementation of spell checking in favor of much + simpler and faster FSA implementation over the term file. + +* Many bug fixes. + +* Removed backwards compatibility with indexes created by versions prior to + 2.5. You may need to re-index if you are using an old index that hasn't been + updated. + +* This is the last 2.x release before a major overhaul that will break backwards + compatibility. + + +Whoosh 2.5 +========== + +* Whoosh 2.5 will read existing indexes, but segments created by 2.5 will not + be readable by older versions of Whoosh. + +* As a replacement for field caches to speed up sorting, Whoosh now supports + adding a ``sortable=True`` keyword argument to fields. This makes Whoosh store + a sortable representation of the field's values in a "column" format + (which associates a "key" value with each document). This is more robust, + efficient, and customizable than the old behavior. + You should now specify ``sortable=True`` on fields that you plan on using to + sort or group search results. + + (You can still sort/group on fields that don't have ``sortable=True``, + however it will use more RAM and be slower as Whoosh caches the field values + in memory.) + + Fields that use ``sortable=True`` can avoid specifying ``stored=True``. The + field's value will still be available on ``Hit`` objects (the value will be + retrieved from the column instead of from the stored fields). This may + actually be faster for certain types of values. + +* Whoosh will now detect common types of OR queries and use optimized read-ahead + matchers to speed them up by several times. + +* Whoosh now includes pure-Python implementations of the Snowball stemmers and + stop word lists for various languages adapted from NLTK. These are available + through the :class:`whoosh.analysis.LanguageAnalyzer` analyzer or through the + ``lang=`` keyword argument to the + :class:`~whoosh.fields.TEXT` field. + +* You can now use the + :meth:`whoosh.filedb.filestore.Storage.create()` and + :meth:`whoosh.filedb.filestore.Storage.destory()` + methods as a consistent API to set up and tear down different types of + storage. + +* Many bug fixes and speed improvements. + +* Switched unit tests to use ``py.test`` instead of ``nose``. + +* Removed obsolete ``SpellChecker`` class. + + +Whoosh 2.4 +========== + +* By default, Whoosh now assembles the individual files of a segment into a + single file when committing. This has a small performance penalty but solves + a problem where Whoosh can keep too many files open. Whoosh is also now + smarter about using mmap. + +* Added functionality to index and search hierarchical documents. See + :doc:`/nested`. + +* Rewrote the Directed Acyclic Word Graph implementation (used in spell + checking) to be faster and more space-efficient. Word graph files created by + previous versions will be ignored, meaning that spell checking may become + slower unless/until you replace the old segments (for example, by + optimizing). + +* Rewrote multiprocessing indexing to be faster and simpler. You can now + do ``myindex.writer(procs=n)`` to get a multiprocessing writer, or + ``myindex.writer(procs=n, multisegment=True)`` to get a multiprocessing + writer that leaves behind multiple segments, like the old MultiSegmentWriter. + (``MultiSegmentWriter`` is still available as a function that returns the + new class.) + +* When creating ``Term`` query objects for special fields (e.g. NUMERIC or + BOOLEAN), you can now use the field's literal type instead of a string as the + second argument, for example ``Term("num", 20)`` or ``Term("bool", True)``. + (This change may cause problems interacting with functions that expect + query objects to be pure textual, such as spell checking.) + +* All writing to and reading from on-disk indexes is now done through "codec" + objects. This architecture should make it easier to add optional or + experimental features, and maintain backwards compatibility. + +* Fixes issues #75, #137, #206, #213, #215, #219, #223, #226, #230, #233, #238, + #239, #240, #241, #243, #244, #245, #252, #253, and other bugs. Thanks to + Thomas Waldmann and Alexei Gousev for the help! + + +Whoosh 2.3.2 +============ + +* Fixes bug in BM25F scoring function, leading to increased precision in search + results. + +* Fixes issues #203, #205, #206, #208, #209, #212. + + +Whoosh 2.3.1 +============ + +* Fixes issue #200. + + +Whoosh 2.3 +========== + +* Added a :class:`whoosh.query.Regex` term query type, similar to + :class:`whoosh.query.Wildcard`. The parser does not allow regex term queries + by default. You need to add the :class:`whoosh.qparser.RegexPlugin` plugin. + After you add the plugin, you can use ``r"expression"`` query syntax for + regular expression term queries. For example, ``r"foo.*bar"``. + +* Added the :class:`whoosh.qparser.PseudoFieldPlugin` parser plugin. This + plugin lets you create "pseudo-fields" that run a transform function on + whatever query syntax the user applies the field to. This is fairly advanced + functionality right now; I'm trying to think of ways to make its power easier + to access. + +* The documents in the lists in the dictionary returned by ``Results.groups()`` + by default are now in the same relative order as in the results. This makes + it much easier to display the "top N" results in each category, for example. + +* The ``groupids`` keyword argument to ``Searcher.search`` has been removed. + Instead you can now pass a :class:`whoosh.sorting.FacetMap` object to the + ``Searcher.search`` method's ``maptype`` argument to control how faceted + documents are grouped, and/or set the ``maptype`` argument on individual + :class:`whoosh.sorting.FacetType`` objects to set custom grouping per facet. + See :doc:`../facets` for more information. + +* Calling ``Searcher.documents()`` or ``Searcher.document_numbers()`` with no + arguments now yields all documents/numbers. + +* Calling ``Writer.update_document()`` with no unique fields is now equivalent + to calling ``Writer.add_document()`` with the same arguments. + +* Fixed a problem with keyword expansion where the code was building a cache + that was fast on small indexes, but unacceptably slow on large indexes. + +* Added the hyphen (``-``) to the list of characters that match a "wildcard" + token, to make parsing slightly more predictable. A true fix will have to + wait for another parser rewrite. + +* Fixed an unused ``__future__`` import and use of ``float("nan")`` which were + breaking under Python 2.5. + +* Fixed a bug where vectored fields with only one term stored an empty term + vector. + +* Various other bug fixes. + +Whoosh 2.2 +========== + +* Fixes several bugs, including a bad bug in BM25F scoring. + +* Added ``allow_overlap`` option to :class:`whoosh.sorting.StoredFieldFacet`. + +* In :meth:`~whoosh.writing.IndexWriter.add_document`, You can now pass + query-like strings for BOOLEAN and DATETIME fields (e.g ``boolfield="true"`` + and ``dtfield="20101131-16:01"``) as an alternative to actual ``bool`` or + ``datetime`` objects. The implementation of this is incomplete: it only works + in the default ``filedb`` backend, and if the field is stored, the stored + value will be the string, not the parsed object. + +* Added :class:`whoosh.analysis.CompoundWordFilter` and + :class:`whoosh.analysis.TeeFilter`. + + +Whoosh 2.1 +========== + +This release fixes several bugs, and contains speed improvments to highlighting. +See :doc:`/highlight` for more information. + + +Whoosh 2.0 +========== + +Improvements +------------ + +* Whoosh is now compatible with Python 3 (tested with Python 3.2). Special + thanks to Vinay Sajip who did the work, and also Jordan Sherer who helped + fix later issues. + +* Sorting and grouping (faceting) now use a new system of "facet" objects which + are much more flexible than the previous field-based system. + + For example, to sort by first name and then score:: + + from whoosh import sorting + + mf = sorting.MultiFacet([sorting.FieldFacet("firstname"), + sorting.ScoreFacet()]) + results = searcher.search(myquery, sortedby=mf) + + In addition to the previously supported sorting/grouping by field contents + and/or query results, you can now use numeric ranges, date ranges, score, and + more. The new faceting system also supports overlapping groups. + + (The old "Sorter" API still works but is deprecated and may be removed in a + future version.) + + See :doc:`/facets` for more information. + +* Completely revamped spell-checking to make it much faster, easier, and more + flexible. You can enable generation of the graph files use by spell checking + using the ``spelling=True`` argument to a field type:: + + schema = fields.Schema(text=fields.TEXT(spelling=True)) + + (Spelling suggestion methods will work on fields without ``spelling=True`` + but will slower.) The spelling graph will be updated automatically as new + documents are added -- it is no longer necessary to maintain a separate + "spelling index". + + You can get suggestions for individual words using + :meth:`whoosh.searching.Searcher.suggest`:: + + suglist = searcher.suggest("content", "werd", limit=3) + + Whoosh now includes convenience methods to spell-check and correct user + queries, with optional highlighting of corrections using the + ``whoosh.highlight`` module:: + + from whoosh import highlight, qparser + + # User query string + qstring = request.get("q") + + # Parse into query object + parser = qparser.QueryParser("content", myindex.schema) + qobject = parser.parse(qstring) + + results = searcher.search(qobject) + + if not results: + correction = searcher.correct_query(gobject, gstring) + # correction.query = corrected query object + # correction.string = corrected query string + + # Format the corrected query string with HTML highlighting + cstring = correction.format_string(highlight.HtmlFormatter()) + + Spelling suggestions can come from field contents and/or lists of words. + For stemmed fields the spelling suggestions automatically use the unstemmed + forms of the words. + + There are APIs for spelling suggestions and query correction, so highly + motivated users could conceivably replace the defaults with more + sophisticated behaviors (for example, to take context into account). + + See :doc:`/spelling` for more information. + +* :class:`whoosh.query.FuzzyTerm` now uses the new word graph feature as well + and so is much faster. + +* You can now set a boost factor for individual documents as you index them, + to increase the score of terms in those documents in searches. See the + documentation for the :meth:`~whoosh.writing.IndexWriter.add_document` for + more information. + +* Added built-in recording of which terms matched in which documents. Use the + ``terms=True`` argument to :meth:`whoosh.searching.Searcher.search` and use + :meth:`whoosh.searching.Hit.matched_terms` and + :meth:`whoosh.searching.Hit.contains_term` to check matched terms. + +* Whoosh now supports whole-term quality optimizations, so for example if the + system knows that a UnionMatcher cannot possibly contribute to the "top N" + results unless both sub-matchers match, it will replace the UnionMatcher with + an IntersectionMatcher which is faster to compute. The performance improvement + is not as dramatic as from block quality optimizations, but it can be + noticeable. + +* Fixed a bug that prevented block quality optimizations in queries with words + not in the index, which could severely degrade performance. + +* Block quality optimizations now use the actual scoring algorithm to calculate + block quality instead of an approximation, which fixes issues where ordering + of results could be different for searches with and without the optimizations. + +* the BOOLEAN field type now supports field boosts. + +* Re-architected the query parser to make the code easier to understand. Custom + parser plugins from previous versions will probably break in Whoosh 2.0. + +* Various bug-fixes and performance improvements. + +* Removed the "read lock", which caused more problems than it solved. Now when + opening a reader, if segments are deleted out from under the reader as it + is opened, the code simply retries. + + +Compatibility +------------- + +* The term quality optimizations required changes to the on-disk formats. + Whoosh 2.0 if backwards-compatible with the old format. As you rewrite an + index using Whoosh 2.0, by default it will use the new formats for new + segments, making the index incompatible with older versions. + + To upgrade an existing index to use the new formats immediately, use + ``Index.optimize()``. + +* Removed the experimental ``TermTrackingCollector`` since it is replaced by + the new built-in term recording functionality. + +* Removed the experimental ``Searcher.define_facets`` feature until a future + release when it will be replaced by a more robust and useful feature. + +* Reader iteration methods (``__iter__``, ``iter_from``, ``iter_field``, etc.) + now yield :class:`whoosh.reading.TermInfo` objects. + +* The arguments to :class:`whoosh.query.FuzzyTerm` changed. + + + diff --git a/docs/source/releases/index.rst b/docs/source/releases/index.rst new file mode 100644 index 0000000..cf63ae8 --- /dev/null +++ b/docs/source/releases/index.rst @@ -0,0 +1,11 @@ +============= +Release notes +============= + +.. toctree:: + :maxdepth: 2 + + 2_0 + 1_0 + 0_3 + diff --git a/docs/source/schema.rst b/docs/source/schema.rst new file mode 100644 index 0000000..a7d9fab --- /dev/null +++ b/docs/source/schema.rst @@ -0,0 +1,377 @@ +================== +Designing a schema +================== + +About schemas and fields +======================== + +The schema specifies the fields of documents in an index. + +Each document can have multiple fields, such as title, content, url, date, etc. + +Some fields can be indexed, and some fields can be stored with the document so +the field value is available in search results. +Some fields will be both indexed and stored. + +The schema is the set of all possible fields in a document. Each individual +document might only use a subset of the available fields in the schema. + +For example, a simple schema for indexing emails might have fields like +``from_addr``, ``to_addr``, ``subject``, ``body``, and ``attachments``, where +the ``attachments`` field lists the names of attachments to the email. For +emails without attachments, you would omit the attachments field. + + +Built-in field types +==================== + +Whoosh provides some useful predefined field types: + +:class:`whoosh.fields.TEXT` + This type is for body text. It indexes (and optionally stores) the text and + stores term positions to allow phrase searching. + + ``TEXT`` fields use :class:`~whoosh.analysis.StandardAnalyzer` by default. To specify a different + analyzer, use the ``analyzer`` keyword argument to the constructor, e.g. + ``TEXT(analyzer=analysis.StemmingAnalyzer())``. See :doc:`analysis`. + + By default, ``TEXT`` fields store position information for each indexed term, to + allow you to search for phrases. If you don't need to be able to search for + phrases in a text field, you can turn off storing term positions to save + space. Use ``TEXT(phrase=False)``. + + By default, ``TEXT`` fields are not stored. Usually you will not want to store + the body text in the search index. Usually you have the indexed documents + themselves available to read or link to based on the search results, so you + don't need to store their text in the search index. However, in some + circumstances it can be useful (see :doc:`highlight`). Use + ``TEXT(stored=True)`` to specify that the text should be stored in the index. + +:class:`whoosh.fields.KEYWORD` + This field type is designed for space- or comma-separated keywords. This + type is indexed and searchable (and optionally stored). To save space, it + does not support phrase searching. + + To store the value of the field in the index, use ``stored=True`` in the + constructor. To automatically lowercase the keywords before indexing them, + use ``lowercase=True``. + + By default, the keywords are space separated. To separate the keywords by + commas instead (to allow keywords containing spaces), use ``commas=True``. + + If your users will use the keyword field for searching, use ``scorable=True``. + +:class:`whoosh.fields.ID` + The ``ID`` field type simply indexes (and optionally stores) the entire value of + the field as a single unit (that is, it doesn't break it up into individual + terms). This type of field does not store frequency information, so it's + quite compact, but not very useful for scoring. + + Use ``ID`` for fields like url or path (the URL or file path of a document), + date, category -- fields where the value must be treated as a whole, and + each document only has one value for the field. + + By default, ``ID`` fields are not stored. Use ``ID(stored=True)`` to specify that + the value of the field should be stored with the document for use in the + search results. For example, you would want to store the value of a url + field so you could provide links to the original in your search results. + +:class:`whoosh.fields.STORED` + This field is stored with the document, but not indexed and not searchable. + This is useful for document information you want to display to the user in + the search results, but don't need to be able to search for. + +:class:`whoosh.fields.NUMERIC` + This field stores int, long, or floating point numbers in a compact, + sortable format. + +:class:`whoosh.fields.DATETIME` + This field stores datetime objects in a compact, sortable format. + +:class:`whoosh.fields.BOOLEAN` + This simple filed indexes boolean values and allows users to search for + ``yes``, ``no``, ``true``, ``false``, ``1``, ``0``, ``t`` or ``f``. + +:class:`whoosh.fields.NGRAM` + TBD. + +Expert users can create their own field types. + + +Creating a Schema +================= + +To create a schema:: + + from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED + from whoosh.analysis import StemmingAnalyzer + + schema = Schema(from_addr=ID(stored=True), + to_addr=ID(stored=True), + subject=TEXT(stored=True), + body=TEXT(analyzer=StemmingAnalyzer()), + tags=KEYWORD) + +If you aren't specifying any constructor keyword arguments to one of the +predefined fields, you can leave off the brackets (e.g. ``fieldname=TEXT`` instead +of ``fieldname=TEXT()``). Whoosh will instantiate the class for you. + +Alternatively you can create a schema declaratively using the ``SchemaClass`` +base class:: + + from whoosh.fields import SchemaClass, TEXT, KEYWORD, ID, STORED + + class MySchema(SchemaClass): + path = ID(stored=True) + title = TEXT(stored=True) + content = TEXT + tags = KEYWORD + +You can pass a declarative class to :func:`~whoosh.index.create_in` or +:meth:`~whoosh.store.Storage.create_index()` instead of a +:class:`~whoosh.fields.Schema` instance. + + +Modifying the schema after indexing +=================================== + +After you have created an index, you can add or remove fields to the schema +using the ``add_field()`` and ``remove_field()`` methods. These methods are +on the ``Writer`` object:: + + writer = ix.writer() + writer.add_field("fieldname", fields.TEXT(stored=True)) + writer.remove_field("content") + writer.commit() + +(If you're going to modify the schema *and* add documents using the same +writer, you must call ``add_field()`` and/or ``remove_field`` *before* you +add any documents.) + +These methods are also on the ``Index`` object as a convenience, but when you +call them on an ``Index``, the Index object simply creates the writer, calls +the corresponding method on it, and commits, so if you want to add or remove +more than one field, it's much more efficient to create the writer yourself:: + + ix.add_field("fieldname", fields.KEYWORD) + +In the ``filedb`` backend, removing a field simply removes that field from the +*schema* -- the index will not get smaller, data about that field will remain +in the index until you optimize. Optimizing will compact the index, removing +references to the deleted field as it goes:: + + writer = ix.writer() + writer.add_field("uuid", fields.ID(stored=True)) + writer.remove_field("path") + writer.commit(optimize=True) + +Because data is stored on disk with the field name, *do not* add a new field with +the same name as a deleted field without optimizing the index in between:: + + writer = ix.writer() + writer.delete_field("path") + # Don't do this!!! + writer.add_field("path", fields.KEYWORD) + +(A future version of Whoosh may automatically prevent this error.) + + +Dynamic fields +============== + +Dynamic fields let you associate a field type with any field name that matches +a given "glob" (a name pattern containing ``*``, ``?``, and/or ``[abc]`` +wildcards). + +You can add dynamic fields to a new schema using the ``add()`` method with the +``glob`` keyword set to True:: + + schema = fields.Schema(...) + # Any name ending in "_d" will be treated as a stored + # DATETIME field + schema.add("*_d", fields.DATETIME(stored=True), glob=True) + +To set up a dynamic field on an existing index, use the same +``IndexWriter.add_field`` method as if you were adding a regular field, but +with the ``glob`` keyword argument set to ``True``:: + + writer = ix.writer() + writer.add_field("*_d", fields.DATETIME(stored=True), glob=True) + writer.commit() + +To remove a dynamic field, use the ``IndexWriter.remove_field()`` method with +the glob as the name:: + + writer = ix.writer() + writer.remove_field("*_d") + writer.commit() + +For example, to allow documents to contain any field name that ends in ``_id`` +and associate it with the ``ID`` field type:: + + schema = fields.Schema(path=fields.ID) + schema.add("*_id", fields.ID, glob=True) + + ix = index.create_in("myindex", schema) + + w = ix.writer() + w.add_document(path=u"/a", test_id=u"alfa") + w.add_document(path=u"/b", class_id=u"MyClass") + # ... + w.commit() + + qp = qparser.QueryParser("path", schema=schema) + q = qp.parse(u"test_id:alfa") + with ix.searcher() as s: + results = s.search(q) + + +Advanced schema setup +===================== + +Field boosts +------------ + +You can specify a field boost for a field. This is a multiplier applied to the +score of any term found in the field. For example, to make terms found in the +title field score twice as high as terms in the body field:: + + schema = Schema(title=TEXT(field_boost=2.0), body=TEXT) + + +Field types +----------- + +The predefined field types listed above are subclasses of ``fields.FieldType``. +``FieldType`` is a pretty simple class. Its attributes contain information that +define the behavior of a field. + +============ =============== ====================================================== +Attribute Type Description +============ =============== ====================================================== +format fields.Format Defines what kind of information a field records + about each term, and how the information is stored + on disk. +vector fields.Format Optional: if defined, the format in which to store + per-document forward-index information for this field. +scorable bool If True, the length of (number of terms in) the field in + each document is stored in the index. Slightly misnamed, + since field lengths are not required for all scoring. + However, field lengths are required to get proper + results from BM25F. +stored bool If True, the value of this field is stored + in the index. +unique bool If True, the value of this field may be used to + replace documents with the same value when the user + calls + :meth:`~whoosh.writing.IndexWriter.document_update` + on an ``IndexWriter``. +============ =============== ====================================================== + +The constructors for most of the predefined field types have parameters that let +you customize these parts. For example: + +* Most of the predefined field types take a stored keyword argument that sets + FieldType.stored. + +* The ``TEXT()`` constructor takes an ``analyzer`` keyword argument that is + passed on to the format object. + +Formats +------- + +A ``Format`` object defines what kind of information a field records about each +term, and how the information is stored on disk. + +For example, the ``Existence`` format would store postings like this: + +==== ==== +Doc +==== ==== +10 +20 +30 +==== ==== + +Whereas the ``Positions`` format would store postings like this: + +===== ============= +Doc Positions +===== ============= +10 ``[1,5,23]`` +20 ``[45]`` +30 ``[7,12]`` +===== ============= + +The indexing code passes the unicode string for a field to the field's ``Format`` +object. The ``Format`` object calls its analyzer (see text analysis) to break the +string into tokens, then encodes information about each token. + +Whoosh ships with the following pre-defined formats. + +=============== ================================================================ +Class name Description +=============== ================================================================ +Stored A "null" format for fields that are stored but not indexed. +Existence Records only whether a term is in a document or not, i.e. it + does not store term frequency. Useful for identifier fields + (e.g. path or id) and "tag"-type fields, where the frequency + is expected to always be 0 or 1. +Frequency Stores the number of times each term appears in each document. +Positions Stores the number of times each term appears in each document, + and at what positions. +=============== ================================================================ + +The ``STORED`` field type uses the ``Stored`` format (which does nothing, so ``STORED`` +fields are not indexed). The ``ID`` type uses the ``Existence`` format. The ``KEYWORD`` type +uses the ``Frequency`` format. The ``TEXT`` type uses the ``Positions`` format if it is +instantiated with ``phrase=True`` (the default), or ``Frequency`` if ``phrase=False``. + +In addition, the following formats are implemented for the possible convenience +of expert users, but are not currently used in Whoosh: + +================= ================================================================ +Class name Description +================= ================================================================ +DocBoosts Like Existence, but also stores per-document boosts +Characters Like Positions, but also stores the start and end character + indices of each term +PositionBoosts Like Positions, but also stores per-position boosts +CharacterBoosts Like Positions, but also stores the start and end character + indices of each term and per-position boosts +================= ================================================================ + +Vectors +------- + +The main index is an inverted index. It maps terms to the documents they appear +in. It is also sometimes useful to store a forward index, also known as a term +vector, that maps documents to the terms that appear in them. + +For example, imagine an inverted index like this for a field: + +========== ========================================================= +Term Postings +========== ========================================================= +apple ``[(doc=1, freq=2), (doc=2, freq=5), (doc=3, freq=1)]`` +bear ``[(doc=2, freq=7)]`` +========== ========================================================= + +The corresponding forward index, or term vector, would be: + +========== ====================================================== +Doc Postings +========== ====================================================== +1 ``[(text=apple, freq=2)]`` +2 ``[(text=apple, freq=5), (text='bear', freq=7)]`` +3 ``[(text=apple, freq=1)]`` +========== ====================================================== + +If you set ``FieldType.vector`` to a ``Format`` object, the indexing code will use the +``Format`` object to store information about the terms in each document. Currently +by default Whoosh does not make use of term vectors at all, but they are +available to expert users who want to implement their own field types. + + + + diff --git a/docs/source/searching.rst b/docs/source/searching.rst new file mode 100644 index 0000000..ab4f2a9 --- /dev/null +++ b/docs/source/searching.rst @@ -0,0 +1,400 @@ +============= +How to search +============= + +Once you've created an index and added documents to it, you can search for those +documents. + +The ``Searcher`` object +======================= + +To get a :class:`whoosh.searching.Searcher` object, call ``searcher()`` on your +``Index`` object:: + + searcher = myindex.searcher() + +You'll usually want to open the searcher using a ``with`` statement so the +searcher is automatically closed when you're done with it (searcher objects +represent a number of open files, so if you don't explicitly close them and the +system is slow to collect them, you can run out of file handles):: + + with ix.searcher() as searcher: + ... + +This is of course equivalent to:: + + try: + searcher = ix.searcher() + ... + finally: + searcher.close() + +The ``Searcher`` object is the main high-level interface for reading the index. It +has lots of useful methods for getting information about the index, such as +``lexicon(fieldname)``. + +:: + + >>> list(searcher.lexicon("content")) + [u"document", u"index", u"whoosh"] + +However, the most important method on the ``Searcher`` object is +:meth:`~whoosh.searching.Searcher.search`, which takes a +:class:`whoosh.query.Query` object and returns a +:class:`~whoosh.searching.Results` object:: + + from whoosh.qparser import QueryParser + + qp = QueryParser("content", schema=myindex.schema) + q = qp.parse(u"hello world") + + with myindex.searcher() as s: + results = s.search(q) + +By default the results contains at most the first 10 matching documents. To get +more results, use the ``limit`` keyword:: + + results = s.search(q, limit=20) + +If you want all results, use ``limit=None``. However, setting the limit whenever +possible makes searches faster because Whoosh doesn't need to examine and score +every document. + +Since displaying a page of results at a time is a common pattern, the +``search_page`` method lets you conveniently retrieve only the results on a +given page:: + + results = s.search_page(q, 1) + +The default page length is 10 hits. You can use the ``pagelen`` keyword argument +to set a different page length:: + + results = s.search_page(q, 5, pagelen=20) + + +Results object +============== + +The :class:`~whoosh.searching.Results` object acts like a list of the matched +documents. You can use it to access the stored fields of each hit document, to +display to the user. + +:: + + >>> # Show the best hit's stored fields + >>> results[0] + {"title": u"Hello World in Python", "path": u"/a/b/c"} + >>> results[0:2] + [{"title": u"Hello World in Python", "path": u"/a/b/c"}, + {"title": u"Foo", "path": u"/bar"}] + +By default, ``Searcher.search(myquery)`` limits the number of hits to 20, So the +number of scored hits in the ``Results`` object may be less than the number of +matching documents in the index. + +:: + + >>> # How many documents in the entire index would have matched? + >>> len(results) + 27 + >>> # How many scored and sorted documents in this Results object? + >>> # This will often be less than len() if the number of hits was limited + >>> # (the default). + >>> results.scored_length() + 10 + +Calling ``len(Results)`` runs a fast (unscored) version of the query again to +figure out the total number of matching documents. This is usually very fast +but for large indexes it can cause a noticeable delay. If you want to avoid +this delay on very large indexes, you can use the +:meth:`~whoosh.searching.Results.has_exact_length`, +:meth:`~whoosh.searching.Results.estimated_length`, and +:meth:`~whoosh.searching.Results.estimated_min_length` methods to estimate the +number of matching documents without calling ``len()``:: + + found = results.scored_length() + if results.has_exact_length(): + print("Scored", found, "of exactly", len(results), "documents") + else: + low = results.estimated_min_length() + high = results.estimated_length() + + print("Scored", found, "of between", low, "and", high, "documents") + + +Scoring and sorting +=================== + +Scoring +------- + +Normally the list of result documents is sorted by *score*. The +:mod:`whoosh.scoring` module contains implementations of various scoring +algorithms. The default is :class:`~whoosh.scoring.BM25F`. + +You can set the scoring object to use when you create the searcher using the +``weighting`` keyword argument:: + + from whoosh import scoring + + with myindex.searcher(weighting=scoring.TF_IDF()) as s: + ... + +A weighting model is a :class:`~whoosh.scoring.WeightingModel` subclass with a +``scorer()`` method that produces a "scorer" instance. This instance has a +method that takes the current matcher and returns a floating point score. + +Sorting +------- + +See :doc:`facets`. + + +Highlighting snippets and More Like This +======================================== + +See :doc:`highlight` and :doc:`keywords` for information on these topics. + + +Filtering results +================= + +You can use the ``filter`` keyword argument to ``search()`` to specify a set of +documents to permit in the results. The argument can be a +:class:`whoosh.query.Query` object, a :class:`whoosh.searching.Results` object, +or a set-like object containing document numbers. The searcher caches filters +so if for example you use the same query filter with a searcher multiple times, +the additional searches will be faster because the searcher will cache the +results of running the filter query + +You can also specify a ``mask`` keyword argument to specify a set of documents +that are not permitted in the results. + +:: + + with myindex.searcher() as s: + qp = qparser.QueryParser("content", myindex.schema) + user_q = qp.parse(query_string) + + # Only show documents in the "rendering" chapter + allow_q = query.Term("chapter", "rendering") + # Don't show any documents where the "tag" field contains "todo" + restrict_q = query.Term("tag", "todo") + + results = s.search(user_q, filter=allow_q, mask=restrict_q) + +(If you specify both a ``filter`` and a ``mask``, and a matching document +appears in both, the ``mask`` "wins" and the document is not permitted.) + +To find out how many results were filtered out of the results, use +``results.filtered_count`` (or ``resultspage.results.filtered_count``):: + + with myindex.searcher() as s: + qp = qparser.QueryParser("content", myindex.schema) + user_q = qp.parse(query_string) + + # Filter documents older than 7 days + old_q = query.DateRange("created", None, datetime.now() - timedelta(days=7)) + results = s.search(user_q, mask=old_q) + + print("Filtered out %d older documents" % results.filtered_count) + + +Which terms from my query matched? +================================== + +You can use the ``terms=True`` keyword argument to ``search()`` to have the +search record which terms in the query matched which documents:: + + with myindex.searcher() as s: + results = s.seach(myquery, terms=True) + +You can then get information about which terms matched from the +:class:`whoosh.searching.Results` and :class:`whoosh.searching.Hit` objects:: + + # Was this results object created with terms=True? + if results.has_matched_terms(): + # What terms matched in the results? + print(results.matched_terms()) + + # What terms matched in each hit? + for hit in results: + print(hit.matched_terms()) + + +.. _collapsing: + +Collapsing results +================== + +Whoosh lets you eliminate all but the top N documents with the same facet key +from the results. This can be useful in a few situations: + +* Eliminating duplicates at search time. + +* Restricting the number of matches per source. For example, in a web search + application, you might want to show at most three matches from any website. + +Whether a document should be collapsed is determined by the value of a "collapse +facet". If a document has an empty collapse key, it will never be collapsed, +but otherwise only the top N documents with the same collapse key will appear +in the results. + +See :doc:`/facets` for information on facets. + +:: + + with myindex.searcher() as s: + # Set the facet to collapse on and the maximum number of documents per + # facet value (default is 1) + results = s.collector(collapse="hostname", collapse_limit=3) + + # Dictionary mapping collapse keys to the number of documents that + # were filtered out by collapsing on that key + print(results.collapsed_counts) + +Collapsing works with both scored and sorted results. You can use any of the +facet types available in the :mod:`whoosh.sorting` module. + +By default, Whoosh uses the results order (score or sort key) to determine the +documents to collapse. For example, in scored results, the best scoring +documents would be kept. You can optionally specify a ``collapse_order`` facet +to control which documents to keep when collapsing. + +For example, in a product search you could display results sorted by decreasing +price, and eliminate all but the highest rated item of each product type:: + + from whoosh import sorting + + with myindex.searcher() as s: + price_facet = sorting.FieldFacet("price", reverse=True) + type_facet = sorting.FieldFacet("type") + rating_facet = sorting.FieldFacet("rating", reverse=True) + + results = s.collector(sortedby=price_facet, # Sort by reverse price + collapse=type_facet, # Collapse on product type + collapse_order=rating_facet # Collapse to highest rated + ) + +The collapsing happens during the search, so it is usually more efficient than +finding everything and post-processing the results. However, if the collapsing +eliminates a large number of documents, collapsed search can take longer +because the search has to consider more documents and remove many +already-collected documents. + +Since this collector must sometimes go back and remove already-collected +documents, if you use it in combination with +:class:`~whoosh.collectors.TermsCollector` and/or +:class:`~whoosh.collectors.FacetCollector`, those collectors may contain +information about documents that were filtered out of the final results by +collapsing. + + +Time limited searches +===================== + +To limit the amount of time a search can take:: + + from whoosh.collectors import TimeLimitCollector, TimeLimit + + with myindex.searcher() as s: + # Get a collector object + c = s.collector(limit=None, sortedby="title_exact") + # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds + tlc = TimeLimitedCollector(c, timelimit=10.0) + + # Try searching + try: + s.search_with_collector(myquery, tlc) + except TimeLimit: + print("Search took too long, aborting!") + + # You can still get partial results from the collector + results = tlc.results() + + +Convenience methods +=================== + +The :meth:`~whoosh.searching.Searcher.document` and +:meth:`~whoosh.searching.Searcher.documents` methods on the ``Searcher`` object let +you retrieve the stored fields of documents matching terms you pass in keyword +arguments. + +This is especially useful for fields such as dates/times, identifiers, paths, +and so on. + +:: + + >>> list(searcher.documents(indexeddate=u"20051225")) + [{"title": u"Christmas presents"}, {"title": u"Turkey dinner report"}] + >>> print searcher.document(path=u"/a/b/c") + {"title": "Document C"} + +These methods have some limitations: + +* The results are not scored. +* Multiple keywords are always AND-ed together. +* The entire value of each keyword argument is considered a single term; you + can't search for multiple terms in the same field. + + +Combining Results objects +========================= + +It is sometimes useful to use the results of another query to influence the +order of a :class:`whoosh.searching.Results` object. + +For example, you might have a "best bet" field. This field contains hand-picked +keywords for documents. When the user searches for those keywords, you want +those documents to be placed at the top of the results list. You could try to +do this by boosting the "bestbet" field tremendously, but that can have +unpredictable effects on scoring. It's much easier to simply run the query +twice and combine the results:: + + # Parse the user query + userquery = queryparser.parse(querystring) + + # Get the terms searched for + termset = set() + userquery.existing_terms(termset) + + # Formulate a "best bet" query for the terms the user + # searched for in the "content" field + bbq = Or([Term("bestbet", text) for fieldname, text + in termset if fieldname == "content"]) + + # Find documents matching the searched for terms + results = s.search(bbq, limit=5) + + # Find documents that match the original query + allresults = s.search(userquery, limit=10) + + # Add the user query results on to the end of the "best bet" + # results. If documents appear in both result sets, push them + # to the top of the combined results. + results.upgrade_and_extend(allresults) + +The ``Results`` object supports the following methods: + +``Results.extend(results)`` + Adds the documents in 'results' on to the end of the list of result + documents. + +``Results.filter(results)`` + Removes the documents in 'results' from the list of result documents. + +``Results.upgrade(results)`` + Any result documents that also appear in 'results' are moved to the top + of the list of result documents. + +``Results.upgrade_and_extend(results)`` + Any result documents that also appear in 'results' are moved to the top + of the list of result documents. Then any other documents in 'results' are + added on to the list of result documents. + + + + + + diff --git a/docs/source/spelling.rst b/docs/source/spelling.rst new file mode 100644 index 0000000..cc1abc8 --- /dev/null +++ b/docs/source/spelling.rst @@ -0,0 +1,130 @@ +===================================================== +"Did you mean... ?" Correcting errors in user queries +===================================================== + +Overview +======== + +Whoosh can quickly suggest replacements for mis-typed words by returning +a list of words from the index (or a dictionary) that are close to the +mis-typed word:: + + with ix.searcher() as s: + corrector = s.corrector("text") + for mistyped_word in mistyped_words: + print corrector.suggest(mistyped_word, limit=3) + +See the :meth:`whoosh.spelling.Corrector.suggest` method documentation +for information on the arguments. + +Currently the suggestion engine is more like a "typo corrector" than a +real "spell checker" since it doesn't do the kind of sophisticated +phonetic matching or semantic/contextual analysis a good spell checker +might. However, it is still very useful. + +There are two main strategies for correcting words: + +* Use the terms from an index field. + +* Use words from a word list. + + +Pulling suggestions from an indexed field +========================================= + +In Whoosh 2.7 and later, spelling suggestions are available on all fields. +However, if you have an analyzer that modifies the indexed words (such as +stemming), you can add ``spelling=True`` to a field to have it store separate +unmodified versions of the terms for spelling suggestions:: + + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(text=TEXT(analyzer=ana, spelling=True)) + +You can then use the :meth:`whoosh.searching.Searcher.corrector` method +to get a corrector for a field:: + + corrector = searcher.corrector("content") + +The advantage of using the contents of an index field is that when you +are spell checking queries on that index, the suggestions are tailored +to the contents of the index. The disadvantage is that if the indexed +documents contain spelling errors, then the spelling suggestions will +also be erroneous. + + +Pulling suggestions from a word list +==================================== + +There are plenty of word lists available on the internet you can use to +populate the spelling dictionary. + +(In the following examples, ``word_list`` can be a list of unicode +strings, or a file object with one word on each line.) + +To create a :class:`whoosh.spelling.Corrector` object from a sorted word list:: + + from whoosh.spelling import ListCorrector + + # word_list must be a sorted list of unicocde strings + corrector = ListCorrector(word_list) + + +Merging two or more correctors +============================== + +You can combine suggestions from two sources (for example, the contents +of an index field and a word list) using a +:class:`whoosh.spelling.MultiCorrector`:: + + c1 = searcher.corrector("content") + c2 = spelling.ListCorrector(word_list) + corrector = MultiCorrector([c1, c2]) + + +Correcting user queries +======================= + +You can spell-check a user query using the +:meth:`whoosh.searching.Searcher.correct_query` method:: + + from whoosh import qparser + + # Parse the user query string + qp = qparser.QueryParser("content", myindex.schema) + q = qp.parse(qstring) + + # Try correcting the query + with myindex.searcher() as s: + corrected = s.correct_query(q, qstring) + if corrected.query != q: + print("Did you mean:", corrected.string) + +The ``correct_query`` method returns an object with the following +attributes: + +``query`` + A corrected :class:`whoosh.query.Query` tree. You can test + whether this is equal (``==``) to the original parsed query to + check if the corrector actually changed anything. + +``string`` + A corrected version of the user's query string. + +``tokens`` + A list of corrected token objects representing the corrected + terms. You can use this to reformat the user query (see below). + + +You can use a :class:`whoosh.highlight.Formatter` object to format the +corrected query string. For example, use the +:class:`~whoosh.highlight.HtmlFormatter` to format the corrected string +as HTML:: + + from whoosh import highlight + + hf = highlight.HtmlFormatter() + corrected = s.correct_query(q, qstring, formatter=hf) + +See the documentation for +:meth:`whoosh.searching.Searcher.correct_query` for information on the +defaults and arguments. diff --git a/docs/source/stemming.rst b/docs/source/stemming.rst new file mode 100644 index 0000000..9f1d738 --- /dev/null +++ b/docs/source/stemming.rst @@ -0,0 +1,217 @@ +======================================== +Stemming, variations, and accent folding +======================================== + +The problem +=========== + +The indexed text will often contain words in different form than the one +the user searches for. For example, if the user searches for ``render``, we +would like the search to match not only documents that contain the ``render``, +but also ``renders``, ``rendering``, ``rendered``, etc. + +A related problem is one of accents. Names and loan words may contain accents in +the original text but not in the user's query, or vice versa. For example, we +want the user to be able to search for ``cafe`` and find documents containing +``café``. + +The default analyzer for the :class:`whoosh.fields.TEXT` field does not do +stemming or accent folding. + + +Stemming +======== + +Stemming is a heuristic process of removing suffixes (and sometimes prefixes) +from words to arrive (hopefully, most of the time) at the base word. Whoosh +includes several stemming algorithms such as Porter and Porter2, Paice Husk, +and Lovins. + +:: + + >>> from whoosh.lang.porter import stem + >>> stem("rendering") + 'render' + +The stemming filter applies the stemming function to the terms it indexes, and +to words in user queries. So in theory all variations of a root word ("render", +"rendered", "renders", "rendering", etc.) are reduced to a single term in the +index, saving space. And all possible variations users might use in a query +are reduced to the root, so stemming enhances "recall". + +The :class:`whoosh.analysis.StemFilter` lets you add a stemming filter to an +analyzer chain. + +:: + + >>> rext = RegexTokenizer() + >>> stream = rext(u"fundamentally willows") + >>> stemmer = StemFilter() + >>> [token.text for token in stemmer(stream)] + [u"fundament", u"willow"] + +The :func:`whoosh.analysis.StemmingAnalyzer` is a pre-packaged analyzer that +combines a tokenizer, lower-case filter, optional stop filter, and stem filter:: + + from whoosh import fields + from whoosh.analysis import StemmingAnalyzer + + stem_ana = StemmingAnalyzer() + schema = fields.Schema(title=TEXT(analyzer=stem_ana, stored=True), + content=TEXT(analyzer=stem_ana)) + +Stemming has pros and cons. + +* It allows the user to find documents without worrying about word forms. + +* It reduces the size of the index, since it reduces the number of separate + terms indexed by "collapsing" multiple word forms into a single base word. + +* It's faster than using variations (see below) + +* The stemming algorithm can sometimes incorrectly conflate words or change + the meaning of a word by removing suffixes. + +* The stemmed forms are often not proper words, so the terms in the field + are not useful for things like creating a spelling dictionary. + + +Variations +========== + +Whereas stemming encodes the words in the index in a base form, when you use +variations you instead index words "as is" and *at query time* expand words +in the user query using a heuristic algorithm to generate morphological +variations of the word. + +:: + + >>> from whoosh.lang.morph_en import variations + >>> variations("rendered") + set(['rendered', 'rendernesses', 'render', 'renderless', 'rendering', + 'renderness', 'renderes', 'renderer', 'renderements', 'rendereless', + 'renderenesses', 'rendere', 'renderment', 'renderest', 'renderement', + 'rendereful', 'renderers', 'renderful', 'renderings', 'renders', 'renderly', + 'renderely', 'rendereness', 'renderments']) + +Many of the generated variations for a given word will not be valid words, but +it's fairly fast for Whoosh to check which variations are actually in the +index and only search for those. + +The :class:`whoosh.query.Variations` query object lets you search for variations +of a word. Whereas the normal :class:`whoosh.query.Term` object only searches +for the given term, the ``Variations`` query acts like an ``Or`` query for the +variations of the given word in the index. For example, the query:: + + query.Variations("content", "rendered") + +...might act like this (depending on what words are in the index):: + + query.Or([query.Term("content", "render"), query.Term("content", "rendered"), + query.Term("content", "renders"), query.Term("content", "rendering")]) + +To have the query parser use :class:`whoosh.query.Variations` instead of +:class:`whoosh.query.Term` for individual terms, use the ``termclass`` +keyword argument to the parser initialization method:: + + from whoosh import qparser, query + + qp = qparser.QueryParser("content", termclass=query.Variations) + +Variations has pros and cons. + +* It allows the user to find documents without worrying about word forms. + +* The terms in the field are actual words, not stems, so you can use the + field's contents for other purposes such as spell checking queries. + +* It increases the size of the index relative to stemming, because different + word forms are indexed separately. + +* It acts like an ``Or`` search for all the variations, which is slower than + searching for a single term. + + +Lemmatization +============= + +Whereas stemming is a somewhat "brute force", mechanical attempt at reducing +words to their base form using simple rules, lemmatization usually refers to +more sophisticated methods of finding the base form ("lemma") of a word using +language models, often involving analysis of the surrounding context and +part-of-speech tagging. + +Whoosh does not include any lemmatization functions, but if you have separate +lemmatizing code you could write a custom :class:`whoosh.analysis.Filter` +to integrate it into a Whoosh analyzer. + + +Character folding +================= + +You can set up an analyzer to treat, for example, ``á``, ``a``, ``å``, and ``â`` +as equivalent to improve recall. This is often very useful, allowing the user +to, for example, type ``cafe`` or ``resume`` and find documents containing +``café`` and ``resumé``. + +Character folding is especially useful for unicode characters that may appear +in Asian language texts that should be treated as equivalent to their ASCII +equivalent, such as "half-width" characters. + +Character folding is not always a panacea. See this article for caveats on where +accent folding can break down. + +http://www.alistapart.com/articles/accent-folding-for-auto-complete/ + +Whoosh includes several mechanisms for adding character folding to an analyzer. + +The :class:`whoosh.analysis.CharsetFilter` applies a character map to token +text. For example, it will filter the tokens ``u'café', u'resumé', ...`` to +``u'cafe', u'resume', ...``. This is usually the method you'll want to use +unless you need to use a charset to tokenize terms:: + + from whoosh.analysis import CharsetFilter, StemmingAnalyzer + from whoosh import fields + from whoosh.support.charset import accent_map + + # For example, to add an accent-folding filter to a stemming analyzer: + my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) + + # To use this analyzer in your schema: + my_schema = fields.Schema(content=fields.TEXT(analyzer=my_analyzer)) + +The :class:`whoosh.analysis.CharsetTokenizer` uses a Sphinx charset table to +both separate terms and perform character folding. This tokenizer is slower +than the :class:`whoosh.analysis.RegexTokenizer` because it loops over each +character in Python. If the language(s) you're indexing can be tokenized using +regular expressions, it will be much faster to use ``RegexTokenizer`` and +``CharsetFilter`` in combination instead of using ``CharsetTokenizer``. + +The :mod:`whoosh.support.charset` module contains an accent folding map useful +for most Western languages, as well as a much more extensive Sphinx charset +table and a function to convert Sphinx charset tables into the character maps +required by ``CharsetTokenizer`` and ``CharsetFilter``:: + + # To create a filter using an enourmous character map for most languages + # generated from a Sphinx charset table + from whoosh.analysis import CharsetFilter + from whoosh.support.charset import default_charset, charset_table_to_dict + charmap = charset_table_to_dict(default_charset) + my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap) + +(The Sphinx charset table format is described at +http://www.sphinxsearch.com/docs/current.html#conf-charset-table ) + + + + + + + + + + + + + + diff --git a/docs/source/tech/backend.rst b/docs/source/tech/backend.rst new file mode 100644 index 0000000..a68cbdd --- /dev/null +++ b/docs/source/tech/backend.rst @@ -0,0 +1,175 @@ +============================== +How to implement a new backend +============================== + +Index +===== + +* Subclass :class:`whoosh.index.Index`. + +* Indexes must implement the following methods. + + * :meth:`whoosh.index.Index.is_empty` + + * :meth:`whoosh.index.Index.doc_count` + + * :meth:`whoosh.index.Index.reader` + + * :meth:`whoosh.index.Index.writer` + +* Indexes that require/support locking must implement the following methods. + + * :meth:`whoosh.index.Index.lock` + + * :meth:`whoosh.index.Index.unlock` + +* Indexes that support deletion must implement the following methods. + + * :meth:`whoosh.index.Index.delete_document` + + * :meth:`whoosh.index.Index.doc_count_all` -- if the backend has delayed + deletion. + +* Indexes that require/support versioning/transactions *may* implement the following methods. + + * :meth:`whoosh.index.Index.latest_generation` + + * :meth:`whoosh.index.Index.up_to_date` + + * :meth:`whoosh.index.Index.last_modified` + +* Index *may* implement the following methods (the base class's versions are no-ops). + + * :meth:`whoosh.index.Index.optimize` + + * :meth:`whoosh.index.Index.close` + + +IndexWriter +=========== + +* Subclass :class:`whoosh.writing.IndexWriter`. + +* IndexWriters must implement the following methods. + + * :meth:`whoosh.writing.IndexWriter.add_document` + + * :meth:`whoosh.writing.IndexWriter.add_reader` + +* Backends that support deletion must implement the following methods. + + * :meth:`whoosh.writing.IndexWriter.delete_document` + +* IndexWriters that work as transactions must implement the following methods. + + * :meth:`whoosh.reading.IndexWriter.commit` -- Save the additions/deletions done with + this IndexWriter to the main index, and release any resources used by the IndexWriter. + + * :meth:`whoosh.reading.IndexWriter.cancel` -- Throw away any additions/deletions done + with this IndexWriter, and release any resources used by the IndexWriter. + + +IndexReader +=========== + +* Subclass :class:`whoosh.reading.IndexReader`. + +* IndexReaders must implement the following methods. + + * :meth:`whoosh.reading.IndexReader.__contains__` + + * :meth:`whoosh.reading.IndexReader.__iter__` + + * :meth:`whoosh.reading.IndexReader.iter_from` + + * :meth:`whoosh.reading.IndexReader.stored_fields` + + * :meth:`whoosh.reading.IndexReader.doc_count_all` + + * :meth:`whoosh.reading.IndexReader.doc_count` + + * :meth:`whoosh.reading.IndexReader.doc_field_length` + + * :meth:`whoosh.reading.IndexReader.field_length` + + * :meth:`whoosh.reading.IndexReader.max_field_length` + + * :meth:`whoosh.reading.IndexReader.postings` + + * :meth:`whoosh.reading.IndexReader.has_vector` + + * :meth:`whoosh.reading.IndexReader.vector` + + * :meth:`whoosh.reading.IndexReader.doc_frequency` + + * :meth:`whoosh.reading.IndexReader.frequency` + +* Backends that support deleting documents should implement the following + methods. + + * :meth:`whoosh.reading.IndexReader.has_deletions` + * :meth:`whoosh.reading.IndexReader.is_deleted` + +* Backends that support versioning should implement the following methods. + + * :meth:`whoosh.reading.IndexReader.generation` + +* If the IndexReader object does not keep the schema in the ``self.schema`` + attribute, it needs to override the following methods. + + * :meth:`whoosh.reading.IndexReader.field` + + * :meth:`whoosh.reading.IndexReader.field_names` + + * :meth:`whoosh.reading.IndexReader.scorable_names` + + * :meth:`whoosh.reading.IndexReader.vector_names` + +* IndexReaders *may* implement the following methods. + + * :meth:`whoosh.reading.DocReader.close` -- closes any open resources associated with the + reader. + + +Matcher +======= + +The :meth:`whoosh.reading.IndexReader.postings` method returns a +:class:`whoosh.matching.Matcher` object. You will probably need to implement +a custom Matcher class for reading from your posting lists. + +* Subclass :class:`whoosh.matching.Matcher`. + +* Implement the following methods at minimum. + + * :meth:`whoosh.matching.Matcher.is_active` + + * :meth:`whoosh.matching.Matcher.copy` + + * :meth:`whoosh.matching.Matcher.id` + + * :meth:`whoosh.matching.Matcher.next` + + * :meth:`whoosh.matching.Matcher.value` + + * :meth:`whoosh.matching.Matcher.value_as` + + * :meth:`whoosh.matching.Matcher.score` + +* Depending on the implementation, you *may* implement the following methods + more efficiently. + + * :meth:`whoosh.matching.Matcher.skip_to` + + * :meth:`whoosh.matching.Matcher.weight` + +* If the implementation supports quality, you should implement the following + methods. + + * :meth:`whoosh.matching.Matcher.supports_quality` + + * :meth:`whoosh.matching.Matcher.quality` + + * :meth:`whoosh.matching.Matcher.block_quality` + + * :meth:`whoosh.matching.Matcher.skip_to_quality` diff --git a/docs/source/tech/filedb.rst b/docs/source/tech/filedb.rst new file mode 100644 index 0000000..439c30f --- /dev/null +++ b/docs/source/tech/filedb.rst @@ -0,0 +1,29 @@ +============ +filedb notes +============ + +TBD. + +Files created +============= + +.toc + The "master" file containing information about the index and its segments. + +The index directory will contain a set of files for each segment. A segment is like a mini-index -- when you add documents to the index, whoosh creates a new segment and then searches the old segment(s) and the new segment to avoid having to do a big merge every time you add a document. When you get enough small segments whoosh will merge them into larger segments or a single segment. + +.dci + Contains per-document information (e.g. field lengths). This will grow linearly with the number of documents. + +.dcz + Contains the stored fields for each document. + +.tiz + Contains per-term information. The size of file will vary based on the number of unique terms. + +.pst + Contains per-term postings. The size of this file depends on the size of the collection and the formats used for each field (e.g. storing term positions takes more space than storing frequency only). + +.fvz + contains term vectors (forward indexes) for each document. This file is only created if at least one field in the schema stores term vectors. The size will vary based on the number of documents, field length, the formats used for each vector (e.g. storing term positions takes more space than storing frequency only), etc. + diff --git a/docs/source/tech/index.rst b/docs/source/tech/index.rst new file mode 100644 index 0000000..196d18f --- /dev/null +++ b/docs/source/tech/index.rst @@ -0,0 +1,9 @@ +=============== +Technical notes +=============== + +.. toctree:: + :glob: + :maxdepth: 2 + + * diff --git a/docs/source/threads.rst b/docs/source/threads.rst new file mode 100644 index 0000000..981a967 --- /dev/null +++ b/docs/source/threads.rst @@ -0,0 +1,74 @@ +==================================== +Concurrency, locking, and versioning +==================================== + +Concurrency +=========== + +The ``FileIndex`` object is "stateless" and should be share-able between +threads. + +A ``Reader`` object (which underlies the ``Searcher`` object) wraps open files and often +individual methods rely on consistent file cursor positions (e.g. they do two +``file.read()``\ s in a row, so if another thread moves the cursor between the two +read calls Bad Things would happen). You should use one Reader/Searcher per +thread in your code. + +Readers/Searchers tend to cache information (such as field caches for sorting), +so if you can share one across multiple search requests, it's a big performance +win. + + +Locking +======= + +Only one thread/process can write to an index at a time. When you open a writer, +it locks the index. If you try to open a writer on the same index in another +thread/process, it will raise ``whoosh.store.LockError``. + +In a multi-threaded or multi-process environment your code needs to be aware +that opening a writer may raise this exception if a writer is already open. +Whoosh includes a couple of example implementations +(:class:`whoosh.writing.AsyncWriter` and :class:`whoosh.writing.BufferedWriter`) +of ways to work around the write lock. + +While the writer is open and during the commit, **the index is still available +for reading**. Existing readers are unaffected and new readers can open the +current index normally. + + +Lock files +---------- + +Locking the index is accomplished by acquiring an exclusive file lock on the +``_WRITELOCK`` file in the index directory. The file is not deleted +after the file lock is released, so the fact that the file exists **does not** +mean the index is locked. + + +Versioning +========== + +When you open a reader/searcher, the reader represents a view of the **current +version** of the index. If someone writes changes to the index, any readers +that are already open **will not** pick up the changes automatically. A reader +always sees the index as it existed when the reader was opened. + +If you are re-using a Searcher across multiple search requests, you can check +whether the Searcher is a view of the latest version of the index using +:meth:`whoosh.searching.Searcher.up_to_date`. If the searcher is not up to date, +you can get an up-to-date copy of the searcher using +:meth:`whoosh.searching.Searcher.refresh`:: + + # If 'searcher' is not up-to-date, replace it + searcher = searcher.refresh() + +(If the searcher has the latest version of the index, ``refresh()`` simply +returns it.) + +Calling ``Searcher.refresh()`` is more efficient that closing the searcher and +opening a new one, since it will re-use any underlying readers and caches that +haven't changed. + + + diff --git a/files/whoosh.svg b/files/whoosh.svg new file mode 100644 index 0000000..45b3db9 --- /dev/null +++ b/files/whoosh.svg @@ -0,0 +1,434 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/files/whoosh_16.png b/files/whoosh_16.png new file mode 100644 index 0000000000000000000000000000000000000000..b0db497f83f36a2641dc82c5386da16044126a15 GIT binary patch literal 909 zcmV;819JR{P)9Tj5+;JGyjO8BJR&GwtJ| zphB`B`_7W0FQK5TvZ#=#3n?P13&Wez63e0@B<*Tsi%QFa3bjDi*>S7Qedsy=^Y5l0 zEW#JwyZ858y!;-9Qi_R8VX=n?P(xI8vyS760I&d9Zi28!Bcyp`Bx=J?0`M}8kWPb+ow0P$LUXR@Kv0q(2m&Yt|KEVZV$XCPu@QzoZFFz6wN!q~ZU2cppt+-hkdNkwIK99P=o z@Am)x9kZ~stdbD*hN`LuL2)rNGgHXU%0%zmcc`h^17~h7j-NaUc{oId1_wj39JEtP zmEQrZ?^)-aF-2KA$CZD+Zr3JZ5<|7m{R7y&yBf2cPCR=27>RfSRU0aGldaaVKVkgR z5gnH=ti0s1q^;|?iiV6R%jtAlj+leH_uA3d_a3ze55Vm%!c<#Uc)Tkt0JHVu%j#6$ z0}f5ysF(Gqtv$@%YH3mLKYY|$Qc?n^(}_ql3O&!i7$3l)zpol?*F-oA@uR8;?*F51)EODi{RmfYUOUP>v(FGG)SjUj7h2=hz*$akMYYWiyc@RpXX;|+Wx&-1Uo jeUpy#_`6^Kt1mwR5!PPkT!x_D00000NkvXXu0mjf0Y#$j literal 0 HcmV?d00001 diff --git a/files/whoosh_35.png b/files/whoosh_35.png new file mode 100644 index 0000000000000000000000000000000000000000..7fc767528922a74ea3693f8fe41020e5842dbd47 GIT binary patch literal 3231 zcmV;Q3}Ew#P)500004b3#c}2nYxW zd-;BK~z|U#nyRrRCSiX@$ajBPpVQ>B8dS53E&dM5E8HvMK*00 z6i*KdZ8IneZiu4TA}Wf|ZsfykS--j5)sjU8xw7Nh*0ALJbJT-T^onh^4J$`6YkW4HDJUHT zBzNl@&$(uOU1iXD^T;deNvYLJM@I)89Wfdk{lsE1R8{55 zE3d?8u?CMr&Ws5M-{K$dT+PCV?Tney$+&ylFNF+ickw8%t6h$G`*@+;Xr+)qEBGPszTi0F7TTj|q@Krm#E?kV; zHxp?4XZG_iyy!nYZujV(-3zUoHf=&t6#Dh+M?4;9{rdI9Vlj$~i!qzchzN;z91%&K zQry$5TSTPg2_CsAn^%{I=~F%fx9^TKLN4n6WI;jK>s)rLbNTY+j2}Oa4?p}buD^aX zfk1%P)>b+@JK48yAE{IdolZwI7Q^H5G@KdJ()b$l?r`&$869*je48xy>;DV^a6wtw zcyEs99)qGd)~;R4^y&9=!wqAouHHZ@m1geTIVg$(Kr)#m5D4IQyJ_(^@W5@i^f(iy zKmJ2y)bCRmJ;{$JZ^oG*=a-jX?65l)xNKI}?p?bmE-q&D=xbTIaurLLe#YX(izzJZ zMoCEtnx>&?8nIXmuh&bkp`OdURxdF0_^(7@S+jvm-|~u!^m_du zr^8XC*Xz4!nwF($TBoY2CbPkKahL2Y%icYE=-$0M6%`fK*B>Dki=wJ3KA#Vh$w)&( z1Dd9h$z%Y?Xd2FVJGMo0Je=SEe!OQb|0PDjU>M4V6*S-VeCo!luN$e?>E@3cJJvjC zV1>)+a3~gw1<{0LGQsqR9wysvW&i&Dbm`KCAwz~xQ&YpXZChEm@MS71E9u_78>*_( zyLT_vuKfv3($d&fS1ge$1hsjOowzD*ZD1IlTM$Q(4bH{kQSOJ$op6 zy`EGmMQdv-!C;VJFv#a${SQcn*4B2+W;25a4<;B4vUcrSR8?ijkjof3as*XXRZN*O zg*V@PgWKHM05U}dxo+*C%-No)zeXrn^mlhb#%F9&} zi3B1-G#Vumi4cuOsoj5oJ$v_|rPD;CQ3ebcKub#twY9Z$>(&i`SS-epB_DC{U>!4N zJj6TiEW%>3(63)V_SMyer%#^rgYMGNN00t(>#~8Kr!?@i`|i7|v#^(b#>`p7<8f3~ z#b7XyN~K69lO&T#+9MH`E?b5kjX)qkpFVv^Bofrs)nPOm$*mYy(Q+8<-faW85+FO3clwUmf-yB(4<+n|qLLd+zlgVH(7%&(N zCm_jWl28Bf-zb?3?cp#vIXM9M{eFx_BZ{JsPN%Wi?39$0ke{EA*Xt!OFOSN~fn-o( zZQ=GWc5L1-57Ab>8D(UimW^;$te0S2RysS_tK=G%qJd(S?VeNIvLyEkvys(!fSBMQ29#cVc{ zm6e6d<-%sO5s5}gsVc85SU_7ij9#aZmzRgl=@Rq?nj7j5Z`rW!*AM`cDUa4>s2Cmw zY5;W6XmrSfGiUAHxofvJea1uD6X9ykO$2Cp=@Cz$sd-(GbGm!Z@7)WV)r!;U#OZV% zgT%=3c479Tv(W1eeDcXBMy-zy7$(H2JpMYzGe2(XCsz6A-)IPTk>p`kZ$z_un@S zpU=lDufK-bVv&qa$9cVbVbUqsY&Jy1bXrQJbt+3<&ZcsNpzzi4X#+q_byZv6@`?+_ z-EwOljl=)&%F61T-ovk=H54G*=}Pb1y*D+Wte-x12la6 z?aI~F1sCDS&L+HL3!jdz)E9QmGaNW{D4m_1O=o8(S6}rIQlEeQO`>mUU#rXIqQ1VKM`u65RU=0L;PH6K$;rXv@!<7(i6`RN zoX+}Fx2J6mHES;6$Inf?SKWl$JN@L_r(#mcVFSCJ^%8fFg8XyfkvGASjcH#E_wM}6 z{O5EJnLFo6nwp#C_B-xs_T~DlW5$fh+OlN}9*+l?%Z15g!t3?2b^DH#s%E}FwsIHR zLkn2eZ`d{rdG%Q&S_MP)KgR<<`))?b}C6OVbz9 zRNo?hyE7qwu#2q!J}m8_dro@qiSct~qTva=Ki`Sj3S*}por}jvFPg_hdnJ#({03ir zwv-(oyvAqB0lj_RtwRR;%2Nh|0gKs;PN&0UGGVvdQPm7OO>>^#UjG3v-;=|J6_9&2 zyuY!HY~RDU+^_w*uTQQ_sPTXd_lbP`>d}fMIz%3yAcz0hQ!c%1s7(CJ&Dx@}T#+(F z%8<4)gG`+?QT%?t?Ao>e!pK5iG)m@GFdiXT5$B)3ZuyL(^3*@YL;~RlG9b> z_&9!Xo^1LdB%^zZq&r2He=2fq50OffNbR=Px8MGco0om`_eIiFACPD6?u?8olyR3` zEVZ?@Qdd_eM~)nkU@$1DR7yTs@^^V|a37K3K9RY%cZ&Z|P}+jeiHPG&ke@LvO)tpI z8#Q@+g2?!?qhB!gG(RjYn@=7v9wO>Nu0T|%Ld1OfpGhr^Oir)A}e z6*B3jvFYT_O+g9zcStlmQ$%uq3vn{WB%#Izx!5MsUlEydxyTpqwMwYzh$Q2qf01xri7}mr&Y&fMf9G!?x6H-KwgpDz*C$$hZj;TQ9kM_?*)I1C0L);`mS` zok|jqMS)jwc@_Yn({}nv(YL(9k)4&bT5m9Tb&B#`Fcg|oQ(aa2TY0|~{|k~qvv+pN RW+DIp002ovPDHLkV1j*ENJszx literal 0 HcmV?d00001 diff --git a/files/whoosh_64.png b/files/whoosh_64.png new file mode 100644 index 0000000000000000000000000000000000000000..a026b6d00890cc095664d3e9b357dfa2cfb76711 GIT binary patch literal 7708 zcmWkz1ymGm6s1{8Ku|hXBvx<<2|+-*ySqbr=@98or8^gpr36H}Ttd1*8l+k2lK$tP zb7tnuoH;Y!``&%`-TO_nnu;tDJ~ciX8XA$joRkJ|7W;n(;Q&X3o$Cs4!g808*8%~X zKgcQyxWAM5Z&;7rnOR#1805_>Tr1d;BU2HtO&E2fgyuH1lcFqp&mgX>PsEeCz z_MtE}8X8@uyp*_>PtI1hx4*XZ&Eu-==mE1p`NTj>v}bK) zOUMfjD5~zV!MryFrBI9UA}UIYZxW-hgksYDR?xO1aA`A1Oij!?Fz_yK&cWm=&)s%A zXUsOwciT5FvqBDX{iuG}gu2{I-AbLyd@$|&^=PpDa53?3f1fFq`0smIfalDYPPRu$ zBDJ7HF;+%8IfrW{T#o`;HU~|G;Ly%(0%4w=b?(&EnMderNOVo**Bh^^LlLHjMQSsg zK8$pW-E{Euv{$De4Ie^*Swc*srE6Vh@!ok?8gzkaMs>_bTBM>+Z^|`BP8jiEpswy~ zXmV(F+ebu$x;j`CC~ux;e>9}#qR)saG!0YB#q!6Y5hZo}hox{Ssci=j!#FUU&;2}& zm-J3tN0rhOOh{*I_5(!NPG1PcHo@V~`5Pyo@K=-#wq+HJ?t|WQnZ-l~q5) z^^#=}{_vT}k*61>!II&@Z1Sq->S&H#;El+^At_Qei&ZIuE#NVdrj6n5_I(XMFYoXN zrz;22aiR7O=g2i_MBPHj;jr_`q#Y_?*p0-E5z8y2XX%P z>!_+I6nxwveEzT@pT?nYF`U6|>7uErnUKOWZLl-PxW*4Rl4C14oh1tqcplVvl78u% z*?nS=E8_olT%iB$p^`to+f`vM#i0~^c6wmYPl$A~&PGQUbedeVtAt4O|w?#2@GCEpQbxAvxS76uCprp?3nSX0c zU!8>sIKCHd#`cPvULg0)n_q*!x-4o7?5I6GJ+*XonZm^8JhQU0zSh*RlH-HJ%uY84 z^oDI{p%f~O?y9nhB6Jvj7Q5^d4N4C@R5~5r=jqO$*3$%&CMbgKHMq`&!dh~EJ=J-l zg_bHN@)-^OBZb!}FYKiD0r8fZIkD||ov<{u{d(~Up7%k{X*tAN6C+2dp>SisYmc>d%Y5CX!LDYbMo?QcviU-S9LN)&}a7dGJh4fx;y!_3I<< zZ?7!*v-f7JXliR~|J?@wF-4dI^W1GlqOGlNAiK!*O^<^zB6#%(Q!HbXb2cda>%N|i zDe*i0VEd5Uyl{B|j62+>&?thBgGtHnJid~jl2!E5PVv2wj82TTzB*iXoUg-Ho|02i zf;-Gr_w@H$*3Xag#%E6n^TfTpR7mA>0@Peh*|%@Nc=u*1Ch~GIQKmT(GbA-7WPv$r;QOy{!nU(k9QlU;B zAv%PzPlDh7h3^9x8@$Fa#EdDk-$m4wU4}_(HeMhe9q~f6anX_6*2)@Rc}nkoMpsKm zXR<_*A}`=Z9_I<|ZwwjRx$4~EY=L~^RxflZBKodE$;jQsMozSrYRd+L2~WDtvB;vR zcEvP~J+1^2)|0$?ZV#HkR*|}C<&S8i)K!c*{vSKdqoY5p(?5)VcM_AJyDIN#9iIDMeOcC25%PI;%g5=Ri4wnc0 za9WfxHl{-$PVdgf{=nY8g{?6Tr3dyHy0njkUCF_+svdPfj5po{tXQfX^zJ1#Ym*389{6!qdE#6aReY{gxR&-2G* z4GgHWO4UttR8;qhQKgAk8Y;Mc^tYn#t+C2Tj`!Nyx&HK-@FBanE?@KvlJWf-ps`>n zvfmRUB(c{Fk;7q+Ic;0q`;14QsG*}Hqpwe;Q;`aXzZCM^59_tSVdS4h_Qlhj+}^8u zCMG7b3JT%{?eFhD!2_nz%F0U3*jPDqtp6yL+Q&)M=u>1!kr9!w9}gBA<;~04K=_|4 ziPi3LhXS-bM@)XYCd#*_pb2C36c5G(xqTGYi&}%qmwkR;tE#rTf-&_tp7XqUBQGxx zhCrgl80Fs5_HDUw`~QQ@R_ccqu>nNJqLLFu62c^#pk`#Wq;g^S-qO-CIy!nilEJ$G zjY18x0sQMxW-{>K%doc)j8hVOe1~+IYH0S8h!v0PP1KJ=_NB)Hl0Q6a9AbNu&Gq~R zfj=O8eBnJ|QhzKgU0aPoV4%d5r^Y>VJE8yH@iln9?3Wg9{&ah3&Yvx>pirb+WAc{v zi6cJ^Ha52KNsq)~=c9>p+X-bL>SWi=Q>wb>B-HX#_~!8%K%wu3Jx$H5E$0FY-MYk zoRdSFyMdamLUIu%B|XjLcY39-pJ`=njZt7{uEavN-0HniUY57Ho$Vq%ckz`4D2b$` zq@OE4@56pldk-cRN6o$tZG9x|evY>hOvWt5>qNXbcey!xmQwX1EmA{9Q(b>xxb@KL z=k>KQ9tTpa(Y_I(vk_Na{!@ZI~y={?%4FU#3>tQXx~V8D6XBebDwi# za(1?&j0_GmJr)^BT<8pMdzA^RiH>Qs(gHz#PW$=({7g z)lXNli%MC%c-hONvkU=N7DuDQ^C!+9jTF81Jr2&E#hhm6PofdCXmZ{!YIA7S`6!mU zMNADO^|~Zweh~*igu0NS&Q=0i!q^US=ZqADRb*ddM-A{zqUbGc{eV>5lsrHoBO@aK z0R1lZr4wiaH)4Rw{>CUOuybHIV0oOzY{=PWL4;Y-_`ay zSm*`7>)N{f7s7!BiC?7uNLy{n{NV!`RzRypiQ*CI_Y(#o1Dq)$ML5bB3P? zr^qA6o}JzF+}zaoxY%3T1bDF)-@ecH5a-)Z2XizZ?CpO#H5Q?$|8L+UHb7Yn3=aOZ z;!!s+PypIG0^!z2Wy=`$fntg#V!}BO)iN4K<$c^s;lReuUi|%g%J8t#%>_zUTANSS z9yKsL><{H*QiNTCBL?}2>Y(%S4Ciuhkiit@)&#m+0)*+C>ge>OJuCW>Xr?|`fVA_^FuFg-EQLYye3 zysh>ygV8k0R4jjeHV0-if`~>&(vsO zAn>s+QS?t<>(QT(la1rsdo3-k0+b)QkHg`CRI#b4>6mSWjwLt1#+B1EGeGvf%{!j& z&k$&9Lx5P}a4c=@VnGowR%T`|50nEt*Y&>< z>{g4SY~`bF%^tC7@j9T+J!@!a$Qar7S_3~FTRiZ(xjaymmc~|AR{nSY@HN(cv zuO$Qo{_WkQM~p409D^`wvg}VEc@3LooSrA7#9sCHlOT$E3Vaz}R>W4%Nf}6=3~k4% znfw1eHucT93yo=(!HsFkhCk{WzmY`FhpCajVrN&^23Q%MhE|dk5fS0FwrJyq(ajhv zh6dzCNr}*_H5c}l*4RL@kOWYFEA1c2o(`ZMf` zUL~Nu*I|@pQves@&&T0F~}H_0|N z=}9rKm@4fe3GIXe`%7IClzk(EmOb9No1G^NC=Fp2zvC}Wo5;tH%e!C0Y%+qz#JotD ztSzY0VYhMb(RCSV`SH4<(};!&+)4ytj*llze9g=p-qIp;7x0MJJtx5H_Vp`hnd{r4 zqk3n@v$j$@D;t}nj0`zrNv2f3hRN}G7sA-RB)V1qDf1Qr@II_5(S+68{{Ff0a;Dp9 z1e-*ZuWzVZe7A3?6u1ZK3g(g6iHG1a%U+iUAc$Ay+>jmr^>TkyKN zr?I^u{Ttm!S7i9%DxeO;4XEVVv!32wbzR-a#XZLz<^2|wM(1T%vys-!th(VU+Ho5F z@q>B)EFD;dBfd#Qhy-s}gU9eFy>TXp7RfaU?fk2OmOv}`6x+F0uV^%84z;z)04;$% z24tpWz;pN*BiqEAfx!M;-RfUXu#&Edg7eha$oaG+Kq1|XEXn1|2}DvFx+PoBrl)6I z2c#+foL%d;I@8%CQ!|T!x~Xh)f_p7W+7JsLPOq`KZX;i3(_iDQ$>PC!={-F)c@pN( z*8GDW?r+vF{3dvb>gVH@ompSMCX!b7aX(qSLzSd|ku#o)chLq?kAP-|O%L*#(TOQWkRQ z=#V+j+2@XGYicCVj7zR^QQ1x7g=ju8 zkf3?T1V7g*^w^2ID+CC@j z;Ve{(I!#In)Y47M=kVc*3;4V0>QO$nFFqs~`XON)i*{hOR?@$Hokk{sQl(5K7wETP zNb0~G!DPDk55F*qKf7lKoqJHg$i9h*5AYHqm(^B5ayM32@zipy%PJ0GzZJwP=6+8v zDaq>#=IId=P;eBu?~G1QPk(c~#;4_H=g*D@${Q~0M7JXLc{q;nw9#ctPFuP)7oJaw ze!ku{`YW|a>Jk!uqu;8aF<9|TTK{Ax(ze#_?0^1^DdPHgXykS9yIWHReMf`fodEJj z!&Aw}sRGhs8->jw+snfextID>dp(VgO1~ZX|9X8qpR#YTsMQ1967R=P~&Ok9h_GMm+8EO;lO_j^@B ztP@O<4#!YjBt1r^DT5(fTQNnI8P_|w^DR#AINR>a??3ri* zzw00=08ZSb)gM0X41a`TWC&TiZTS}CzN~#1`MULL<&LjDAV*roi(e9V=j0z5ifX)u z6PixYxth4_oQUv4m0@`uKng00LROVChmXpXEdh(z)6?@gD{GKQU?ps5uV>FgZR)p_ z-qfDA4a0Cu$m&k4Uu>CDQ1z>fgC9WW-4XOo!+)+ko52YS9iRK#}ACJ2CVBa=FA4v#tAg>@L zKRW1v*DW6O0XS(ez{3jlZ+>vN3r6DoR2Vli^raXyga0a!|$1QTx0=s zFDX@e44OM3A#a>I+au%F2L~o{ zeDlUdK!o~y5*5AnXZFgD_g?W&K0o&u92x@T7+_8c*zl)HRr_rkNXa(L&Ai?(d9mBn z<0bY>`uhu?oSeLT`SNo{Miii5?d#1O=7rvdy5|r|4wiESYOgtkb?^Kfcf|+Toz8Lb zacTB<8zwF>uc*OJflI!qk+3<&o1@lzK=sonMfxyG@05qDo~Mu)wFZQ)>`TJSO4eb& z1yPm3K=kUSrinvaFGNJ>Dn|OA&8d`@muH$IBPYsay|u8HoW0+zRWXZ-(&vxODTrN< zwbkHC1}5%n_#Afsn!M^~B_wPI`TQuxdEfYM33l~~nEV!_yJKVQT{?04>y01TE*4TY zJSr8~j$XoupFn9Y9Od|O=2sgy#nB2$0sV9dN&)0xLhl6_j9xZ1@J!cK@{=(gSzo?v z*HOQDV$PwE`u_$>O`TLHo2W1n8`@(%p| z&M7~hl1K0Dva{7aX-aN3co)|=a++h5UL>o3XLP+=zh1x2BXfn1N%A!mVhuB)M<4)q z5i+%Bb$X#$US4j#;He2nR3*9E`{|gh$hQPm3RHKzE-=w~pMCCJW9r-?Ra-sY_KizI)-dmsu zyD>CdRgFK622qBm5X}E;-mH$sZ%Bo$UmX8T9}oOJVGx0iG7wf&#kiv2t@9i+dER-! z0@%HO{-V__UUeM5TLs{I3vaVW0;1oOOzZmJ;Q_tZ<6xmda;>U&<3!C+x4&C5GQk*m z(DjM(22A*gkl2KvM)>*lR+=E}k{LhdRMzZ5HTXR@dc+CwtJwA6if4qFeWsZ8rm^sp zg~2q=0t6vpD8-M1EvL@poxFgUdYmWluV16iSXHb{%?nQt4PErvV`3k_d+#Kcdqr*_ zUoUn}0jog(2NbZOf3=aamu-W!MZk8?y!4M9v#$ z;UYxJv)r~gj(-}3&E`*!doPk}?wvF>ljT9d^NxokZ8P@FEG#%+tfBW0zklsc@4+5Q zCDW`b&mzzr}2%? z6`Mb1LrpW7w^EjhKW)F*>RpBlHyI{J6v{(f4tIyUR8uX}yUEF|F%oOS>yE2ZuvV6#@@>k2#!CvjO8PWJ`|1dRO+cUnSTb zEu7Qv??}*>FYjM{Pt{gjLj=I!Ih19|%r%|%O~z2c!w@{$CDUJ#l9LIz#M71*dAkZL zUFheiMS&Lig)jSt9M46KIzo;Df7vD+$>#9&=X46+KJ^JqAI;ClZFRtu7!jd6?sC;M z`{GzRwaLg_WO7ZDi16PNhuE6Tn%tXYbC#G>x^IuPT5M+2q8tzvPZS{~cQ=-jCg~Ja zb14;r>}NLvt8p}(ibI1|l9Hhu92_aQ#Plh>y}fm9ZALoU+5>>pEVs+zcm+)p48<9` zZq}F0=ac*5b~CT>@J&2jL!msa=BKWhw9ES#ICX^ehu(P1h{sjJhg__XCJ=sV0M=+h(D?7b?wcj0I4>RQVI zg{E0sTh}2F3bV7b3i)HKe9FlU4W4dyVbP~WZRk8)!_cib0{GpUsm839J?WZ3W{Mjk zMdME6xc@1ai~?k|P6Q8x>C5?_+MvN6Mo$o=SKQ<^KTNj3nPLF@u z@F=L6`tq2r$p5AWbeaMIO9#(QPyYf|?WsUm@M=_DX{k)!#|Lb;;oXV4tQGK-7C=P-3CpJ2k zkK&;;;Uq{6O^vv#6@O!8U=_W&Sukt&1D}sh#WOG#UqgeD>jAlz_P5;`T}=n{0GW}vn1TOLm2tDhJ( zIMyUKSi-3|uSTOTE=jBa(UpVLh|D*q=Vr6kxt~3^-q{`?PJ^bd!}KB~z*J8nOGsjG-kSc`7?j z@_G*wvg&J|7Z;~fWAD=?Wpb;wHi=GbcD9!@bF<4HYPuC6>hS6s*{CnP0+da{7|7ot zBxh0Z#K04V#~Wg5j;OGTDQE}m#Uil6$O(Zs-IL8|tR>u4Wyc2YxvS`sL#$uw@W)kw zPPqCvM*@lLn8iV^Kn<`5s|Kl_H9AA@8}dC7r-8TnC$9;?HF3b7Tr_!U6{%_ovylG) Dr1snx literal 0 HcmV?d00001 diff --git a/files/whoosh_small.svg b/files/whoosh_small.svg new file mode 100644 index 0000000..0d967b9 --- /dev/null +++ b/files/whoosh_small.svg @@ -0,0 +1,604 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..22cc32e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,40 @@ +[wheel] +universal = 1 + +[build_sphinx] +build-dir = docs/build +source-dir = docs/source + +[upload_sphinx] +upload-dir = docs/build/html + +[sdist] +formats = zip,gztar + +[aliases] +push = sdist bdist_wheel upload +pushdocs = build_sphinx upload_sphinx + +[pytest] +addopts = -rs --tb=native +norecursedirs = .hg .tox _build tmp* env* benchmark stress +minversion = 2.0 +python_files = test_*.py +pep8ignore = + *.py E121 E122 E123 E124 E125 E126 E127 E128 # continuation line indentation + *.py E401 # imports on separate lines + *.py W391 # blank line at end of file + test_*.py E501 # Ignore long lines in tests + upload.py ALL # 3rd party (and not in the repo): rietveld upload tool + docs/source/conf.py ALL # sphinx stuff, automatically generated, don't check this + src/whoosh/lang/*.py ALL # 3rd party / crashing py.test with non-ascii stuff + src/whoosh/lang/snowball/*.py ALL # 3rd party + src/whoosh/support/relativedelta.py ALL # 3rd party + src/whoosh/support/charset.py ALL # non-ascii py.test crash + src/whoosh/support/unicode.py ALL # non-ascii py.test crash + +[egg_info] +tag_build = +tag_date = 0 +tag_svn_revision = 0 + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1229018 --- /dev/null +++ b/setup.py @@ -0,0 +1,60 @@ +#!python + +import os.path, sys +from setuptools import setup, find_packages +from setuptools.command.test import test as TestCommand + +try: + import pytest +except ImportError: + pytest = None + +sys.path.insert(0, os.path.abspath("src")) +from whoosh import __version__, versionstring + + +class PyTest(TestCommand): + def finalize_options(self): + TestCommand.finalize_options(self) + self.test_args = [] + self.test_suite = True + + def run_tests(self): + #import here, cause outside the eggs aren't loaded + import pytest + pytest.main(self.test_args) + + +if __name__ == "__main__": + setup( + name="Whoosh", + version=versionstring(), + package_dir={'': 'src'}, + packages=find_packages("src"), + + author="Matt Chaput", + author_email="matt@whoosh.ca", + + description="Fast, pure-Python full text indexing, search, and spell checking library.", + long_description=open("README.txt").read(), + + license="Two-clause BSD license", + keywords="index search text spell", + url="http://bitbucket.org/mchaput/whoosh", + + zip_safe=True, + tests_require=['pytest'], + cmdclass={'test': PyTest}, + + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 2.5", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing :: Indexing", + ], + ) diff --git a/src/Whoosh.egg-info/PKG-INFO b/src/Whoosh.egg-info/PKG-INFO new file mode 100644 index 0000000..84d0d80 --- /dev/null +++ b/src/Whoosh.egg-info/PKG-INFO @@ -0,0 +1,88 @@ +Metadata-Version: 1.1 +Name: Whoosh +Version: 2.7.0 +Summary: Fast, pure-Python full text indexing, search, and spell checking library. +Home-page: http://bitbucket.org/mchaput/whoosh +Author: Matt Chaput +Author-email: matt@whoosh.ca +License: Two-clause BSD license +Description: About Whoosh + ============ + + Whoosh is a fast, featureful full-text indexing and searching library + implemented in pure Python. Programmers can use it to easily add search + functionality to their applications and websites. Every part of how Whoosh + works can be extended or replaced to meet your needs exactly. + + Some of Whoosh's features include: + + * Pythonic API. + * Pure-Python. No compilation or binary packages needed, no mysterious crashes. + * Fielded indexing and search. + * Fast indexing and retrieval -- faster than any other pure-Python, scoring, + full-text search solution I know of. + * Pluggable scoring algorithm (including BM25F), text analysis, storage, + posting format, etc. + * Powerful query language. + * Pure Python spell-checker (as far as I know, the only one). + + Whoosh might be useful in the following circumstances: + + * Anywhere a pure-Python solution is desirable to avoid having to build/compile + native libraries (or force users to build/compile them). + * As a research platform (at least for programmers that find Python easier to + read and work with than Java ;) + * When an easy-to-use Pythonic interface is more important to you than raw + speed. + + Whoosh was created and is maintained by Matt Chaput. It was originally created + for use in the online help system of Side Effects Software's 3D animation + software Houdini. Side Effects Software Inc. graciously agreed to open-source + the code. + + This software is licensed under the terms of the simplified BSD (A.K.A. "two + clause" or "FreeBSD") license. See LICENSE.txt for information. + + Installing Whoosh + ================= + + If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install`` + or ``pip`` to download and install Whoosh automatically:: + + $ easy_install Whoosh + + or + + $ pip install Whoosh + + Learning more + ============= + + * Read the online documentation at http://packages.python.org/Whoosh/ + + * Join the Whoosh mailing list at http://groups.google.com/group/whoosh + + * File bug reports and view the Whoosh wiki at + http://bitbucket.org/mchaput/whoosh/ + + Getting the source + ================== + + Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/ + + You can check out the latest version of the source code using Mercurial:: + + hg clone http://bitbucket.org/mchaput/whoosh + + +Keywords: index search text spell +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Natural Language :: English +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python :: 2.5 +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Text Processing :: Indexing diff --git a/src/Whoosh.egg-info/SOURCES.txt b/src/Whoosh.egg-info/SOURCES.txt new file mode 100644 index 0000000..ad9c0d2 --- /dev/null +++ b/src/Whoosh.egg-info/SOURCES.txt @@ -0,0 +1,224 @@ +LICENSE.txt +MANIFEST.in +README.txt +setup.cfg +setup.py +benchmark/dcvgr10.txt.gz +benchmark/dictionary.py +benchmark/enron.py +benchmark/marc21.py +benchmark/reuters.py +benchmark/reuters21578.txt.gz +docs/source/analysis.rst +docs/source/batch.rst +docs/source/conf.py +docs/source/dates.rst +docs/source/facets.rst +docs/source/fieldcaches.rst +docs/source/glossary.rst +docs/source/highlight.rst +docs/source/index.rst +docs/source/indexing.rst +docs/source/intro.rst +docs/source/keywords.rst +docs/source/nested.rst +docs/source/ngrams.rst +docs/source/parsing.rst +docs/source/query.rst +docs/source/querylang.rst +docs/source/quickstart.rst +docs/source/recipes.rst +docs/source/schema.rst +docs/source/searching.rst +docs/source/spelling.rst +docs/source/stemming.rst +docs/source/threads.rst +docs/source/api/analysis.rst +docs/source/api/api.rst +docs/source/api/collectors.rst +docs/source/api/columns.rst +docs/source/api/fields.rst +docs/source/api/formats.rst +docs/source/api/highlight.rst +docs/source/api/idsets.rst +docs/source/api/index.rst +docs/source/api/matching.rst +docs/source/api/qparser.rst +docs/source/api/query.rst +docs/source/api/reading.rst +docs/source/api/scoring.rst +docs/source/api/searching.rst +docs/source/api/sorting.rst +docs/source/api/spelling.rst +docs/source/api/util.rst +docs/source/api/writing.rst +docs/source/api/codec/base.rst +docs/source/api/filedb/filestore.rst +docs/source/api/filedb/filetables.rst +docs/source/api/filedb/structfile.rst +docs/source/api/lang/morph_en.rst +docs/source/api/lang/porter.rst +docs/source/api/lang/wordnet.rst +docs/source/api/support/charset.rst +docs/source/api/support/levenshtein.rst +docs/source/releases/0_3.rst +docs/source/releases/1_0.rst +docs/source/releases/2_0.rst +docs/source/releases/index.rst +docs/source/tech/backend.rst +docs/source/tech/filedb.rst +docs/source/tech/index.rst +files/whoosh.svg +files/whoosh_16.png +files/whoosh_35.png +files/whoosh_64.png +files/whoosh_small.svg +src/Whoosh.egg-info/PKG-INFO +src/Whoosh.egg-info/SOURCES.txt +src/Whoosh.egg-info/dependency_links.txt +src/Whoosh.egg-info/top_level.txt +src/Whoosh.egg-info/zip-safe +src/whoosh/__init__.py +src/whoosh/classify.py +src/whoosh/collectors.py +src/whoosh/columns.py +src/whoosh/compat.py +src/whoosh/externalsort.py +src/whoosh/fields.py +src/whoosh/formats.py +src/whoosh/highlight.py +src/whoosh/idsets.py +src/whoosh/index.py +src/whoosh/legacy.py +src/whoosh/multiproc.py +src/whoosh/reading.py +src/whoosh/scoring.py +src/whoosh/searching.py +src/whoosh/sorting.py +src/whoosh/spelling.py +src/whoosh/system.py +src/whoosh/writing.py +src/whoosh/analysis/__init__.py +src/whoosh/analysis/acore.py +src/whoosh/analysis/analyzers.py +src/whoosh/analysis/filters.py +src/whoosh/analysis/intraword.py +src/whoosh/analysis/morph.py +src/whoosh/analysis/ngrams.py +src/whoosh/analysis/tokenizers.py +src/whoosh/automata/__init__.py +src/whoosh/automata/fsa.py +src/whoosh/automata/glob.py +src/whoosh/automata/lev.py +src/whoosh/automata/nfa.py +src/whoosh/automata/reg.py +src/whoosh/codec/__init__.py +src/whoosh/codec/base.py +src/whoosh/codec/memory.py +src/whoosh/codec/plaintext.py +src/whoosh/codec/whoosh3.py +src/whoosh/filedb/__init__.py +src/whoosh/filedb/compound.py +src/whoosh/filedb/filestore.py +src/whoosh/filedb/filetables.py +src/whoosh/filedb/gae.py +src/whoosh/filedb/structfile.py +src/whoosh/lang/__init__.py +src/whoosh/lang/dmetaphone.py +src/whoosh/lang/isri.py +src/whoosh/lang/lovins.py +src/whoosh/lang/morph_en.py +src/whoosh/lang/paicehusk.py +src/whoosh/lang/phonetic.py +src/whoosh/lang/porter.py +src/whoosh/lang/porter2.py +src/whoosh/lang/stopwords.py +src/whoosh/lang/wordnet.py +src/whoosh/lang/snowball/__init__.py +src/whoosh/lang/snowball/bases.py +src/whoosh/lang/snowball/danish.py +src/whoosh/lang/snowball/dutch.py +src/whoosh/lang/snowball/english.py +src/whoosh/lang/snowball/finnish.py +src/whoosh/lang/snowball/french.py +src/whoosh/lang/snowball/german.py +src/whoosh/lang/snowball/hungarian.py +src/whoosh/lang/snowball/italian.py +src/whoosh/lang/snowball/norwegian.py +src/whoosh/lang/snowball/portugese.py +src/whoosh/lang/snowball/romanian.py +src/whoosh/lang/snowball/russian.py +src/whoosh/lang/snowball/spanish.py +src/whoosh/lang/snowball/swedish.py +src/whoosh/matching/__init__.py +src/whoosh/matching/binary.py +src/whoosh/matching/combo.py +src/whoosh/matching/mcore.py +src/whoosh/matching/wrappers.py +src/whoosh/qparser/__init__.py +src/whoosh/qparser/common.py +src/whoosh/qparser/dateparse.py +src/whoosh/qparser/default.py +src/whoosh/qparser/plugins.py +src/whoosh/qparser/syntax.py +src/whoosh/qparser/taggers.py +src/whoosh/query/__init__.py +src/whoosh/query/compound.py +src/whoosh/query/nested.py +src/whoosh/query/positional.py +src/whoosh/query/qcolumns.py +src/whoosh/query/qcore.py +src/whoosh/query/ranges.py +src/whoosh/query/spans.py +src/whoosh/query/terms.py +src/whoosh/query/wrappers.py +src/whoosh/support/__init__.py +src/whoosh/support/base85.py +src/whoosh/support/bench.py +src/whoosh/support/charset.py +src/whoosh/support/levenshtein.py +src/whoosh/support/relativedelta.py +src/whoosh/support/unicode.py +src/whoosh/util/__init__.py +src/whoosh/util/cache.py +src/whoosh/util/filelock.py +src/whoosh/util/loading.py +src/whoosh/util/numeric.py +src/whoosh/util/numlists.py +src/whoosh/util/testing.py +src/whoosh/util/text.py +src/whoosh/util/times.py +src/whoosh/util/varints.py +src/whoosh/util/versions.py +tests/test_analysis.py +tests/test_automata.py +tests/test_bits.py +tests/test_classify.py +tests/test_codecs.py +tests/test_collector.py +tests/test_columns.py +tests/test_compound.py +tests/test_dateparse.py +tests/test_fields.py +tests/test_flexible.py +tests/test_highlighting.py +tests/test_indexing.py +tests/test_matching.py +tests/test_misc.py +tests/test_mpwriter.py +tests/test_nested.py +tests/test_parse_plugins.py +tests/test_parsing.py +tests/test_postings.py +tests/test_quality.py +tests/test_queries.py +tests/test_reading.py +tests/test_results.py +tests/test_searching.py +tests/test_sorting.py +tests/test_spans.py +tests/test_spelling.py +tests/test_tables.py +tests/test_vectors.py +tests/test_weightings.py +tests/test_writing.py \ No newline at end of file diff --git a/src/Whoosh.egg-info/dependency_links.txt b/src/Whoosh.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/Whoosh.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/Whoosh.egg-info/top_level.txt b/src/Whoosh.egg-info/top_level.txt new file mode 100644 index 0000000..d752255 --- /dev/null +++ b/src/Whoosh.egg-info/top_level.txt @@ -0,0 +1 @@ +whoosh diff --git a/src/Whoosh.egg-info/zip-safe b/src/Whoosh.egg-info/zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/Whoosh.egg-info/zip-safe @@ -0,0 +1 @@ + diff --git a/src/whoosh/__init__.py b/src/whoosh/__init__.py new file mode 100644 index 0000000..414f8bb --- /dev/null +++ b/src/whoosh/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2008 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +__version__ = (2, 7, 0) + + +def versionstring(build=True, extra=True): + """Returns the version number of Whoosh as a string. + + :param build: Whether to include the build number in the string. + :param extra: Whether to include alpha/beta/rc etc. tags. Only + checked if build is True. + :rtype: str + """ + + if build: + first = 3 + else: + first = 2 + + s = ".".join(str(n) for n in __version__[:first]) + if build and extra: + s += "".join(str(n) for n in __version__[3:]) + + return s diff --git a/src/whoosh/analysis/__init__.py b/src/whoosh/analysis/__init__.py new file mode 100644 index 0000000..66293bc --- /dev/null +++ b/src/whoosh/analysis/__init__.py @@ -0,0 +1,69 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +"""Classes and functions for turning a piece of text into an indexable stream +of "tokens" (usually equivalent to words). There are three general classes +involved in analysis: + +* Tokenizers are always at the start of the text processing pipeline. They take + a string and yield Token objects (actually, the same token object over and + over, for performance reasons) corresponding to the tokens (words) in the + text. + + Every tokenizer is a callable that takes a string and returns an iterator of + tokens. + +* Filters take the tokens from the tokenizer and perform various + transformations on them. For example, the LowercaseFilter converts all tokens + to lowercase, which is usually necessary when indexing regular English text. + + Every filter is a callable that takes a token generator and returns a token + generator. + +* Analyzers are convenience functions/classes that "package up" a tokenizer and + zero or more filters into a single unit. For example, the StandardAnalyzer + combines a RegexTokenizer, LowercaseFilter, and StopFilter. + + Every analyzer is a callable that takes a string and returns a token + iterator. (So Tokenizers can be used as Analyzers if you don't need any + filtering). + +You can compose tokenizers and filters together using the ``|`` character:: + + my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() + +The first item must be a tokenizer and the rest must be filters (you can't put +a filter first or a tokenizer after the first item). +""" + +from whoosh.analysis.acore import * +from whoosh.analysis.tokenizers import * +from whoosh.analysis.filters import * +from whoosh.analysis.morph import * +from whoosh.analysis.intraword import * +from whoosh.analysis.ngrams import * +from whoosh.analysis.analyzers import * diff --git a/src/whoosh/analysis/acore.py b/src/whoosh/analysis/acore.py new file mode 100644 index 0000000..adb53b7 --- /dev/null +++ b/src/whoosh/analysis/acore.py @@ -0,0 +1,156 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.compat import iteritems + + +# Exceptions + +class CompositionError(Exception): + pass + + +# Utility functions + +def unstopped(tokenstream): + """Removes tokens from a token stream where token.stopped = True. + """ + return (t for t in tokenstream if not t.stopped) + + +def entoken(textstream, positions=False, chars=False, start_pos=0, + start_char=0, **kwargs): + """Takes a sequence of unicode strings and yields a series of Token objects + (actually the same Token object over and over, for performance reasons), + with the attributes filled in with reasonable values (for example, if + ``positions`` or ``chars`` is True, the function assumes each token was + separated by one space). + """ + + pos = start_pos + char = start_char + t = Token(positions=positions, chars=chars, **kwargs) + + for text in textstream: + t.text = text + + if positions: + t.pos = pos + pos += 1 + + if chars: + t.startchar = char + char = char + len(text) + t.endchar = char + + yield t + + +# Token object + +class Token(object): + """ + Represents a "token" (usually a word) extracted from the source text being + indexed. + + See "Advanced analysis" in the user guide for more information. + + Because object instantiation in Python is slow, tokenizers should create + ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes + each time. + + This trick means that consumers of tokens (i.e. filters) must never try to + hold onto the token object between loop iterations, or convert the token + generator into a list. Instead, save the attributes between iterations, + not the object:: + + def RemoveDuplicatesFilter(self, stream): + # Removes duplicate words. + lasttext = None + for token in stream: + # Only yield the token if its text doesn't + # match the previous token. + if lasttext != token.text: + yield token + lasttext = token.text + + ...or, call token.copy() to get a copy of the token object. + """ + + def __init__(self, positions=False, chars=False, removestops=True, mode='', + **kwargs): + """ + :param positions: Whether tokens should have the token position in the + 'pos' attribute. + :param chars: Whether tokens should have character offsets in the + 'startchar' and 'endchar' attributes. + :param removestops: whether to remove stop words from the stream (if + the tokens pass through a stop filter). + :param mode: contains a string describing the purpose for which the + analyzer is being called, i.e. 'index' or 'query'. + """ + + self.positions = positions + self.chars = chars + self.stopped = False + self.boost = 1.0 + self.removestops = removestops + self.mode = mode + self.__dict__.update(kwargs) + + def __repr__(self): + parms = ", ".join("%s=%r" % (name, value) + for name, value in iteritems(self.__dict__)) + return "%s(%s)" % (self.__class__.__name__, parms) + + def copy(self): + # This is faster than using the copy module + return Token(**self.__dict__) + + +# Composition support + +class Composable(object): + is_morph = False + + def __or__(self, other): + from whoosh.analysis.analyzers import CompositeAnalyzer + + if not isinstance(other, Composable): + raise TypeError("%r is not composable with %r" % (self, other)) + return CompositeAnalyzer(self, other) + + def __repr__(self): + attrs = "" + if self.__dict__: + attrs = ", ".join("%s=%r" % (key, value) + for key, value + in iteritems(self.__dict__)) + return self.__class__.__name__ + "(%s)" % attrs + + def has_morph(self): + return self.is_morph diff --git a/src/whoosh/analysis/analyzers.py b/src/whoosh/analysis/analyzers.py new file mode 100644 index 0000000..f7d6e3c --- /dev/null +++ b/src/whoosh/analysis/analyzers.py @@ -0,0 +1,296 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.analysis.acore import Composable, CompositionError +from whoosh.analysis.tokenizers import Tokenizer +from whoosh.analysis.filters import LowercaseFilter +from whoosh.analysis.filters import StopFilter, STOP_WORDS +from whoosh.analysis.morph import StemFilter +from whoosh.analysis.intraword import IntraWordFilter +from whoosh.analysis.tokenizers import default_pattern +from whoosh.analysis.tokenizers import CommaSeparatedTokenizer +from whoosh.analysis.tokenizers import IDTokenizer +from whoosh.analysis.tokenizers import RegexTokenizer +from whoosh.analysis.tokenizers import SpaceSeparatedTokenizer +from whoosh.lang.porter import stem + + +# Analyzers + +class Analyzer(Composable): + """ Abstract base class for analyzers. + """ + + def __repr__(self): + return "%s()" % self.__class__.__name__ + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.__dict__ == other.__dict__) + + def __call__(self, value, **kwargs): + raise NotImplementedError + + def clean(self): + pass + + +class CompositeAnalyzer(Analyzer): + def __init__(self, *composables): + self.items = [] + + for comp in composables: + if isinstance(comp, CompositeAnalyzer): + self.items.extend(comp.items) + else: + self.items.append(comp) + + # Tokenizers must start a chain, and then only filters after that + # (because analyzers take a string and return a generator of tokens, + # and filters take and return generators of tokens) + for item in self.items[1:]: + if isinstance(item, Tokenizer): + raise CompositionError("Only one tokenizer allowed at the start" + " of the analyzer: %r" % self.items) + + def __repr__(self): + return "%s(%s)" % (self.__class__.__name__, + ", ".join(repr(item) for item in self.items)) + + def __call__(self, value, no_morph=False, **kwargs): + items = self.items + # Start with tokenizer + gen = items[0](value, **kwargs) + # Run filters + for item in items[1:]: + if not (no_morph and hasattr(item, "is_morph") and item.is_morph): + gen = item(gen) + return gen + + def __getitem__(self, item): + return self.items.__getitem__(item) + + def __len__(self): + return len(self.items) + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.items == other.items) + + def clean(self): + for item in self.items: + if hasattr(item, "clean"): + item.clean() + + def has_morph(self): + return any(item.is_morph for item in self.items) + + +# Functions that return composed analyzers + +def IDAnalyzer(lowercase=False): + """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if + desired. + """ + + tokenizer = IDTokenizer() + if lowercase: + tokenizer = tokenizer | LowercaseFilter() + return tokenizer + + +def KeywordAnalyzer(lowercase=False, commas=False): + """Parses whitespace- or comma-separated tokens. + + >>> ana = KeywordAnalyzer() + >>> [token.text for token in ana("Hello there, this is a TEST")] + ["Hello", "there,", "this", "is", "a", "TEST"] + + :param lowercase: whether to lowercase the tokens. + :param commas: if True, items are separated by commas rather than + whitespace. + """ + + if commas: + tokenizer = CommaSeparatedTokenizer() + else: + tokenizer = SpaceSeparatedTokenizer() + if lowercase: + tokenizer = tokenizer | LowercaseFilter() + return tokenizer + + +def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False): + """Deprecated, just use a RegexTokenizer directly. + """ + + return RegexTokenizer(expression=expression, gaps=gaps) + + +def SimpleAnalyzer(expression=default_pattern, gaps=False): + """Composes a RegexTokenizer with a LowercaseFilter. + + >>> ana = SimpleAnalyzer() + >>> [token.text for token in ana("Hello there, this is a TEST")] + ["hello", "there", "this", "is", "a", "test"] + + :param expression: The regular expression pattern to use to extract tokens. + :param gaps: If True, the tokenizer *splits* on the expression, rather + than matching on the expression. + """ + + return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter() + + +def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, + minsize=2, maxsize=None, gaps=False): + """Composes a RegexTokenizer with a LowercaseFilter and optional + StopFilter. + + >>> ana = StandardAnalyzer() + >>> [token.text for token in ana("Testing is testing and testing")] + ["testing", "testing", "testing"] + + :param expression: The regular expression pattern to use to extract tokens. + :param stoplist: A list of stop words. Set this to None to disable + the stop word filter. + :param minsize: Words smaller than this are removed from the stream. + :param maxsize: Words longer that this are removed from the stream. + :param gaps: If True, the tokenizer *splits* on the expression, rather + than matching on the expression. + """ + + ret = RegexTokenizer(expression=expression, gaps=gaps) + chain = ret | LowercaseFilter() + if stoplist is not None: + chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, + maxsize=maxsize) + return chain + + +def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, + minsize=2, maxsize=None, gaps=False, stemfn=stem, + ignore=None, cachesize=50000): + """Composes a RegexTokenizer with a lower case filter, an optional stop + filter, and a stemming filter. + + >>> ana = StemmingAnalyzer() + >>> [token.text for token in ana("Testing is testing and testing")] + ["test", "test", "test"] + + :param expression: The regular expression pattern to use to extract tokens. + :param stoplist: A list of stop words. Set this to None to disable + the stop word filter. + :param minsize: Words smaller than this are removed from the stream. + :param maxsize: Words longer that this are removed from the stream. + :param gaps: If True, the tokenizer *splits* on the expression, rather + than matching on the expression. + :param ignore: a set of words to not stem. + :param cachesize: the maximum number of stemmed words to cache. The larger + this number, the faster stemming will be but the more memory it will + use. Use None for no cache, or -1 for an unbounded cache. + """ + + ret = RegexTokenizer(expression=expression, gaps=gaps) + chain = ret | LowercaseFilter() + if stoplist is not None: + chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, + maxsize=maxsize) + return chain | StemFilter(stemfn=stemfn, ignore=ignore, + cachesize=cachesize) + + +def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2, + maxsize=None, gaps=True, splitwords=True, splitnums=True, + mergewords=False, mergenums=False): + """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and + StopFilter. + + >>> ana = FancyAnalyzer() + >>> [token.text for token in ana("Should I call getInt or get_real?")] + ["should", "call", "getInt", "get", "int", "get_real", "get", "real"] + + :param expression: The regular expression pattern to use to extract tokens. + :param stoplist: A list of stop words. Set this to None to disable + the stop word filter. + :param minsize: Words smaller than this are removed from the stream. + :param maxsize: Words longer that this are removed from the stream. + :param gaps: If True, the tokenizer *splits* on the expression, rather + than matching on the expression. + """ + + return (RegexTokenizer(expression=expression, gaps=gaps) + | IntraWordFilter(splitwords=splitwords, splitnums=splitnums, + mergewords=mergewords, mergenums=mergenums) + | LowercaseFilter() + | StopFilter(stoplist=stoplist, minsize=minsize) + ) + + +def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, + cachesize=50000): + """Configures a simple analyzer for the given language, with a + LowercaseFilter, StopFilter, and StemFilter. + + >>> ana = LanguageAnalyzer("es") + >>> [token.text for token in ana("Por el mar corren las liebres")] + ['mar', 'corr', 'liebr'] + + The list of available languages is in `whoosh.lang.languages`. + You can use :func:`whoosh.lang.has_stemmer` and + :func:`whoosh.lang.has_stopwords` to check if a given language has a + stemming function and/or stop word list available. + + :param expression: The regular expression pattern to use to extract tokens. + :param gaps: If True, the tokenizer *splits* on the expression, rather + than matching on the expression. + :param cachesize: the maximum number of stemmed words to cache. The larger + this number, the faster stemming will be but the more memory it will + use. + """ + + from whoosh.lang import NoStemmer, NoStopWords + + # Make the start of the chain + chain = (RegexTokenizer(expression=expression, gaps=gaps) + | LowercaseFilter()) + + # Add a stop word filter + try: + chain = chain | StopFilter(lang=lang) + except NoStopWords: + pass + + # Add a stemming filter + try: + chain = chain | StemFilter(lang=lang, cachesize=cachesize) + except NoStemmer: + pass + + return chain diff --git a/src/whoosh/analysis/filters.py b/src/whoosh/analysis/filters.py new file mode 100644 index 0000000..add9c98 --- /dev/null +++ b/src/whoosh/analysis/filters.py @@ -0,0 +1,479 @@ +# coding=utf-8 + +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from itertools import chain + +from whoosh.compat import next, xrange +from whoosh.analysis.acore import Composable +from whoosh.util.text import rcompile + + +# Default list of stop words (words so common it's usually wasteful to index +# them). This list is used by the StopFilter class, which allows you to supply +# an optional list to override this one. + +STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', + 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', + 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', + 'to', 'us', 'we', 'when', 'will', 'with', 'yet', + 'you', 'your')) + + +# Simple pattern for filtering URLs, may be useful + +url_pattern = rcompile(""" +( + [A-Za-z+]+:// # URL protocol + \\S+? # URL body + (?=\\s|[.]\\s|$|[.]$) # Stop at space/end, or a dot followed by space/end +) | ( # or... + \w+([:.]?\w+)* # word characters, with opt. internal colons/dots +) +""", verbose=True) + + +# Filters + +class Filter(Composable): + """Base class for Filter objects. A Filter subclass must implement a + filter() method that takes a single argument, which is an iterator of Token + objects, and yield a series of Token objects in return. + + Filters that do morphological transformation of tokens (e.g. stemming) + should set their ``is_morph`` attribute to True. + """ + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.__dict__ == other.__dict__) + + def __ne__(self, other): + return not self == other + + def __call__(self, tokens): + raise NotImplementedError + + +class PassFilter(Filter): + """An identity filter: passes the tokens through untouched. + """ + + def __call__(self, tokens): + return tokens + + +class LoggingFilter(Filter): + """Prints the contents of every filter that passes through as a debug + log entry. + """ + + def __init__(self, logger=None): + """ + :param target: the logger to use. If omitted, the "whoosh.analysis" + logger is used. + """ + + if logger is None: + import logging + logger = logging.getLogger("whoosh.analysis") + self.logger = logger + + def __call__(self, tokens): + logger = self.logger + for t in tokens: + logger.debug(repr(t)) + yield t + + +class MultiFilter(Filter): + """Chooses one of two or more sub-filters based on the 'mode' attribute + of the token stream. + """ + + default_filter = PassFilter() + + def __init__(self, **kwargs): + """Use keyword arguments to associate mode attribute values with + instantiated filters. + + >>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False) + >>> iwf_for_query = IntraWordFilter(mergewords=False, mergenums=False) + >>> mf = MultiFilter(index=iwf_for_index, query=iwf_for_query) + + This class expects that the value of the mode attribute is consistent + among all tokens in a token stream. + """ + self.filters = kwargs + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.filters == other.filters) + + def __call__(self, tokens): + # Only selects on the first token + t = next(tokens) + filter = self.filters.get(t.mode, self.default_filter) + return filter(chain([t], tokens)) + + +class TeeFilter(Filter): + """Interleaves the results of two or more filters (or filter chains). + + NOTE: because it needs to create copies of each token for each sub-filter, + this filter is quite slow. + + >>> target = "ALFA BRAVO CHARLIE" + >>> # In one branch, we'll lower-case the tokens + >>> f1 = LowercaseFilter() + >>> # In the other branch, we'll reverse the tokens + >>> f2 = ReverseTextFilter() + >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2) + >>> [token.text for token in ana(target)] + ["alfa", "AFLA", "bravo", "OVARB", "charlie", "EILRAHC"] + + To combine the incoming token stream with the output of a filter chain, use + ``TeeFilter`` and make one of the filters a :class:`PassFilter`. + + >>> f1 = PassFilter() + >>> f2 = BiWordFilter() + >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2) | LowercaseFilter() + >>> [token.text for token in ana(target)] + ["alfa", "alfa-bravo", "bravo", "bravo-charlie", "charlie"] + """ + + def __init__(self, *filters): + if len(filters) < 2: + raise Exception("TeeFilter requires two or more filters") + self.filters = filters + + def __eq__(self, other): + return (self.__class__ is other.__class__ + and self.filters == other.fitlers) + + def __call__(self, tokens): + from itertools import tee + + count = len(self.filters) + # Tee the token iterator and wrap each teed iterator with the + # corresponding filter + gens = [filter(t.copy() for t in gen) for filter, gen + in zip(self.filters, tee(tokens, count))] + # Keep a count of the number of running iterators + running = count + while running: + for i, gen in enumerate(gens): + if gen is not None: + try: + yield next(gen) + except StopIteration: + gens[i] = None + running -= 1 + + +class ReverseTextFilter(Filter): + """Reverses the text of each token. + + >>> ana = RegexTokenizer() | ReverseTextFilter() + >>> [token.text for token in ana("hello there")] + ["olleh", "ereht"] + """ + + def __call__(self, tokens): + for t in tokens: + t.text = t.text[::-1] + yield t + + +class LowercaseFilter(Filter): + """Uses unicode.lower() to lowercase token text. + + >>> rext = RegexTokenizer() + >>> stream = rext("This is a TEST") + >>> [token.text for token in LowercaseFilter(stream)] + ["this", "is", "a", "test"] + """ + + def __call__(self, tokens): + for t in tokens: + t.text = t.text.lower() + yield t + + +class StripFilter(Filter): + """Calls unicode.strip() on the token text. + """ + + def __call__(self, tokens): + for t in tokens: + t.text = t.text.strip() + yield t + + +class StopFilter(Filter): + """Marks "stop" words (words too common to index) in the stream (and by + default removes them). + + Make sure you precede this filter with a :class:`LowercaseFilter`. + + >>> stopper = RegexTokenizer() | StopFilter() + >>> [token.text for token in stopper(u"this is a test")] + ["test"] + >>> es_stopper = RegexTokenizer() | StopFilter(lang="es") + >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")] + ["lapiz", "mesa"] + + The list of available languages is in `whoosh.lang.languages`. + You can use :func:`whoosh.lang.has_stopwords` to check if a given language + has a stop word list available. + """ + + def __init__(self, stoplist=STOP_WORDS, minsize=2, maxsize=None, + renumber=True, lang=None): + """ + :param stoplist: A collection of words to remove from the stream. + This is converted to a frozenset. The default is a list of + common English stop words. + :param minsize: The minimum length of token texts. Tokens with + text smaller than this will be stopped. The default is 2. + :param maxsize: The maximum length of token texts. Tokens with text + larger than this will be stopped. Use None to allow any length. + :param renumber: Change the 'pos' attribute of unstopped tokens + to reflect their position with the stopped words removed. + :param lang: Automatically get a list of stop words for the given + language + """ + + stops = set() + if stoplist: + stops.update(stoplist) + if lang: + from whoosh.lang import stopwords_for_language + + stops.update(stopwords_for_language(lang)) + + self.stops = frozenset(stops) + self.min = minsize + self.max = maxsize + self.renumber = renumber + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.stops == other.stops + and self.min == other.min + and self.renumber == other.renumber) + + def __call__(self, tokens): + stoplist = self.stops + minsize = self.min + maxsize = self.max + renumber = self.renumber + + pos = None + for t in tokens: + text = t.text + if (len(text) >= minsize + and (maxsize is None or len(text) <= maxsize) + and text not in stoplist): + # This is not a stop word + if renumber and t.positions: + if pos is None: + pos = t.pos + else: + pos += 1 + t.pos = pos + t.stopped = False + yield t + else: + # This is a stop word + if not t.removestops: + # This IS a stop word, but we're not removing them + t.stopped = True + yield t + + +class CharsetFilter(Filter): + """Translates the text of tokens by calling unicode.translate() using the + supplied character mapping object. This is useful for case and accent + folding. + + The ``whoosh.support.charset`` module has a useful map for accent folding. + + >>> from whoosh.support.charset import accent_map + >>> retokenizer = RegexTokenizer() + >>> chfilter = CharsetFilter(accent_map) + >>> [t.text for t in chfilter(retokenizer(u'café'))] + [u'cafe'] + + Another way to get a character mapping object is to convert a Sphinx + charset table file using + :func:`whoosh.support.charset.charset_table_to_dict`. + + >>> from whoosh.support.charset import charset_table_to_dict + >>> from whoosh.support.charset import default_charset + >>> retokenizer = RegexTokenizer() + >>> charmap = charset_table_to_dict(default_charset) + >>> chfilter = CharsetFilter(charmap) + >>> [t.text for t in chfilter(retokenizer(u'Stra\\xdfe'))] + [u'strase'] + + The Sphinx charset table format is described at + http://www.sphinxsearch.com/docs/current.html#conf-charset-table. + """ + + __inittypes__ = dict(charmap=dict) + + def __init__(self, charmap): + """ + :param charmap: a dictionary mapping from integer character numbers to + unicode characters, as required by the unicode.translate() method. + """ + + self.charmap = charmap + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.charmap == other.charmap) + + def __call__(self, tokens): + assert hasattr(tokens, "__iter__") + charmap = self.charmap + for t in tokens: + t.text = t.text.translate(charmap) + yield t + + +class DelimitedAttributeFilter(Filter): + """Looks for delimiter characters in the text of each token and stores the + data after the delimiter in a named attribute on the token. + + The defaults are set up to use the ``^`` character as a delimiter and store + the value after the ``^`` as the boost for the token. + + >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost") + >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter() + >>> for t in ana(u("image render^2 file^0.5")) + ... print("%r %f" % (t.text, t.boost)) + 'image' 1.0 + 'render' 2.0 + 'file' 0.5 + + Note that you need to make sure your tokenizer includes the delimiter and + data as part of the token! + """ + + def __init__(self, delimiter="^", attribute="boost", default=1.0, + type=float): + """ + :param delimiter: a string that, when present in a token's text, + separates the actual text from the "data" payload. + :param attribute: the name of the attribute in which to store the + data on the token. + :param default: the value to use for the attribute for tokens that + don't have delimited data. + :param type: the type of the data, for example ``str`` or ``float``. + This is used to convert the string value of the data before + storing it in the attribute. + """ + + self.delim = delimiter + self.attr = attribute + self.default = default + self.type = type + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.delim == other.delim + and self.attr == other.attr + and self.default == other.default) + + def __call__(self, tokens): + delim = self.delim + attr = self.attr + default = self.default + type_ = self.type + + for t in tokens: + text = t.text + pos = text.find(delim) + if pos > -1: + setattr(t, attr, type_(text[pos + 1:])) + if t.chars: + t.endchar -= len(t.text) - pos + t.text = text[:pos] + else: + setattr(t, attr, default) + + yield t + + +class SubstitutionFilter(Filter): + """Performs a regular expression substitution on the token text. + + This is especially useful for removing text from tokens, for example + hyphens:: + + ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("-", "") + + Because it has the full power of the re.sub() method behind it, this filter + can perform some fairly complex transformations. For example, to take + tokens like ``'a=b', 'c=d', 'e=f'`` and change them to ``'b=a', 'd=c', + 'f=e'``:: + + # Analyzer that swaps the text on either side of an equal sign + rt = RegexTokenizer(r"\\S+") + sf = SubstitutionFilter("([^/]*)/(./*)", r"\\2/\\1") + ana = rt | sf + """ + + def __init__(self, pattern, replacement): + """ + :param pattern: a pattern string or compiled regular expression object + describing the text to replace. + :param replacement: the substitution text. + """ + + self.pattern = rcompile(pattern) + self.replacement = replacement + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.pattern == other.pattern + and self.replacement == other.replacement) + + def __call__(self, tokens): + pattern = self.pattern + replacement = self.replacement + + for t in tokens: + t.text = pattern.sub(replacement, t.text) + yield t diff --git a/src/whoosh/analysis/intraword.py b/src/whoosh/analysis/intraword.py new file mode 100644 index 0000000..601423e --- /dev/null +++ b/src/whoosh/analysis/intraword.py @@ -0,0 +1,494 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import re +from collections import deque + +from whoosh.compat import u, text_type +from whoosh.compat import xrange +from whoosh.analysis.filters import Filter + + +class CompoundWordFilter(Filter): + """Given a set of words (or any object with a ``__contains__`` method), + break any tokens in the stream that are composites of words in the word set + into their individual parts. + + Given the correct set of words, this filter can break apart run-together + words and trademarks (e.g. "turbosquid", "applescript"). It can also be + useful for agglutinative languages such as German. + + The ``keep_compound`` argument lets you decide whether to keep the + compound word in the token stream along with the word segments. + + >>> cwf = CompoundWordFilter(wordset, keep_compound=True) + >>> analyzer = RegexTokenizer(r"\S+") | cwf + >>> [t.text for t in analyzer("I do not like greeneggs and ham") + ["I", "do", "not", "like", "greeneggs", "green", "eggs", "and", "ham"] + >>> cwf.keep_compound = False + >>> [t.text for t in analyzer("I do not like greeneggs and ham") + ["I", "do", "not", "like", "green", "eggs", "and", "ham"] + """ + + def __init__(self, wordset, keep_compound=True): + """ + :param wordset: an object with a ``__contains__`` method, such as a + set, containing strings to look for inside the tokens. + :param keep_compound: if True (the default), the original compound + token will be retained in the stream before the subwords. + """ + + self.wordset = wordset + self.keep_compound = keep_compound + + def subwords(self, s, memo): + if s in self.wordset: + return [s] + if s in memo: + return memo[s] + + for i in xrange(1, len(s)): + prefix = s[:i] + if prefix in self.wordset: + suffix = s[i:] + suffix_subs = self.subwords(suffix, memo) + if suffix_subs: + result = [prefix] + suffix_subs + memo[s] = result + return result + + return None + + def __call__(self, tokens): + keep_compound = self.keep_compound + memo = {} + subwords = self.subwords + for t in tokens: + subs = subwords(t.text, memo) + if subs: + if len(subs) > 1 and keep_compound: + yield t + for subword in subs: + t.text = subword + yield t + else: + yield t + + +class BiWordFilter(Filter): + """Merges adjacent tokens into "bi-word" tokens, so that for example:: + + "the", "sign", "of", "four" + + becomes:: + + "the-sign", "sign-of", "of-four" + + This can be used to create fields for pseudo-phrase searching, where if + all the terms match the document probably contains the phrase, but the + searching is faster than actually doing a phrase search on individual word + terms. + + The ``BiWordFilter`` is much faster than using the otherwise equivalent + ``ShingleFilter(2)``. + """ + + def __init__(self, sep="-"): + self.sep = sep + + def __call__(self, tokens): + sep = self.sep + prev_text = None + prev_startchar = None + prev_pos = None + atleastone = False + + for token in tokens: + # Save the original text of this token + text = token.text + + # Save the original position + positions = token.positions + if positions: + ps = token.pos + + # Save the original start char + chars = token.chars + if chars: + sc = token.startchar + + if prev_text is not None: + # Use the pos and startchar from the previous token + if positions: + token.pos = prev_pos + if chars: + token.startchar = prev_startchar + + # Join the previous token text and the current token text to + # form the biword token + token.text = "".join((prev_text, sep, text)) + yield token + atleastone = True + + # Save the originals and the new "previous" values + prev_text = text + if chars: + prev_startchar = sc + if positions: + prev_pos = ps + + # If no bi-words were emitted, that is, the token stream only had + # a single token, then emit that single token. + if not atleastone: + yield token + + +class ShingleFilter(Filter): + """Merges a certain number of adjacent tokens into multi-word tokens, so + that for example:: + + "better", "a", "witty", "fool", "than", "a", "foolish", "wit" + + with ``ShingleFilter(3, ' ')`` becomes:: + + 'better a witty', 'a witty fool', 'witty fool than', 'fool than a', + 'than a foolish', 'a foolish wit' + + This can be used to create fields for pseudo-phrase searching, where if + all the terms match the document probably contains the phrase, but the + searching is faster than actually doing a phrase search on individual word + terms. + + If you're using two-word shingles, you should use the functionally + equivalent ``BiWordFilter`` instead because it's faster than + ``ShingleFilter``. + """ + + def __init__(self, size=2, sep="-"): + self.size = size + self.sep = sep + + def __call__(self, tokens): + size = self.size + sep = self.sep + buf = deque() + atleastone = False + + def make_token(): + tk = buf[0] + tk.text = sep.join([t.text for t in buf]) + if tk.chars: + tk.endchar = buf[-1].endchar + return tk + + for token in tokens: + if not token.stopped: + buf.append(token.copy()) + if len(buf) == size: + atleastone = True + yield make_token() + buf.popleft() + + # If no shingles were emitted, that is, the token stream had fewer than + # 'size' tokens, then emit a single token with whatever tokens there + # were + if not atleastone and buf: + yield make_token() + + +class IntraWordFilter(Filter): + """Splits words into subwords and performs optional transformations on + subword groups. This filter is funtionally based on yonik's + WordDelimiterFilter in Solr, but shares no code with it. + + * Split on intra-word delimiters, e.g. `Wi-Fi` -> `Wi`, `Fi`. + * When splitwords=True, split on case transitions, + e.g. `PowerShot` -> `Power`, `Shot`. + * When splitnums=True, split on letter-number transitions, + e.g. `SD500` -> `SD`, `500`. + * Leading and trailing delimiter characters are ignored. + * Trailing possesive "'s" removed from subwords, + e.g. `O'Neil's` -> `O`, `Neil`. + + The mergewords and mergenums arguments turn on merging of subwords. + + When the merge arguments are false, subwords are not merged. + + * `PowerShot` -> `0`:`Power`, `1`:`Shot` (where `0` and `1` are token + positions). + + When one or both of the merge arguments are true, consecutive runs of + alphabetic and/or numeric subwords are merged into an additional token with + the same position as the last sub-word. + + * `PowerShot` -> `0`:`Power`, `1`:`Shot`, `1`:`PowerShot` + * `A's+B's&C's` -> `0`:`A`, `1`:`B`, `2`:`C`, `2`:`ABC` + * `Super-Duper-XL500-42-AutoCoder!` -> `0`:`Super`, `1`:`Duper`, `2`:`XL`, + `2`:`SuperDuperXL`, + `3`:`500`, `4`:`42`, `4`:`50042`, `5`:`Auto`, `6`:`Coder`, + `6`:`AutoCoder` + + When using this filter you should use a tokenizer that only splits on + whitespace, so the tokenizer does not remove intra-word delimiters before + this filter can see them, and put this filter before any use of + LowercaseFilter. + + >>> rt = RegexTokenizer(r"\\S+") + >>> iwf = IntraWordFilter() + >>> lcf = LowercaseFilter() + >>> analyzer = rt | iwf | lcf + + One use for this filter is to help match different written representations + of a concept. For example, if the source text contained `wi-fi`, you + probably want `wifi`, `WiFi`, `wi-fi`, etc. to match. One way of doing this + is to specify mergewords=True and/or mergenums=True in the analyzer used + for indexing, and mergewords=False / mergenums=False in the analyzer used + for querying. + + >>> iwf_i = IntraWordFilter(mergewords=True, mergenums=True) + >>> iwf_q = IntraWordFilter(mergewords=False, mergenums=False) + >>> iwf = MultiFilter(index=iwf_i, query=iwf_q) + >>> analyzer = RegexTokenizer(r"\S+") | iwf | LowercaseFilter() + + (See :class:`MultiFilter`.) + """ + + is_morph = True + + __inittypes__ = dict(delims=text_type, splitwords=bool, splitnums=bool, + mergewords=bool, mergenums=bool) + + def __init__(self, delims=u("-_'\"()!@#$%^&*[]{}<>\|;:,./?`~=+"), + splitwords=True, splitnums=True, + mergewords=False, mergenums=False): + """ + :param delims: a string of delimiter characters. + :param splitwords: if True, split at case transitions, + e.g. `PowerShot` -> `Power`, `Shot` + :param splitnums: if True, split at letter-number transitions, + e.g. `SD500` -> `SD`, `500` + :param mergewords: merge consecutive runs of alphabetic subwords into + an additional token with the same position as the last subword. + :param mergenums: merge consecutive runs of numeric subwords into an + additional token with the same position as the last subword. + """ + + from whoosh.support.unicode import digits, lowercase, uppercase + + self.delims = re.escape(delims) + + # Expression for text between delimiter characters + self.between = re.compile(u("[^%s]+") % (self.delims,), re.UNICODE) + # Expression for removing "'s" from the end of sub-words + dispat = u("(?<=[%s%s])'[Ss](?=$|[%s])") % (lowercase, uppercase, + self.delims) + self.possessive = re.compile(dispat, re.UNICODE) + + # Expression for finding case and letter-number transitions + lower2upper = u("[%s][%s]") % (lowercase, uppercase) + letter2digit = u("[%s%s][%s]") % (lowercase, uppercase, digits) + digit2letter = u("[%s][%s%s]") % (digits, lowercase, uppercase) + if splitwords and splitnums: + splitpat = u("(%s|%s|%s)") % (lower2upper, letter2digit, + digit2letter) + self.boundary = re.compile(splitpat, re.UNICODE) + elif splitwords: + self.boundary = re.compile(text_type(lower2upper), re.UNICODE) + elif splitnums: + numpat = u("(%s|%s)") % (letter2digit, digit2letter) + self.boundary = re.compile(numpat, re.UNICODE) + + self.splitting = splitwords or splitnums + self.mergewords = mergewords + self.mergenums = mergenums + + def __eq__(self, other): + return other and self.__class__ is other.__class__\ + and self.__dict__ == other.__dict__ + + def _split(self, string): + bound = self.boundary + + # Yields (startchar, endchar) pairs for each indexable substring in + # the given string, e.g. "WikiWord" -> (0, 4), (4, 8) + + # Whether we're splitting on transitions (case changes, letter -> num, + # num -> letter, etc.) + splitting = self.splitting + + # Make a list (dispos, for "dispossessed") of (startchar, endchar) + # pairs for runs of text between "'s" + if "'" in string: + # Split on possessive 's + dispos = [] + prev = 0 + for match in self.possessive.finditer(string): + dispos.append((prev, match.start())) + prev = match.end() + if prev < len(string): + dispos.append((prev, len(string))) + else: + # Shortcut if there's no apostrophe in the string + dispos = ((0, len(string)),) + + # For each run between 's + for sc, ec in dispos: + # Split on boundary characters + for part_match in self.between.finditer(string, sc, ec): + part_start = part_match.start() + part_end = part_match.end() + + if splitting: + # The point to start splitting at + prev = part_start + # Find transitions (e.g. "iW" or "a0") + for bmatch in bound.finditer(string, part_start, part_end): + # The point in the middle of the transition + pivot = bmatch.start() + 1 + # Yield from the previous match to the transition + yield (prev, pivot) + # Make the transition the new starting point + prev = pivot + + # If there's leftover text at the end, yield it too + if prev < part_end: + yield (prev, part_end) + else: + # Not splitting on transitions, just yield the part + yield (part_start, part_end) + + def _merge(self, parts): + mergewords = self.mergewords + mergenums = self.mergenums + + # Current type (1=alpah, 2=digit) + last = 0 + # Where to insert a merged term in the original list + insertat = 0 + # Buffer for parts to merge + buf = [] + # Iterate on a copy of the parts list so we can modify the original as + # we go + + def insert_item(buf, at, newpos): + newtext = "".join(item[0] for item in buf) + newsc = buf[0][2] # start char of first item in buffer + newec = buf[-1][3] # end char of last item in buffer + parts.insert(insertat, (newtext, newpos, newsc, newec)) + + for item in list(parts): + # item = (text, pos, startchar, endchar) + text = item[0] + pos = item[1] + + # Set the type of this part + if text.isalpha(): + this = 1 + elif text.isdigit(): + this = 2 + else: + this = None + + # Is this the same type as the previous part? + if (buf and (this == last == 1 and mergewords) + or (this == last == 2 and mergenums)): + # This part is the same type as the previous. Add it to the + # buffer of parts to merge. + buf.append(item) + else: + # This part is different than the previous. + if len(buf) > 1: + # If the buffer has at least two parts in it, merge them + # and add them to the original list of parts. + insert_item(buf, insertat, pos - 1) + insertat += 1 + # Reset the buffer + buf = [item] + last = this + insertat += 1 + + # If there are parts left in the buffer at the end, merge them and add + # them to the original list. + if len(buf) > 1: + insert_item(buf, len(parts), pos) + + def __call__(self, tokens): + mergewords = self.mergewords + mergenums = self.mergenums + + # This filter renumbers tokens as it expands them. New position + # counter. + newpos = None + for t in tokens: + text = t.text + + # If this is the first token we've seen, use it to set the new + # position counter + if newpos is None: + if t.positions: + newpos = t.pos + else: + # Token doesn't have positions, just use 0 + newpos = 0 + + if ((text.isalpha() and (text.islower() or text.isupper())) + or text.isdigit()): + # Short-circuit the common cases of no delimiters, no case + # transitions, only digits, etc. + t.pos = newpos + yield t + newpos += 1 + else: + # Split the token text on delimiters, word and/or number + # boundaries into a list of (text, pos, startchar, endchar) + # tuples + ranges = self._split(text) + parts = [(text[sc:ec], i + newpos, sc, ec) + for i, (sc, ec) in enumerate(ranges)] + + # Did the split yield more than one part? + if len(parts) > 1: + # If the options are set, merge consecutive runs of all- + # letters and/or all-numbers. + if mergewords or mergenums: + self._merge(parts) + + # Yield tokens for the parts + chars = t.chars + if chars: + base = t.startchar + for text, pos, startchar, endchar in parts: + t.text = text + t.pos = pos + if t.chars: + t.startchar = base + startchar + t.endchar = base + endchar + yield t + + if parts: + # Set the new position counter based on the last part + newpos = parts[-1][1] + 1 diff --git a/src/whoosh/analysis/morph.py b/src/whoosh/analysis/morph.py new file mode 100644 index 0000000..b7d644f --- /dev/null +++ b/src/whoosh/analysis/morph.py @@ -0,0 +1,267 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.analysis.filters import Filter +from whoosh.compat import integer_types +from whoosh.lang.dmetaphone import double_metaphone +from whoosh.lang.porter import stem +from whoosh.util.cache import lfu_cache, unbound_cache + + +class StemFilter(Filter): + """Stems (removes suffixes from) the text of tokens using the Porter + stemming algorithm. Stemming attempts to reduce multiple forms of the same + root word (for example, "rendering", "renders", "rendered", etc.) to a + single word in the index. + + >>> stemmer = RegexTokenizer() | StemFilter() + >>> [token.text for token in stemmer("fundamentally willows")] + ["fundament", "willow"] + + You can pass your own stemming function to the StemFilter. The default + is the Porter stemming algorithm for English. + + >>> stemfilter = StemFilter(stem_function) + + You can also use one of the Snowball stemming functions by passing the + `lang` keyword argument. + + >>> stemfilter = StemFilter(lang="ru") + + The list of available languages is in `whoosh.lang.languages`. + You can use :func:`whoosh.lang.has_stemmer` to check if a given language has + a stemming function available. + + By default, this class wraps an LRU cache around the stemming function. The + ``cachesize`` keyword argument sets the size of the cache. To make the + cache unbounded (the class caches every input), use ``cachesize=-1``. To + disable caching, use ``cachesize=None``. + + If you compile and install the py-stemmer library, the + :class:`PyStemmerFilter` provides slightly easier access to the language + stemmers in that library. + """ + + __inittypes__ = dict(stemfn=object, ignore=list) + + is_morph = True + + def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): + """ + :param stemfn: the function to use for stemming. + :param lang: if not None, overrides the stemfn with a language stemmer + from the ``whoosh.lang.snowball`` package. + :param ignore: a set/list of words that should not be stemmed. This is + converted into a frozenset. If you omit this argument, all tokens + are stemmed. + :param cachesize: the maximum number of words to cache. Use ``-1`` for + an unbounded cache, or ``None`` for no caching. + """ + + self.stemfn = stemfn + self.lang = lang + self.ignore = frozenset() if ignore is None else frozenset(ignore) + self.cachesize = cachesize + # clear() sets the _stem attr to a cached wrapper around self.stemfn + self.clear() + + def __getstate__(self): + # Can't pickle a dynamic function, so we have to remove the _stem + # attribute from the state + return dict([(k, self.__dict__[k]) for k in self.__dict__ + if k != "_stem"]) + + def __setstate__(self, state): + # Check for old instances of StemFilter class, which didn't have a + # cachesize attribute and pickled the cache attribute + if "cachesize" not in state: + self.cachesize = 50000 + if "ignores" in state: + self.ignore = state["ignores"] + elif "ignore" not in state: + self.ignore = frozenset() + if "lang" not in state: + self.lang = None + if "cache" in state: + del state["cache"] + + self.__dict__.update(state) + # Set the _stem attribute + self.clear() + + def clear(self): + if self.lang: + from whoosh.lang import stemmer_for_language + stemfn = stemmer_for_language(self.lang) + else: + stemfn = self.stemfn + + if isinstance(self.cachesize, integer_types) and self.cachesize != 0: + if self.cachesize < 0: + self._stem = unbound_cache(stemfn) + elif self.cachesize > 1: + self._stem = lfu_cache(self.cachesize)(stemfn) + else: + self._stem = stemfn + + def cache_info(self): + if self.cachesize <= 1: + return None + return self._stem.cache_info() + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.stemfn == other.stemfn) + + def __call__(self, tokens): + stemfn = self._stem + ignore = self.ignore + + for t in tokens: + if not t.stopped: + text = t.text + if text not in ignore: + t.text = stemfn(text) + yield t + + +class PyStemmerFilter(StemFilter): + """This is a simple subclass of StemFilter that works with the py-stemmer + third-party library. You must have the py-stemmer library installed to use + this filter. + + >>> PyStemmerFilter("spanish") + """ + + def __init__(self, lang="english", ignore=None, cachesize=10000): + """ + :param lang: a string identifying the stemming algorithm to use. You + can get a list of available algorithms by with the + :meth:`PyStemmerFilter.algorithms` method. The identification + strings are directly from the py-stemmer library. + :param ignore: a set/list of words that should not be stemmed. This is + converted into a frozenset. If you omit this argument, all tokens + are stemmed. + :param cachesize: the maximum number of words to cache. + """ + + self.lang = lang + self.ignore = frozenset() if ignore is None else frozenset(ignore) + self.cachesize = cachesize + self._stem = self._get_stemmer_fn() + + def algorithms(self): + """Returns a list of stemming algorithms provided by the py-stemmer + library. + """ + + import Stemmer # @UnresolvedImport + + return Stemmer.algorithms() + + def cache_info(self): + return None + + def _get_stemmer_fn(self): + import Stemmer # @UnresolvedImport + + stemmer = Stemmer.Stemmer(self.lang) + stemmer.maxCacheSize = self.cachesize + return stemmer.stemWord + + def __getstate__(self): + # Can't pickle a dynamic function, so we have to remove the _stem + # attribute from the state + return dict([(k, self.__dict__[k]) for k in self.__dict__ + if k != "_stem"]) + + def __setstate__(self, state): + # Check for old instances of StemFilter class, which didn't have a + # cachesize attribute and pickled the cache attribute + if "cachesize" not in state: + self.cachesize = 10000 + if "ignores" in state: + self.ignore = state["ignores"] + elif "ignore" not in state: + self.ignore = frozenset() + if "cache" in state: + del state["cache"] + + self.__dict__.update(state) + # Set the _stem attribute + self._stem = self._get_stemmer_fn() + + +class DoubleMetaphoneFilter(Filter): + """Transforms the text of the tokens using Lawrence Philips's Double + Metaphone algorithm. This algorithm attempts to encode words in such a way + that similar-sounding words reduce to the same code. This may be useful for + fields containing the names of people and places, and other uses where + tolerance of spelling differences is desireable. + """ + + is_morph = True + + def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False): + """ + :param primary_boost: the boost to apply to the token containing the + primary code. + :param secondary_boost: the boost to apply to the token containing the + secondary code, if any. + :param combine: if True, the original unencoded tokens are kept in the + stream, preceding the encoded tokens. + """ + + self.primary_boost = primary_boost + self.secondary_boost = secondary_boost + self.combine = combine + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.primary_boost == other.primary_boost) + + def __call__(self, tokens): + primary_boost = self.primary_boost + secondary_boost = self.secondary_boost + combine = self.combine + + for t in tokens: + if combine: + yield t + + primary, secondary = double_metaphone(t.text) + b = t.boost + # Overwrite the token's text and boost and yield it + if primary: + t.text = primary + t.boost = b * primary_boost + yield t + if secondary: + t.text = secondary + t.boost = b * secondary_boost + yield t diff --git a/src/whoosh/analysis/ngrams.py b/src/whoosh/analysis/ngrams.py new file mode 100644 index 0000000..a57fcde --- /dev/null +++ b/src/whoosh/analysis/ngrams.py @@ -0,0 +1,237 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.compat import text_type +from whoosh.compat import xrange +from whoosh.analysis.acore import Token +from whoosh.analysis.filters import Filter, LowercaseFilter +from whoosh.analysis.tokenizers import Tokenizer, RegexTokenizer + + +# Tokenizer + +class NgramTokenizer(Tokenizer): + """Splits input text into N-grams instead of words. + + >>> ngt = NgramTokenizer(4) + >>> [token.text for token in ngt("hi there")] + ["hi t", "i th", " the", "ther", "here"] + + Note that this tokenizer does NOT use a regular expression to extract + words, so the grams emitted by it will contain whitespace, punctuation, + etc. You may want to massage the input or add a custom filter to this + tokenizer's output. + + Alternatively, if you only want sub-word grams without whitespace, you + could combine a RegexTokenizer with NgramFilter instead. + """ + + __inittypes__ = dict(minsize=int, maxsize=int) + + def __init__(self, minsize, maxsize=None): + """ + :param minsize: The minimum size of the N-grams. + :param maxsize: The maximum size of the N-grams. If you omit + this parameter, maxsize == minsize. + """ + + self.min = minsize + self.max = maxsize or minsize + + def __eq__(self, other): + if self.__class__ is other.__class__: + if self.min == other.min and self.max == other.max: + return True + return False + + def __call__(self, value, positions=False, chars=False, keeporiginal=False, + removestops=True, start_pos=0, start_char=0, mode='', + **kwargs): + assert isinstance(value, text_type), "%r is not unicode" % value + + inlen = len(value) + t = Token(positions, chars, removestops=removestops, mode=mode) + pos = start_pos + + if mode == "query": + size = min(self.max, inlen) + for start in xrange(0, inlen - size + 1): + end = start + size + if end > inlen: + continue + t.text = value[start:end] + if keeporiginal: + t.original = t.text + t.stopped = False + if positions: + t.pos = pos + if chars: + t.startchar = start_char + start + t.endchar = start_char + end + yield t + pos += 1 + else: + for start in xrange(0, inlen - self.min + 1): + for size in xrange(self.min, self.max + 1): + end = start + size + if end > inlen: + continue + t.text = value[start:end] + if keeporiginal: + t.original = t.text + t.stopped = False + if positions: + t.pos = pos + if chars: + t.startchar = start_char + start + t.endchar = start_char + end + + yield t + pos += 1 + + +# Filter + +class NgramFilter(Filter): + """Splits token text into N-grams. + + >>> rext = RegexTokenizer() + >>> stream = rext("hello there") + >>> ngf = NgramFilter(4) + >>> [token.text for token in ngf(stream)] + ["hell", "ello", "ther", "here"] + """ + + __inittypes__ = dict(minsize=int, maxsize=int) + + def __init__(self, minsize, maxsize=None, at=None): + """ + :param minsize: The minimum size of the N-grams. + :param maxsize: The maximum size of the N-grams. If you omit this + parameter, maxsize == minsize. + :param at: If 'start', only take N-grams from the start of each word. + if 'end', only take N-grams from the end of each word. Otherwise, + take all N-grams from the word (the default). + """ + + self.min = minsize + self.max = maxsize or minsize + self.at = 0 + if at == "start": + self.at = -1 + elif at == "end": + self.at = 1 + + def __eq__(self, other): + return other and self.__class__ is other.__class__\ + and self.min == other.min and self.max == other.max + + def __call__(self, tokens): + assert hasattr(tokens, "__iter__") + at = self.at + for t in tokens: + text = t.text + if len(text) < self.min: + continue + + chars = t.chars + if chars: + startchar = t.startchar + # Token positions don't mean much for N-grams, + # so we'll leave the token's original position + # untouched. + + if t.mode == "query": + size = min(self.max, len(t.text)) + if at == -1: + t.text = text[:size] + if chars: + t.endchar = startchar + size + yield t + elif at == 1: + t.text = text[0 - size:] + if chars: + t.startchar = t.endchar - size + yield t + else: + for start in xrange(0, len(text) - size + 1): + t.text = text[start:start + size] + if chars: + t.startchar = startchar + start + t.endchar = startchar + start + size + yield t + else: + if at == -1: + limit = min(self.max, len(text)) + for size in xrange(self.min, limit + 1): + t.text = text[:size] + if chars: + t.endchar = startchar + size + yield t + + elif at == 1: + if chars: + original_startchar = t.startchar + start = max(0, len(text) - self.max) + for i in xrange(start, len(text) - self.min + 1): + t.text = text[i:] + if chars: + t.startchar = original_startchar + i + yield t + else: + for start in xrange(0, len(text) - self.min + 1): + for size in xrange(self.min, self.max + 1): + end = start + size + if end > len(text): + continue + + t.text = text[start:end] + + if chars: + t.startchar = startchar + start + t.endchar = startchar + end + + yield t + + +# Analyzers + +def NgramAnalyzer(minsize, maxsize=None): + """Composes an NgramTokenizer and a LowercaseFilter. + + >>> ana = NgramAnalyzer(4) + >>> [token.text for token in ana("hi there")] + ["hi t", "i th", " the", "ther", "here"] + """ + + return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter() + + +def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, at=None): + if not tokenizer: + tokenizer = RegexTokenizer() + return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, at=at) diff --git a/src/whoosh/analysis/tokenizers.py b/src/whoosh/analysis/tokenizers.py new file mode 100644 index 0000000..630ad46 --- /dev/null +++ b/src/whoosh/analysis/tokenizers.py @@ -0,0 +1,338 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.compat import u, text_type +from whoosh.analysis.acore import Composable, Token +from whoosh.util.text import rcompile + + +default_pattern = rcompile(r"\w+(\.?\w+)*") + + +# Tokenizers + + +class Tokenizer(Composable): + """Base class for Tokenizers. + """ + + def __eq__(self, other): + return other and self.__class__ is other.__class__ + + +class IDTokenizer(Tokenizer): + """Yields the entire input string as a single token. For use in indexed but + untokenized fields, such as a document's path. + + >>> idt = IDTokenizer() + >>> [token.text for token in idt("/a/b 123 alpha")] + ["/a/b 123 alpha"] + """ + + def __call__(self, value, positions=False, chars=False, + keeporiginal=False, removestops=True, + start_pos=0, start_char=0, mode='', **kwargs): + assert isinstance(value, text_type), "%r is not unicode" % value + t = Token(positions, chars, removestops=removestops, mode=mode, + **kwargs) + t.text = value + t.boost = 1.0 + if keeporiginal: + t.original = value + if positions: + t.pos = start_pos + 1 + if chars: + t.startchar = start_char + t.endchar = start_char + len(value) + yield t + + +class RegexTokenizer(Tokenizer): + """ + Uses a regular expression to extract tokens from text. + + >>> rex = RegexTokenizer() + >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))] + ["hi", "there", "3.141", "big", "time", "under_score"] + """ + + def __init__(self, expression=default_pattern, gaps=False): + """ + :param expression: A regular expression object or string. Each match + of the expression equals a token. Group 0 (the entire matched text) + is used as the text of the token. If you require more complicated + handling of the expression match, simply write your own tokenizer. + :param gaps: If True, the tokenizer *splits* on the expression, rather + than matching on the expression. + """ + + self.expression = rcompile(expression) + self.gaps = gaps + + def __eq__(self, other): + if self.__class__ is other.__class__: + if self.expression.pattern == other.expression.pattern: + return True + return False + + def __call__(self, value, positions=False, chars=False, keeporiginal=False, + removestops=True, start_pos=0, start_char=0, tokenize=True, + mode='', **kwargs): + """ + :param value: The unicode string to tokenize. + :param positions: Whether to record token positions in the token. + :param chars: Whether to record character offsets in the token. + :param start_pos: The position number of the first token. For example, + if you set start_pos=2, the tokens will be numbered 2,3,4,... + instead of 0,1,2,... + :param start_char: The offset of the first character of the first + token. For example, if you set start_char=2, the text "aaa bbb" + will have chars (2,5),(6,9) instead (0,3),(4,7). + :param tokenize: if True, the text should be tokenized. + """ + + assert isinstance(value, text_type), "%s is not unicode" % repr(value) + + t = Token(positions, chars, removestops=removestops, mode=mode, + **kwargs) + if not tokenize: + t.original = t.text = value + t.boost = 1.0 + if positions: + t.pos = start_pos + if chars: + t.startchar = start_char + t.endchar = start_char + len(value) + yield t + elif not self.gaps: + # The default: expression matches are used as tokens + for pos, match in enumerate(self.expression.finditer(value)): + t.text = match.group(0) + t.boost = 1.0 + if keeporiginal: + t.original = t.text + t.stopped = False + if positions: + t.pos = start_pos + pos + if chars: + t.startchar = start_char + match.start() + t.endchar = start_char + match.end() + yield t + else: + # When gaps=True, iterate through the matches and + # yield the text between them. + prevend = 0 + pos = start_pos + for match in self.expression.finditer(value): + start = prevend + end = match.start() + text = value[start:end] + if text: + t.text = text + t.boost = 1.0 + if keeporiginal: + t.original = t.text + t.stopped = False + if positions: + t.pos = pos + pos += 1 + if chars: + t.startchar = start_char + start + t.endchar = start_char + end + + yield t + + prevend = match.end() + + # If the last "gap" was before the end of the text, + # yield the last bit of text as a final token. + if prevend < len(value): + t.text = value[prevend:] + t.boost = 1.0 + if keeporiginal: + t.original = t.text + t.stopped = False + if positions: + t.pos = pos + if chars: + t.startchar = prevend + t.endchar = len(value) + yield t + + +class CharsetTokenizer(Tokenizer): + """Tokenizes and translates text according to a character mapping object. + Characters that map to None are considered token break characters. For all + other characters the map is used to translate the character. This is useful + for case and accent folding. + + This tokenizer loops character-by-character and so will likely be much + slower than :class:`RegexTokenizer`. + + One way to get a character mapping object is to convert a Sphinx charset + table file using :func:`whoosh.support.charset.charset_table_to_dict`. + + >>> from whoosh.support.charset import charset_table_to_dict + >>> from whoosh.support.charset import default_charset + >>> charmap = charset_table_to_dict(default_charset) + >>> chtokenizer = CharsetTokenizer(charmap) + >>> [t.text for t in chtokenizer(u'Stra\\xdfe ABC')] + [u'strase', u'abc'] + + The Sphinx charset table format is described at + http://www.sphinxsearch.com/docs/current.html#conf-charset-table. + """ + + __inittype__ = dict(charmap=str) + + def __init__(self, charmap): + """ + :param charmap: a mapping from integer character numbers to unicode + characters, as used by the unicode.translate() method. + """ + self.charmap = charmap + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.charmap == other.charmap) + + def __call__(self, value, positions=False, chars=False, keeporiginal=False, + removestops=True, start_pos=0, start_char=0, tokenize=True, + mode='', **kwargs): + """ + :param value: The unicode string to tokenize. + :param positions: Whether to record token positions in the token. + :param chars: Whether to record character offsets in the token. + :param start_pos: The position number of the first token. For example, + if you set start_pos=2, the tokens will be numbered 2,3,4,... + instead of 0,1,2,... + :param start_char: The offset of the first character of the first + token. For example, if you set start_char=2, the text "aaa bbb" + will have chars (2,5),(6,9) instead (0,3),(4,7). + :param tokenize: if True, the text should be tokenized. + """ + + assert isinstance(value, text_type), "%r is not unicode" % value + + t = Token(positions, chars, removestops=removestops, mode=mode, + **kwargs) + if not tokenize: + t.original = t.text = value + t.boost = 1.0 + if positions: + t.pos = start_pos + if chars: + t.startchar = start_char + t.endchar = start_char + len(value) + yield t + else: + text = u("") + charmap = self.charmap + pos = start_pos + startchar = currentchar = start_char + for char in value: + tchar = charmap[ord(char)] + if tchar: + text += tchar + else: + if currentchar > startchar: + t.text = text + t.boost = 1.0 + if keeporiginal: + t.original = t.text + if positions: + t.pos = pos + pos += 1 + if chars: + t.startchar = startchar + t.endchar = currentchar + yield t + startchar = currentchar + 1 + text = u("") + + currentchar += 1 + + if currentchar > startchar: + t.text = value[startchar:currentchar] + t.boost = 1.0 + if keeporiginal: + t.original = t.text + if positions: + t.pos = pos + if chars: + t.startchar = startchar + t.endchar = currentchar + yield t + + +def SpaceSeparatedTokenizer(): + """Returns a RegexTokenizer that splits tokens by whitespace. + + >>> sst = SpaceSeparatedTokenizer() + >>> [token.text for token in sst("hi there big-time, what's up")] + ["hi", "there", "big-time,", "what's", "up"] + """ + + return RegexTokenizer(r"[^ \t\r\n]+") + + +def CommaSeparatedTokenizer(): + """Splits tokens by commas. + + Note that the tokenizer calls unicode.strip() on each match of the regular + expression. + + >>> cst = CommaSeparatedTokenizer() + >>> [token.text for token in cst("hi there, what's , up")] + ["hi there", "what's", "up"] + """ + + from whoosh.analysis.filters import StripFilter + + return RegexTokenizer(r"[^,]+") | StripFilter() + + +class PathTokenizer(Tokenizer): + """A simple tokenizer that given a string ``"/a/b/c"`` yields tokens + ``["/a", "/a/b", "/a/b/c"]``. + """ + + def __init__(self, expression="[^/]+"): + self.expr = rcompile(expression) + + def __call__(self, value, positions=False, start_pos=0, **kwargs): + assert isinstance(value, text_type), "%r is not unicode" % value + token = Token(positions, **kwargs) + pos = start_pos + for match in self.expr.finditer(value): + token.text = value[:match.end()] + if positions: + token.pos = pos + pos += 1 + yield token + diff --git a/src/whoosh/automata/__init__.py b/src/whoosh/automata/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/whoosh/automata/fsa.py b/src/whoosh/automata/fsa.py new file mode 100644 index 0000000..187e562 --- /dev/null +++ b/src/whoosh/automata/fsa.py @@ -0,0 +1,714 @@ +from __future__ import print_function + +import itertools +import operator +import sys +from bisect import bisect_left +from collections import defaultdict + +from whoosh.compat import iteritems, next, text_type, unichr, xrange + + +unull = unichr(0) + + +# Marker constants + +class Marker(object): + def __init__(self, name): + self.name = name + + def __repr__(self): + return "<%s>" % self.name + + +EPSILON = Marker("EPSILON") +ANY = Marker("ANY") + + +# Base class + +class FSA(object): + def __init__(self, initial): + self.initial = initial + self.transitions = {} + self.final_states = set() + + def __len__(self): + return len(self.all_states()) + + def __eq__(self, other): + if self.initial != other.initial: + return False + if self.final_states != other.final_states: + return False + st = self.transitions + ot = other.transitions + if list(st) != list(ot): + return False + for key in st: + if st[key] != ot[key]: + return False + return True + + def all_states(self): + stateset = set(self.transitions) + for src, trans in iteritems(self.transitions): + stateset.update(trans.values()) + return stateset + + def all_labels(self): + labels = set() + for src, trans in iteritems(self.transitions): + labels.update(trans) + return labels + + def get_labels(self, src): + return iter(self.transitions.get(src, [])) + + def generate_all(self, state=None, sofar=""): + state = self.start() if state is None else state + if self.is_final(state): + yield sofar + for label in sorted(self.get_labels(state)): + newstate = self.next_state(state, label) + for string in self.generate_all(newstate, sofar + label): + yield string + + def start(self): + return self.initial + + def next_state(self, state, label): + raise NotImplementedError + + def is_final(self, state): + raise NotImplementedError + + def add_transition(self, src, label, dest): + raise NotImplementedError + + def add_final_state(self, state): + raise NotImplementedError + + def to_dfa(self): + raise NotImplementedError + + def accept(self, string, debug=False): + state = self.start() + + for label in string: + if debug: + print(" ", state, "->", label, "->") + + state = self.next_state(state, label) + if not state: + break + + return self.is_final(state) + + def append(self, fsa): + self.transitions.update(fsa.transitions) + for state in self.final_states: + self.add_transition(state, EPSILON, fsa.initial) + self.final_states = fsa.final_states + + +# Implementations + +class NFA(FSA): + def __init__(self, initial): + self.transitions = {} + self.final_states = set() + self.initial = initial + + def dump(self, stream=sys.stdout): + starts = self.start() + for src in self.transitions: + beg = "@" if src in starts else " " + print(beg, src, file=stream) + xs = self.transitions[src] + for label in xs: + dests = xs[label] + end = "||" if self.is_final(dests) else "" + + def start(self): + return frozenset(self._expand(set([self.initial]))) + + def add_transition(self, src, label, dest): + self.transitions.setdefault(src, {}).setdefault(label, set()).add(dest) + + def add_final_state(self, state): + self.final_states.add(state) + + def triples(self): + for src, trans in iteritems(self.transitions): + for label, dests in iteritems(trans): + for dest in dests: + yield src, label, dest + + def is_final(self, states): + return bool(self.final_states.intersection(states)) + + def _expand(self, states): + transitions = self.transitions + frontier = set(states) + while frontier: + state = frontier.pop() + if state in transitions and EPSILON in transitions[state]: + new_states = transitions[state][EPSILON].difference(states) + frontier.update(new_states) + states.update(new_states) + return states + + def next_state(self, states, label): + transitions = self.transitions + dest_states = set() + for state in states: + if state in transitions: + xs = transitions[state] + if label in xs: + dest_states.update(xs[label]) + if ANY in xs: + dest_states.update(xs[ANY]) + return frozenset(self._expand(dest_states)) + + def get_labels(self, states): + transitions = self.transitions + labels = set() + for state in states: + if state in transitions: + labels.update(transitions[state]) + return labels + + def embed(self, other): + # Copy all transitions from the other NFA into this one + for s, othertrans in iteritems(other.transitions): + trans = self.transitions.setdefault(s, {}) + for label, otherdests in iteritems(othertrans): + dests = trans.setdefault(label, set()) + dests.update(otherdests) + + def insert(self, src, other, dest): + self.embed(other) + + # Connect src to the other NFA's initial state, and the other + # NFA's final states to dest + self.add_transition(src, EPSILON, other.initial) + for finalstate in other.final_states: + self.add_transition(finalstate, EPSILON, dest) + + def to_dfa(self): + dfa = DFA(self.start()) + frontier = [self.start()] + seen = set() + while frontier: + current = frontier.pop() + if self.is_final(current): + dfa.add_final_state(current) + labels = self.get_labels(current) + for label in labels: + if label is EPSILON: + continue + new_state = self.next_state(current, label) + if new_state not in seen: + frontier.append(new_state) + seen.add(new_state) + if self.is_final(new_state): + dfa.add_final_state(new_state) + if label is ANY: + dfa.set_default_transition(current, new_state) + else: + dfa.add_transition(current, label, new_state) + return dfa + + +class DFA(FSA): + def __init__(self, initial): + self.initial = initial + self.transitions = {} + self.defaults = {} + self.final_states = set() + self.outlabels = {} + + def dump(self, stream=sys.stdout): + for src in sorted(self.transitions): + beg = "@" if src == self.initial else " " + print(beg, src, file=stream) + xs = self.transitions[src] + for label in sorted(xs): + dest = xs[label] + end = "||" if self.is_final(dest) else "" + + def start(self): + return self.initial + + def add_transition(self, src, label, dest): + self.transitions.setdefault(src, {})[label] = dest + + def set_default_transition(self, src, dest): + self.defaults[src] = dest + + def add_final_state(self, state): + self.final_states.add(state) + + def is_final(self, state): + return state in self.final_states + + def next_state(self, src, label): + trans = self.transitions.get(src, {}) + return trans.get(label, self.defaults.get(src, None)) + + def next_valid_string(self, string, asbytes=False): + state = self.start() + stack = [] + + # Follow the DFA as far as possible + for i, label in enumerate(string): + stack.append((string[:i], state, label)) + state = self.next_state(state, label) + if not state: + break + else: + stack.append((string[:i + 1], state, None)) + + if self.is_final(state): + # Word is already valid + return string + + # Perform a 'wall following' search for the lexicographically smallest + # accepting state. + while stack: + path, state, label = stack.pop() + label = self.find_next_edge(state, label, asbytes=asbytes) + if label: + path += label + state = self.next_state(state, label) + if self.is_final(state): + return path + stack.append((path, state, None)) + return None + + def find_next_edge(self, s, label, asbytes): + if label is None: + label = b"\x00" if asbytes else u'\0' + else: + label = (label + 1) if asbytes else unichr(ord(label) + 1) + trans = self.transitions.get(s, {}) + if label in trans or s in self.defaults: + return label + + try: + labels = self.outlabels[s] + except KeyError: + self.outlabels[s] = labels = sorted(trans) + + pos = bisect_left(labels, label) + if pos < len(labels): + return labels[pos] + return None + + def reachable_from(self, src, inclusive=True): + transitions = self.transitions + + reached = set() + if inclusive: + reached.add(src) + + stack = [src] + seen = set() + while stack: + src = stack.pop() + seen.add(src) + for _, dest in iteritems(transitions[src]): + reached.add(dest) + if dest not in seen: + stack.append(dest) + return reached + + def minimize(self): + transitions = self.transitions + initial = self.initial + + # Step 1: Delete unreachable states + reachable = self.reachable_from(initial) + for src in list(transitions): + if src not in reachable: + del transitions[src] + final_states = self.final_states.intersection(reachable) + labels = self.all_labels() + + # Step 2: Partition the states into equivalence sets + changed = True + parts = [final_states, reachable - final_states] + while changed: + changed = False + for i in xrange(len(parts)): + part = parts[i] + changed_part = False + for label in labels: + next_part = None + new_part = set() + for state in part: + dest = transitions[state].get(label) + if dest is not None: + if next_part is None: + for p in parts: + if dest in p: + next_part = p + elif dest not in next_part: + new_part.add(state) + changed = True + changed_part = True + if changed_part: + old_part = part - new_part + parts.pop(i) + parts.append(old_part) + parts.append(new_part) + break + + # Choose one state from each equivalence set and map all equivalent + # states to it + new_trans = {} + + # Create mapping + mapping = {} + new_initial = None + for part in parts: + representative = part.pop() + if representative is initial: + new_initial = representative + mapping[representative] = representative + new_trans[representative] = {} + for state in part: + if state is initial: + new_initial = representative + mapping[state] = representative + assert new_initial is not None + + # Apply mapping to existing transitions + new_finals = set(mapping[s] for s in final_states) + for state, d in iteritems(new_trans): + trans = transitions[state] + for label, dest in iteritems(trans): + d[label] = mapping[dest] + + # Remove dead states - non-final states with no outgoing arcs except + # to themselves + non_final_srcs = [src for src in new_trans if src not in new_finals] + removing = set() + for src in non_final_srcs: + dests = set(new_trans[src].values()) + dests.discard(src) + if not dests: + removing.add(src) + del new_trans[src] + # Delete transitions to removed dead states + for t in new_trans.values(): + for label in list(t): + if t[label] in removing: + del t[label] + + self.transitions = new_trans + self.initial = new_initial + self.final_states = new_finals + + def to_dfa(self): + return self + + +# Useful functions + +def renumber_dfa(dfa, base=0): + c = itertools.count(base) + mapping = {} + + def remap(state): + if state in mapping: + newnum = mapping[state] + else: + newnum = next(c) + mapping[state] = newnum + return newnum + + newdfa = DFA(remap(dfa.initial)) + for src, trans in iteritems(dfa.transitions): + for label, dest in iteritems(trans): + newdfa.add_transition(remap(src), label, remap(dest)) + for finalstate in dfa.final_states: + newdfa.add_final_state(remap(finalstate)) + for src, dest in iteritems(dfa.defaults): + newdfa.set_default_transition(remap(src), remap(dest)) + return newdfa + + +def u_to_utf8(dfa, base=0): + c = itertools.count(base) + transitions = dfa.transitions + + for src, trans in iteritems(transitions): + trans = transitions[src] + for label, dest in list(iteritems(trans)): + if label is EPSILON: + continue + elif label is ANY: + raise Exception + else: + assert isinstance(label, text_type) + label8 = label.encode("utf8") + for i, byte in enumerate(label8): + if i < len(label8) - 1: + st = next(c) + dfa.add_transition(src, byte, st) + src = st + else: + dfa.add_transition(src, byte, dest) + del trans[label] + + +def find_all_matches(dfa, lookup_func, first=unull): + """ + Uses lookup_func to find all words within levenshtein distance k of word. + + Args: + word: The word to look up + k: Maximum edit distance + lookup_func: A single argument function that returns the first word in the + database that is greater than or equal to the input argument. + Yields: + Every matching word within levenshtein distance k from the database. + """ + + match = dfa.next_valid_string(first) + while match: + key = lookup_func(match) + if key is None: + return + if match == key: + yield match + key += unull + match = dfa.next_valid_string(key) + + +# Construction functions + +def reverse_nfa(n): + s = object() + nfa = NFA(s) + for src, trans in iteritems(n.transitions): + for label, destset in iteritems(trans): + for dest in destset: + nfa.add_transition(dest, label, src) + for finalstate in n.final_states: + nfa.add_transition(s, EPSILON, finalstate) + nfa.add_final_state(n.initial) + return nfa + + +def product(dfa1, op, dfa2): + dfa1 = dfa1.to_dfa() + dfa2 = dfa2.to_dfa() + start = (dfa1.start(), dfa2.start()) + dfa = DFA(start) + stack = [start] + while stack: + src = stack.pop() + state1, state2 = src + trans1 = set(dfa1.transitions[state1]) + trans2 = set(dfa2.transitions[state2]) + for label in trans1.intersection(trans2): + state1 = dfa1.next_state(state1, label) + state2 = dfa2.next_state(state2, label) + if op(state1 is not None, state2 is not None): + dest = (state1, state2) + dfa.add_transition(src, label, dest) + stack.append(dest) + if op(dfa1.is_final(state1), dfa2.is_final(state2)): + dfa.add_final_state(dest) + return dfa + + +def intersection(dfa1, dfa2): + return product(dfa1, operator.and_, dfa2) + + +def union(dfa1, dfa2): + return product(dfa1, operator.or_, dfa2) + + +def epsilon_nfa(): + return basic_nfa(EPSILON) + + +def dot_nfa(): + return basic_nfa(ANY) + + +def basic_nfa(label): + s = object() + e = object() + nfa = NFA(s) + nfa.add_transition(s, label, e) + nfa.add_final_state(e) + return nfa + + +def charset_nfa(labels): + s = object() + e = object() + nfa = NFA(s) + for label in labels: + nfa.add_transition(s, label, e) + nfa.add_final_state(e) + return nfa + + +def string_nfa(string): + s = object() + e = object() + nfa = NFA(s) + for label in string: + e = object() + nfa.add_transition(s, label, e) + s = e + nfa.add_final_state(e) + return nfa + + +def choice_nfa(n1, n2): + s = object() + e = object() + nfa = NFA(s) + # -> nfa1 - + # / \ + # s e + # \ / + # -> nfa2 - + nfa.insert(s, n1, e) + nfa.insert(s, n2, e) + nfa.add_final_state(e) + return nfa + + +def concat_nfa(n1, n2): + s = object() + m = object() + e = object() + nfa = NFA(s) + nfa.insert(s, n1, m) + nfa.insert(m, n2, e) + nfa.add_final_state(e) + return nfa + + +def star_nfa(n): + s = object() + e = object() + nfa = NFA(s) + # -----<----- + # / \ + # s ---> n ---> e + # \ / + # ----->----- + + nfa.insert(s, n, e) + nfa.add_transition(s, EPSILON, e) + for finalstate in n.final_states: + nfa.add_transition(finalstate, EPSILON, s) + nfa.add_final_state(e) + return nfa + + +def plus_nfa(n): + return concat_nfa(n, star_nfa(n)) + + +def optional_nfa(n): + return choice_nfa(n, epsilon_nfa()) + + +# Daciuk Mihov DFA construction algorithm + +class DMNode(object): + def __init__(self, n): + self.n = n + self.arcs = {} + self.final = False + + def __repr__(self): + return "<%s, %r>" % (self.n, self.tuple()) + + def __hash__(self): + return hash(self.tuple()) + + def tuple(self): + arcs = tuple(sorted(iteritems(self.arcs))) + return arcs, self.final + + +def strings_dfa(strings): + dfa = DFA(0) + c = itertools.count(1) + + last = "" + seen = {} + nodes = [DMNode(0)] + + for string in strings: + if string <= last: + raise Exception("Strings must be in order") + if not string: + raise Exception("Can't add empty string") + + # Find the common prefix with the previous string + i = 0 + while i < len(last) and i < len(string) and last[i] == string[i]: + i += 1 + prefixlen = i + + # Freeze the transitions after the prefix, since they're not shared + add_suffix(dfa, nodes, last, prefixlen + 1, seen) + + # Create new nodes for the substring after the prefix + for label in string[prefixlen:]: + node = DMNode(next(c)) + # Create an arc from the previous node to this node + nodes[-1].arcs[label] = node.n + nodes.append(node) + # Mark the last node as an accept state + nodes[-1].final = True + + last = string + + if len(nodes) > 1: + add_suffix(dfa, nodes, last, 0, seen) + return dfa + + +def add_suffix(dfa, nodes, last, downto, seen): + while len(nodes) > downto: + node = nodes.pop() + tup = node.tuple() + + # If a node just like this one (final/nonfinal, same arcs to same + # destinations) is already seen, replace with it + try: + this = seen[tup] + except KeyError: + this = node.n + if node.final: + dfa.add_final_state(this) + seen[tup] = this + else: + # If we replaced the node with an already seen one, fix the parent + # node's pointer to this + parent = nodes[-1] + inlabel = last[len(nodes) - 1] + parent.arcs[inlabel] = this + + # Add the node's transitions to the DFA + for label, dest in iteritems(node.arcs): + dfa.add_transition(this, label, dest) + + + + diff --git a/src/whoosh/automata/glob.py b/src/whoosh/automata/glob.py new file mode 100644 index 0000000..b8fbc87 --- /dev/null +++ b/src/whoosh/automata/glob.py @@ -0,0 +1,90 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.automata.fsa import ANY, EPSILON, NFA + + +# Constants for glob +_LIT = 0 +_STAR = 1 +_PLUS = 2 +_QUEST = 3 +_RANGE = 4 + + +def parse_glob(pattern, _glob_multi="*", _glob_single="?", + _glob_range1="[", _glob_range2="]"): + pos = 0 + last = None + while pos < len(pattern): + char = pattern[pos] + pos += 1 + if char == _glob_multi: # * + # (Ignore more than one star in a row) + if last is not _STAR: + yield _STAR, None + last = _STAR + elif char == _glob_single: # ? + # (Ignore ? after a star) + if last is not _STAR: + yield _QUEST, None + last = _QUEST + elif char == _glob_range1: # [ + chars = set() + negate = False + # Take the char range specification until the ] + while pos < len(pattern): + char = pattern[pos] + pos += 1 + if char == _glob_range2: + break + chars.add(char) + if chars: + yield _RANGE, (chars, negate) + last = _RANGE + else: + yield _LIT, char + last = _LIT + + +def glob_automaton(pattern): + nfa = NFA(0) + i = -1 + for i, (op, arg) in enumerate(parse_glob(pattern)): + if op is _LIT: + nfa.add_transition(i, arg, i + 1) + elif op is _STAR: + nfa.add_transition(i, ANY, i + 1) + nfa.add_transition(i, EPSILON, i + 1) + nfa.add_transition(i + 1, EPSILON, i) + elif op is _QUEST: + nfa.add_transition(i, ANY, i + 1) + elif op is _RANGE: + for char in arg[0]: + nfa.add_transition(i, char, i + 1) + nfa.add_final_state(i + 1) + return nfa diff --git a/src/whoosh/automata/lev.py b/src/whoosh/automata/lev.py new file mode 100644 index 0000000..7067c64 --- /dev/null +++ b/src/whoosh/automata/lev.py @@ -0,0 +1,30 @@ +from __future__ import print_function + +from whoosh.compat import unichr, xrange +from whoosh.automata.fsa import ANY, EPSILON, NFA, unull + + +def levenshtein_automaton(term, k, prefix=0): + nfa = NFA((0, 0)) + if prefix: + for i in xrange(prefix): + c = term[i] + nfa.add_transition((i, 0), c, (i + 1, 0)) + + for i in xrange(prefix, len(term)): + c = term[i] + for e in xrange(k + 1): + # Correct character + nfa.add_transition((i, e), c, (i + 1, e)) + if e < k: + # Deletion + nfa.add_transition((i, e), ANY, (i, e + 1)) + # Insertion + nfa.add_transition((i, e), EPSILON, (i + 1, e + 1)) + # Substitution + nfa.add_transition((i, e), ANY, (i + 1, e + 1)) + for e in xrange(k + 1): + if e < k: + nfa.add_transition((len(term), e), ANY, (len(term), e + 1)) + nfa.add_final_state((len(term), e)) + return nfa diff --git a/src/whoosh/automata/nfa.py b/src/whoosh/automata/nfa.py new file mode 100644 index 0000000..6ea72be --- /dev/null +++ b/src/whoosh/automata/nfa.py @@ -0,0 +1,388 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.automata.fst import Arc + + +class Instruction(object): + def __repr__(self): + return "%s()" % (self.__class__.__name__, ) + + +class Char(Instruction): + """ + Matches a literal character. + """ + + def __init__(self, c): + self.c = c + + def __repr__(self): + return "Char(%r)" % self.c + +class Lit(Instruction): + """ + Matches a literal string. + """ + + def __init__(self, c): + self.c = c + + def __repr__(self): + return "Lit(%r)" % self.c + + +class Any(Instruction): + """ + Matches any character. + """ + + +class Match(Instruction): + """ + Stop this thread: the string matched. + """ + + def __repr__(self): + return "Match()" + + +class Jmp(Instruction): + """ + Jump to a specified instruction. + """ + + def __init__(self, x): + self.x = x + + def __repr__(self): + return "Jmp(%s)" % self.x + + +class Split(Instruction): + """ + Split execution: continue at two separate specified instructions. + """ + + def __init__(self, x, y): + self.x = x + self.y = y + + def __repr__(self): + return "Split(%s, %s)" % (self.x, self.y) + + +class Label(Instruction): + """ + Placeholder to act as a target for JMP instructions + """ + + def __hash__(self): + return id(self) + + def __repr__(self): + return "L(%s)" % hex(id(self)) + + +def concat(e1, e2): + return e1 + e2 + + +def alt(e1, e2): + L1, L2, L3 = Label(), Label(), Label() + return [L1] + e1 + [Jmp(L3), L2] + e2 + [L3] + + +def zero_or_one(e): + L1, L2 = Label(), Label() + return [Split(L1, L2), L1] + e + [L2] + + +def zero_or_more(e): + L1, L2, L3 = Label(), Label(), Label() + return [L1, Split(L2, L3), L2] + e + [Jmp(L1), L3] + + +def one_or_more(e): + L1, L2 = Label(), Label() + return [L1] + e + [Split(L1, L2), L2] + + +def fixup(program): + refs = {} + i = 0 + while i < len(program): + op = program[i] + if isinstance(op, Label): + refs[op] = i + program.pop(i) + else: + i += 1 + + if refs: + for op in program: + if isinstance(op, (Jmp, Split)): + op.x = refs[op.x] + if isinstance(op, Split): + op.y = refs[op.y] + + return program + [Match] + + +class ThreadList(object): + def __init__(self, program, max=1000): + self.program = program + self.max = max + self.threads = [] + + def __nonzero__(self): + return bool(self.threads) + + def current(self): + return self.threads.pop() + + def add(self, thread): + op = self.program[thread.pc] + optype = type(op) + if optype is Jmp: + self.add(thread.at(op.x)) + elif optype is Split: + self.add(thread.copy_at(op.x)) + self.add(thread.at(op.y)) + else: + self.threads.append(thread) + + +class Thread(object): + def __init__(self, pc, address, sofar='', accept=False): + self.pc = pc + self.address = address + self.sofar = sofar + self.accept = accept + + def at(self, pc): + self.pc = pc + return self + + def copy_at(self, pc): + return Thread(pc, self.address, self.sofar, self.accept) + + def __repr__(self): + d = self.__dict__ + return "Thread(%s)" % ",".join("%s=%r" % (k, v) for k, v in d.items()) + + +def advance(thread, arc, c): + thread.pc += 1 + thread.address = arc.target + thread.sofar += c + thread.accept = arc.accept + + +def run(graph, program, address): + threads = ThreadList(program) + threads.add(Thread(0, address)) + arc = Arc() + while threads: + thread = threads.current() + address = thread.address + op = program[thread.pc] + optype = type(op) + + if optype is Char: + if address: + arc = graph.find_arc(address, op.c, arc) + if arc: + advance(thread, arc) + threads.add(thread) + elif optype is Lit: + if address: + c = op.c + arc = graph.find_path(c, arc, address) + if arc: + advance(thread, arc, c) + threads.add(thread) + elif optype is Any: + if address: + sofar = thread.sofar + pc = thread.pc + 1 + for arc in graph.iter_arcs(address, arc): + t = Thread(pc, arc.target, sofar + arc.label, arc.accept) + threads.add(t) + elif op is Match: + if thread.accept: + yield thread.sofar + else: + raise Exception("Don't know what to do with %r" % op) + + +LO = 0 +HI = 1 + + +def regex_limit(graph, mode, program, address): + low = mode == LO + output = [] + threads = ThreadList(program) + threads.add(Thread(0, address)) + arc = Arc() + while threads: + thread = threads.current() + address = thread.address + op = program[thread.pc] + optype = type(op) + + if optype is Char: + if address: + arc = graph.find_arc(address, op.c, arc) + if arc: + if low and arc.accept: + return thread.sofar + thread.label + advance(thread, arc) + threads.add(thread) + elif optype is Lit: + if address: + labels = op.c + for label in labels: + arc = graph.find_arc(address, label) + if arc is None: + return thread.sofar + elif thread.accept: + return thread.sofar + elif optype is Any: + if address: + if low: + arc = graph.arc_at(address, arc) + else: + for arc in graph.iter_arcs(address): + pass + advance(thread, arc, arc.label) + threads.add(thread) + elif thread.accept: + return thread.sofar + elif op is Match: + return thread.sofar + else: + raise Exception("Don't know what to do with %r" % op) + + +# if __name__ == "__main__": +# from whoosh import index, query +# from whoosh.filedb.filestore import RamStorage +# from whoosh.automata import fst +# from whoosh.util.testing import timing +# +# st = RamStorage() +# gw = fst.GraphWriter(st.create_file("test")) +# gw.start_field("test") +# for key in ["aaaa", "aaab", "aabb", "abbb", "babb", "bbab", "bbba"]: +# gw.insert(key) +# gw.close() +# gr = fst.GraphReader(st.open_file("test")) +# +# program = one_or_more([Lit("a")]) +# print program +# program = fixup(program) +# print program +# print list(run(gr, program, gr.root("test"))) +# +# ix = index.open_dir("e:/dev/src/houdini/help/index") +# r = ix.reader() +# gr = r._get_graph() +# +# # program = fixup([Any(), Any(), Any(), Any(), Any()]) +# # program = fixup(concat(zero_or_more([Any()]), [Char("/")])) +# # with timing(): +# # x = list(run(gr, program, gr.root("path"))) +# # print len(x) +# +# q = query.Regex("path", "^.[abc].*/$") +# with timing(): +# y = list(q._btexts(r)) +# print len(y) +# print y[0], y[-1] +# +# pr = [Any()] + alt([Lit("c")], alt([Lit("b")], [Lit("a")])) + zero_or_more([Any()]) + [Lit("/")] +# program = fixup(pr) +# # with timing(): +# # x = list(run(gr, program, gr.root("path"))) +# # print len(x), x +# +# with timing(): +# print "lo=", regex_limit(gr, LO, program, gr.root("path")) +# print "hi=", regex_limit(gr, HI, program, gr.root("path")) +# +# +# +# #int +# #backtrackingvm(Inst *prog, char *input) +# #{ +# # enum { MAXTHREAD = 1000 }; +# # Thread ready[MAXTHREAD]; +# # int nready; +# # Inst *pc; +# # char *sp; +# # +# # /* queue initial thread */ +# # ready[0] = thread(prog, input); +# # nready = 1; +# # +# # /* run threads in stack order */ +# # while(nready > 0){ +# # --nready; /* pop state for next thread to run */ +# # pc = ready[nready].pc; +# # sp = ready[nready].sp; +# # for(;;){ +# # switch(pc->opcode){ +# # case Char: +# # if(*sp != pc->c) +# # goto Dead; +# # pc++; +# # sp++; +# # continue; +# # case Match: +# # return 1; +# # case Jmp: +# # pc = pc->x; +# # continue; +# # case Split: +# # if(nready >= MAXTHREAD){ +# # fprintf(stderr, "regexp overflow"); +# # return -1; +# # } +# # /* queue new thread */ +# # ready[nready++] = thread(pc->y, sp); +# # pc = pc->x; /* continue current thread */ +# # continue; +# # } +# # } +# # Dead:; +# # } +# # return 0; +# #} +# +# diff --git a/src/whoosh/automata/reg.py b/src/whoosh/automata/reg.py new file mode 100644 index 0000000..578071e --- /dev/null +++ b/src/whoosh/automata/reg.py @@ -0,0 +1,135 @@ +# Copyright 2014 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import re +from whoosh.automata.fsa import ANY, EPSILON, NFA + + +# Operator precedence +CHOICE = ("|", ) +ops = () + + +def parse(pattern): + stack = [] + ops = [] + + + + +class RegexBuilder(object): + def __init__(self): + self.statenum = 1 + + def new_state(self): + self.statenum += 1 + return self.statenum + + def epsilon(self): + s = self.new_state() + e = self.new_state() + nfa = NFA(s) + nfa.add_transition(s, EPSILON, e) + nfa.add_final_state(e) + return nfa + + def char(self, label): + s = self.new_state() + e = self.new_state() + nfa = NFA(s) + nfa.add_transition(s, label, e) + nfa.add_final_state(e) + return nfa + + def charset(self, chars): + s = self.new_state() + e = self.new_state() + nfa = NFA(s) + for char in chars: + nfa.add_transition(s, char, e) + nfa.add_final_state(e) + return e + + def dot(self): + s = self.new_state() + e = self.new_state() + nfa = NFA(s) + nfa.add_transition(s, ANY, e) + nfa.add_final_state(e) + return nfa + + def choice(self, n1, n2): + s = self.new_state() + s1 = self.new_state() + s2 = self.new_state() + e1 = self.new_state() + e2 = self.new_state() + e = self.new_state() + nfa = NFA(s) + nfa.add_transition(s, EPSILON, s1) + nfa.add_transition(s, EPSILON, s2) + nfa.insert(s1, n1, e1) + nfa.insert(s2, n2, e2) + nfa.add_transition(e1, EPSILON, e) + nfa.add_transition(e2, EPSILON, e) + nfa.add_final_state(e) + return nfa + + def concat(self, n1, n2): + s = self.new_state() + m = self.new_state() + e = self.new_state() + nfa = NFA(s) + nfa.insert(s, n1, m) + nfa.insert(m, n2, e) + nfa.add_final_state(e) + return nfa + + def star(self, n): + s = self.new_state() + m1 = self.new_state() + m2 = self.new_state() + e = self.new_state() + nfa = NFA(s) + nfa.add_transition(s, EPSILON, m1) + nfa.add_transition(s, EPSILON, e) + nfa.insert(m1, n, m2) + nfa.add_transition(m2, EPSILON, m1) + nfa.add_transition(m2, EPSILON, e) + nfa.add_final_state(e) + return nfa + + def plus(self, n): + return self.concat(n, self.star(n)) + + def question(self, n): + return self.choice(n, self.epsilon()) + + + + + diff --git a/src/whoosh/classify.py b/src/whoosh/classify.py new file mode 100755 index 0000000..628edf5 --- /dev/null +++ b/src/whoosh/classify.py @@ -0,0 +1,377 @@ +# Copyright 2008 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +"""Classes and functions for classifying and extracting information from +documents. +""" + +from __future__ import division +import random +from collections import defaultdict +from math import log + +from whoosh.compat import xrange, iteritems + + +# Expansion models + +class ExpansionModel(object): + def __init__(self, doc_count, field_length): + self.N = doc_count + self.collection_total = field_length + + if self.N: + self.mean_length = self.collection_total / self.N + else: + self.mean_length = 0 + + def normalizer(self, maxweight, top_total): + raise NotImplementedError + + def score(self, weight_in_top, weight_in_collection, top_total): + raise NotImplementedError + + +class Bo1Model(ExpansionModel): + def normalizer(self, maxweight, top_total): + f = maxweight / self.N + return (maxweight * log((1.0 + f) / f) + log(1.0 + f)) / log(2.0) + + def score(self, weight_in_top, weight_in_collection, top_total): + f = weight_in_collection / self.N + return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2) + + +class Bo2Model(ExpansionModel): + def normalizer(self, maxweight, top_total): + f = maxweight * self.N / self.collection_total + return maxweight * log((1.0 + f) / f, 2) + log(1.0 + f, 2) + + def score(self, weight_in_top, weight_in_collection, top_total): + f = weight_in_top * top_total / self.collection_total + return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2) + + +class KLModel(ExpansionModel): + def normalizer(self, maxweight, top_total): + return (maxweight * log(self.collection_total / top_total) / log(2.0) + * top_total) + + def score(self, weight_in_top, weight_in_collection, top_total): + wit_over_tt = weight_in_top / top_total + wic_over_ct = weight_in_collection / self.collection_total + + if wit_over_tt < wic_over_ct: + return 0 + else: + return wit_over_tt * log(wit_over_tt + / (weight_in_top / self.collection_total), + 2) + + +class Expander(object): + """Uses an ExpansionModel to expand the set of query terms based on the top + N result documents. + """ + + def __init__(self, ixreader, fieldname, model=Bo1Model): + """ + :param reader: A :class:whoosh.reading.IndexReader object. + :param fieldname: The name of the field in which to search. + :param model: (classify.ExpansionModel) The model to use for expanding + the query terms. If you omit this parameter, the expander uses + :class:`Bo1Model` by default. + """ + + self.ixreader = ixreader + self.fieldname = fieldname + doccount = self.ixreader.doc_count_all() + fieldlen = self.ixreader.field_length(fieldname) + + if type(model) is type: + model = model(doccount, fieldlen) + self.model = model + + # Maps words to their weight in the top N documents. + self.topN_weight = defaultdict(float) + + # Total weight of all terms in the top N documents. + self.top_total = 0 + + def add(self, vector): + """Adds forward-index information about one of the "top N" documents. + + :param vector: A series of (text, weight) tuples, such as is + returned by Reader.vector_as("weight", docnum, fieldname). + """ + + total_weight = 0 + topN_weight = self.topN_weight + + for word, weight in vector: + total_weight += weight + topN_weight[word] += weight + + self.top_total += total_weight + + def add_document(self, docnum): + ixreader = self.ixreader + if self.ixreader.has_vector(docnum, self.fieldname): + self.add(ixreader.vector_as("weight", docnum, self.fieldname)) + elif self.ixreader.schema[self.fieldname].stored: + self.add_text(ixreader.stored_fields(docnum).get(self.fieldname)) + else: + raise Exception("Field %r in document %s is not vectored or stored" + % (self.fieldname, docnum)) + + def add_text(self, string): + # Unfortunately since field.index() yields bytes texts, and we want + # unicode, we end up encoding and decoding unnecessarily. + # + # TODO: Find a way around this + + field = self.ixreader.schema[self.fieldname] + from_bytes = field.from_bytes + self.add((from_bytes(text), weight) for text, _, weight, _ + in field.index(string)) + + def expanded_terms(self, number, normalize=True): + """Returns the N most important terms in the vectors added so far. + + :param number: The number of terms to return. + :param normalize: Whether to normalize the weights. + :returns: A list of ("term", weight) tuples. + """ + + model = self.model + fieldname = self.fieldname + ixreader = self.ixreader + field = ixreader.schema[fieldname] + tlist = [] + maxweight = 0 + + # If no terms have been added, return an empty list + if not self.topN_weight: + return [] + + for word, weight in iteritems(self.topN_weight): + btext = field.to_bytes(word) + if (fieldname, btext) in ixreader: + cf = ixreader.frequency(fieldname, btext) + score = model.score(weight, cf, self.top_total) + if score > maxweight: + maxweight = score + tlist.append((score, word)) + + if normalize: + norm = model.normalizer(maxweight, self.top_total) + else: + norm = maxweight + tlist = [(weight / norm, t) for weight, t in tlist] + tlist.sort(key=lambda x: (0 - x[0], x[1])) + + return [(t, weight) for weight, t in tlist[:number]] + + +# Similarity functions + +def shingles(input, size=2): + d = defaultdict(int) + for shingle in (input[i:i + size] + for i in xrange(len(input) - (size - 1))): + d[shingle] += 1 + return iteritems(d) + + +def simhash(features, hashbits=32): + if hashbits == 32: + hashfn = hash + else: + hashfn = lambda s: _hash(s, hashbits) + + vs = [0] * hashbits + for feature, weight in features: + h = hashfn(feature) + for i in xrange(hashbits): + if h & (1 << i): + vs[i] += weight + else: + vs[i] -= weight + + out = 0 + for i, v in enumerate(vs): + if v > 0: + out |= 1 << i + return out + + +def _hash(s, hashbits): + # A variable-length version of Python's builtin hash + if s == "": + return 0 + else: + x = ord(s[0]) << 7 + m = 1000003 + mask = 2 ** hashbits - 1 + for c in s: + x = ((x * m) ^ ord(c)) & mask + x ^= len(s) + if x == -1: + x = -2 + return x + + +def hamming_distance(first_hash, other_hash, hashbits=32): + x = (first_hash ^ other_hash) & ((1 << hashbits) - 1) + tot = 0 + while x: + tot += 1 + x &= x - 1 + return tot + + +# Clustering + +def kmeans(data, k, t=0.0001, distfun=None, maxiter=50, centers=None): + """ + One-dimensional K-means clustering function. + + :param data: list of data points. + :param k: number of clusters. + :param t: tolerance; stop if changes between iterations are smaller than + this value. + :param distfun: a distance function. + :param centers: a list of centroids to start with. + :param maxiter: maximum number of iterations to run. + """ + + # Adapted from a C version by Roger Zhang, + # http://cs.smu.ca/~r_zhang/code/kmeans.c + + DOUBLE_MAX = 1.797693e308 + n = len(data) + + error = DOUBLE_MAX # sum of squared euclidean distance + + counts = [0] * k # size of each cluster + labels = [0] * n # output cluster label for each data point + + # c1 is an array of len k of the temp centroids + c1 = [0] * k + + # choose k initial centroids + if centers: + c = centers + else: + c = random.sample(data, k) + + niter = 0 + # main loop + while True: + # save error from last step + old_error = error + error = 0 + + # clear old counts and temp centroids + for i in xrange(k): + counts[i] = 0 + c1[i] = 0 + + for h in xrange(n): + # identify the closest cluster + min_distance = DOUBLE_MAX + for i in xrange(k): + distance = (data[h] - c[i]) ** 2 + if distance < min_distance: + labels[h] = i + min_distance = distance + + # update size and temp centroid of the destination cluster + c1[labels[h]] += data[h] + counts[labels[h]] += 1 + # update standard error + error += min_distance + + for i in xrange(k): # update all centroids + c[i] = c1[i] / counts[i] if counts[i] else c1[i] + + niter += 1 + if (abs(error - old_error) < t) or (niter > maxiter): + break + + return labels, c + + +# Sliding window clusters + +def two_pass_variance(data): + n = 0 + sum1 = 0 + sum2 = 0 + + for x in data: + n += 1 + sum1 = sum1 + x + + mean = sum1 / n + + for x in data: + sum2 += (x - mean) * (x - mean) + + variance = sum2 / (n - 1) + return variance + + +def weighted_incremental_variance(data_weight_pairs): + mean = 0 + S = 0 + sumweight = 0 + for x, weight in data_weight_pairs: + temp = weight + sumweight + Q = x - mean + R = Q * weight / temp + S += sumweight * Q * R + mean += R + sumweight = temp + Variance = S / (sumweight - 1) # if sample is the population, omit -1 + return Variance + + +def swin(data, size): + clusters = [] + for i, left in enumerate(data): + j = i + right = data[j] + while j < len(data) - 1 and right - left < size: + j += 1 + right = data[j] + v = 99999 + if j - i > 1: + v = two_pass_variance(data[i:j + 1]) + clusters.append((left, right, j - i, v)) + clusters.sort(key=lambda x: (0 - x[2], x[3])) + return clusters diff --git a/src/whoosh/codec/__init__.py b/src/whoosh/codec/__init__.py new file mode 100644 index 0000000..7044563 --- /dev/null +++ b/src/whoosh/codec/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + + +def default_codec(*args, **kwargs): + from whoosh.codec.whoosh3 import W3Codec + + return W3Codec(*args, **kwargs) diff --git a/src/whoosh/codec/base.py b/src/whoosh/codec/base.py new file mode 100644 index 0000000..159a978 --- /dev/null +++ b/src/whoosh/codec/base.py @@ -0,0 +1,843 @@ +# Copyright 2011 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module contains base classes/interfaces for "codec" objects. +""" + +from bisect import bisect_right + +from whoosh import columns +from whoosh.automata import lev +from whoosh.compat import abstractmethod, izip, unichr, xrange +from whoosh.filedb.compound import CompoundStorage +from whoosh.system import emptybytes +from whoosh.util import random_name + + +# Exceptions + +class OutOfOrderError(Exception): + pass + + +# Base classes + +class Codec(object): + length_stats = True + + # Per document value writer + + @abstractmethod + def per_document_writer(self, storage, segment): + raise NotImplementedError + + # Inverted index writer + + @abstractmethod + def field_writer(self, storage, segment): + raise NotImplementedError + + # Postings + + @abstractmethod + def postings_writer(self, dbfile, byteids=False): + raise NotImplementedError + + @abstractmethod + def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): + raise NotImplementedError + + # Index readers + + def automata(self, storage, segment): + return Automata() + + @abstractmethod + def terms_reader(self, storage, segment): + raise NotImplementedError + + @abstractmethod + def per_document_reader(self, storage, segment): + raise NotImplementedError + + # Segments and generations + + @abstractmethod + def new_segment(self, storage, indexname): + raise NotImplementedError + + +class WrappingCodec(Codec): + def __init__(self, child): + self._child = child + + def per_document_writer(self, storage, segment): + return self._child.per_document_writer(storage, segment) + + def field_writer(self, storage, segment): + return self._child.field_writer(storage, segment) + + def postings_writer(self, dbfile, byteids=False): + return self._child.postings_writer(dbfile, byteids=byteids) + + def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): + return self._child.postings_reader(dbfile, terminfo, format_, term=term, + scorer=scorer) + + def automata(self, storage, segment): + return self._child.automata(storage, segment) + + def terms_reader(self, storage, segment): + return self._child.terms_reader(storage, segment) + + def per_document_reader(self, storage, segment): + return self._child.per_document_reader(storage, segment) + + def new_segment(self, storage, indexname): + return self._child.new_segment(storage, indexname) + + +# Writer classes + +class PerDocumentWriter(object): + @abstractmethod + def start_doc(self, docnum): + raise NotImplementedError + + @abstractmethod + def add_field(self, fieldname, fieldobj, value, length): + raise NotImplementedError + + @abstractmethod + def add_column_value(self, fieldname, columnobj, value): + raise NotImplementedError("Codec does not implement writing columns") + + @abstractmethod + def add_vector_items(self, fieldname, fieldobj, items): + raise NotImplementedError + + def add_vector_matcher(self, fieldname, fieldobj, vmatcher): + def readitems(): + while vmatcher.is_active(): + text = vmatcher.id() + weight = vmatcher.weight() + valuestring = vmatcher.value() + yield (text, weight, valuestring) + vmatcher.next() + self.add_vector_items(fieldname, fieldobj, readitems()) + + def finish_doc(self): + pass + + def close(self): + pass + + +class FieldWriter(object): + def add_postings(self, schema, lengths, items): + # This method translates a generator of (fieldname, btext, docnum, w, v) + # postings into calls to start_field(), start_term(), add(), + # finish_term(), finish_field(), etc. + + start_field = self.start_field + start_term = self.start_term + add = self.add + finish_term = self.finish_term + finish_field = self.finish_field + + if lengths: + dfl = lengths.doc_field_length + else: + dfl = lambda docnum, fieldname: 0 + + # The fieldname of the previous posting + lastfn = None + # The bytes text of the previous posting + lasttext = None + # The (fieldname, btext) of the previous spelling posting + lastspell = None + # The field object for the current field + fieldobj = None + for fieldname, btext, docnum, weight, value in items: + # Check for out-of-order postings. This is convoluted because Python + # 3 removed the ability to compare a string to None + if lastfn is not None and fieldname < lastfn: + raise OutOfOrderError("Field %r .. %r" % (lastfn, fieldname)) + if fieldname == lastfn and lasttext and btext < lasttext: + raise OutOfOrderError("Term %s:%r .. %s:%r" + % (lastfn, lasttext, fieldname, btext)) + + # If the fieldname of this posting is different from the last one, + # tell the writer we're starting a new field + if fieldname != lastfn: + if lasttext is not None: + finish_term() + if lastfn is not None and fieldname != lastfn: + finish_field() + fieldobj = schema[fieldname] + start_field(fieldname, fieldobj) + lastfn = fieldname + lasttext = None + + # HACK: items where docnum == -1 indicate words that should be added + # to the spelling graph, not the postings + if docnum == -1: + # spellterm = (fieldname, btext) + # # There can be duplicates of spelling terms, so only add a spell + # # term if it's greater than the last one + # if lastspell is None or spellterm > lastspell: + # spellword = fieldobj.from_bytes(btext) + # self.add_spell_word(fieldname, spellword) + # lastspell = spellterm + continue + + # If this term is different from the term in the previous posting, + # tell the writer to start a new term + if btext != lasttext: + if lasttext is not None: + finish_term() + start_term(btext) + lasttext = btext + + # Add this posting + length = dfl(docnum, fieldname) + if value is None: + value = emptybytes + add(docnum, weight, value, length) + + if lasttext is not None: + finish_term() + if lastfn is not None: + finish_field() + + @abstractmethod + def start_field(self, fieldname, fieldobj): + raise NotImplementedError + + @abstractmethod + def start_term(self, text): + raise NotImplementedError + + @abstractmethod + def add(self, docnum, weight, vbytes, length): + raise NotImplementedError + + def add_spell_word(self, fieldname, text): + raise NotImplementedError + + @abstractmethod + def finish_term(self): + raise NotImplementedError + + def finish_field(self): + pass + + def close(self): + pass + + +# Postings + +class PostingsWriter(object): + @abstractmethod + def start_postings(self, format_, terminfo): + raise NotImplementedError + + @abstractmethod + def add_posting(self, id_, weight, vbytes, length=None): + raise NotImplementedError + + def finish_postings(self): + pass + + @abstractmethod + def written(self): + """Returns True if this object has already written to disk. + """ + + raise NotImplementedError + + +# Reader classes + +class FieldCursor(object): + def first(self): + raise NotImplementedError + + def find(self, string): + raise NotImplementedError + + def next(self): + raise NotImplementedError + + def term(self): + raise NotImplementedError + + +class TermsReader(object): + @abstractmethod + def __contains__(self, term): + raise NotImplementedError + + @abstractmethod + def cursor(self, fieldname, fieldobj): + raise NotImplementedError + + @abstractmethod + def terms(self): + raise NotImplementedError + + @abstractmethod + def terms_from(self, fieldname, prefix): + raise NotImplementedError + + @abstractmethod + def items(self): + raise NotImplementedError + + @abstractmethod + def items_from(self, fieldname, prefix): + raise NotImplementedError + + @abstractmethod + def term_info(self, fieldname, text): + raise NotImplementedError + + @abstractmethod + def frequency(self, fieldname, text): + return self.term_info(fieldname, text).weight() + + @abstractmethod + def doc_frequency(self, fieldname, text): + return self.term_info(fieldname, text).doc_frequency() + + @abstractmethod + def matcher(self, fieldname, text, format_, scorer=None): + raise NotImplementedError + + @abstractmethod + def indexed_field_names(self): + raise NotImplementedError + + def close(self): + pass + + +class Automata(object): + @staticmethod + def levenshtein_dfa(uterm, maxdist, prefix=0): + return lev.levenshtein_automaton(uterm, maxdist, prefix).to_dfa() + + @staticmethod + def find_matches(dfa, cur): + unull = unichr(0) + + term = cur.text() + if term is None: + return + + match = dfa.next_valid_string(term) + while match: + cur.find(match) + term = cur.text() + if term is None: + return + if match == term: + yield match + term += unull + match = dfa.next_valid_string(term) + + def terms_within(self, fieldcur, uterm, maxdist, prefix=0): + dfa = self.levenshtein_dfa(uterm, maxdist, prefix) + return self.find_matches(dfa, fieldcur) + + +# Per-doc value reader + +class PerDocumentReader(object): + def close(self): + pass + + @abstractmethod + def doc_count(self): + raise NotImplementedError + + @abstractmethod + def doc_count_all(self): + raise NotImplementedError + + # Deletions + + @abstractmethod + def has_deletions(self): + raise NotImplementedError + + @abstractmethod + def is_deleted(self, docnum): + raise NotImplementedError + + @abstractmethod + def deleted_docs(self): + raise NotImplementedError + + def all_doc_ids(self): + """ + Returns an iterator of all (undeleted) document IDs in the reader. + """ + + is_deleted = self.is_deleted + return (docnum for docnum in xrange(self.doc_count_all()) + if not is_deleted(docnum)) + + def iter_docs(self): + for docnum in self.all_doc_ids(): + yield docnum, self.stored_fields(docnum) + + # Columns + + def supports_columns(self): + return False + + def has_column(self, fieldname): + return False + + def list_columns(self): + raise NotImplementedError + + # Don't need to override this if supports_columns() returns False + def column_reader(self, fieldname, column): + raise NotImplementedError + + # Bitmaps + + def field_docs(self, fieldname): + return None + + # Lengths + + @abstractmethod + def doc_field_length(self, docnum, fieldname, default=0): + raise NotImplementedError + + @abstractmethod + def field_length(self, fieldname): + raise NotImplementedError + + @abstractmethod + def min_field_length(self, fieldname): + raise NotImplementedError + + @abstractmethod + def max_field_length(self, fieldname): + raise NotImplementedError + + # Vectors + + def has_vector(self, docnum, fieldname): + return False + + # Don't need to override this if has_vector() always returns False + def vector(self, docnum, fieldname, format_): + raise NotImplementedError + + # Stored + + @abstractmethod + def stored_fields(self, docnum): + raise NotImplementedError + + def all_stored_fields(self): + for docnum in self.all_doc_ids(): + yield self.stored_fields(docnum) + + +# Segment base class + +class Segment(object): + """Do not instantiate this object directly. It is used by the Index object + to hold information about a segment. A list of objects of this class are + pickled as part of the TOC file. + + The TOC file stores a minimal amount of information -- mostly a list of + Segment objects. Segments are the real reverse indexes. Having multiple + segments allows quick incremental indexing: just create a new segment for + the new documents, and have the index overlay the new segment over previous + ones for purposes of reading/search. "Optimizing" the index combines the + contents of existing segments into one (removing any deleted documents + along the way). + """ + + # Extension for compound segment files + COMPOUND_EXT = ".seg" + + # self.indexname + # self.segid + + def __init__(self, indexname): + self.indexname = indexname + self.segid = self._random_id() + self.compound = False + + @classmethod + def _random_id(cls, size=16): + return random_name(size=size) + + def __repr__(self): + return "<%s %s>" % (self.__class__.__name__, self.segment_id()) + + def codec(self): + raise NotImplementedError + + def index_name(self): + return self.indexname + + def segment_id(self): + if hasattr(self, "name"): + # Old segment class + return self.name + else: + return "%s_%s" % (self.index_name(), self.segid) + + def is_compound(self): + if not hasattr(self, "compound"): + return False + return self.compound + + # File convenience methods + + def make_filename(self, ext): + return "%s%s" % (self.segment_id(), ext) + + def list_files(self, storage): + prefix = "%s." % self.segment_id() + return [name for name in storage.list() if name.startswith(prefix)] + + def create_file(self, storage, ext, **kwargs): + """Convenience method to create a new file in the given storage named + with this segment's ID and the given extension. Any keyword arguments + are passed to the storage's create_file method. + """ + + fname = self.make_filename(ext) + return storage.create_file(fname, **kwargs) + + def open_file(self, storage, ext, **kwargs): + """Convenience method to open a file in the given storage named with + this segment's ID and the given extension. Any keyword arguments are + passed to the storage's open_file method. + """ + + fname = self.make_filename(ext) + return storage.open_file(fname, **kwargs) + + def create_compound_file(self, storage): + segfiles = self.list_files(storage) + assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles) + cfile = self.create_file(storage, self.COMPOUND_EXT) + CompoundStorage.assemble(cfile, storage, segfiles) + for name in segfiles: + storage.delete_file(name) + self.compound = True + + def open_compound_file(self, storage): + name = self.make_filename(self.COMPOUND_EXT) + dbfile = storage.open_file(name) + return CompoundStorage(dbfile, use_mmap=storage.supports_mmap) + + # Abstract methods + + @abstractmethod + def doc_count_all(self): + """ + Returns the total number of documents, DELETED OR UNDELETED, in this + segment. + """ + + raise NotImplementedError + + def doc_count(self): + """ + Returns the number of (undeleted) documents in this segment. + """ + + return self.doc_count_all() - self.deleted_count() + + def set_doc_count(self, doccount): + raise NotImplementedError + + def has_deletions(self): + """ + Returns True if any documents in this segment are deleted. + """ + + return self.deleted_count() > 0 + + @abstractmethod + def deleted_count(self): + """ + Returns the total number of deleted documents in this segment. + """ + + raise NotImplementedError + + @abstractmethod + def deleted_docs(self): + raise NotImplementedError + + @abstractmethod + def delete_document(self, docnum, delete=True): + """Deletes the given document number. The document is not actually + removed from the index until it is optimized. + + :param docnum: The document number to delete. + :param delete: If False, this undeletes a deleted document. + """ + + raise NotImplementedError + + @abstractmethod + def is_deleted(self, docnum): + """ + Returns True if the given document number is deleted. + """ + + raise NotImplementedError + + def should_assemble(self): + return True + + +# Wrapping Segment + +class WrappingSegment(Segment): + def __init__(self, child): + self._child = child + + def codec(self): + return self._child.codec() + + def index_name(self): + return self._child.index_name() + + def segment_id(self): + return self._child.segment_id() + + def is_compound(self): + return self._child.is_compound() + + def should_assemble(self): + return self._child.should_assemble() + + def make_filename(self, ext): + return self._child.make_filename(ext) + + def list_files(self, storage): + return self._child.list_files(storage) + + def create_file(self, storage, ext, **kwargs): + return self._child.create_file(storage, ext, **kwargs) + + def open_file(self, storage, ext, **kwargs): + return self._child.open_file(storage, ext, **kwargs) + + def create_compound_file(self, storage): + return self._child.create_compound_file(storage) + + def open_compound_file(self, storage): + return self._child.open_compound_file(storage) + + def delete_document(self, docnum, delete=True): + return self._child.delete_document(docnum, delete=delete) + + def has_deletions(self): + return self._child.has_deletions() + + def deleted_count(self): + return self._child.deleted_count() + + def deleted_docs(self): + return self._child.deleted_docs() + + def is_deleted(self, docnum): + return self._child.is_deleted(docnum) + + def set_doc_count(self, doccount): + self._child.set_doc_count(doccount) + + def doc_count(self): + return self._child.doc_count() + + def doc_count_all(self): + return self._child.doc_count_all() + + +# Multi per doc reader + +class MultiPerDocumentReader(PerDocumentReader): + def __init__(self, readers, offset=0): + self._readers = readers + + self._doc_offsets = [] + self._doccount = 0 + for pdr in readers: + self._doc_offsets.append(self._doccount) + self._doccount += pdr.doc_count_all() + + self.is_closed = False + + def close(self): + for r in self._readers: + r.close() + self.is_closed = True + + def doc_count_all(self): + return self._doccount + + def doc_count(self): + total = 0 + for r in self._readers: + total += r.doc_count() + return total + + def _document_reader(self, docnum): + return max(0, bisect_right(self._doc_offsets, docnum) - 1) + + def _reader_and_docnum(self, docnum): + rnum = self._document_reader(docnum) + offset = self._doc_offsets[rnum] + return rnum, docnum - offset + + # Deletions + + def has_deletions(self): + return any(r.has_deletions() for r in self._readers) + + def is_deleted(self, docnum): + x, y = self._reader_and_docnum(docnum) + return self._readers[x].is_deleted(y) + + def deleted_docs(self): + for r, offset in izip(self._readers, self._doc_offsets): + for docnum in r.deleted_docs(): + yield docnum + offset + + def all_doc_ids(self): + for r, offset in izip(self._readers, self._doc_offsets): + for docnum in r.all_doc_ids(): + yield docnum + offset + + # Columns + + def has_column(self, fieldname): + return any(r.has_column(fieldname) for r in self._readers) + + def column_reader(self, fieldname, column): + if not self.has_column(fieldname): + raise ValueError("No column %r" % (fieldname,)) + + default = column.default_value() + colreaders = [] + for r in self._readers: + if r.has_column(fieldname): + cr = r.column_reader(fieldname, column) + else: + cr = columns.EmptyColumnReader(default, r.doc_count_all()) + colreaders.append(cr) + + if len(colreaders) == 1: + return colreaders[0] + else: + return columns.MultiColumnReader(colreaders) + + # Lengths + + def doc_field_length(self, docnum, fieldname, default=0): + x, y = self._reader_and_docnum(docnum) + return self._readers[x].doc_field_length(y, fieldname, default) + + def field_length(self, fieldname): + total = 0 + for r in self._readers: + total += r.field_length(fieldname) + return total + + def min_field_length(self): + return min(r.min_field_length() for r in self._readers) + + def max_field_length(self): + return max(r.max_field_length() for r in self._readers) + + +# Extended base classes + +class PerDocWriterWithColumns(PerDocumentWriter): + def __init__(self): + PerDocumentWriter.__init__(self) + # Implementations need to set these attributes + self._storage = None + self._segment = None + self._docnum = None + + @abstractmethod + def _has_column(self, fieldname): + raise NotImplementedError + + @abstractmethod + def _create_column(self, fieldname, column): + raise NotImplementedError + + @abstractmethod + def _get_column(self, fieldname): + raise NotImplementedError + + def add_column_value(self, fieldname, column, value): + if not self._has_column(fieldname): + self._create_column(fieldname, column) + self._get_column(fieldname).add(self._docnum, value) + + +# FieldCursor implementations + +class EmptyCursor(FieldCursor): + def first(self): + return None + + def find(self, term): + return None + + def next(self): + return None + + def text(self): + return None + + def term_info(self): + return None + + def is_valid(self): + return False diff --git a/src/whoosh/codec/memory.py b/src/whoosh/codec/memory.py new file mode 100644 index 0000000..5a5babe --- /dev/null +++ b/src/whoosh/codec/memory.py @@ -0,0 +1,334 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import with_statement +from bisect import bisect_left +from threading import Lock, RLock + +from whoosh.compat import xrange +from whoosh.codec import base +from whoosh.matching import ListMatcher +from whoosh.reading import SegmentReader, TermInfo, TermNotFound +from whoosh.writing import SegmentWriter + + +class MemWriter(SegmentWriter): + def commit(self): + self._finalize_segment() + + +class MemoryCodec(base.Codec): + def __init__(self): + from whoosh.filedb.filestore import RamStorage + + self.storage = RamStorage() + self.segment = MemSegment(self, "blah") + + def writer(self, schema): + ix = self.storage.create_index(schema) + return MemWriter(ix, _lk=False, codec=self, + docbase=self.segment._doccount) + + def reader(self, schema): + return SegmentReader(self.storage, schema, self.segment, codec=self) + + def per_document_writer(self, storage, segment): + return MemPerDocWriter(self.storage, self.segment) + + def field_writer(self, storage, segment): + return MemFieldWriter(self.storage, self.segment) + + def per_document_reader(self, storage, segment): + return MemPerDocReader(self.storage, self.segment) + + def terms_reader(self, storage, segment): + return MemTermsReader(self.storage, self.segment) + + def new_segment(self, storage, indexname): + return self.segment + + +class MemPerDocWriter(base.PerDocWriterWithColumns): + def __init__(self, storage, segment): + self._storage = storage + self._segment = segment + self.is_closed = False + self._colwriters = {} + self._doccount = 0 + + def _has_column(self, fieldname): + return fieldname in self._colwriters + + def _create_column(self, fieldname, column): + colfile = self._storage.create_file("%s.c" % fieldname) + self._colwriters[fieldname] = (colfile, column.writer(colfile)) + + def _get_column(self, fieldname): + return self._colwriters[fieldname][1] + + def start_doc(self, docnum): + self._doccount += 1 + self._docnum = docnum + self._stored = {} + self._lengths = {} + self._vectors = {} + + def add_field(self, fieldname, fieldobj, value, length): + if value is not None: + self._stored[fieldname] = value + if length is not None: + self._lengths[fieldname] = length + + def add_vector_items(self, fieldname, fieldobj, items): + self._vectors[fieldname] = tuple(items) + + def finish_doc(self): + with self._segment._lock: + docnum = self._docnum + self._segment._stored[docnum] = self._stored + self._segment._lengths[docnum] = self._lengths + self._segment._vectors[docnum] = self._vectors + + def close(self): + colwriters = self._colwriters + for fieldname in colwriters: + colfile, colwriter = colwriters[fieldname] + colwriter.finish(self._doccount) + colfile.close() + self.is_closed = True + + +class MemPerDocReader(base.PerDocumentReader): + def __init__(self, storage, segment): + self._storage = storage + self._segment = segment + + def doc_count(self): + return self._segment.doc_count() + + def doc_count_all(self): + return self._segment.doc_count_all() + + def has_deletions(self): + return self._segment.has_deletions() + + def is_deleted(self, docnum): + return self._segment.is_deleted(docnum) + + def deleted_docs(self): + return self._segment.deleted_docs() + + def supports_columns(self): + return True + + def has_column(self, fieldname): + filename = "%s.c" % fieldname + return self._storage.file_exists(filename) + + def column_reader(self, fieldname, column): + filename = "%s.c" % fieldname + colfile = self._storage.open_file(filename) + length = self._storage.file_length(filename) + return column.reader(colfile, 0, length, self._segment.doc_count_all()) + + def doc_field_length(self, docnum, fieldname, default=0): + return self._segment._lengths[docnum].get(fieldname, default) + + def field_length(self, fieldname): + return sum(lens.get(fieldname, 0) for lens + in self._segment._lengths.values()) + + def min_field_length(self, fieldname): + return min(lens[fieldname] for lens in self._segment._lengths.values() + if fieldname in lens) + + def max_field_length(self, fieldname): + return max(lens[fieldname] for lens in self._segment._lengths.values() + if fieldname in lens) + + def has_vector(self, docnum, fieldname): + return (docnum in self._segment._vectors + and fieldname in self._segment._vectors[docnum]) + + def vector(self, docnum, fieldname, format_): + items = self._segment._vectors[docnum][fieldname] + ids, weights, values = zip(*items) + return ListMatcher(ids, weights, values, format_) + + def stored_fields(self, docnum): + return self._segment._stored[docnum] + + def close(self): + pass + + +class MemFieldWriter(base.FieldWriter): + def __init__(self, storage, segment): + self._storage = storage + self._segment = segment + self._fieldname = None + self._btext = None + self.is_closed = False + + def start_field(self, fieldname, fieldobj): + if self._fieldname is not None: + raise Exception("Called start_field in a field") + + with self._segment._lock: + invindex = self._segment._invindex + if fieldname not in invindex: + invindex[fieldname] = {} + + self._fieldname = fieldname + self._fieldobj = fieldobj + + def start_term(self, btext): + if self._btext is not None: + raise Exception("Called start_term in a term") + fieldname = self._fieldname + + fielddict = self._segment._invindex[fieldname] + terminfos = self._segment._terminfos + with self._segment._lock: + if btext not in fielddict: + fielddict[btext] = [] + + if (fieldname, btext) not in terminfos: + terminfos[fieldname, btext] = TermInfo() + + self._postings = fielddict[btext] + self._terminfo = terminfos[fieldname, btext] + self._btext = btext + + def add(self, docnum, weight, vbytes, length): + self._postings.append((docnum, weight, vbytes)) + self._terminfo.add_posting(docnum, weight, length) + + def finish_term(self): + if self._btext is None: + raise Exception("Called finish_term outside a term") + + self._postings = None + self._btext = None + self._terminfo = None + + def finish_field(self): + if self._fieldname is None: + raise Exception("Called finish_field outside a field") + self._fieldname = None + self._fieldobj = None + + def close(self): + self.is_closed = True + + +class MemTermsReader(base.TermsReader): + def __init__(self, storage, segment): + self._storage = storage + self._segment = segment + self._invindex = segment._invindex + + def __contains__(self, term): + return term in self._segment._terminfos + + def terms(self): + for fieldname in self._invindex: + for btext in self._invindex[fieldname]: + yield (fieldname, btext) + + def terms_from(self, fieldname, prefix): + if fieldname not in self._invindex: + raise TermNotFound("Unknown field %r" % (fieldname,)) + terms = sorted(self._invindex[fieldname]) + if not terms: + return + start = bisect_left(terms, prefix) + for i in xrange(start, len(terms)): + yield (fieldname, terms[i]) + + def term_info(self, fieldname, text): + return self._segment._terminfos[fieldname, text] + + def matcher(self, fieldname, btext, format_, scorer=None): + items = self._invindex[fieldname][btext] + ids, weights, values = zip(*items) + return ListMatcher(ids, weights, values, format_, scorer=scorer) + + def indexed_field_names(self): + return self._invindex.keys() + + def close(self): + pass + + +class MemSegment(base.Segment): + def __init__(self, codec, indexname): + base.Segment.__init__(self, indexname) + self._codec = codec + self._doccount = 0 + self._stored = {} + self._lengths = {} + self._vectors = {} + self._invindex = {} + self._terminfos = {} + self._lock = Lock() + + def codec(self): + return self._codec + + def set_doc_count(self, doccount): + self._doccount = doccount + + def doc_count(self): + return len(self._stored) + + def doc_count_all(self): + return self._doccount + + def delete_document(self, docnum, delete=True): + if not delete: + raise Exception("MemoryCodec can't undelete") + with self._lock: + del self._stored[docnum] + del self._lengths[docnum] + del self._vectors[docnum] + + def has_deletions(self): + with self._lock: + return self._doccount - len(self._stored) + + def is_deleted(self, docnum): + return docnum not in self._stored + + def deleted_docs(self): + stored = self._stored + for docnum in xrange(self.doc_count_all()): + if docnum not in stored: + yield docnum + + def should_assemble(self): + return False diff --git a/src/whoosh/codec/plaintext.py b/src/whoosh/codec/plaintext.py new file mode 100644 index 0000000..fda91cc --- /dev/null +++ b/src/whoosh/codec/plaintext.py @@ -0,0 +1,452 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from ast import literal_eval + +from whoosh.compat import b, bytes_type, text_type, integer_types, PY3 +from whoosh.compat import iteritems, dumps, loads, xrange +from whoosh.codec import base +from whoosh.matching import ListMatcher +from whoosh.reading import TermInfo, TermNotFound + +if not PY3: + class memoryview: + pass + +_reprable = (bytes_type, text_type, integer_types, float) + + +# Mixin classes for producing and consuming the simple text format + +class LineWriter(object): + def _print_line(self, indent, command, **kwargs): + self._dbfile.write(b(" ") * indent) + self._dbfile.write(command.encode("latin1")) + for k, v in iteritems(kwargs): + if isinstance(v, memoryview): + v = bytes(v) + if v is not None and not isinstance(v, _reprable): + raise TypeError(type(v)) + self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1")) + self._dbfile.write(b("\n")) + + +class LineReader(object): + def __init__(self, dbfile): + self._dbfile = dbfile + + def _reset(self): + self._dbfile.seek(0) + + def _find_line(self, indent, command, **kwargs): + for largs in self._find_lines(indent, command, **kwargs): + return largs + + def _find_lines(self, indent, command, **kwargs): + while True: + line = self._dbfile.readline() + if not line: + return + + c = self._parse_line(line) + if c is None: + return + + lindent, lcommand, largs = c + if lindent == indent and lcommand == command: + matched = True + if kwargs: + for k in kwargs: + if kwargs[k] != largs.get(k): + matched = False + break + + if matched: + yield largs + elif lindent < indent: + return + + def _parse_line(self, line): + line = line.decode("latin1") + line = line.rstrip() + l = len(line) + line = line.lstrip() + if not line or line.startswith("#"): + return None + + indent = (l - len(line)) // 2 + + parts = line.split("\t") + command = parts[0] + args = {} + for i in xrange(1, len(parts)): + n, v = parts[i].split("=") + args[n] = literal_eval(v) + return (indent, command, args) + + def _find_root(self, command): + self._reset() + c = self._find_line(0, command) + if c is None: + raise Exception("No root section %r" % (command,)) + + +# Codec class + +class PlainTextCodec(base.Codec): + length_stats = False + + def per_document_writer(self, storage, segment): + return PlainPerDocWriter(storage, segment) + + def field_writer(self, storage, segment): + return PlainFieldWriter(storage, segment) + + def per_document_reader(self, storage, segment): + return PlainPerDocReader(storage, segment) + + def terms_reader(self, storage, segment): + return PlainTermsReader(storage, segment) + + def new_segment(self, storage, indexname): + return PlainSegment(indexname) + + +class PlainPerDocWriter(base.PerDocumentWriter, LineWriter): + def __init__(self, storage, segment): + self._dbfile = storage.create_file(segment.make_filename(".dcs")) + self._print_line(0, "DOCS") + self.is_closed = False + + def start_doc(self, docnum): + self._print_line(1, "DOC", dn=docnum) + + def add_field(self, fieldname, fieldobj, value, length): + if value is not None: + value = dumps(value, -1) + self._print_line(2, "DOCFIELD", fn=fieldname, v=value, len=length) + + def add_column_value(self, fieldname, columnobj, value): + self._print_line(2, "COLVAL", fn=fieldname, v=value) + + def add_vector_items(self, fieldname, fieldobj, items): + self._print_line(2, "VECTOR", fn=fieldname) + for text, weight, vbytes in items: + self._print_line(3, "VPOST", t=text, w=weight, v=vbytes) + + def finish_doc(self): + pass + + def close(self): + self._dbfile.close() + self.is_closed = True + + +class PlainPerDocReader(base.PerDocumentReader, LineReader): + def __init__(self, storage, segment): + self._dbfile = storage.open_file(segment.make_filename(".dcs")) + self._segment = segment + self.is_closed = False + + def doc_count(self): + return self._segment.doc_count() + + def doc_count_all(self): + return self._segment.doc_count() + + def has_deletions(self): + return False + + def is_deleted(self, docnum): + return False + + def deleted_docs(self): + return frozenset() + + def _find_doc(self, docnum): + self._find_root("DOCS") + c = self._find_line(1, "DOC") + while c is not None: + dn = c["dn"] + if dn == docnum: + return True + elif dn > docnum: + return False + c = self._find_line(1, "DOC") + return False + + def _iter_docs(self): + self._find_root("DOCS") + c = self._find_line(1, "DOC") + while c is not None: + yield c["dn"] + c = self._find_line(1, "DOC") + + def _iter_docfields(self, fieldname): + for _ in self._iter_docs(): + for c in self._find_lines(2, "DOCFIELD", fn=fieldname): + yield c + + def _iter_lengths(self, fieldname): + return (c.get("len", 0) for c in self._iter_docfields(fieldname)) + + def doc_field_length(self, docnum, fieldname, default=0): + for dn in self._iter_docs(): + if dn == docnum: + + c = self._find_line(2, "DOCFIELD", fn=fieldname) + if c is not None: + return c.get("len", default) + elif dn > docnum: + break + + return default + + def _column_values(self, fieldname): + for i, docnum in enumerate(self._iter_docs()): + if i != docnum: + raise Exception("Missing column value for field %r doc %d?" + % (fieldname, i)) + + c = self._find_line(2, "COLVAL", fn=fieldname) + if c is None: + raise Exception("Missing column value for field %r doc %d?" + % (fieldname, docnum)) + + yield c.get("v") + + def has_column(self, fieldname): + for _ in self._column_values(fieldname): + return True + return False + + def column_reader(self, fieldname, column): + return list(self._column_values(fieldname)) + + def field_length(self, fieldname): + return sum(self._iter_lengths(fieldname)) + + def min_field_length(self, fieldname): + return min(self._iter_lengths(fieldname)) + + def max_field_length(self, fieldname): + return max(self._iter_lengths(fieldname)) + + def has_vector(self, docnum, fieldname): + if self._find_doc(docnum): + if self._find_line(2, "VECTOR"): + return True + return False + + def vector(self, docnum, fieldname, format_): + if not self._find_doc(docnum): + raise Exception + if not self._find_line(2, "VECTOR"): + raise Exception + + ids = [] + weights = [] + values = [] + c = self._find_line(3, "VPOST") + while c is not None: + ids.append(c["t"]) + weights.append(c["w"]) + values.append(c["v"]) + c = self._find_line(3, "VPOST") + + return ListMatcher(ids, weights, values, format_,) + + def _read_stored_fields(self): + sfs = {} + c = self._find_line(2, "DOCFIELD") + while c is not None: + v = c.get("v") + if v is not None: + v = loads(v) + sfs[c["fn"]] = v + c = self._find_line(2, "DOCFIELD") + return sfs + + def stored_fields(self, docnum): + if not self._find_doc(docnum): + raise Exception + return self._read_stored_fields() + + def iter_docs(self): + return enumerate(self.all_stored_fields()) + + def all_stored_fields(self): + for _ in self._iter_docs(): + yield self._read_stored_fields() + + def close(self): + self._dbfile.close() + self.is_closed = True + + +class PlainFieldWriter(base.FieldWriter, LineWriter): + def __init__(self, storage, segment): + self._dbfile = storage.create_file(segment.make_filename(".trm")) + self._print_line(0, "TERMS") + + @property + def is_closed(self): + return self._dbfile.is_closed + + def start_field(self, fieldname, fieldobj): + self._fieldobj = fieldobj + self._print_line(1, "TERMFIELD", fn=fieldname) + + def start_term(self, btext): + self._terminfo = TermInfo() + self._print_line(2, "BTEXT", t=btext) + + def add(self, docnum, weight, vbytes, length): + self._terminfo.add_posting(docnum, weight, length) + self._print_line(3, "POST", dn=docnum, w=weight, v=vbytes) + + def finish_term(self): + ti = self._terminfo + self._print_line(3, "TERMINFO", + df=ti.doc_frequency(), weight=ti.weight(), + minlength=ti.min_length(), maxlength=ti.max_length(), + maxweight=ti.max_weight(), + minid=ti.min_id(), maxid=ti.max_id()) + + def add_spell_word(self, fieldname, text): + self._print_line(2, "SPELL", fn=fieldname, t=text) + + def close(self): + self._dbfile.close() + + +class PlainTermsReader(base.TermsReader, LineReader): + def __init__(self, storage, segment): + self._dbfile = storage.open_file(segment.make_filename(".trm")) + self._segment = segment + self.is_closed = False + + def _find_field(self, fieldname): + self._find_root("TERMS") + if self._find_line(1, "TERMFIELD", fn=fieldname) is None: + raise TermNotFound("No field %r" % fieldname) + + def _iter_fields(self): + self._find_root() + c = self._find_line(1, "TERMFIELD") + while c is not None: + yield c["fn"] + c = self._find_line(1, "TERMFIELD") + + def _iter_btexts(self): + c = self._find_line(2, "BTEXT") + while c is not None: + yield c["t"] + c = self._find_line(2, "BTEXT") + + def _find_term(self, fieldname, btext): + self._find_field(fieldname) + for t in self._iter_btexts(): + if t == btext: + return True + elif t > btext: + break + return False + + def _find_terminfo(self): + c = self._find_line(3, "TERMINFO") + return TermInfo(**c) + + def __contains__(self, term): + fieldname, btext = term + return self._find_term(fieldname, btext) + + def indexed_field_names(self): + return self._iter_fields() + + def terms(self): + for fieldname in self._iter_fields(): + for btext in self._iter_btexts(): + yield (fieldname, btext) + + def terms_from(self, fieldname, prefix): + self._find_field(fieldname) + for btext in self._iter_btexts(): + if btext < prefix: + continue + yield (fieldname, btext) + + def items(self): + for fieldname, btext in self.terms(): + yield (fieldname, btext), self._find_terminfo() + + def items_from(self, fieldname, prefix): + for fieldname, btext in self.terms_from(fieldname, prefix): + yield (fieldname, btext), self._find_terminfo() + + def term_info(self, fieldname, btext): + if not self._find_term(fieldname, btext): + raise TermNotFound((fieldname, btext)) + return self._find_terminfo() + + def matcher(self, fieldname, btext, format_, scorer=None): + if not self._find_term(fieldname, btext): + raise TermNotFound((fieldname, btext)) + + ids = [] + weights = [] + values = [] + c = self._find_line(3, "POST") + while c is not None: + ids.append(c["dn"]) + weights.append(c["w"]) + values.append(c["v"]) + c = self._find_line(3, "POST") + + return ListMatcher(ids, weights, values, format_, scorer=scorer) + + def close(self): + self._dbfile.close() + self.is_closed = True + + +class PlainSegment(base.Segment): + def __init__(self, indexname): + base.Segment.__init__(self, indexname) + self._doccount = 0 + + def codec(self): + return PlainTextCodec() + + def set_doc_count(self, doccount): + self._doccount = doccount + + def doc_count(self): + return self._doccount + + def should_assemble(self): + return False diff --git a/src/whoosh/codec/whoosh3.py b/src/whoosh/codec/whoosh3.py new file mode 100644 index 0000000..92dc636 --- /dev/null +++ b/src/whoosh/codec/whoosh3.py @@ -0,0 +1,1281 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module implements a "codec" for writing/reading Whoosh X indexes. +""" + +import struct +from array import array +from collections import defaultdict + +from whoosh import columns, formats +from whoosh.compat import b, bytes_type, string_type, integer_types +from whoosh.compat import dumps, loads, iteritems, xrange +from whoosh.codec import base +from whoosh.filedb import compound, filetables +from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher +from whoosh.reading import TermInfo, TermNotFound +from whoosh.system import emptybytes +from whoosh.system import _SHORT_SIZE, _INT_SIZE, _LONG_SIZE, _FLOAT_SIZE +from whoosh.system import pack_ushort, unpack_ushort +from whoosh.system import pack_int, unpack_int, pack_long, unpack_long +from whoosh.util.numlists import delta_encode, delta_decode +from whoosh.util.numeric import length_to_byte, byte_to_length + +try: + import zlib +except ImportError: + zlib = None + + +# This byte sequence is written at the start of a posting list to identify the +# codec/version +WHOOSH3_HEADER_MAGIC = b("W3Bl") + +# Column type to store field length info +LENGTHS_COLUMN = columns.NumericColumn("B", default=0) +# Column type to store pointers to vector posting lists +VECTOR_COLUMN = columns.NumericColumn("I") +# Column type to store vector posting list lengths +VECTOR_LEN_COLUMN = columns.NumericColumn("i") +# Column type to store values of stored fields +STORED_COLUMN = columns.PickleColumn(columns.CompressedBytesColumn()) + + +class W3Codec(base.Codec): + # File extensions + TERMS_EXT = ".trm" # Term index + POSTS_EXT = ".pst" # Term postings + VPOSTS_EXT = ".vps" # Vector postings + COLUMN_EXT = ".col" # Per-document value columns + + def __init__(self, blocklimit=128, compression=3, inlinelimit=1): + self._blocklimit = blocklimit + self._compression = compression + self._inlinelimit = inlinelimit + + # def automata(self): + + # Per-document value writer + def per_document_writer(self, storage, segment): + return W3PerDocWriter(self, storage, segment) + + # Inverted index writer + def field_writer(self, storage, segment): + return W3FieldWriter(self, storage, segment) + + # Postings + + def postings_writer(self, dbfile, byteids=False): + return W3PostingsWriter(dbfile, blocklimit=self._blocklimit, + byteids=byteids, compression=self._compression, + inlinelimit=self._inlinelimit) + + def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): + if terminfo.is_inlined(): + # If the postings were inlined into the terminfo object, pull them + # out and use a ListMatcher to wrap them in a Matcher interface + ids, weights, values = terminfo.inlined_postings() + m = ListMatcher(ids, weights, values, format_, scorer=scorer, + term=term, terminfo=terminfo) + else: + offset, length = terminfo.extent() + m = W3LeafMatcher(dbfile, offset, length, format_, term=term, + scorer=scorer) + return m + + # Readers + + def per_document_reader(self, storage, segment): + return W3PerDocReader(storage, segment) + + def terms_reader(self, storage, segment): + tiname = segment.make_filename(self.TERMS_EXT) + tilen = storage.file_length(tiname) + tifile = storage.open_file(tiname) + + postfile = segment.open_file(storage, self.POSTS_EXT) + + return W3TermsReader(self, tifile, tilen, postfile) + + # Graph methods provided by CodecWithGraph + + # Columns + + def supports_columns(self): + return True + + @classmethod + def column_filename(cls, segment, fieldname): + ext = "".join((".", fieldname, cls.COLUMN_EXT)) + return segment.make_filename(ext) + + # Segments and generations + + def new_segment(self, storage, indexname): + return W3Segment(self, indexname) + + +# Common functions + +def _vecfield(fieldname): + return "_%s_vec" % fieldname + + +def _lenfield(fieldname): + return "_%s_len" % fieldname + + +# Per-doc information writer + +class W3PerDocWriter(base.PerDocWriterWithColumns): + def __init__(self, codec, storage, segment): + self._codec = codec + self._storage = storage + self._segment = segment + + tempst = storage.temp_storage("%s.tmp" % segment.indexname) + self._cols = compound.CompoundWriter(tempst) + self._colwriters = {} + self._create_column("_stored", STORED_COLUMN) + + self._fieldlengths = defaultdict(int) + self._doccount = 0 + self._docnum = None + self._storedfields = None + self._indoc = False + self.is_closed = False + + # We'll wait to create the vector file until someone actually tries + # to add a vector + self._vpostfile = None + + def _create_file(self, ext): + return self._segment.create_file(self._storage, ext) + + def _has_column(self, fieldname): + return fieldname in self._colwriters + + def _create_column(self, fieldname, column): + writers = self._colwriters + if fieldname in writers: + raise Exception("Already added column %r" % fieldname) + + f = self._cols.create_file(fieldname) + writers[fieldname] = column.writer(f) + + def _get_column(self, fieldname): + return self._colwriters[fieldname] + + def _prep_vectors(self): + self._vpostfile = self._create_file(W3Codec.VPOSTS_EXT) + # We'll use offset==0 as a marker for "no vectors", so we can't start + # postings at position 0, so just write a few header bytes :) + self._vpostfile.write(b("VPST")) + + def start_doc(self, docnum): + if self._indoc: + raise Exception("Called start_doc when already in a doc") + if docnum != self._doccount: + raise Exception("Called start_doc(%r) was expecting %r" + % (docnum, self._doccount)) + + self._docnum = docnum + self._doccount += 1 + self._storedfields = {} + self._indoc = True + + def add_field(self, fieldname, fieldobj, value, length): + if value is not None: + self._storedfields[fieldname] = value + if length: + # Add byte to length column + lenfield = _lenfield(fieldname) + lb = length_to_byte(length) + self.add_column_value(lenfield, LENGTHS_COLUMN, lb) + # Add length to total field length + self._fieldlengths[fieldname] += length + + def add_vector_items(self, fieldname, fieldobj, items): + if self._vpostfile is None: + self._prep_vectors() + + # Write vector postings + vpostwriter = self._codec.postings_writer(self._vpostfile, byteids=True) + vpostwriter.start_postings(fieldobj.vector, W3TermInfo()) + for text, weight, vbytes in items: + vpostwriter.add_posting(text, weight, vbytes) + # finish_postings() returns terminfo object + vinfo = vpostwriter.finish_postings() + + # Add row to vector lookup column + vecfield = _vecfield(fieldname) # Compute vector column name + offset, length = vinfo.extent() + self.add_column_value(vecfield, VECTOR_COLUMN, offset) + self.add_column_value(vecfield + "L", VECTOR_LEN_COLUMN, length) + + def finish_doc(self): + sf = self._storedfields + if sf: + self.add_column_value("_stored", STORED_COLUMN, sf) + sf.clear() + self._indoc = False + + def _column_filename(self, fieldname): + return W3Codec.column_filename(self._segment, fieldname) + + def close(self): + if self._indoc is not None: + # Called close without calling finish_doc + self.finish_doc() + + self._segment._fieldlengths = self._fieldlengths + + # Finish open columns and close the columns writer + for writer in self._colwriters.values(): + writer.finish(self._doccount) + self._cols.save_as_files(self._storage, self._column_filename) + + # If vectors were written, close the vector writers + if self._vpostfile: + self._vpostfile.close() + + self.is_closed = True + + +class W3FieldWriter(base.FieldWriter): + def __init__(self, codec, storage, segment): + self._codec = codec + self._storage = storage + self._segment = segment + + self._fieldname = None + self._fieldid = None + self._btext = None + self._fieldobj = None + self._format = None + + _tifile = self._create_file(W3Codec.TERMS_EXT) + self._tindex = filetables.OrderedHashWriter(_tifile) + self._fieldmap = self._tindex.extras["fieldmap"] = {} + + self._postfile = self._create_file(W3Codec.POSTS_EXT) + + self._postwriter = None + self._infield = False + self.is_closed = False + + def _create_file(self, ext): + return self._segment.create_file(self._storage, ext) + + def start_field(self, fieldname, fieldobj): + fmap = self._fieldmap + if fieldname in fmap: + self._fieldid = fmap[fieldname] + else: + self._fieldid = len(fmap) + fmap[fieldname] = self._fieldid + + self._fieldname = fieldname + self._fieldobj = fieldobj + self._format = fieldobj.format + self._infield = True + + # Start a new postwriter for this field + self._postwriter = self._codec.postings_writer(self._postfile) + + def start_term(self, btext): + if self._postwriter is None: + raise Exception("Called start_term before start_field") + self._btext = btext + self._postwriter.start_postings(self._fieldobj.format, W3TermInfo()) + + def add(self, docnum, weight, vbytes, length): + self._postwriter.add_posting(docnum, weight, vbytes, length) + + def finish_term(self): + terminfo = self._postwriter.finish_postings() + + # Add row to term info table + keybytes = pack_ushort(self._fieldid) + self._btext + valbytes = terminfo.to_bytes() + self._tindex.add(keybytes, valbytes) + + # FieldWriterWithGraph.add_spell_word + + def finish_field(self): + if not self._infield: + raise Exception("Called finish_field before start_field") + self._infield = False + self._postwriter = None + + def close(self): + self._tindex.close() + self._postfile.close() + self.is_closed = True + + +# Reader objects + +class W3PerDocReader(base.PerDocumentReader): + def __init__(self, storage, segment): + self._storage = storage + self._segment = segment + self._doccount = segment.doc_count_all() + + self._vpostfile = None + self._colfiles = {} + self._readers = {} + self._minlengths = {} + self._maxlengths = {} + + def close(self): + for colfile, _, _ in self._colfiles.values(): + colfile.close() + if self._vpostfile: + self._vpostfile.close() + + def doc_count(self): + return self._doccount - self._segment.deleted_count() + + def doc_count_all(self): + return self._doccount + + # Deletions + + def has_deletions(self): + return self._segment.has_deletions() + + def is_deleted(self, docnum): + return self._segment.is_deleted(docnum) + + def deleted_docs(self): + return self._segment.deleted_docs() + + # Columns + + def has_column(self, fieldname): + filename = W3Codec.column_filename(self._segment, fieldname) + return self._storage.file_exists(filename) + + def _get_column_file(self, fieldname): + filename = W3Codec.column_filename(self._segment, fieldname) + length = self._storage.file_length(filename) + colfile = self._storage.open_file(filename) + return colfile, 0, length + + def column_reader(self, fieldname, column): + if fieldname not in self._colfiles: + self._colfiles[fieldname] = self._get_column_file(fieldname) + colfile, offset, length = self._colfiles[fieldname] + return column.reader(colfile, offset, length, self._doccount) + + # Lengths + + def _cached_reader(self, fieldname, column): + if fieldname in self._readers: + return self._readers[fieldname] + else: + if not self.has_column(fieldname): + return None + + reader = self.column_reader(fieldname, column) + self._readers[fieldname] = reader + return reader + + def doc_field_length(self, docnum, fieldname, default=0): + if docnum > self._doccount: + raise IndexError("Asked for docnum %r of %d" + % (docnum, self._doccount)) + + lenfield = _lenfield(fieldname) + reader = self._cached_reader(lenfield, LENGTHS_COLUMN) + if reader is None: + return default + + lbyte = reader[docnum] + if lbyte: + return byte_to_length(lbyte) + + def field_length(self, fieldname): + return self._segment._fieldlengths.get(fieldname, 0) + + def _minmax_length(self, fieldname, op, cache): + if fieldname in cache: + return cache[fieldname] + + lenfield = _lenfield(fieldname) + reader = self._cached_reader(lenfield, LENGTHS_COLUMN) + length = byte_to_length(op(reader)) + cache[fieldname] = length + return length + + def min_field_length(self, fieldname): + return self._minmax_length(fieldname, min, self._minlengths) + + def max_field_length(self, fieldname): + return self._minmax_length(fieldname, max, self._maxlengths) + + # Vectors + + def _prep_vectors(self): + f = self._segment.open_file(self._storage, W3Codec.VPOSTS_EXT) + self._vpostfile = f + + def _vector_extent(self, docnum, fieldname): + if docnum > self._doccount: + raise IndexError("Asked for document %r of %d" + % (docnum, self._doccount)) + vecfield = _vecfield(fieldname) # Compute vector column name + + # Get the offset from the vector offset column + offset = self._cached_reader(vecfield, VECTOR_COLUMN)[docnum] + + # Get the length from the length column, if it exists, otherwise return + # -1 for the length (backwards compatibility with old dev versions) + lreader = self._cached_reader(vecfield + "L", VECTOR_COLUMN) + if lreader: + length = [docnum] + else: + length = -1 + + return offset, length + + def has_vector(self, docnum, fieldname): + return (self.has_column(_vecfield(fieldname)) + and self._vector_extent(docnum, fieldname)) + + def vector(self, docnum, fieldname, format_): + if self._vpostfile is None: + self._prep_vectors() + offset, length = self._vector_extent(docnum, fieldname) + m = W3LeafMatcher(self._vpostfile, offset, length, format_, + byteids=True) + return m + + # Stored fields + + def stored_fields(self, docnum): + reader = self._cached_reader("_stored", STORED_COLUMN) + v = reader[docnum] + if v is None: + v = {} + return v + + +class W3FieldCursor(base.FieldCursor): + def __init__(self, tindex, fieldname, keycoder, keydecoder, fieldobj): + self._tindex = tindex + self._fieldname = fieldname + self._keycoder = keycoder + self._keydecoder = keydecoder + self._fieldobj = fieldobj + + prefixbytes = keycoder(fieldname, b'') + self._startpos = self._tindex.closest_key_pos(prefixbytes) + + self._pos = self._startpos + self._text = None + self._datapos = None + self._datalen = None + self.next() + + def first(self): + self._pos = self._startpos + return self.next() + + def find(self, term): + if not isinstance(term, bytes_type): + term = self._fieldobj.to_bytes(term) + key = self._keycoder(self._fieldname, term) + self._pos = self._tindex.closest_key_pos(key) + return self.next() + + def next(self): + if self._pos is not None: + keyrng = self._tindex.key_and_range_at(self._pos) + if keyrng is not None: + keybytes, datapos, datalen = keyrng + fname, text = self._keydecoder(keybytes) + if fname == self._fieldname: + self._pos = datapos + datalen + self._text = self._fieldobj.from_bytes(text) + self._datapos = datapos + self._datalen = datalen + return self._text + + self._text = self._pos = self._datapos = self._datalen = None + return None + + def text(self): + return self._text + + def term_info(self): + if self._pos is None: + return None + + databytes = self._tindex.dbfile.get(self._datapos, self._datalen) + return W3TermInfo.from_bytes(databytes) + + def is_valid(self): + return self._pos is not None + + +class W3TermsReader(base.TermsReader): + def __init__(self, codec, dbfile, length, postfile): + self._codec = codec + self._dbfile = dbfile + self._tindex = filetables.OrderedHashReader(dbfile, length) + self._fieldmap = self._tindex.extras["fieldmap"] + self._postfile = postfile + + self._fieldunmap = [None] * len(self._fieldmap) + for fieldname, num in iteritems(self._fieldmap): + self._fieldunmap[num] = fieldname + + def _keycoder(self, fieldname, tbytes): + assert isinstance(tbytes, bytes_type), "tbytes=%r" % tbytes + fnum = self._fieldmap.get(fieldname, 65535) + return pack_ushort(fnum) + tbytes + + def _keydecoder(self, keybytes): + fieldid = unpack_ushort(keybytes[:_SHORT_SIZE])[0] + return self._fieldunmap[fieldid], keybytes[_SHORT_SIZE:] + + def _range_for_key(self, fieldname, tbytes): + return self._tindex.range_for_key(self._keycoder(fieldname, tbytes)) + + def __contains__(self, term): + return self._keycoder(*term) in self._tindex + + def indexed_field_names(self): + return self._fieldmap.keys() + + def cursor(self, fieldname, fieldobj): + tindex = self._tindex + coder = self._keycoder + decoder = self._keydecoder + return W3FieldCursor(tindex, fieldname, coder, decoder, fieldobj) + + def terms(self): + keydecoder = self._keydecoder + return (keydecoder(keybytes) for keybytes in self._tindex.keys()) + + def terms_from(self, fieldname, prefix): + prefixbytes = self._keycoder(fieldname, prefix) + keydecoder = self._keydecoder + return (keydecoder(keybytes) for keybytes + in self._tindex.keys_from(prefixbytes)) + + def items(self): + tidecoder = W3TermInfo.from_bytes + keydecoder = self._keydecoder + return ((keydecoder(keybytes), tidecoder(valbytes)) + for keybytes, valbytes in self._tindex.items()) + + def items_from(self, fieldname, prefix): + prefixbytes = self._keycoder(fieldname, prefix) + tidecoder = W3TermInfo.from_bytes + keydecoder = self._keydecoder + return ((keydecoder(keybytes), tidecoder(valbytes)) + for keybytes, valbytes in self._tindex.items_from(prefixbytes)) + + def term_info(self, fieldname, tbytes): + key = self._keycoder(fieldname, tbytes) + try: + return W3TermInfo.from_bytes(self._tindex[key]) + except KeyError: + raise TermNotFound("No term %s:%r" % (fieldname, tbytes)) + + def frequency(self, fieldname, tbytes): + datapos = self._range_for_key(fieldname, tbytes)[0] + return W3TermInfo.read_weight(self._dbfile, datapos) + + def doc_frequency(self, fieldname, tbytes): + datapos = self._range_for_key(fieldname, tbytes)[0] + return W3TermInfo.read_doc_freq(self._dbfile, datapos) + + def matcher(self, fieldname, tbytes, format_, scorer=None): + terminfo = self.term_info(fieldname, tbytes) + m = self._codec.postings_reader(self._postfile, terminfo, format_, + term=(fieldname, tbytes), scorer=scorer) + return m + + def close(self): + self._tindex.close() + self._postfile.close() + + +# Postings + +class W3PostingsWriter(base.PostingsWriter): + """This object writes posting lists to the postings file. It groups postings + into blocks and tracks block level statistics to makes it easier to skip + through the postings. + """ + + def __init__(self, postfile, blocklimit, byteids=False, compression=3, + inlinelimit=1): + self._postfile = postfile + self._blocklimit = blocklimit + self._byteids = byteids + self._compression = compression + self._inlinelimit = inlinelimit + + self._blockcount = 0 + self._format = None + self._terminfo = None + + def written(self): + return self._blockcount > 0 + + def start_postings(self, format_, terminfo): + # Start a new term + if self._terminfo: + # If self._terminfo is not None, that means we are already in a term + raise Exception("Called start in a term") + + assert isinstance(format_, formats.Format) + self._format = format_ + # Reset block count + self._blockcount = 0 + # Reset block bufferg + self._new_block() + # Remember terminfo object passed to us + self._terminfo = terminfo + # Remember where we started in the posting file + self._startoffset = self._postfile.tell() + + def add_posting(self, id_, weight, vbytes, length=None): + # Add a posting to the buffered block + + # If the number of buffered postings == the block limit, write out the + # buffered block and reset before adding this one + if len(self._ids) >= self._blocklimit: + self._write_block() + + # Check types + if self._byteids: + assert isinstance(id_, string_type), "id_=%r" % id_ + else: + assert isinstance(id_, integer_types), "id_=%r" % id_ + assert isinstance(weight, (int, float)), "weight=%r" % weight + assert isinstance(vbytes, bytes_type), "vbytes=%r" % vbytes + assert length is None or isinstance(length, integer_types) + + self._ids.append(id_) + self._weights.append(weight) + + if weight > self._maxweight: + self._maxweight = weight + if vbytes: + self._values.append(vbytes) + if length: + minlength = self._minlength + if minlength is None or length < minlength: + self._minlength = length + if length > self._maxlength: + self._maxlength = length + + def finish_postings(self): + terminfo = self._terminfo + # If we have fewer than "inlinelimit" postings in this posting list, + # "inline" the postings into the terminfo instead of writing them to + # the posting file + if not self.written() and len(self) < self._inlinelimit: + terminfo.add_block(self) + terminfo.set_inline(self._ids, self._weights, self._values) + else: + # If there are leftover items in the current block, write them out + if self._ids: + self._write_block(last=True) + startoffset = self._startoffset + length = self._postfile.tell() - startoffset + terminfo.set_extent(startoffset, length) + + # Clear self._terminfo to indicate we're between terms + self._terminfo = None + # Return the current terminfo object + return terminfo + + def _new_block(self): + # Reset block buffer + + # List of IDs (docnums for regular posting list, terms for vector PL) + self._ids = [] if self._byteids else array("I") + # List of weights + self._weights = array("f") + # List of encoded payloads + self._values = [] + # Statistics + self._minlength = None + self._maxlength = 0 + self._maxweight = 0 + + def _write_block(self, last=False): + # Write the buffered block to the postings file + + # If this is the first block, write a small header first + if not self._blockcount: + self._postfile.write(WHOOSH3_HEADER_MAGIC) + + # Add this block's statistics to the terminfo object, which tracks the + # overall statistics for all term postings + self._terminfo.add_block(self) + + # Minify the IDs, weights, and values, and put them in a tuple + data = (self._mini_ids(), self._mini_weights(), self._mini_values()) + # Pickle the tuple + databytes = dumps(data) + # If the pickle is less than 20 bytes, don't bother compressing + if len(databytes) < 20: + comp = 0 + # Compress the pickle (if self._compression > 0) + comp = self._compression + if comp: + databytes = zlib.compress(databytes, comp) + + # Make a tuple of block info. The posting reader can check this info + # and decide whether to skip the block without having to decompress the + # full block data + # + # - Number of postings in block + # - Last ID in block + # - Maximum weight in block + # - Compression level + # - Minimum length byte + # - Maximum length byte + ids = self._ids + infobytes = dumps((len(ids), ids[-1], self._maxweight, comp, + length_to_byte(self._minlength), + length_to_byte(self._maxlength), + )) + + # Write block length + postfile = self._postfile + blocklength = len(infobytes) + len(databytes) + if last: + # If this is the last block, use a negative number + blocklength *= -1 + postfile.write_int(blocklength) + # Write block info + postfile.write(infobytes) + # Write block data + postfile.write(databytes) + + self._blockcount += 1 + # Reset block buffer + self._new_block() + + # Methods to reduce the byte size of the various lists + + def _mini_ids(self): + # Minify IDs + + ids = self._ids + if not self._byteids: + ids = delta_encode(ids) + return tuple(ids) + + def _mini_weights(self): + # Minify weights + + weights = self._weights + + if all(w == 1.0 for w in weights): + return None + elif all(w == weights[0] for w in weights): + return weights[0] + else: + return tuple(weights) + + def _mini_values(self): + # Minify values + + fixedsize = self._format.fixed_value_size() + values = self._values + + if fixedsize is None or fixedsize < 0: + vs = tuple(values) + elif fixedsize == 0: + vs = None + else: + vs = emptybytes.join(values) + return vs + + # Block stats methods + + def __len__(self): + # Returns the number of unwritten buffered postings + return len(self._ids) + + def min_id(self): + # First ID in the buffered block + return self._ids[0] + + def max_id(self): + # Last ID in the buffered block + return self._ids[-1] + + def min_length(self): + # Shortest field length in the buffered block + return self._minlength + + def max_length(self): + # Longest field length in the buffered block + return self._maxlength + + def max_weight(self): + # Highest weight in the buffered block + return self._maxweight + + +class W3LeafMatcher(LeafMatcher): + """Reads on-disk postings from the postings file and presents the + :class:`whoosh.matching.Matcher` interface. + """ + + def __init__(self, postfile, startoffset, length, format_, term=None, + byteids=None, scorer=None): + self._postfile = postfile + self._startoffset = startoffset + self._length = length + self.format = format_ + self._term = term + self._byteids = byteids + self.scorer = scorer + + self._fixedsize = self.format.fixed_value_size() + # Read the header tag at the start of the postings + self._read_header() + # "Reset" to read the first block + self.reset() + + def _read_header(self): + # Seek to the start of the postings and check the header tag + postfile = self._postfile + + postfile.seek(self._startoffset) + magic = postfile.read(4) + if magic != WHOOSH3_HEADER_MAGIC: + raise Exception("Block tag error %r" % magic) + + # Remember the base offset (start of postings, after the header) + self._baseoffset = postfile.tell() + + def reset(self): + # Reset block stats + self._blocklength = None + self._maxid = None + self._maxweight = None + self._compression = None + self._minlength = None + self._maxlength = None + + self._lastblock = False + self._atend = False + # Consume first block + self._goto(self._baseoffset) + + def _goto(self, position): + # Read the posting block at the given position + + postfile = self._postfile + + # Reset block data -- we'll lazy load the data from the new block as + # needed + self._data = None + self._ids = None + self._weights = None + self._values = None + # Reset pointer into the block + self._i = 0 + + # Seek to the start of the block + postfile.seek(position) + # Read the block length + length = postfile.read_int() + # If the block length is negative, that means this is the last block + if length < 0: + self._lastblock = True + length *= -1 + + # Remember the offset of the next block + self._nextoffset = position + _INT_SIZE + length + # Read the pickled block info tuple + info = postfile.read_pickle() + # Remember the offset of the block's data + self._dataoffset = postfile.tell() + + # Decompose the info tuple to set the current block info + (self._blocklength, self._maxid, self._maxweight, self._compression, + mnlen, mxlen) = info + self._minlength = byte_to_length(mnlen) + self._maxlength = byte_to_length(mxlen) + + def _next_block(self): + if self._atend: + # We were already at the end, and yet somebody called _next_block() + # again, so something is wrong somewhere + raise Exception("No next block") + elif self._lastblock: + # Reached the end of the postings + self._atend = True + else: + # Go to the next block + self._goto(self._nextoffset) + + def _skip_to_block(self, skipwhile): + # Skip blocks as long as the skipwhile() function returns True + + skipped = 0 + while self.is_active() and skipwhile(): + self._next_block() + skipped += 1 + return skipped + + def is_active(self): + return not self._atend and self._i < self._blocklength + + def id(self): + # Get the current ID (docnum for regular postings, term for vector) + + # If we haven't loaded the block IDs yet, load them now + if self._ids is None: + self._read_ids() + + return self._ids[self._i] + + def weight(self): + # Get the weight for the current posting + + # If we haven't loaded the block weights yet, load them now + if self._weights is None: + self._read_weights() + + return self._weights[self._i] + + def value(self): + # Get the value for the current posting + + # If we haven't loaded the block values yet, load them now + if self._values is None: + self._read_values() + + return self._values[self._i] + + def next(self): + # Move to the next posting + + # Increment the in-block pointer + self._i += 1 + # If we reached the end of the block, move to the next block + if self._i == self._blocklength: + self._next_block() + return True + else: + return False + + def skip_to(self, targetid): + # Skip to the next ID equal to or greater than the given target ID + + if not self.is_active(): + raise ReadTooFar + + # If we're already at or past target ID, do nothing + if targetid <= self.id(): + return + + # Skip to the block that would contain the target ID + block_max_id = self.block_max_id + if targetid > block_max_id(): + self._skip_to_block(lambda: targetid > block_max_id()) + + # Iterate through the IDs in the block until we find or pass the + # target + while self.is_active() and self.id() < targetid: + self.next() + + def skip_to_quality(self, minquality): + # Skip blocks until we find one that might exceed the given minimum + # quality + + block_quality = self.block_quality + + # If the quality of this block is already higher than the minimum, + # do nothing + if block_quality() > minquality: + return 0 + + # Skip blocks as long as the block quality is not greater than the + # minimum + return self._skip_to_block(lambda: block_quality() <= minquality) + + def block_min_id(self): + if self._ids is None: + self._read_ids() + return self._ids[0] + + def block_max_id(self): + return self._maxid + + def block_min_length(self): + return self._minlength + + def block_max_length(self): + return self._maxlength + + def block_max_weight(self): + return self._maxweight + + def _read_data(self): + # Load block data tuple from disk + + datalen = self._nextoffset - self._dataoffset + b = self._postfile.get(self._dataoffset, datalen) + + # Decompress the pickled data if necessary + if self._compression: + b = zlib.decompress(b) + + # Unpickle the data tuple and save it in an attribute + self._data = loads(b) + + def _read_ids(self): + # If we haven't loaded the data from disk yet, load it now + if self._data is None: + self._read_data() + ids = self._data[0] + + # De-minify the IDs + if not self._byteids: + ids = tuple(delta_decode(ids)) + + self._ids = ids + + def _read_weights(self): + # If we haven't loaded the data from disk yet, load it now + if self._data is None: + self._read_data() + weights = self._data[1] + + # De-minify the weights + postcount = self._blocklength + if weights is None: + self._weights = array("f", (1.0 for _ in xrange(postcount))) + elif isinstance(weights, float): + self._weights = array("f", (weights for _ in xrange(postcount))) + else: + self._weights = weights + + def _read_values(self): + # If we haven't loaded the data from disk yet, load it now + if self._data is None: + self._read_data() + + # De-minify the values + fixedsize = self._fixedsize + vs = self._data[2] + if fixedsize is None or fixedsize < 0: + self._values = vs + elif fixedsize is 0: + self._values = (None,) * self._blocklength + else: + assert isinstance(vs, bytes_type) + self._values = tuple(vs[i:i + fixedsize] + for i in xrange(0, len(vs), fixedsize)) + + +# Term info implementation + +class W3TermInfo(TermInfo): + # B | Flags + # f | Total weight + # I | Total doc freq + # B | Min length (encoded as byte) + # B | Max length (encoded as byte) + # f | Max weight + # I | Minimum (first) ID + # I | Maximum (last) ID + _struct = struct.Struct("!BfIBBfII") + + def __init__(self, *args, **kwargs): + TermInfo.__init__(self, *args, **kwargs) + self._offset = None + self._length = None + self._inlined = None + + def add_block(self, block): + self._weight += sum(block._weights) + self._df += len(block) + + ml = block.min_length() + if self._minlength is None: + self._minlength = ml + else: + self._minlength = min(self._minlength, ml) + + self._maxlength = max(self._maxlength, block.max_length()) + self._maxweight = max(self._maxweight, block.max_weight()) + if self._minid is None: + self._minid = block.min_id() + self._maxid = block.max_id() + + def set_extent(self, offset, length): + self._offset = offset + self._length = length + + def extent(self): + return self._offset, self._length + + def set_inlined(self, ids, weights, values): + self._inlined = (tuple(ids), tuple(weights), tuple(values)) + + def is_inlined(self): + return self._inlined is not None + + def inlined_postings(self): + return self._inlined + + def to_bytes(self): + isinlined = self.is_inlined() + + # Encode the lengths as 0-255 values + minlength = (0 if self._minlength is None + else length_to_byte(self._minlength)) + maxlength = length_to_byte(self._maxlength) + # Convert None values to the out-of-band NO_ID constant so they can be + # stored as unsigned ints + minid = 0xffffffff if self._minid is None else self._minid + maxid = 0xffffffff if self._maxid is None else self._maxid + + # Pack the term info into bytes + st = self._struct.pack(isinlined, self._weight, self._df, + minlength, maxlength, self._maxweight, + minid, maxid) + + if isinlined: + # Postings are inlined - dump them using the pickle protocol + postbytes = dumps(self._inlined, -1) + else: + postbytes = pack_long(self._offset) + pack_int(self._length) + st += postbytes + return st + + @classmethod + def from_bytes(cls, s): + st = cls._struct + vals = st.unpack(s[:st.size]) + terminfo = cls() + + flags = vals[0] + terminfo._weight = vals[1] + terminfo._df = vals[2] + terminfo._minlength = byte_to_length(vals[3]) + terminfo._maxlength = byte_to_length(vals[4]) + terminfo._maxweight = vals[5] + terminfo._minid = None if vals[6] == 0xffffffff else vals[6] + terminfo._maxid = None if vals[7] == 0xffffffff else vals[7] + + if flags: + # Postings are stored inline + terminfo._inlined = loads(s[st.size:]) + else: + # Last bytes are pointer into posting file and length + offpos = st.size + lenpos = st.size + _LONG_SIZE + terminfo._offset = unpack_long(s[offpos:lenpos])[0] + terminfo._length = unpack_int(s[lenpos:lenpos + _INT_SIZE]) + + return terminfo + + @classmethod + def read_weight(cls, dbfile, datapos): + return dbfile.get_float(datapos + 1) + + @classmethod + def read_doc_freq(cls, dbfile, datapos): + return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE) + + @classmethod + def read_min_and_max_length(cls, dbfile, datapos): + lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + ml = byte_to_length(dbfile.get_byte(lenpos)) + xl = byte_to_length(dbfile.get_byte(lenpos + 1)) + return ml, xl + + @classmethod + def read_max_weight(cls, dbfile, datapos): + weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2 + return dbfile.get_float(weightspos) + + +# Segment implementation + +class W3Segment(base.Segment): + def __init__(self, codec, indexname, doccount=0, segid=None, deleted=None): + self.indexname = indexname + self.segid = self._random_id() if segid is None else segid + + self._codec = codec + self._doccount = doccount + self._deleted = deleted + self.compound = False + + def codec(self, **kwargs): + return self._codec + + def set_doc_count(self, dc): + self._doccount = dc + + def doc_count_all(self): + return self._doccount + + def deleted_count(self): + if self._deleted is None: + return 0 + return len(self._deleted) + + def deleted_docs(self): + if self._deleted is None: + return () + else: + return iter(self._deleted) + + def delete_document(self, docnum, delete=True): + if delete: + if self._deleted is None: + self._deleted = set() + self._deleted.add(docnum) + elif self._deleted is not None and docnum in self._deleted: + self._deleted.clear(docnum) + + def is_deleted(self, docnum): + if self._deleted is None: + return False + return docnum in self._deleted diff --git a/src/whoosh/collectors.py b/src/whoosh/collectors.py new file mode 100644 index 0000000..fbfce3b --- /dev/null +++ b/src/whoosh/collectors.py @@ -0,0 +1,1162 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module contains "collector" objects. Collectors provide a way to gather +"raw" results from a :class:`whoosh.matching.Matcher` object, implement +sorting, filtering, collation, etc., and produce a +:class:`whoosh.searching.Results` object. + +The basic collectors are: + +TopCollector + Returns the top N matching results sorted by score, using block-quality + optimizations to skip blocks of documents that can't contribute to the top + N. The :meth:`whoosh.searching.Searcher.search` method uses this type of + collector by default or when you specify a ``limit``. + +UnlimitedCollector + Returns all matching results sorted by score. The + :meth:`whoosh.searching.Searcher.search` method uses this type of collector + when you specify ``limit=None`` or you specify a limit equal to or greater + than the number of documents in the searcher. + +SortingCollector + Returns all matching results sorted by a :class:`whoosh.sorting.Facet` + object. The :meth:`whoosh.searching.Searcher.search` method uses this type + of collector when you use the ``sortedby`` parameter. + +Here's an example of a simple collector that instead of remembering the matched +documents just counts up the number of matches:: + + class CountingCollector(Collector): + def prepare(self, top_searcher, q, context): + # Always call super method in prepare + Collector.prepare(self, top_searcher, q, context) + + self.count = 0 + + def collect(self, sub_docnum): + self.count += 1 + + c = CountingCollector() + mysearcher.search_with_collector(myquery, c) + print(c.count) + +There are also several wrapping collectors that extend or modify the +functionality of other collectors. The meth:`whoosh.searching.Searcher.search` +method uses many of these when you specify various parameters. + +NOTE: collectors are not designed to be reentrant or thread-safe. It is +generally a good idea to create a new collector for each search. +""" + +import os +import threading +from array import array +from bisect import insort +from collections import defaultdict +from heapq import heapify, heappush, heapreplace + +from whoosh import sorting +from whoosh.compat import abstractmethod, iteritems, itervalues, xrange +from whoosh.searching import Results, TimeLimit +from whoosh.util import now + + +# Functions + +def ilen(iterator): + total = 0 + for _ in iterator: + total += 1 + return total + + +# Base class + +class Collector(object): + """Base class for collectors. + """ + + def prepare(self, top_searcher, q, context): + """This method is called before a search. + + Subclasses can override this to perform set-up work, but + they should still call the superclass's method because it sets several + necessary attributes on the collector object: + + self.top_searcher + The top-level searcher. + self.q + The query object + self.context + ``context.needs_current`` controls whether a wrapping collector + requires that this collector's matcher be in a valid state at every + call to ``collect()``. If this is ``False``, the collector is free + to use faster methods that don't necessarily keep the matcher + updated, such as ``matcher.all_ids()``. + + :param top_searcher: the top-level :class:`whoosh.searching.Searcher` + object. + :param q: the :class:`whoosh.query.Query` object being searched for. + :param context: a :class:`whoosh.searching.SearchContext` object + containing information about the search. + """ + + self.top_searcher = top_searcher + self.q = q + self.context = context + + self.starttime = now() + self.runtime = None + self.docset = set() + + def run(self): + # Collect matches for each sub-searcher + try: + for subsearcher, offset in self.top_searcher.leaf_searchers(): + self.set_subsearcher(subsearcher, offset) + self.collect_matches() + finally: + self.finish() + + def set_subsearcher(self, subsearcher, offset): + """This method is called each time the collector starts on a new + sub-searcher. + + Subclasses can override this to perform set-up work, but + they should still call the superclass's method because it sets several + necessary attributes on the collector object: + + self.subsearcher + The current sub-searcher. If the top-level searcher is atomic, this + is the same as the top-level searcher. + self.offset + The document number offset of the current searcher. You must add + this number to the document number passed to + :meth:`Collector.collect` to get the top-level document number + for use in results. + self.matcher + A :class:`whoosh.matching.Matcher` object representing the matches + for the query in the current sub-searcher. + """ + + self.subsearcher = subsearcher + self.offset = offset + self.matcher = self.q.matcher(subsearcher, self.context) + + def computes_count(self): + """Returns True if the collector naturally computes the exact number of + matching documents. Collectors that use block optimizations will return + False since they might skip blocks containing matching documents. + + Note that if this method returns False you can still call :meth:`count`, + but it means that method might have to do more work to calculate the + number of matching documents. + """ + + return True + + def all_ids(self): + """Returns a sequence of docnums matched in this collector. (Only valid + after the collector is run.) + + The default implementation is based on the docset. If a collector does + not maintain the docset, it will need to override this method. + """ + + return self.docset + + def count(self): + """Returns the total number of documents matched in this collector. + (Only valid after the collector is run.) + + The default implementation is based on the docset. If a collector does + not maintain the docset, it will need to override this method. + """ + + return len(self.docset) + + def collect_matches(self): + """This method calls :meth:`Collector.matches` and then for each + matched document calls :meth:`Collector.collect`. Sub-classes that + want to intervene between finding matches and adding them to the + collection (for example, to filter out certain documents) can override + this method. + """ + + collect = self.collect + for sub_docnum in self.matches(): + collect(sub_docnum) + + @abstractmethod + def collect(self, sub_docnum): + """This method is called for every matched document. It should do the + work of adding a matched document to the results, and it should return + an object to use as a "sorting key" for the given document (such as the + document's score, a key generated by a facet, or just None). Subclasses + must implement this method. + + If you want the score for the current document, use + ``self.matcher.score()``. + + Overriding methods should add the current document offset + (``self.offset``) to the ``sub_docnum`` to get the top-level document + number for the matching document to add to results. + + :param sub_docnum: the document number of the current match within the + current sub-searcher. You must add ``self.offset`` to this number + to get the document's top-level document number. + """ + + raise NotImplementedError + + @abstractmethod + def sort_key(self, sub_docnum): + """Returns a sorting key for the current match. This should return the + same value returned by :meth:`Collector.collect`, but without the side + effect of adding the current document to the results. + + If the collector has been prepared with ``context.needs_current=True``, + this method can use ``self.matcher`` to get information, for example + the score. Otherwise, it should only use the provided ``sub_docnum``, + since the matcher may be in an inconsistent state. + + Subclasses must implement this method. + """ + + raise NotImplementedError + + def remove(self, global_docnum): + """Removes a document from the collector. Not that this method uses the + global document number as opposed to :meth:`Collector.collect` which + takes a segment-relative docnum. + """ + + items = self.items + for i in xrange(len(items)): + if items[i][1] == global_docnum: + items.pop(i) + return + raise KeyError(global_docnum) + + def _step_through_matches(self): + matcher = self.matcher + while matcher.is_active(): + yield matcher.id() + matcher.next() + + def matches(self): + """Yields a series of relative document numbers for matches + in the current subsearcher. + """ + + # We jump through a lot of hoops to avoid stepping through the matcher + # "manually" if we can because all_ids() is MUCH faster + if self.context.needs_current: + return self._step_through_matches() + else: + return self.matcher.all_ids() + + def finish(self): + """This method is called after a search. + + Subclasses can override this to perform set-up work, but + they should still call the superclass's method because it sets several + necessary attributes on the collector object: + + self.runtime + The time (in seconds) the search took. + """ + + self.runtime = now() - self.starttime + + def _results(self, items, **kwargs): + # Fills in a Results object with the invariant information and the + # given "items" (a list of (score, docnum) tuples) + r = Results(self.top_searcher, self.q, items, **kwargs) + r.runtime = self.runtime + r.collector = self + return r + + @abstractmethod + def results(self): + """Returns a :class:`~whoosh.searching.Results` object containing the + results of the search. Subclasses must implement this method + """ + + raise NotImplementedError + + +# Scored collectors + +class ScoredCollector(Collector): + """Base class for collectors that sort the results based on document score. + """ + + def __init__(self, replace=10): + """ + :param replace: Number of matches between attempts to replace the + matcher with a more efficient version. + """ + + Collector.__init__(self) + self.replace = replace + + def prepare(self, top_searcher, q, context): + # This collector requires a valid matcher at each step + Collector.prepare(self, top_searcher, q, context) + + if top_searcher.weighting.use_final: + self.final_fn = top_searcher.weighting.final + else: + self.final_fn = None + + # Heap containing top N (score, 0-docnum) pairs + self.items = [] + # Minimum score a document must have to make it into the top N. This is + # used by the block-quality optimizations + self.minscore = 0 + # Number of times the matcher was replaced (for debugging) + self.replaced_times = 0 + # Number of blocks skipped by quality optimizations (for debugging) + self.skipped_times = 0 + + def sort_key(self, sub_docnum): + return 0 - self.matcher.score() + + def _collect(self, global_docnum, score): + # Concrete subclasses should override this method to collect matching + # documents + + raise NotImplementedError + + def _use_block_quality(self): + # Concrete subclasses should override this method to return True if the + # collector should use block quality optimizations + + return False + + def collect(self, sub_docnum): + # Do common work to calculate score and top-level document number + global_docnum = self.offset + sub_docnum + + score = self.matcher.score() + if self.final_fn: + score = self.final_fn(self.top_searcher, global_docnum, score) + + # Call specialized method on subclass + return self._collect(global_docnum, score) + + def matches(self): + minscore = self.minscore + matcher = self.matcher + usequality = self._use_block_quality() + replace = self.replace + replacecounter = 0 + + # A flag to indicate whether we should check block quality at the start + # of the next loop + checkquality = True + + while matcher.is_active(): + # If the replacement counter has reached 0, try replacing the + # matcher with a more efficient version + if replace: + if replacecounter == 0 or self.minscore != minscore: + self.matcher = matcher = matcher.replace(minscore or 0) + self.replaced_times += 1 + if not matcher.is_active(): + break + usequality = self._use_block_quality() + replacecounter = self.replace + + if self.minscore != minscore: + checkquality = True + minscore = self.minscore + + replacecounter -= 1 + + # If we're using block quality optimizations, and the checkquality + # flag is true, try to skip ahead to the next block with the + # minimum required quality + if usequality and checkquality and minscore is not None: + self.skipped_times += matcher.skip_to_quality(minscore) + # Skipping ahead might have moved the matcher to the end of the + # posting list + if not matcher.is_active(): + break + + yield matcher.id() + + # Move to the next document. This method returns True if the + # matcher has entered a new block, so we should check block quality + # again. + checkquality = matcher.next() + + +class TopCollector(ScoredCollector): + """A collector that only returns the top "N" scored results. + """ + + def __init__(self, limit=10, usequality=True, **kwargs): + """ + :param limit: the maximum number of results to return. + :param usequality: whether to use block-quality optimizations. This may + be useful for debugging. + """ + + ScoredCollector.__init__(self, **kwargs) + self.limit = limit + self.usequality = usequality + self.total = 0 + + def _use_block_quality(self): + return (self.usequality + and not self.top_searcher.weighting.use_final + and self.matcher.supports_block_quality()) + + def computes_count(self): + return not self._use_block_quality() + + def all_ids(self): + # Since this collector can skip blocks, it doesn't track the total + # number of matching documents, so if the user asks for all matched + # docs we need to re-run the search using docs_for_query + + return self.top_searcher.docs_for_query(self.q) + + def count(self): + if self.computes_count(): + return self.total + else: + return ilen(self.all_ids()) + + # ScoredCollector.collect calls this + def _collect(self, global_docnum, score): + items = self.items + self.total += 1 + + # Document numbers are negated before putting them in the heap so that + # higher document numbers have lower "priority" in the queue. Lower + # document numbers should always come before higher document numbers + # with the same score to keep the order stable. + if len(items) < self.limit: + # The heap isn't full, so add this document + heappush(items, (score, 0 - global_docnum)) + # Negate score to act as sort key so higher scores appear first + return 0 - score + elif score > items[0][0]: + # The heap is full, but if this document has a high enough + # score to make the top N, add it to the heap + heapreplace(items, (score, 0 - global_docnum)) + self.minscore = items[0][0] + # Negate score to act as sort key so higher scores appear first + return 0 - score + else: + return 0 + + def remove(self, global_docnum): + negated = 0 - global_docnum + items = self.items + + # Remove the document if it's on the list (it may not be since + # TopCollector forgets documents that don't make the top N list) + for i in xrange(len(items)): + if items[i][1] == negated: + items.pop(i) + # Restore the heap invariant + heapify(items) + self.minscore = items[0][0] if items else 0 + return + + def results(self): + # The items are stored (postive score, negative docnum) so the heap + # keeps the highest scores and lowest docnums, in order from lowest to + # highest. Since for the results we want the highest scores first, + # sort the heap in reverse order + items = self.items + items.sort(reverse=True) + # De-negate the docnums for presentation to the user + items = [(score, 0 - docnum) for score, docnum in items] + return self._results(items) + + +class UnlimitedCollector(ScoredCollector): + """A collector that returns **all** scored results. + """ + + def __init__(self, reverse=False): + ScoredCollector.__init__(self) + self.reverse = reverse + + # ScoredCollector.collect calls this + def _collect(self, global_docnum, score): + self.items.append((score, global_docnum)) + self.docset.add(global_docnum) + # Negate score to act as sort key so higher scores appear first + return 0 - score + + def results(self): + # Sort by negated scores so that higher scores go first, then by + # document number to keep the order stable when documents have the + # same score + self.items.sort(key=lambda x: (0 - x[0], x[1]), reverse=self.reverse) + return self._results(self.items, docset=self.docset) + + +# Sorting collector + +class SortingCollector(Collector): + """A collector that returns results sorted by a given + :class:`whoosh.sorting.Facet` object. See :doc:`/facets` for more + information. + """ + + def __init__(self, sortedby, limit=10, reverse=False): + """ + :param sortedby: see :doc:`/facets`. + :param reverse: If True, reverse the overall results. Note that you + can reverse individual facets in a multi-facet sort key as well. + """ + + Collector.__init__(self) + self.sortfacet = sorting.MultiFacet.from_sortedby(sortedby) + self.limit = limit + self.reverse = reverse + + def prepare(self, top_searcher, q, context): + self.categorizer = self.sortfacet.categorizer(top_searcher) + # If the categorizer requires a valid matcher, then tell the child + # collector that we need it + rm = context.needs_current or self.categorizer.needs_current + Collector.prepare(self, top_searcher, q, context.set(needs_current=rm)) + + # List of (sortkey, docnum) pairs + self.items = [] + + def set_subsearcher(self, subsearcher, offset): + Collector.set_subsearcher(self, subsearcher, offset) + self.categorizer.set_searcher(subsearcher, offset) + + def sort_key(self, sub_docnum): + return self.categorizer.key_for(self.matcher, sub_docnum) + + def collect(self, sub_docnum): + global_docnum = self.offset + sub_docnum + sortkey = self.sort_key(sub_docnum) + self.items.append((sortkey, global_docnum)) + self.docset.add(global_docnum) + return sortkey + + def results(self): + items = self.items + items.sort(reverse=self.reverse) + if self.limit: + items = items[:self.limit] + return self._results(items, docset=self.docset) + + +class UnsortedCollector(Collector): + def prepare(self, top_searcher, q, context): + Collector.prepare(self, top_searcher, q, context.set(weighting=None)) + self.items = [] + + def collect(self, sub_docnum): + global_docnum = self.offset + sub_docnum + self.items.append((None, global_docnum)) + self.docset.add(global_docnum) + + def results(self): + items = self.items + return self._results(items, docset=self.docset) + + +# Wrapping collectors + +class WrappingCollector(Collector): + """Base class for collectors that wrap other collectors. + """ + + def __init__(self, child): + self.child = child + + @property + def top_searcher(self): + return self.child.top_searcher + + @property + def context(self): + return self.child.context + + def prepare(self, top_searcher, q, context): + self.child.prepare(top_searcher, q, context) + + def set_subsearcher(self, subsearcher, offset): + self.child.set_subsearcher(subsearcher, offset) + self.subsearcher = subsearcher + self.matcher = self.child.matcher + self.offset = self.child.offset + + def all_ids(self): + return self.child.all_ids() + + def count(self): + return self.child.count() + + def collect_matches(self): + for sub_docnum in self.matches(): + self.collect(sub_docnum) + + def sort_key(self, sub_docnum): + return self.child.sort_key(sub_docnum) + + def collect(self, sub_docnum): + return self.child.collect(sub_docnum) + + def remove(self, global_docnum): + return self.child.remove(global_docnum) + + def matches(self): + return self.child.matches() + + def finish(self): + self.child.finish() + + def results(self): + return self.child.results() + + +# Allow and disallow collector + +class FilterCollector(WrappingCollector): + """A collector that lets you allow and/or restrict certain document numbers + in the results:: + + uc = collectors.UnlimitedCollector() + + ins = query.Term("chapter", "rendering") + outs = query.Term("status", "restricted") + fc = FilterCollector(uc, allow=ins, restrict=outs) + + mysearcher.search_with_collector(myquery, fc) + print(fc.results()) + + This collector discards a document if: + + * The allowed set is not None and a document number is not in the set, or + * The restrict set is not None and a document number is in the set. + + (So, if the same document number is in both sets, that document will be + discarded.) + + If you have a reference to the collector, you can use + ``FilterCollector.filtered_count`` to get the number of matching documents + filtered out of the results by the collector. + """ + + def __init__(self, child, allow=None, restrict=None): + """ + :param child: the collector to wrap. + :param allow: a query, Results object, or set-like object containing + docnument numbers that are allowed in the results, or None (meaning + everything is allowed). + :param restrict: a query, Results object, or set-like object containing + document numbers to disallow from the results, or None (meaning + nothing is disallowed). + """ + + self.child = child + self.allow = allow + self.restrict = restrict + + def prepare(self, top_searcher, q, context): + self.child.prepare(top_searcher, q, context) + + allow = self.allow + restrict = self.restrict + ftc = top_searcher._filter_to_comb + + self._allow = ftc(allow) if allow else None + self._restrict = ftc(restrict) if restrict else None + self.filtered_count = 0 + + def all_ids(self): + child = self.child + + _allow = self._allow + _restrict = self._restrict + + for global_docnum in child.all_ids(): + if ((_allow and global_docnum not in _allow) + or (_restrict and global_docnum in _restrict)): + continue + yield global_docnum + + def count(self): + child = self.child + if child.computes_count(): + return child.count() - self.filtered_count + else: + return ilen(self.all_ids()) + + def collect_matches(self): + child = self.child + _allow = self._allow + _restrict = self._restrict + + if _allow is not None or _restrict is not None: + filtered_count = self.filtered_count + for sub_docnum in child.matches(): + global_docnum = self.offset + sub_docnum + if ((_allow is not None and global_docnum not in _allow) + or (_restrict is not None and global_docnum in _restrict)): + filtered_count += 1 + continue + child.collect(sub_docnum) + self.filtered_count = filtered_count + else: + # If there was no allow or restrict set, don't do anything special, + # just forward the call to the child collector + child.collect_matches() + + def results(self): + r = self.child.results() + r.filtered_count = self.filtered_count + r.allowed = self.allow + r.restricted = self.restrict + return r + + +# Facet grouping collector + +class FacetCollector(WrappingCollector): + """A collector that creates groups of documents based on + :class:`whoosh.sorting.Facet` objects. See :doc:`/facets` for more + information. + + This collector is used if you specify a ``groupedby`` parameter in the + :meth:`whoosh.searching.Searcher.search` method. You can use the + :meth:`whoosh.searching.Results.groups` method to access the facet groups. + + If you have a reference to the collector can also use + ``FacetedCollector.facetmaps`` to access the groups directly:: + + uc = collectors.UnlimitedCollector() + fc = FacetedCollector(uc, sorting.FieldFacet("category")) + mysearcher.search_with_collector(myquery, fc) + print(fc.facetmaps) + """ + + def __init__(self, child, groupedby, maptype=None): + """ + :param groupedby: see :doc:`/facets`. + :param maptype: a :class:`whoosh.sorting.FacetMap` type to use for any + facets that don't specify their own. + """ + + self.child = child + self.facets = sorting.Facets.from_groupedby(groupedby) + self.maptype = maptype + + def prepare(self, top_searcher, q, context): + facets = self.facets + + # For each facet we're grouping by: + # - Create a facetmap (to hold the groups) + # - Create a categorizer (to generate document keys) + self.facetmaps = {} + self.categorizers = {} + + # Set needs_current to True if any of the categorizers require the + # current document to work + needs_current = context.needs_current + for facetname, facet in facets.items(): + self.facetmaps[facetname] = facet.map(self.maptype) + + ctr = facet.categorizer(top_searcher) + self.categorizers[facetname] = ctr + needs_current = needs_current or ctr.needs_current + context = context.set(needs_current=needs_current) + + self.child.prepare(top_searcher, q, context) + + def set_subsearcher(self, subsearcher, offset): + WrappingCollector.set_subsearcher(self, subsearcher, offset) + + # Tell each categorizer about the new subsearcher and offset + for categorizer in itervalues(self.categorizers): + categorizer.set_searcher(self.child.subsearcher, self.child.offset) + + def collect(self, sub_docnum): + matcher = self.child.matcher + global_docnum = sub_docnum + self.child.offset + + # We want the sort key for the document so we can (by default) sort + # the facet groups + sortkey = self.child.collect(sub_docnum) + + # For each facet we're grouping by + for name, categorizer in iteritems(self.categorizers): + add = self.facetmaps[name].add + + # We have to do more work if the facet allows overlapping groups + if categorizer.allow_overlap: + for key in categorizer.keys_for(matcher, sub_docnum): + add(categorizer.key_to_name(key), global_docnum, sortkey) + else: + key = categorizer.key_for(matcher, sub_docnum) + key = categorizer.key_to_name(key) + add(key, global_docnum, sortkey) + + return sortkey + + def results(self): + r = self.child.results() + r._facetmaps = self.facetmaps + return r + + +# Collapsing collector + +class CollapseCollector(WrappingCollector): + """A collector that collapses results based on a facet. That is, it + eliminates all but the top N results that share the same facet key. + Documents with an empty key for the facet are never eliminated. + + The "top" results within each group is determined by the result ordering + (e.g. highest score in a scored search) or an optional second "ordering" + facet. + + If you have a reference to the collector you can use + ``CollapseCollector.collapsed_counts`` to access the number of documents + eliminated based on each key:: + + tc = TopCollector(limit=20) + cc = CollapseCollector(tc, "group", limit=3) + mysearcher.search_with_collector(myquery, cc) + print(cc.collapsed_counts) + + See :ref:`collapsing` for more information. + """ + + def __init__(self, child, keyfacet, limit=1, order=None): + """ + :param child: the collector to wrap. + :param keyfacet: a :class:`whoosh.sorting.Facet` to use for collapsing. + All but the top N documents that share a key will be eliminated + from the results. + :param limit: the maximum number of documents to keep for each key. + :param order: an optional :class:`whoosh.sorting.Facet` to use + to determine the "top" document(s) to keep when collapsing. The + default (``orderfaceet=None``) uses the results order (e.g. the + highest score in a scored search). + """ + + self.child = child + self.keyfacet = sorting.MultiFacet.from_sortedby(keyfacet) + + self.limit = limit + if order: + self.orderfacet = sorting.MultiFacet.from_sortedby(order) + else: + self.orderfacet = None + + def prepare(self, top_searcher, q, context): + # Categorizer for getting the collapse key of a document + self.keyer = self.keyfacet.categorizer(top_searcher) + # Categorizer for getting the collapse order of a document + self.orderer = None + if self.orderfacet: + self.orderer = self.orderfacet.categorizer(top_searcher) + + # Dictionary mapping keys to lists of (sortkey, global_docnum) pairs + # representing the best docs for that key + self.lists = defaultdict(list) + # Dictionary mapping keys to the number of documents that have been + # filtered out with that key + self.collapsed_counts = defaultdict(int) + # Total number of documents filtered out by collapsing + self.collapsed_total = 0 + + # If the keyer or orderer require a valid matcher, tell the child + # collector we need it + needs_current = (context.needs_current + or self.keyer.needs_current + or (self.orderer and self.orderer.needs_current)) + self.child.prepare(top_searcher, q, + context.set(needs_current=needs_current)) + + def set_subsearcher(self, subsearcher, offset): + WrappingCollector.set_subsearcher(self, subsearcher, offset) + + # Tell the keyer and (optional) orderer about the new subsearcher + self.keyer.set_searcher(subsearcher, offset) + if self.orderer: + self.orderer.set_searcher(subsearcher, offset) + + def all_ids(self): + child = self.child + limit = self.limit + counters = defaultdict(int) + + for subsearcher, offset in child.subsearchers(): + self.set_subsearcher(subsearcher, offset) + matcher = child.matcher + keyer = self.keyer + for sub_docnum in child.matches(): + ckey = keyer.key_for(matcher, sub_docnum) + if ckey is not None: + if ckey in counters and counters[ckey] >= limit: + continue + else: + counters[ckey] += 1 + yield offset + sub_docnum + + def count(self): + if self.child.computes_count(): + return self.child.count() - self.collapsed_total + else: + return ilen(self.all_ids()) + + def collect_matches(self): + lists = self.lists + limit = self.limit + keyer = self.keyer + orderer = self.orderer + collapsed_counts = self.collapsed_counts + + child = self.child + matcher = child.matcher + offset = child.offset + for sub_docnum in child.matches(): + # Collapsing category key + ckey = keyer.key_to_name(keyer.key_for(matcher, sub_docnum)) + if not ckey: + # If the document isn't in a collapsing category, just add it + child.collect(sub_docnum) + else: + global_docnum = offset + sub_docnum + + if orderer: + # If user specified a collapse order, use it + sortkey = orderer.key_for(child.matcher, sub_docnum) + else: + # Otherwise, use the results order + sortkey = child.sort_key(sub_docnum) + + # Current list of best docs for this collapse key + best = lists[ckey] + add = False + if len(best) < limit: + # If the heap is not full yet, just add this document + add = True + elif sortkey < best[-1][0]: + # If the heap is full but this document has a lower sort + # key than the highest key currently on the heap, replace + # the "least-best" document + # Tell the child collector to remove the document + child.remove(best.pop()[1]) + add = True + + if add: + insort(best, (sortkey, global_docnum)) + child.collect(sub_docnum) + else: + # Remember that a document was filtered + collapsed_counts[ckey] += 1 + self.collapsed_total += 1 + + def results(self): + r = self.child.results() + r.collapsed_counts = self.collapsed_counts + return r + + +# Time limit collector + +class TimeLimitCollector(WrappingCollector): + """A collector that raises a :class:`TimeLimit` exception if the search + does not complete within a certain number of seconds:: + + uc = collectors.UnlimitedCollector() + tlc = TimeLimitedCollector(uc, timelimit=5.8) + try: + mysearcher.search_with_collector(myquery, tlc) + except collectors.TimeLimit: + print("The search ran out of time!") + + # We can still get partial results from the collector + print(tlc.results()) + + IMPORTANT: On Unix systems (systems where signal.SIGALRM is defined), the + code uses signals to stop searching immediately when the time limit is + reached. On Windows, the OS does not support this functionality, so the + search only checks the time between each found document, so if a matcher + is slow the search could exceed the time limit. + """ + + def __init__(self, child, timelimit, greedy=False, use_alarm=True): + """ + :param child: the collector to wrap. + :param timelimit: the maximum amount of time (in seconds) to + allow for searching. If the search takes longer than this, it will + raise a ``TimeLimit`` exception. + :param greedy: if ``True``, the collector will finish adding the most + recent hit before raising the ``TimeLimit`` exception. + :param use_alarm: if ``True`` (the default), the collector will try to + use signal.SIGALRM (on UNIX). + """ + self.child = child + self.timelimit = timelimit + self.greedy = greedy + + if use_alarm: + import signal + self.use_alarm = use_alarm and hasattr(signal, "SIGALRM") + else: + self.use_alarm = False + + self.timer = None + self.timedout = False + + def prepare(self, top_searcher, q, context): + self.child.prepare(top_searcher, q, context) + + self.timedout = False + if self.use_alarm: + import signal + signal.signal(signal.SIGALRM, self._was_signaled) + + # Start a timer thread. If the timer fires, it will call this object's + # _timestop() method + self.timer = threading.Timer(self.timelimit, self._timestop) + self.timer.start() + + def _timestop(self): + # Called when the timer expires + self.timer = None + # Set an attribute that will be noticed in the collect_matches() loop + self.timedout = True + + if self.use_alarm: + import signal + os.kill(os.getpid(), signal.SIGALRM) + + def _was_signaled(self, signum, frame): + raise TimeLimit + + def collect_matches(self): + child = self.child + greedy = self.greedy + + for sub_docnum in child.matches(): + # If the timer fired since the last loop and we're not greedy, + # raise the exception + if self.timedout and not greedy: + raise TimeLimit + + child.collect(sub_docnum) + + # If the timer fired since we entered the loop or it fired earlier + # but we were greedy, raise now + if self.timedout: + raise TimeLimit + + def finish(self): + if self.timer: + self.timer.cancel() + self.timer = None + self.child.finish() + + +# Matched terms collector + +class TermsCollector(WrappingCollector): + """A collector that remembers which terms appeared in which terms appeared + in each matched document. + + This collector is used if you specify ``terms=True`` in the + :meth:`whoosh.searching.Searcher.search` method. + + If you have a reference to the collector can also use + ``TermsCollector.termslist`` to access the term lists directly:: + + uc = collectors.UnlimitedCollector() + tc = TermsCollector(uc) + mysearcher.search_with_collector(myquery, tc) + # tc.termdocs is a dictionary mapping (fieldname, text) tuples to + # sets of document numbers + print(tc.termdocs) + # tc.docterms is a dictionary mapping docnums to lists of + # (fieldname, text) tuples + print(tc.docterms) + """ + + def __init__(self, child, settype=set): + self.child = child + self.settype = settype + + def prepare(self, top_searcher, q, context): + # This collector requires a valid matcher at each step + self.child.prepare(top_searcher, q, context.set(needs_current=True)) + + # A dictionary mapping (fieldname, text) pairs to arrays of docnums + self.termdocs = defaultdict(lambda: array("I")) + # A dictionary mapping docnums to lists of (fieldname, text) pairs + self.docterms = defaultdict(list) + + def set_subsearcher(self, subsearcher, offset): + WrappingCollector.set_subsearcher(self, subsearcher, offset) + + # Store a list of all the term matchers in the matcher tree + self.termmatchers = list(self.child.matcher.term_matchers()) + + def collect(self, sub_docnum): + child = self.child + termdocs = self.termdocs + docterms = self.docterms + + child.collect(sub_docnum) + + global_docnum = child.offset + sub_docnum + + # For each term matcher... + for tm in self.termmatchers: + # If the term matcher is matching the current document... + if tm.is_active() and tm.id() == sub_docnum: + # Add it to the list of matching documents for the term + term = tm.term() + termdocs[term].append(global_docnum) + docterms[global_docnum].append(term) + + def results(self): + r = self.child.results() + r.termdocs = dict(self.termdocs) + r.docterms = dict(self.docterms) + return r diff --git a/src/whoosh/columns.py b/src/whoosh/columns.py new file mode 100644 index 0000000..59b5f56 --- /dev/null +++ b/src/whoosh/columns.py @@ -0,0 +1,1411 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +The API and implementation of columns may change in the next version of Whoosh! + +This module contains "Column" objects which you can use as the argument to a +Field object's ``sortable=`` keyword argument. Each field defines a default +column type for when the user specifies ``sortable=True`` (the object returned +by the field's ``default_column()`` method). + +The default column type for most fields is ``VarBytesColumn``, +although numeric and date fields use ``NumericColumn``. Expert users may use +other field types that may be faster or more storage efficient based on the +field contents. For example, if a field always contains one of a limited number +of possible values, a ``RefBytesColumn`` will save space by only storing the +values once. If a field's values are always a fixed length, the +``FixedBytesColumn`` saves space by not storing the length of each value. + +A ``Column`` object basically exists to store configuration information and +provides two important methods: ``writer()`` to return a ``ColumnWriter`` object +and ``reader()`` to return a ``ColumnReader`` object. +""" + +from __future__ import division, with_statement +import struct, warnings +from array import array +from bisect import bisect_right + +try: + import zlib +except ImportError: + zlib = None + +from whoosh.compat import b, bytes_type, BytesIO +from whoosh.compat import array_tobytes, xrange +from whoosh.compat import dumps, loads +from whoosh.filedb.structfile import StructFile +from whoosh.idsets import BitSet, OnDiskBitSet +from whoosh.system import emptybytes +from whoosh.util.cache import lru_cache +from whoosh.util.numeric import typecode_max, typecode_min +from whoosh.util.numlists import GrowableArray +from whoosh.util.varints import varint + + +# Utility functions + +def _mintype(maxn): + if maxn < 2 ** 8: + typecode = "B" + elif maxn < 2 ** 16: + typecode = "H" + elif maxn < 2 ** 31: + typecode = "i" + else: + typecode = "I" + + return typecode + + +# Python does not support arrays of long long see Issue 1172711 +# These functions help write/read a simulated an array of q/Q using lists + +def write_qsafe_array(typecode, arry, dbfile): + if typecode == "q": + for num in arry: + dbfile.write_long(num) + elif typecode == "Q": + for num in arry: + dbfile.write_ulong(num) + else: + dbfile.write_array(arry) + + +def read_qsafe_array(typecode, size, dbfile): + if typecode == "q": + arry = [dbfile.read_long() for _ in xrange(size)] + elif typecode == "Q": + arry = [dbfile.read_ulong() for _ in xrange(size)] + else: + arry = dbfile.read_array(typecode, size) + + return arry + + +def make_array(typecode, size=0, default=None): + if typecode.lower() == "q": + # Python does not support arrays of long long see Issue 1172711 + if default is not None and size: + arry = [default] * size + else: + arry = [] + else: + if default is not None and size: + arry = array(typecode, (default for _ in xrange(size))) + else: + arry = array(typecode) + return arry + + +# Base classes + +class Column(object): + """Represents a "column" of rows mapping docnums to document values. + + The interface requires that you store the start offset of the column, the + length of the column data, and the number of documents (rows) separately, + and pass them to the reader object. + """ + + reversible = False + + def writer(self, dbfile): + """Returns a :class:`ColumnWriter` object you can use to use to create + a column of this type on disk. + + :param dbfile: the :class:`~whoosh.filedb.structfile.StructFile` to + write to. + """ + + return self.Writer(dbfile) + + def reader(self, dbfile, basepos, length, doccount): + """Returns a :class:`ColumnReader` object you can use to read a column + of this type from disk. + + :param dbfile: the :class:`~whoosh.filedb.structfile.StructFile` to + read from. + :param basepos: the offset within the file at which the column starts. + :param length: the length in bytes of the column occupies in the file. + :param doccount: the number of rows (documents) in the column. + """ + + return self.Reader(dbfile, basepos, length, doccount) + + def default_value(self, reverse=False): + """Returns the default value for this column type. + """ + + return self._default + + def stores_lists(self): + """Returns True if the column stores a list of values for each document + instead of a single value. + """ + + return False + + +class ColumnWriter(object): + def __init__(self, dbfile): + self._dbfile = dbfile + self._count = 0 + + def fill(self, docnum): + write = self._dbfile.write + default = self._defaultbytes + if docnum > self._count: + for _ in xrange(docnum - self._count): + write(default) + + def add(self, docnum, value): + raise NotImplementedError + + def finish(self, docnum): + pass + + +class ColumnReader(object): + def __init__(self, dbfile, basepos, length, doccount): + self._dbfile = dbfile + self._basepos = basepos + self._length = length + self._doccount = doccount + + def __len__(self): + return self._doccount + + def __getitem__(self, docnum): + raise NotImplementedError + + def sort_key(self, docnum): + return self[docnum] + + def __iter__(self): + for i in xrange(self._doccount): + yield self[i] + + def load(self): + return list(self) + + def set_reverse(self): + raise NotImplementedError + + +# Arbitrary bytes column + +class VarBytesColumn(Column): + """Stores variable length byte strings. See also :class:`RefBytesColumn`. + + The current implementation limits the total length of all document values + a segment to 2 GB. + + The default value (the value returned for a document that didn't have a + value assigned to it at indexing time) is an empty bytestring (``b''``). + """ + + _default = emptybytes + + class Writer(ColumnWriter): + def __init__(self, dbfile): + assert isinstance(dbfile, StructFile) + self._dbfile = dbfile + self._count = 0 + self._lengths = GrowableArray(allow_longs=False) + + def __repr__(self): + return "" + + def fill(self, docnum): + if docnum > self._count: + self._lengths.extend(0 for _ in xrange(docnum - self._count)) + + def add(self, docnum, v): + self.fill(docnum) + self._dbfile.write(v) + self._lengths.append(len(v)) + self._count = docnum + 1 + + def finish(self, doccount): + self.fill(doccount) + lengths = self._lengths.array + + self._dbfile.write_array(lengths) + # Write the typecode for the lengths + self._dbfile.write_byte(ord(lengths.typecode)) + + class Reader(ColumnReader): + def __init__(self, dbfile, basepos, length, doccount): + self._dbfile = dbfile + self._basepos = basepos + self._length = length + self._doccount = doccount + + self._read_lengths() + # Create an array of offsets into the strings using the lengths + offsets = array("L", (0,)) + for length in self._lengths: + offsets.append(offsets[-1] + length) + self._offsets = offsets + + def __repr__(self): + return "" + + def _read_lengths(self): + dbfile = self._dbfile + basepos = self._basepos + length = self._length + doccount = self._doccount + + # The end of the lengths array is the end of the data minus the + # typecode byte + endoflens = basepos + length - 1 + # Load the length typecode from before the key length + typecode = chr(dbfile.get_byte(endoflens)) + # Load the length array from before the typecode + itemsize = struct.calcsize(typecode) + lengthsbase = endoflens - (itemsize * doccount) + self._lengths = dbfile.get_array(lengthsbase, typecode, doccount) + + @lru_cache() + def __getitem__(self, docnum): + length = self._lengths[docnum] + if not length: + return emptybytes + offset = self._offsets[docnum] + return self._dbfile.get(self._basepos + offset, length) + + def __iter__(self): + get = self._dbfile.get + pos = self._basepos + for length in self._lengths: + yield get(pos, length) + pos += length + + +class FixedBytesColumn(Column): + """Stores fixed-length byte strings. + """ + + def __init__(self, fixedlen, default=None): + """ + :param fixedlen: the fixed length of byte strings in this column. + :param default: the default value to use for documents that don't + specify a value. If you don't specify a default, the column will + use ``b'\\x00' * fixedlen``. + """ + + self._fixedlen = fixedlen + + if default is None: + default = b("\x00") * fixedlen + elif len(default) != fixedlen: + raise ValueError + self._default = default + + def writer(self, dbfile): + return self.Writer(dbfile, self._fixedlen, self._default) + + def reader(self, dbfile, basepos, length, doccount): + return self.Reader(dbfile, basepos, length, doccount, self._fixedlen, + self._default) + + class Writer(ColumnWriter): + def __init__(self, dbfile, fixedlen, default): + self._dbfile = dbfile + self._fixedlen = fixedlen + self._default = self._defaultbytes = default + self._count = 0 + + def __repr__(self): + return "" + + def add(self, docnum, v): + if v == self._default: + return + if docnum > self._count: + self.fill(docnum) + assert len(v) == self._fixedlen + self._dbfile.write(v) + self._count = docnum + 1 + + class Reader(ColumnReader): + def __init__(self, dbfile, basepos, length, doccount, fixedlen, + default): + self._dbfile = dbfile + self._basepos = basepos + self._doccount = doccount + self._fixedlen = fixedlen + self._default = self._defaultbytes = default + self._count = length // fixedlen + + def __repr__(self): + return "" + + def __getitem__(self, docnum): + if docnum >= self._count: + return self._defaultbytes + pos = self._basepos + self._fixedlen * docnum + return self._dbfile.get(pos, self._fixedlen) + + def __iter__(self): + count = self._count + default = self._default + for i in xrange(self._doccount): + if i < count: + yield self[i] + else: + yield default + + +# Variable/fixed length reference (enum) column + +class RefBytesColumn(Column): + """Stores variable-length or fixed-length byte strings, similar to + :class:`VarBytesColumn` and :class:`FixedBytesColumn`. However, where those + columns stores a value for each document, this column keeps a list of all + the unique values in the field, and for each document stores a short + pointer into the unique list. For fields where the number of possible + values is smaller than the number of documents (for example, + "category" or "chapter"), this saves significant space. + + This column type supports a maximum of 65535 unique values across all + documents in a segment. You should generally use this column type where the + number of unique values is in no danger of approaching that number (for + example, a "tags" field). If you try to index too many unique values, the + column will convert additional unique values to the default value and issue + a warning using the ``warnings`` module (this will usually be preferable to + crashing the indexer and potentially losing indexed documents). + """ + + # NOTE that RefBytes is reversible within a single column (we could just + # negate the reference number), but it's NOT reversible ACROSS SEGMENTS + # (since different segments can have different uniques values in their + # columns), so we have to say that the column type is not reversible + reversible = False + + def __init__(self, fixedlen=0, default=None): + """ + :param fixedlen: an optional fixed length for the values. If you + specify a number other than 0, the column will require all values + to be the specified length. + :param default: a default value to use for documents that don't specify + one. If you don't specify a default, the column will use an empty + bytestring (``b''``), or if you specify a fixed length, + ``b'\\x00' * fixedlen``. + """ + + self._fixedlen = fixedlen + + if default is None: + default = b("\x00") * fixedlen if fixedlen else emptybytes + elif fixedlen and len(default) != fixedlen: + raise ValueError + self._default = default + + def writer(self, dbfile): + return self.Writer(dbfile, self._fixedlen, self._default) + + def reader(self, dbfile, basepos, length, doccount): + return self.Reader(dbfile, basepos, length, doccount, self._fixedlen) + + class Writer(ColumnWriter): + def __init__(self, dbfile, fixedlen, default): + self._dbfile = dbfile + self._fixedlen = fixedlen + self._default = default + + # At first we'll buffer refs in a byte array. If the number of + # uniques stays below 256, we can just write the byte array. As + # soon as the ref count goes above 255, we know we're going to have + # to write shorts, so we'll switch to writing directly. + self._refs = array("B") + self._uniques = {default: 0} + self._count = 0 + + def __repr__(self): + return "" + + def fill(self, docnum): + if docnum > self._count: + if self._refs is not None: + self._refs.extend(0 for _ in xrange(docnum - self._count)) + else: + dbfile = self._dbfile + for _ in xrange(docnum - self._count): + dbfile.write_ushort(0) + + def add(self, docnum, v): + dbfile = self._dbfile + refs = self._refs + self.fill(docnum) + + uniques = self._uniques + try: + ref = uniques[v] + except KeyError: + uniques[v] = ref = len(uniques) + if refs is not None and ref >= 256: + # We won't be able to use bytes, we have to switch to + # writing unbuffered ushorts + for n in refs: + dbfile.write_ushort(n) + refs = self._refs = None + + if refs is not None: + self._refs.append(ref) + else: + if ref > 65535: + warnings.warn("RefBytesColumn dropped unique value %r" % v, + UserWarning) + ref = 0 + dbfile.write_ushort(ref) + + self._count = docnum + 1 + + def _write_uniques(self, typecode): + dbfile = self._dbfile + fixedlen = self._fixedlen + uniques = self._uniques + + dbfile.write_varint(len(uniques)) + # Sort unique values by position + vs = sorted(uniques.keys(), key=lambda key: uniques[key]) + for v in vs: + if not fixedlen: + dbfile.write_varint(len(v)) + dbfile.write(v) + + def finish(self, doccount): + dbfile = self._dbfile + refs = self._refs + self.fill(doccount) + + typecode = "H" + if refs is not None: + dbfile.write_array(refs) + typecode = refs.typecode + + self._write_uniques(typecode) + dbfile.write_byte(ord(typecode)) + + class Reader(ColumnReader): + def __init__(self, dbfile, basepos, length, doccount, fixedlen): + self._dbfile = dbfile + self._basepos = basepos + self._doccount = doccount + self._fixedlen = fixedlen + + self._typecode = chr(dbfile.get_byte(basepos + length - 1)) + + st = struct.Struct("!" + self._typecode) + self._unpack = st.unpack + self._itemsize = st.size + + dbfile.seek(basepos + doccount * self._itemsize) + self._uniques = self._read_uniques() + + def __repr__(self): + return "" + + def _read_uniques(self): + dbfile = self._dbfile + fixedlen = self._fixedlen + + ucount = dbfile.read_varint() + length = fixedlen + uniques = [] + for _ in xrange(ucount): + if not fixedlen: + length = dbfile.read_varint() + uniques.append(dbfile.read(length)) + return uniques + + def __getitem__(self, docnum): + pos = self._basepos + docnum * self._itemsize + ref = self._unpack(self._dbfile.get(pos, self._itemsize))[0] + return self._uniques[ref] + + def __iter__(self): + get = self._dbfile.get + basepos = self._basepos + uniques = self._uniques + unpack = self._unpack + itemsize = self._itemsize + + for i in xrange(self._doccount): + pos = basepos + i * itemsize + ref = unpack(get(pos, itemsize))[0] + yield uniques[ref] + + +# Numeric column + +class NumericColumn(FixedBytesColumn): + """Stores numbers (integers and floats) as compact binary. + """ + + reversible = True + + def __init__(self, typecode, default=0): + """ + :param typecode: a typecode character (as used by the ``struct`` + module) specifying the number type. For example, ``"i"`` for + signed integers. + :param default: the default value to use for documents that don't + specify one. + """ + + self._typecode = typecode + self._default = default + + def writer(self, dbfile): + return self.Writer(dbfile, self._typecode, self._default) + + def reader(self, dbfile, basepos, length, doccount): + return self.Reader(dbfile, basepos, length, doccount, self._typecode, + self._default) + + def default_value(self, reverse=False): + v = self._default + if reverse: + v = 0 - v + return v + + class Writer(FixedBytesColumn.Writer): + def __init__(self, dbfile, typecode, default): + self._dbfile = dbfile + self._pack = struct.Struct("!" + typecode).pack + self._default = default + self._defaultbytes = self._pack(default) + self._fixedlen = struct.calcsize(typecode) + self._count = 0 + + def __repr__(self): + return "" + + def add(self, docnum, v): + if v == self._default: + return + if docnum > self._count: + self.fill(docnum) + self._dbfile.write(self._pack(v)) + self._count = docnum + 1 + + class Reader(FixedBytesColumn.Reader): + def __init__(self, dbfile, basepos, length, doccount, typecode, + default): + self._dbfile = dbfile + self._basepos = basepos + self._doccount = doccount + self._default = default + self._reverse = False + + self._typecode = typecode + self._unpack = struct.Struct("!" + typecode).unpack + self._defaultbytes = struct.pack("!" + typecode, default) + self._fixedlen = struct.calcsize(typecode) + self._count = length // self._fixedlen + + def __repr__(self): + return "" + + def __getitem__(self, docnum): + s = FixedBytesColumn.Reader.__getitem__(self, docnum) + return self._unpack(s)[0] + + def sort_key(self, docnum): + key = self[docnum] + if self._reverse: + key = 0 - key + return key + + def load(self): + if self._typecode in "qQ": + return list(self) + else: + return array(self._typecode, self) + + def set_reverse(self): + self._reverse = True + + +# Column of boolean values + +class BitColumn(Column): + """Stores a column of True/False values compactly. + """ + + reversible = True + _default = False + + def __init__(self, compress_at=2048): + """ + :param compress_at: columns with this number of values or fewer will + be saved compressed on disk, and loaded into RAM for reading. Set + this to 0 to disable compression. + """ + + self._compressat = compress_at + + def writer(self, dbfile): + return self.Writer(dbfile, self._compressat) + + def default_value(self, reverse=False): + return self._default ^ reverse + + class Writer(ColumnWriter): + def __init__(self, dbfile, compressat): + self._dbfile = dbfile + self._compressat = compressat + self._bitset = BitSet() + + def __repr__(self): + return "" + + def add(self, docnum, value): + if value: + self._bitset.add(docnum) + + def finish(self, doccount): + dbfile = self._dbfile + bits = self._bitset.bits + + if zlib and len(bits) <= self._compressat: + compressed = zlib.compress(array_tobytes(bits), 3) + dbfile.write(compressed) + dbfile.write_byte(1) + else: + dbfile.write_array(bits) + dbfile.write_byte(0) + + class Reader(ColumnReader): + def __init__(self, dbfile, basepos, length, doccount): + self._dbfile = dbfile + self._basepos = basepos + self._length = length + self._doccount = doccount + self._reverse = False + + compressed = dbfile.get_byte(basepos + (length - 1)) + if compressed: + bbytes = zlib.decompress(dbfile.get(basepos, length - 1)) + bitset = BitSet.from_bytes(bbytes) + else: + dbfile.seek(basepos) + bitset = OnDiskBitSet(dbfile, basepos, length - 1) + self._bitset = bitset + + def id_set(self): + return self._bitset + + def __repr__(self): + return "" + + def __getitem__(self, i): + return i in self._bitset + + def sort_key(self, docnum): + return int(self[docnum] ^ self._reverse) + + def __iter__(self): + i = 0 + for num in self._bitset: + if num > i: + for _ in xrange(num - i): + yield False + yield True + i = num + 1 + if self._doccount > i: + for _ in xrange(self._doccount - i): + yield False + + def load(self): + if isinstance(self._bitset, OnDiskBitSet): + bs = self._dbfile.get_array(self._basepos, "B", + self._length - 1) + self._bitset = BitSet.from_bytes(bs) + return self + + def set_reverse(self): + self._reverse = True + + +# Compressed variants + +class CompressedBytesColumn(Column): + """Stores variable-length byte strings compressed using deflate (by + default). + """ + + def __init__(self, level=3, module="zlib"): + """ + :param level: the compression level to use. + :param module: a string containing the name of the compression module + to use. The default is "zlib". The module should export "compress" + and "decompress" functions. + """ + + self._level = level + self._module = module + + def writer(self, dbfile): + return self.Writer(dbfile, self._level, self._module) + + def reader(self, dbfile, basepos, length, doccount): + return self.Reader(dbfile, basepos, length, doccount, self._module) + + class Writer(VarBytesColumn.Writer): + def __init__(self, dbfile, level, module): + VarBytesColumn.Writer.__init__(self, dbfile) + self._level = level + self._compress = __import__(module).compress + + def __repr__(self): + return "" + + def add(self, docnum, v): + v = self._compress(v, self._level) + VarBytesColumn.Writer.add(self, docnum, v) + + class Reader(VarBytesColumn.Reader): + def __init__(self, dbfile, basepos, length, doccount, module): + VarBytesColumn.Reader.__init__(self, dbfile, basepos, length, + doccount) + self._decompress = __import__(module).decompress + + def __repr__(self): + return "" + + def __getitem__(self, docnum): + v = VarBytesColumn.Reader.__getitem__(self, docnum) + if v: + v = self._decompress(v) + return v + + def __iter__(self): + for v in VarBytesColumn.Reader.__iter__(self): + yield self._decompress(v) + + def load(self): + return list(self) + + +class CompressedBlockColumn(Column): + """An experimental column type that compresses and decompresses blocks of + values at a time. This can lead to high compression and decent performance + for columns with lots of very short values, but random access times are + usually terrible. + """ + + def __init__(self, level=3, blocksize=32, module="zlib"): + """ + :param level: the compression level to use. + :param blocksize: the size (in KB) of each compressed block. + :param module: a string containing the name of the compression module + to use. The default is "zlib". The module should export "compress" + and "decompress" functions. + """ + + self._level = level + self._blocksize = blocksize + self._module = module + + def writer(self, dbfile): + return self.Writer(dbfile, self._level, self._blocksize, self._module) + + def reader(self, dbfile, basepos, length, doccount): + return self.Reader(dbfile, basepos, length, doccount, self._module) + + class Writer(ColumnWriter): + def __init__(self, dbfile, level, blocksize, module): + self._dbfile = dbfile + self._blocksize = blocksize * 1024 + self._level = level + self._compress = __import__(module).compress + + self._reset() + + def __repr__(self): + return "" + + def _reset(self): + self._startdoc = None + self._block = emptybytes + self._lengths = [] + + def _emit(self): + dbfile = self._dbfile + block = self._compress(self._block, self._level) + header = (self._startdoc, self._lastdoc, len(block), + tuple(self._lengths)) + dbfile.write_pickle(header) + dbfile.write(block) + + def add(self, docnum, v): + if self._startdoc is None: + self._startdoc = docnum + self._lengths.append((docnum, len(v))) + self._lastdoc = docnum + + self._block += v + if len(self._block) >= self._blocksize: + self._emit() + self._reset() + + def finish(self, doccount): + # If there's still a pending block, write it out + if self._startdoc is not None: + self._emit() + + class Reader(ColumnReader): + def __init__(self, dbfile, basepos, length, doccount, module): + ColumnReader.__init__(self, dbfile, basepos, length, doccount) + self._decompress = __import__(module).decompress + + self._blocks = [] + dbfile.seek(basepos) + pos = 0 + while pos < length: + startdoc, enddoc, blocklen, lengths = dbfile.read_pickle() + here = dbfile.tell() + self._blocks.append((startdoc, enddoc, here, blocklen, + lengths)) + dbfile.seek(blocklen, 1) + pos = here + blocklen + + def __repr__(self): + return "" + + def _find_block(self, docnum): + # TODO: use binary search instead of linear + for i, b in enumerate(self._blocks): + if docnum < b[0]: + return None + elif docnum <= b[1]: + return i + return None + + def _get_block(self, blocknum): + block = self._blocks[blocknum] + pos = block[2] + blocklen = block[3] + lengths = block[4] + + data = self._decompress(self._dbfile.get(self._basepos + pos, + blocklen)) + values = {} + base = 0 + for docnum, vlen in lengths: + values[docnum] = data[base:base + vlen] + base += vlen + return values + + def __getitem__(self, docnum): + i = self._find_block(docnum) + if i is None: + return emptybytes + return self._get_block(i)[docnum] + + def __iter__(self): + last = -1 + for i, block in enumerate(self._blocks): + startdoc = block[0] + enddoc = block[1] + if startdoc > (last + 1): + for _ in xrange(startdoc - last): + yield emptybytes + values = self._get_block(i) + for docnum in xrange(startdoc, enddoc + 1): + if docnum in values: + yield values[docnum] + else: + yield emptybytes + last = enddoc + if enddoc < self._doccount - 1: + for _ in xrange(self._doccount - enddoc): + yield emptybytes + + +class StructColumn(FixedBytesColumn): + def __init__(self, spec, default): + self._spec = spec + self._fixedlen = struct.calcsize(spec) + self._default = default + + def writer(self, dbfile): + return self.Writer(dbfile, self._spec, self._default) + + def reader(self, dbfile, basepos, length, doccount): + return self.Reader(dbfile, basepos, length, doccount, self._spec, + self._default) + + class Writer(FixedBytesColumn.Writer): + def __init__(self, dbfile, spec, default): + self._dbfile = dbfile + self._struct = struct.Struct(spec) + self._fixedlen = self._struct.size + self._default = default + self._defaultbytes = self._struct.pack(*default) + self._count = 0 + + def __repr__(self): + return "" + + def add(self, docnum, v): + b = self._struct.pack(*v) + FixedBytesColumn.Writer.add(self, docnum, b) + + class Reader(FixedBytesColumn.Reader): + def __init__(self, dbfile, basepos, length, doccount, spec, default): + self._dbfile = dbfile + self._basepos = basepos + self._doccount = doccount + self._struct = struct.Struct(spec) + self._fixedlen = self._struct.size + self._default = default + self._defaultbytes = self._struct.pack(*default) + self._count = length // self._fixedlen + + def __repr__(self): + return "" + + def __getitem__(self, docnum): + v = FixedBytesColumn.Reader.__getitem__(self, docnum) + return self._struct.unpack(v) + + +# Utility readers + +class EmptyColumnReader(ColumnReader): + """Acts like a reader for a column with no stored values. Always returns + the default. + """ + + def __init__(self, default, doccount): + """ + :param default: the value to return for all "get" requests. + :param doccount: the number of documents in the nominal column. + """ + + self._default = default + self._doccount = doccount + + def __getitem__(self, docnum): + return self._default + + def __iter__(self): + return (self._default for _ in xrange(self._doccount)) + + def load(self): + return self + + +class MultiColumnReader(ColumnReader): + """Serializes access to multiple column readers, making them appear to be + one large column. + """ + + def __init__(self, readers, offsets=None): + """ + :param readers: a sequence of column reader objects. + """ + + self._readers = readers + + self._doc_offsets = [] + self._doccount = 0 + + if offsets is None: + for r in readers: + self._doc_offsets.append(self._doccount) + self._doccount += len(r) + else: + assert len(offsets) == len(readers) + self._doc_offsets = offsets + + def _document_reader(self, docnum): + return max(0, bisect_right(self._doc_offsets, docnum) - 1) + + def _reader_and_docnum(self, docnum): + rnum = self._document_reader(docnum) + offset = self._doc_offsets[rnum] + return rnum, docnum - offset + + def __getitem__(self, docnum): + x, y = self._reader_and_docnum(docnum) + return self._readers[x][y] + + def __iter__(self): + for r in self._readers: + for v in r: + yield v + + +class TranslatingColumnReader(ColumnReader): + """Calls a function to "translate" values from an underlying column reader + object before returning them. + + ``IndexReader`` objects can wrap a column reader with this object to call + ``FieldType.from_column_value`` on the stored column value before returning + it the the user. + """ + + def __init__(self, reader, translate): + """ + :param reader: the underlying ColumnReader object to get values from. + :param translate: a function that takes a value from the underlying + reader and returns a translated value. + """ + + self._reader = reader + self._translate = translate + + def raw_column(self): + """Returns the underlying column reader. + """ + + return self._reader + + def __len__(self): + return len(self._reader) + + def __getitem__(self, docnum): + return self._translate(self._reader[docnum]) + + def sort_key(self, docnum): + return self._reader.sort_key(docnum) + + def __iter__(self): + translate = self._translate + return (translate(v) for v in self._reader) + + def set_reverse(self): + self._reader.set_reverse() + + +# Column wrappers + +class WrappedColumn(Column): + def __init__(self, child): + self._child = child + + def writer(self, *args, **kwargs): + return self.Writer(self._child.writer(*args, **kwargs)) + + def reader(self, *args, **kwargs): + return self.Reader(self._child.reader(*args, **kwargs)) + + def stores_lists(self): + return self._child.stores_lists() + + +class WrappedColumnWriter(ColumnWriter): + def __init__(self, child): + self._child = child + + def fill(self, docnum): + return self._child.fill(docnum) + + def add(self, docnum, value): + return self._child.add(docnum, value) + + def finish(self, docnum): + return self._child.finish(docnum) + + +class WrappedColumnReader(ColumnReader): + def __init__(self, child): + self._child = child + + def __len__(self): + return len(self._child) + + def __getitem__(self, docnum): + return self._child[docnum] + + def sort_key(self, docnum): + return self._child.sort_key(docnum) + + def __iter__(self): + return iter(self._child) + + def load(self): + return list(self) + + def set_reverse(self): + self._child.set_reverse() + + +class ClampedNumericColumn(WrappedColumn): + """An experimental wrapper type for NumericColumn that clamps out-of-range + values instead of raising an exception. + """ + + def reader(self, *args, **kwargs): + return self._child.reader(*args, **kwargs) + + class Writer(WrappedColumnWriter): + def __init__(self, child): + self._child = child + self._min = typecode_min[child._typecode] + self._max = typecode_max[child._typecode] + + def add(self, docnum, v): + v = min(v, self._min) + v = max(v, self._max) + self._child.add(docnum, v) + + +class PickleColumn(WrappedColumn): + """Converts arbitrary objects to pickled bytestrings and stores them using + the wrapped column (usually a :class:`VarBytesColumn` or + :class:`CompressedBytesColumn`). + + If you can express the value you want to store as a number or bytestring, + you should use the appropriate column type to avoid the time and size + overhead of pickling and unpickling. + """ + + class Writer(WrappedColumnWriter): + def __repr__(self): + return "" + + def add(self, docnum, v): + if v is None: + v = emptybytes + else: + v = dumps(v, -1) + self._child.add(docnum, v) + + class Reader(WrappedColumnReader): + def __repr__(self): + return "" + + def __getitem__(self, docnum): + v = self._child[docnum] + if not v: + return None + else: + return loads(v) + + def __iter__(self): + for v in self._child: + if not v: + yield None + else: + yield loads(v) + + +# List columns + +class ListColumn(WrappedColumn): + def stores_lists(self): + return True + + +class ListColumnReader(ColumnReader): + def sort_key(self, docnum): + return self[docnum][0] + + def __iter__(self): + for docnum in xrange(len(self)): + yield self[docnum] + + +class VarBytesListColumn(ListColumn): + def __init__(self): + self._child = VarBytesColumn() + + class Writer(WrappedColumnWriter): + def add(self, docnum, ls): + out = [varint(len(ls))] + for v in ls: + assert isinstance(v, bytes_type) + out.append(varint(len(v))) + out.append(v) + self._child.add(emptybytes.join(out)) + + class Reader(WrappedColumnReader, ListColumnReader): + def __getitem__(self, docnum): + bio = BytesIO(self._child[docnum]) + count = bio.read_varint() + out = [] + for _ in xrange(count): + vlen = bio.read_varint() + v = bio.read(vlen) + out.append(v) + return out + + +class FixedBytesListColumn(ListColumn): + def __init__(self, fixedlen): + self._fixedlen = fixedlen + self._child = VarBytesColumn() + + def writer(self, *args, **kwargs): + return self.Writer(self._child.writer(*args, **kwargs), self._fixedlen) + + def reader(self, *args, **kwargs): + return self.Reader(self._child.reader(*args, **kwargs), self._fixedlen) + + class Writer(WrappedColumnWriter): + def __init__(self, child, fixedlen): + self._child = child + self._fixedlen = fixedlen + self._lengths = GrowableArray() + self._count = 0 + + def add(self, docnum, ls): + out = [] + for v in ls: + assert len(v) == self._fixedlen + out.append(v) + b = emptybytes.join(out) + self._child.add(docnum, b) + + class Reader(WrappedColumnReader, ListColumnReader): + def __init__(self, child, fixedlen): + self._child = child + self._fixedlen = fixedlen + + def __getitem__(self, docnum): + fixedlen = self._fixedlen + v = self._child[docnum] + if not v: + return [] + ls = [v[i:i + fixedlen] for i in xrange(0, len(v), fixedlen)] + return ls + + +#class RefListColumn(Column): +# def __init__(self, fixedlen=0): +# """ +# :param fixedlen: an optional fixed length for the values. If you +# specify a number other than 0, the column will require all values +# to be the specified length. +# :param default: a default value to use for documents that don't specify +# one. If you don't specify a default, the column will use an empty +# bytestring (``b''``), or if you specify a fixed length, +# ``b'\\x00' * fixedlen``. +# """ +# +# self._fixedlen = fixedlen +# +# def stores_lists(self): +# return True +# +# def writer(self, dbfile): +# return self.Writer(dbfile, self._fixedlen) +# +# def reader(self, dbfile, basepos, length, doccount): +# return self.Reader(dbfile, basepos, length, doccount, self._fixedlen) +# +# class Writer(ColumnWriter): +# def __init__(self, dbfile, fixedlen): +# self._dbfile = dbfile +# self._fixedlen = fixedlen +# +# self._refs = GrowableArray(allow_longs=False) +# self._lengths = GrowableArray(allow_longs=False) +# self._count = 0 +# +# def __repr__(self): +# return "" +# +# def fill(self, docnum): +# if docnum > self._count: +# self._lengths.extend(0 for _ in xrange(docnum - self._count)) +# +# def add(self, docnum, ls): +# uniques = self._uniques +# refs = self._refs +# +# self.fill(docnum) +# self._lengths.append(len(ls)) +# for v in ls: +# try: +# i = uniques[v] +# except KeyError: +# uniques[v] = i = len(uniques) +# refs.append(i) +# +# self._count = docnum + 1 +# +# def finish(self, doccount): +# dbfile = self._dbfile +# refs = self._refs.array +# lengths = self._lengths.array +# +# self.fill(doccount) +# dbfile.write_byte(ord(lengths.typecode)) +# dbfile.write_array(lengths) +# dbfile.write_byte(ord(refs.typecode)) +# self._write_uniques(refs.typecode) +# dbfile.write_array(refs) +# +# class Reader(ListColumnReader): +# def __init__(self, dbfile, basepos, length, doccount, fixedlen): +# self._dbfile = dbfile +# self._basepos = basepos +# self._doccount = doccount +# self._fixedlen = fixedlen +# +# dbfile.seek(basepos) +# lencode = chr(dbfile.read_byte()) +# self._lengths = dbfile.read_array(lencode, doccount) +# +# self._typecode = chr(dbfile.read_byte()) +# refst = struct.Struct("!" + self._typecode) +# self._unpack = refst.unpack +# self._itemsize = refst.size +# +# self._read_uniques() +# self._refbase = dbfile.tell() +# +# # Create an array of offsets into the references using the lengths +# offsets = array("i", (0,)) +# for length in self._lengths: +# offsets.append(offsets[-1] + length) +# self._offsets = offsets +# +# def __repr__(self): +# return "" +# +# def _get_ref(self, docnum): +# pos = self._basepos + 1 + docnum * self._itemsize +# return self._unpack(self._dbfile.get(pos, self._itemsize))[0] +# +# def __getitem__(self, docnum): +# offset = self._offsets[docnum] +# length = self._lengths[docnum] +# +# pos = self._refbase + offset * self._itemsize +# reflist = self._dbfile.get_array(pos, self._typecode, length) +# return [self._uniques[ref] for ref in reflist] diff --git a/src/whoosh/compat.py b/src/whoosh/compat.py new file mode 100644 index 0000000..5154e6d --- /dev/null +++ b/src/whoosh/compat.py @@ -0,0 +1,206 @@ +import array, sys + + +# Run time aliasing of Python2/3 differences + +def htmlescape(s, quote=True): + # this is html.escape reimplemented with cgi.escape, + # so it works for python 2.x, 3.0 and 3.1 + import cgi + s = cgi.escape(s, quote) + if quote: + # python 3.2 also replaces the single quotes: + s = s.replace("'", "'") + return s + +if sys.version_info[0] < 3: + PY3 = False + + def b(s): + return s + + import cStringIO as StringIO + StringIO = BytesIO = StringIO.StringIO + callable = callable + integer_types = (int, long) + iteritems = lambda o: o.iteritems() + itervalues = lambda o: o.itervalues() + iterkeys = lambda o: o.iterkeys() + from itertools import izip + long_type = long + next = lambda o: o.next() + import cPickle as pickle + from cPickle import dumps, loads, dump, load + string_type = basestring + text_type = unicode + bytes_type = str + unichr = unichr + from urllib import urlretrieve + + def byte(num): + return chr(num) + + def u(s): + return unicode(s, "unicode_escape") + + def with_metaclass(meta, base=object): + class _WhooshBase(base): + __metaclass__ = meta + return _WhooshBase + + xrange = xrange + zip_ = zip + + def memoryview_(source, offset=None, length=None): + if offset or length: + return buffer(source, offset, length) + else: + return buffer(source) + +else: + PY3 = True + import collections + + def b(s): + return s.encode("latin-1") + + import io + BytesIO = io.BytesIO + callable = lambda o: isinstance(o, collections.Callable) + exec_ = eval("exec") + integer_types = (int,) + iteritems = lambda o: o.items() + itervalues = lambda o: o.values() + iterkeys = lambda o: iter(o.keys()) + izip = zip + long_type = int + next = next + import pickle + from pickle import dumps, loads, dump, load + StringIO = io.StringIO + string_type = str + text_type = str + bytes_type = bytes + unichr = chr + from urllib.request import urlretrieve + + def byte(num): + return bytes((num,)) + + def u(s): + if isinstance(s, bytes): + return s.decode("ascii") + return s + + def with_metaclass(meta, base=object): + ns = dict(base=base, meta=meta) + exec_("""class _WhooshBase(base, metaclass=meta): + pass""", ns) + return ns["_WhooshBase"] + + xrange = range + zip_ = lambda * args: list(zip(*args)) + + def memoryview_(source, offset=None, length=None): + mv = memoryview(source) + if offset or length: + return mv[offset:offset + length] + else: + return mv + + try: + # for python >= 3.2, avoid DeprecationWarning for cgi.escape + from html import escape as htmlescape + except ImportError: + pass + + +if hasattr(array.array, "tobytes"): + def array_tobytes(arry): + return arry.tobytes() + + def array_frombytes(arry, bs): + return arry.frombytes(bs) +else: + def array_tobytes(arry): + return arry.tostring() + + def array_frombytes(arry, bs): + return arry.fromstring(bs) + + +# Implementations missing from older versions of Python + +try: + from itertools import permutations # @UnusedImport +except ImportError: + # Python 2.5 + def permutations(iterable, r=None): + pool = tuple(iterable) + n = len(pool) + r = n if r is None else r + if r > n: + return + indices = range(n) + cycles = range(n, n - r, -1) + yield tuple(pool[i] for i in indices[:r]) + while n: + for i in reversed(range(r)): + cycles[i] -= 1 + if cycles[i] == 0: + indices[i:] = indices[i + 1:] + indices[i:i + 1] + cycles[i] = n - i + else: + j = cycles[i] + indices[i], indices[-j] = indices[-j], indices[i] + yield tuple(pool[i] for i in indices[:r]) + break + else: + return + + +try: + # Python 2.6-2.7 + from itertools import izip_longest # @UnusedImport +except ImportError: + try: + # Python 3.0 + from itertools import zip_longest as izip_longest # @UnusedImport + except ImportError: + # Python 2.5 + from itertools import chain, izip, repeat + + def izip_longest(*args, **kwds): + fillvalue = kwds.get('fillvalue') + + def sentinel(counter=([fillvalue] * (len(args) - 1)).pop): + yield counter() + + fillers = repeat(fillvalue) + iters = [chain(it, sentinel(), fillers) for it in args] + try: + for tup in izip(*iters): + yield tup + except IndexError: + pass + + +try: + from operator import methodcaller # @UnusedImport +except ImportError: + # Python 2.5 + def methodcaller(name, *args, **kwargs): + def caller(obj): + return getattr(obj, name)(*args, **kwargs) + return caller + + +try: + from abc import abstractmethod # @UnusedImport +except ImportError: + # Python 2.5 + def abstractmethod(funcobj): + """A decorator indicating abstract methods. + """ + funcobj.__isabstractmethod__ = True + return funcobj diff --git a/src/whoosh/externalsort.py b/src/whoosh/externalsort.py new file mode 100644 index 0000000..6490e28 --- /dev/null +++ b/src/whoosh/externalsort.py @@ -0,0 +1,240 @@ +# Copyright 2011 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module implements a general external merge sort for Python objects. +""" + +from __future__ import with_statement + +import os, tempfile +from heapq import heapify, heappop, heapreplace + +from whoosh.compat import dump, load + + +## Python 3.2 had a bug that make marshal.load unusable +#if (hasattr(platform, "python_implementation") +# and platform.python_implementation() == "CPython" +# and platform.python_version() == "3.2.0"): +# # Use pickle instead of marshal on Python 3.2 +# from whoosh.compat import dump as dump_pickle +# from whoosh.compat import load +# +# def dump(obj, f): +# dump_pickle(obj, f, -1) +#else: +# from marshal import dump, load + + +try: + from heapq import merge + + def imerge(iterables): + return merge(*iterables) +except ImportError: + def imerge(iterables): + _hpop, _hreplace, _Stop = (heappop, heapreplace, StopIteration) + h = [] + h_append = h.append + for itnum, it in enumerate(map(iter, iterables)): + try: + nx = it.next + h_append([nx(), itnum, nx]) + except _Stop: + pass + heapify(h) + + while 1: + try: + while 1: + v, itnum, nx = s = h[0] + yield v + s[0] = nx() + _hreplace(h, s) + except _Stop: + _hpop(h) + except IndexError: + return + + +class SortingPool(object): + """This object implements a general K-way external merge sort for Python + objects. + + >>> pool = MergePool() + >>> # Add an unlimited number of items in any order + >>> for item in my_items: + ... pool.add(item) + ... + >>> # Get the items back in sorted order + >>> for item in pool.items(): + ... print(item) + + This class uses the `marshal` module to write the items to temporary files, + so you can only sort marshal-able types (generally: numbers, strings, + tuples, lists, and dicts). + """ + + def __init__(self, maxsize=1000000, tempdir=None, prefix="", + suffix=".run"): + """ + :param maxsize: the maximum number of items to keep in memory at once. + :param tempdir: the path of a directory to use for temporary file + storage. The default is to use the system's temp directory. + :param prefix: a prefix to add to temporary filenames. + :param suffix: a suffix to add to temporary filenames. + """ + + self.tempdir = tempdir + if maxsize < 1: + raise ValueError("maxsize=%s must be >= 1" % maxsize) + self.maxsize = maxsize + self.prefix = prefix + self.suffix = suffix + # Current run queue + self.current = [] + # List of run filenames + self.runs = [] + + def _new_run(self): + fd, path = tempfile.mkstemp(prefix=self.prefix, suffix=self.suffix, + dir=self.tempdir) + f = os.fdopen(fd, "wb") + return path, f + + def _open_run(self, path): + return open(path, "rb") + + def _remove_run(self, path): + os.remove(path) + + def _read_run(self, path): + f = self._open_run(path) + try: + while True: + yield load(f) + except EOFError: + return + finally: + f.close() + self._remove_run(path) + + def _merge_runs(self, paths): + iters = [self._read_run(path) for path in paths] + for item in imerge(iters): + yield item + + def add(self, item): + """Adds `item` to the pool to be sorted. + """ + + if len(self.current) >= self.maxsize: + self.save() + self.current.append(item) + + def _write_run(self, f, items): + for item in items: + dump(item, f, -1) + f.close() + + def _add_run(self, filename): + self.runs.append(filename) + + def save(self): + current = self.current + if current: + current.sort() + path, f = self._new_run() + self._write_run(f, current) + self._add_run(path) + self.current = [] + + def cleanup(self): + for path in self.runs: + try: + os.remove(path) + except OSError: + pass + + def reduce_to(self, target, k): + # Reduce the number of runs to "target" by merging "k" runs at a time + + if k < 2: + raise ValueError("k=%s must be > 2" % k) + if target < 1: + raise ValueError("target=%s must be >= 1" % target) + runs = self.runs + while len(runs) > target: + newpath, f = self._new_run() + # Take k runs off the end of the run list + tomerge = [] + while runs and len(tomerge) < k: + tomerge.append(runs.pop()) + # Merge them into a new run and add it at the start of the list + self._write_run(f, self._merge_runs(tomerge)) + runs.insert(0, newpath) + + def items(self, maxfiles=128): + """Returns a sorted list or iterator of the items in the pool. + + :param maxfiles: maximum number of files to open at once. + """ + + if maxfiles < 2: + raise ValueError("maxfiles=%s must be >= 2" % maxfiles) + + if not self.runs: + # We never wrote a run to disk, so just sort the queue in memory + # and return that + return sorted(self.current) + # Write a new run with the leftover items in the queue + self.save() + + # If we have more runs than allowed open files, merge some of the runs + if maxfiles < len(self.runs): + self.reduce_to(maxfiles, maxfiles) + + # Take all the runs off the run list and merge them + runs = self.runs + self.runs = [] # Minor detail, makes this object reusable + return self._merge_runs(runs) + + +def sort(items, maxsize=100000, tempdir=None, maxfiles=128): + """Sorts the given items using an external merge sort. + + :param tempdir: the path of a directory to use for temporary file + storage. The default is to use the system's temp directory. + :param maxsize: the maximum number of items to keep in memory at once. + :param maxfiles: maximum number of files to open at once. + """ + + p = SortingPool(maxsize=maxsize, tempdir=tempdir) + for item in items: + p.add(item) + return p.items(maxfiles=maxfiles) diff --git a/src/whoosh/fields.py b/src/whoosh/fields.py new file mode 100644 index 0000000..b66d1b4 --- /dev/null +++ b/src/whoosh/fields.py @@ -0,0 +1,1603 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" + Contains functions and classes related to fields. +""" + +import datetime, fnmatch, re, struct, sys +from array import array +from decimal import Decimal + +from whoosh import analysis, columns, formats +from whoosh.compat import with_metaclass +from whoosh.compat import itervalues, xrange +from whoosh.compat import bytes_type, string_type, text_type +from whoosh.system import emptybytes +from whoosh.system import pack_byte, unpack_byte +from whoosh.util.numeric import to_sortable, from_sortable +from whoosh.util.numeric import typecode_max, NaN +from whoosh.util.text import utf8encode, utf8decode +from whoosh.util.times import datetime_to_long, long_to_datetime + + +# Exceptions + +class FieldConfigurationError(Exception): + pass + + +class UnknownFieldError(Exception): + pass + + +# Field Types + +class FieldType(object): + """ + Represents a field configuration. + + The FieldType object supports the following attributes: + + * format (formats.Format): the storage format for posting blocks. + + * analyzer (analysis.Analyzer): the analyzer to use to turn text into + terms. + + * scorable (boolean): whether searches against this field may be scored. + This controls whether the index stores per-document field lengths for + this field. + + * stored (boolean): whether the content of this field is stored for each + document. For example, in addition to indexing the title of a document, + you usually want to store the title so it can be presented as part of + the search results. + + * unique (boolean): whether this field's value is unique to each document. + For example, 'path' or 'ID'. IndexWriter.update_document() will use + fields marked as 'unique' to find the previous version of a document + being updated. + + * multitoken_query is a string indicating what kind of query to use when + a "word" in a user query parses into multiple tokens. The string is + interpreted by the query parser. The strings understood by the default + query parser are "first" (use first token only), "and" (join the tokens + with an AND query), "or" (join the tokens with OR), "phrase" (join + the tokens with a phrase query), and "default" (use the query parser's + default join type). + + * vector (formats.Format or boolean): the format to use to store term + vectors. If not a ``Format`` object, any true value means to use the + index format as the term vector format. Any flase value means don't + store term vectors for this field. + + The constructor for the base field type simply lets you supply your own + attribute values. Subclasses may configure some or all of this for you. + """ + + analyzer = format = scorable = stored = unique = vector = None + indexed = True + multitoken_query = "default" + sortable_typecode = None + column_type = None + + def __init__(self, format, analyzer, scorable=False, + stored=False, unique=False, multitoken_query="default", + sortable=False, vector=None): + self.format = format + self.analyzer = analyzer + self.scorable = scorable + self.stored = stored + self.unique = unique + self.multitoken_query = multitoken_query + self.set_sortable(sortable) + + if isinstance(vector, formats.Format): + self.vector = vector + elif vector: + self.vector = self.format + else: + self.vector = None + + def __repr__(self): + return ("%s(format=%r, scorable=%s, stored=%s, unique=%s)" + % (self.__class__.__name__, self.format, self.scorable, + self.stored, self.unique)) + + def __eq__(self, other): + return all((isinstance(other, FieldType), + (self.format == other.format), + (self.scorable == other.scorable), + (self.stored == other.stored), + (self.unique == other.unique), + (self.column_type == other.column_type))) + + def __ne__(self, other): + return not(self.__eq__(other)) + + # Text + + def index(self, value, **kwargs): + """Returns an iterator of (btext, frequency, weight, encoded_value) + tuples for each unique word in the input value. + + The default implementation uses the ``analyzer`` attribute to tokenize + the value into strings, then encodes them into bytes using UTF-8. + """ + + if not self.format: + raise Exception("%s field %r cannot index without a format" + % (self.__class__.__name__, self)) + if not isinstance(value, (text_type, list, tuple)): + raise ValueError("%r is not unicode or sequence" % value) + assert isinstance(self.format, formats.Format) + + if "mode" not in kwargs: + kwargs["mode"] = "index" + + word_values = self.format.word_values + ana = self.analyzer + for tstring, freq, wt, vbytes in word_values(value, ana, **kwargs): + yield (utf8encode(tstring)[0], freq, wt, vbytes) + + def tokenize(self, value, **kwargs): + """ + Analyzes the given string and returns an iterator of Token objects + (note: for performance reasons, actually the same token yielded over + and over with different attributes). + """ + + if not self.analyzer: + raise Exception("%s field has no analyzer" % self.__class__) + return self.analyzer(value, **kwargs) + + def process_text(self, qstring, mode='', **kwargs): + """ + Analyzes the given string and returns an iterator of token texts. + + >>> field = fields.TEXT() + >>> list(field.process_text("The ides of March")) + ["ides", "march"] + """ + + if not self.format: + raise Exception("%s field has no format" % self) + return (t.text for t in self.tokenize(qstring, mode=mode, **kwargs)) + + # Conversion + + def to_bytes(self, value): + """ + Returns a bytes representation of the given value, appropriate to be + written to disk. The default implementation assumes a unicode value and + encodes it using UTF-8. + """ + + if isinstance(value, (list, tuple)): + value = value[0] + if not isinstance(value, bytes_type): + value = utf8encode(value)[0] + return value + + def to_column_value(self, value): + """ + Returns an object suitable to be inserted into the document values + column for this field. The default implementation simply calls + ``self.to_bytes(value)``. + """ + + return self.to_bytes(value) + + def from_bytes(self, bs): + return utf8decode(bs)[0] + + def from_column_value(self, value): + return self.from_bytes(value) + + # Columns/sorting + + def set_sortable(self, sortable): + if sortable: + if isinstance(sortable, columns.Column): + self.column_type = sortable + else: + self.column_type = self.default_column() + else: + self.column_type = None + + def sortable_terms(self, ixreader, fieldname): + """ + Returns an iterator of the "sortable" tokens in the given reader and + field. These values can be used for sorting. The default implementation + simply returns all tokens in the field. + + This can be overridden by field types such as NUMERIC where some values + in a field are not useful for sorting. + """ + + return ixreader.lexicon(fieldname) + + def default_column(self): + return columns.VarBytesColumn() + + # Parsing + + def self_parsing(self): + """ + Subclasses should override this method to return True if they want + the query parser to call the field's ``parse_query()`` method instead + of running the analyzer on text in this field. This is useful where + the field needs full control over how queries are interpreted, such + as in the numeric field type. + """ + + return False + + def parse_query(self, fieldname, qstring, boost=1.0): + """ + When ``self_parsing()`` returns True, the query parser will call + this method to parse basic query text. + """ + + raise NotImplementedError(self.__class__.__name__) + + def parse_range(self, fieldname, start, end, startexcl, endexcl, + boost=1.0): + """ + When ``self_parsing()`` returns True, the query parser will call + this method to parse range query text. If this method returns None + instead of a query object, the parser will fall back to parsing the + start and end terms using process_text(). + """ + + return None + + # Spelling + + def separate_spelling(self): + """ + Returns True if the field stores unstemmed words in a separate field for + spelling suggestions. + """ + + return False + + def spelling_fieldname(self, fieldname): + """ + Returns the name of a field to use for spelling suggestions instead of + this field. + + :param fieldname: the name of this field. + """ + + return fieldname + + def spellable_words(self, value): + """Returns an iterator of each unique word (in sorted order) in the + input value, suitable for inclusion in the field's word graph. + + The default behavior is to call the field analyzer with the keyword + argument ``no_morph=True``, which should make the analyzer skip any + morphological transformation filters (e.g. stemming) to preserve the + original form of the words. Exotic field types may need to override + this behavior. + """ + + if isinstance(value, (list, tuple)): + words = value + else: + words = [token.text for token + in self.analyzer(value, no_morph=True)] + + return iter(sorted(set(words))) + + # Utility + + def subfields(self): + """ + Returns an iterator of ``(name_prefix, fieldobject)`` pairs for the + fields that need to be indexed when content is put in this field. The + default implementation simply yields ``("", self)``. + """ + + yield "", self + + def supports(self, name): + """ + Returns True if the underlying format supports the given posting + value type. + + >>> field = TEXT() + >>> field.supports("positions") + True + >>> field.supports("chars") + False + """ + + return self.format.supports(name) + + def clean(self): + """ + Clears any cached information in the field and any child objects. + """ + + if self.format and hasattr(self.format, "clean"): + self.format.clean() + + # Events + + def on_add(self, schema, fieldname): + pass + + def on_remove(self, schema, fieldname): + pass + + +# Wrapper base class + +class FieldWrapper(FieldType): + def __init__(self, subfield, prefix): + if isinstance(subfield, type): + subfield = subfield() + self.subfield = subfield + self.name_prefix = prefix + + # By default we'll copy all the subfield's attributes -- override these + # in subclass constructor for things you want to change + self.analyzer = subfield.analyzer + self.format = subfield.format + self.column_type = subfield.column_type + self.scorable = subfield.scorable + self.stored = subfield.stored + self.unique = subfield.unique + self.indexed = subfield.indexed + self.vector = subfield.vector + + def __eq__(self, other): + return self.subfield.__eq__(other) + + def __ne__(self, other): + return self.subfield.__ne__(other) + + # Text + + # def index(self, value, boost=1.0, **kwargs): + # return self.subfield.index(value, boost, **kwargs) + # + # def tokenize(self, value, **kwargs): + # return self.subfield.tokenize(value, **kwargs) + # + # def process_text(self, qstring, mode='', **kwargs): + # return self.subfield.process_text(qstring, mode, **kwargs) + + # Conversion + + def to_bytes(self, value): + return self.subfield.to_bytes(value) + + def to_column_value(self, value): + return self.subfield.to_column_value(value) + + def from_bytes(self, bs): + return self.subfield.from_bytes(bs) + + def from_column_value(self, value): + return self.subfield.from_column_value(value) + + # Sorting/columns + + def set_sortable(self, sortable): + self.subfield.set_sortable(sortable) + + def sortable_terms(self, ixreader, fieldname): + return self.subfield.sortable_terms(ixreader, fieldname) + + def default_column(self): + return self.subfield.default_column() + + # Parsing + + def self_parsing(self): + return self.subfield.self_parsing() + + def parse_query(self, fieldname, qstring, boost=1.0): + return self.subfield.parse_query(fieldname, qstring, boost) + + def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): + self.subfield.parse_range(fieldname, start, end, startexcl, endexcl, + boost) + + # Utility + + def subfields(self): + # The default FieldWrapper.subfields() implementation DOES NOT split + # out the subfield here -- you need to override if that's what you want + yield "", self + + def supports(self, name): + return self.subfield.supports(name) + + def clean(self): + self.subfield.clean() + + # Events + + def on_add(self, schema, fieldname): + self.subfield.on_add(schema, fieldname) + + def on_remove(self, schema, fieldname): + self.subfield.on_remove(schema, fieldname) + + +# Pre-configured field types + +class ID(FieldType): + """ + Configured field type that indexes the entire value of the field as one + token. This is useful for data you don't want to tokenize, such as the path + of a file. + """ + + def __init__(self, stored=False, unique=False, field_boost=1.0, + sortable=False, analyzer=None): + """ + :param stored: Whether the value of this field is stored with the + document. + """ + + self.analyzer = analyzer or analysis.IDAnalyzer() + # Don't store any information other than the doc ID + self.format = formats.Existence(field_boost=field_boost) + self.stored = stored + self.unique = unique + self.set_sortable(sortable) + + +class IDLIST(FieldType): + """ + Configured field type for fields containing IDs separated by whitespace + and/or punctuation (or anything else, using the expression param). + """ + + def __init__(self, stored=False, unique=False, expression=None, + field_boost=1.0): + """ + :param stored: Whether the value of this field is stored with the + document. + :param unique: Whether the value of this field is unique per-document. + :param expression: The regular expression object to use to extract + tokens. The default expression breaks tokens on CRs, LFs, tabs, + spaces, commas, and semicolons. + """ + + expression = expression or re.compile(r"[^\r\n\t ,;]+") + # Don't store any information other than the doc ID + self.format = formats.Existence(field_boost=field_boost) + self.stored = stored + self.unique = unique + + +class NUMERIC(FieldType): + """ + Special field type that lets you index integer or floating point + numbers in relatively short fixed-width terms. The field converts numbers + to sortable bytes for you before indexing. + + You specify the numeric type of the field (``int`` or ``float``) when you + create the ``NUMERIC`` object. The default is ``int``. For ``int``, you can + specify a size in bits (``32`` or ``64``). For both ``int`` and ``float`` + you can specify a ``signed`` keyword argument (default is ``True``). + + >>> schema = Schema(path=STORED, position=NUMERIC(int, 64, signed=False)) + >>> ix = storage.create_index(schema) + >>> with ix.writer() as w: + ... w.add_document(path="/a", position=5820402204) + ... + + You can also use the NUMERIC field to store Decimal instances by specifying + a type of ``int`` or ``long`` and the ``decimal_places`` keyword argument. + This simply multiplies each number by ``(10 ** decimal_places)`` before + storing it as an integer. Of course this may throw away decimal prcesision + (by truncating, not rounding) and imposes the same maximum value limits as + ``int``/``long``, but these may be acceptable for certain applications. + + >>> from decimal import Decimal + >>> schema = Schema(path=STORED, position=NUMERIC(int, decimal_places=4)) + >>> ix = storage.create_index(schema) + >>> with ix.writer() as w: + ... w.add_document(path="/a", position=Decimal("123.45") + ... + + """ + + def __init__(self, numtype=int, bits=32, stored=False, unique=False, + field_boost=1.0, decimal_places=0, shift_step=4, signed=True, + sortable=False, default=None): + """ + :param numtype: the type of numbers that can be stored in this field, + either ``int``, ``float``. If you use ``Decimal``, + use the ``decimal_places`` argument to control how many decimal + places the field will store. + :param bits: When ``numtype`` is ``int``, the number of bits to use to + store the number: 8, 16, 32, or 64. + :param stored: Whether the value of this field is stored with the + document. + :param unique: Whether the value of this field is unique per-document. + :param decimal_places: specifies the number of decimal places to save + when storing Decimal instances. If you set this, you will always + get Decimal instances back from the field. + :param shift_steps: The number of bits of precision to shift away at + each tiered indexing level. Values should generally be 1-8. Lower + values yield faster searches but take up more space. A value + of `0` means no tiered indexing. + :param signed: Whether the numbers stored in this field may be + negative. + """ + + # Allow users to specify strings instead of Python types in case + # docstring isn't clear + if numtype == "int": + numtype = int + if numtype == "float": + numtype = float + # Raise an error if the user tries to use a type other than int or + # float + if numtype is Decimal: + numtype = int + if not decimal_places: + raise TypeError("To store Decimal instances, you must set the " + "decimal_places argument") + elif numtype not in (int, float): + raise TypeError("Can't use %r as a type, use int or float" + % numtype) + # Sanity check + if numtype is float and decimal_places: + raise Exception("A float type and decimal_places argument %r are " + "incompatible" % decimal_places) + + intsizes = [8, 16, 32, 64] + intcodes = ["B", "H", "I", "Q"] + # Set up field configuration based on type and size + if numtype is float: + bits = 64 # Floats are converted to 64 bit ints + else: + if bits not in intsizes: + raise Exception("Invalid bits %r, use 8, 16, 32, or 64" + % bits) + # Type code for the *sortable* representation + self.sortable_typecode = intcodes[intsizes.index(bits)] + self._struct = struct.Struct(">" + str(self.sortable_typecode)) + + self.numtype = numtype + self.bits = bits + self.stored = stored + self.unique = unique + self.decimal_places = decimal_places + self.shift_step = shift_step + self.signed = signed + self.analyzer = analysis.IDAnalyzer() + # Don't store any information other than the doc ID + self.format = formats.Existence(field_boost=field_boost) + self.min_value, self.max_value = self._min_max() + + # Column configuration + if default is None: + if numtype is int: + default = typecode_max[self.sortable_typecode] + else: + default = NaN + elif not self.is_valid(default): + raise Exception("The default %r is not a valid number for this " + "field" % default) + + self.default = default + self.set_sortable(sortable) + + def __getstate__(self): + d = self.__dict__.copy() + if "_struct" in d: + del d["_struct"] + return d + + def __setstate__(self, d): + self.__dict__.update(d) + self._struct = struct.Struct(">" + str(self.sortable_typecode)) + if "min_value" not in d: + d["min_value"], d["max_value"] = self._min_max() + + def _min_max(self): + numtype = self.numtype + bits = self.bits + signed = self.signed + + # Calculate the minimum and maximum possible values for error checking + min_value = from_sortable(numtype, bits, signed, 0) + max_value = from_sortable(numtype, bits, signed, 2 ** bits - 1) + + return min_value, max_value + + def default_column(self): + return columns.NumericColumn(self.sortable_typecode, + default=self.default) + + def is_valid(self, x): + try: + x = self.to_bytes(x) + except ValueError: + return False + except OverflowError: + return False + + return True + + def index(self, num, **kwargs): + # If the user gave us a list of numbers, recurse on the list + if isinstance(num, (list, tuple)): + for n in num: + for item in self.index(n): + yield item + return + + # word, freq, weight, valuestring + if self.shift_step: + for shift in xrange(0, self.bits, self.shift_step): + yield (self.to_bytes(num, shift), 1, 1.0, emptybytes) + else: + yield (self.to_bytes(num), 1, 1.0, emptybytes) + + def prepare_number(self, x): + if x == emptybytes or x is None: + return x + + dc = self.decimal_places + if dc and isinstance(x, (string_type, Decimal)): + x = Decimal(x) * (10 ** dc) + elif isinstance(x, Decimal): + raise TypeError("Can't index a Decimal object unless you specified " + "decimal_places on the field") + + try: + x = self.numtype(x) + except OverflowError: + raise ValueError("Value %r overflowed number type %r" + % (x, self.numtype)) + + if x < self.min_value or x > self.max_value: + raise ValueError("Numeric field value %s out of range [%s, %s]" + % (x, self.min_value, self.max_value)) + return x + + def unprepare_number(self, x): + dc = self.decimal_places + if dc: + s = str(x) + x = Decimal(s[:-dc] + "." + s[-dc:]) + return x + + def to_column_value(self, x): + if isinstance(x, (list, tuple, array)): + x = x[0] + x = self.prepare_number(x) + return to_sortable(self.numtype, self.bits, self.signed, x) + + def from_column_value(self, x): + x = from_sortable(self.numtype, self.bits, self.signed, x) + return self.unprepare_number(x) + + def to_bytes(self, x, shift=0): + # Try to avoid re-encoding; this sucks because on Python 2 we can't + # tell the difference between a string and encoded bytes, so we have + # to require the user use unicode when they mean string + if isinstance(x, bytes_type): + return x + + if x == emptybytes or x is None: + return self.sortable_to_bytes(0) + + x = self.prepare_number(x) + x = to_sortable(self.numtype, self.bits, self.signed, x) + return self.sortable_to_bytes(x, shift) + + def sortable_to_bytes(self, x, shift=0): + if shift: + x >>= shift + return pack_byte(shift) + self._struct.pack(x) + + def from_bytes(self, bs): + x = self._struct.unpack(bs[1:])[0] + x = from_sortable(self.numtype, self.bits, self.signed, x) + x = self.unprepare_number(x) + return x + + def process_text(self, text, **kwargs): + return (self.to_bytes(text),) + + def self_parsing(self): + return True + + def parse_query(self, fieldname, qstring, boost=1.0): + from whoosh import query + from whoosh.qparser.common import QueryParserError + + if qstring == "*": + return query.Every(fieldname, boost=boost) + + if not self.is_valid(qstring): + raise QueryParserError("%r is not a valid number" % qstring) + + token = self.to_bytes(qstring) + return query.Term(fieldname, token, boost=boost) + + def parse_range(self, fieldname, start, end, startexcl, endexcl, + boost=1.0): + from whoosh import query + from whoosh.qparser.common import QueryParserError + + if start is not None: + if not self.is_valid(start): + raise QueryParserError("Range start %r is not a valid number" + % start) + start = self.prepare_number(start) + if end is not None: + if not self.is_valid(end): + raise QueryParserError("Range end %r is not a valid number" + % end) + end = self.prepare_number(end) + return query.NumericRange(fieldname, start, end, startexcl, endexcl, + boost=boost) + + def sortable_terms(self, ixreader, fieldname): + zero = b"\x00" + for token in ixreader.lexicon(fieldname): + if token[0:1] != zero: + # Only yield the full-precision values + break + yield token + + +class DATETIME(NUMERIC): + """ + Special field type that lets you index datetime objects. The field + converts the datetime objects to sortable text for you before indexing. + + Since this field is based on Python's datetime module it shares all the + limitations of that module, such as the inability to represent dates before + year 1 in the proleptic Gregorian calendar. However, since this field + stores datetimes as an integer number of microseconds, it could easily + represent a much wider range of dates if the Python datetime implementation + ever supports them. + + >>> schema = Schema(path=STORED, date=DATETIME) + >>> ix = storage.create_index(schema) + >>> w = ix.writer() + >>> w.add_document(path="/a", date=datetime.now()) + >>> w.commit() + """ + + def __init__(self, stored=False, unique=False, sortable=False): + """ + :param stored: Whether the value of this field is stored with the + document. + :param unique: Whether the value of this field is unique per-document. + """ + + super(DATETIME, self).__init__(int, 64, stored=stored, + unique=unique, shift_step=8, + sortable=sortable) + + def prepare_datetime(self, x): + from whoosh.util.times import floor + + if isinstance(x, text_type): + # For indexing, support same strings as for query parsing -- + # convert unicode to datetime object + x = self._parse_datestring(x) + x = floor(x) # this makes most sense (unspecified = lowest) + + if isinstance(x, datetime.datetime): + return datetime_to_long(x) + elif isinstance(x, bytes_type): + return x + else: + raise Exception("%r is not a datetime" % (x,)) + + def to_column_value(self, x): + if isinstance(x, bytes_type): + raise Exception("%r is not a datetime" % (x,)) + if isinstance(x, (list, tuple)): + x = x[0] + return self.prepare_datetime(x) + + def from_column_value(self, x): + return long_to_datetime(x) + + def to_bytes(self, x, shift=0): + x = self.prepare_datetime(x) + return NUMERIC.to_bytes(self, x, shift=shift) + + def from_bytes(self, bs): + x = NUMERIC.from_bytes(self, bs) + return long_to_datetime(x) + + def _parse_datestring(self, qstring): + # This method parses a very simple datetime representation of the form + # YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]] + from whoosh.util.times import adatetime, fix, is_void + + qstring = qstring.replace(" ", "").replace("-", "").replace(".", "") + year = month = day = hour = minute = second = microsecond = None + if len(qstring) >= 4: + year = int(qstring[:4]) + if len(qstring) >= 6: + month = int(qstring[4:6]) + if len(qstring) >= 8: + day = int(qstring[6:8]) + if len(qstring) >= 10: + hour = int(qstring[8:10]) + if len(qstring) >= 12: + minute = int(qstring[10:12]) + if len(qstring) >= 14: + second = int(qstring[12:14]) + if len(qstring) == 20: + microsecond = int(qstring[14:]) + + at = fix(adatetime(year, month, day, hour, minute, second, + microsecond)) + if is_void(at): + raise Exception("%r is not a parseable date" % qstring) + return at + + def parse_query(self, fieldname, qstring, boost=1.0): + from whoosh import query + from whoosh.util.times import is_ambiguous + + try: + at = self._parse_datestring(qstring) + except: + e = sys.exc_info()[1] + return query.error_query(e) + + if is_ambiguous(at): + startnum = datetime_to_long(at.floor()) + endnum = datetime_to_long(at.ceil()) + return query.NumericRange(fieldname, startnum, endnum) + else: + return query.Term(fieldname, at, boost=boost) + + def parse_range(self, fieldname, start, end, startexcl, endexcl, + boost=1.0): + from whoosh import query + + if start is None and end is None: + return query.Every(fieldname, boost=boost) + + if start is not None: + startdt = self._parse_datestring(start).floor() + start = datetime_to_long(startdt) + + if end is not None: + enddt = self._parse_datestring(end).ceil() + end = datetime_to_long(enddt) + + return query.NumericRange(fieldname, start, end, boost=boost) + + +class BOOLEAN(FieldType): + """ + Special field type that lets you index boolean values (True and False). + The field converts the boolean values to text for you before indexing. + + >>> schema = Schema(path=STORED, done=BOOLEAN) + >>> ix = storage.create_index(schema) + >>> w = ix.writer() + >>> w.add_document(path="/a", done=False) + >>> w.commit() + """ + + bytestrings = (b"f", b"t") + trues = frozenset(u"t true yes 1".split()) + falses = frozenset(u"f false no 0".split()) + + def __init__(self, stored=False, field_boost=1.0): + """ + :param stored: Whether the value of this field is stored with the + document. + """ + + self.stored = stored + # Don't store any information other than the doc ID + self.format = formats.Existence(field_boost=field_boost) + + def _obj_to_bool(self, x): + # We special case strings such as "true", "false", "yes", "no", but + # otherwise call bool() on the query value. This lets you pass objects + # as query values and do the right thing. + + if isinstance(x, string_type) and x.lower() in self.trues: + x = True + elif isinstance(x, string_type) and x.lower() in self.falses: + x = False + else: + x = bool(x) + return x + + def to_bytes(self, x): + if isinstance(x, bytes_type): + return x + elif isinstance(x, string_type): + x = x.lower() in self.trues + else: + x = bool(x) + bs = self.bytestrings[int(x)] + return bs + + def index(self, bit, **kwargs): + if isinstance(bit, string_type): + bit = bit.lower() in self.trues + else: + bit = bool(bit) + # word, freq, weight, valuestring + return [(self.bytestrings[int(bit)], 1, 1.0, emptybytes)] + + def self_parsing(self): + return True + + def parse_query(self, fieldname, qstring, boost=1.0): + from whoosh import query + + if qstring == "*": + return query.Every(fieldname, boost=boost) + + return query.Term(fieldname, self._obj_to_bool(qstring), boost=boost) + + +class STORED(FieldType): + """ + Configured field type for fields you want to store but not index. + """ + + indexed = False + stored = True + + def __init__(self): + pass + + +class COLUMN(FieldType): + """ + Configured field type for fields you want to store as a per-document + value column but not index. + """ + + indexed = False + stored = False + + def __init__(self, columnobj=None): + if columnobj is None: + columnobj = columns.VarBytesColumn() + if not isinstance(columnobj, columns.Column): + raise TypeError("%r is not a column object" % (columnobj,)) + self.column_type = columnobj + + def to_bytes(self, v): + return v + + def from_bytes(self, b): + return b + + +class KEYWORD(FieldType): + """ + Configured field type for fields containing space-separated or + comma-separated keyword-like data (such as tags). The default is to not + store positional information (so phrase searching is not allowed in this + field) and to not make the field scorable. + """ + + def __init__(self, stored=False, lowercase=False, commas=False, + scorable=False, unique=False, field_boost=1.0, sortable=False, + vector=None): + """ + :param stored: Whether to store the value of the field with the + document. + :param comma: Whether this is a comma-separated field. If this is False + (the default), it is treated as a space-separated field. + :param scorable: Whether this field is scorable. + """ + + self.analyzer = analysis.KeywordAnalyzer(lowercase=lowercase, + commas=commas) + # Store field lengths and weights along with doc ID + self.format = formats.Frequency(field_boost=field_boost) + self.scorable = scorable + self.stored = stored + self.unique = unique + + if isinstance(vector, formats.Format): + self.vector = vector + elif vector: + self.vector = self.format + else: + self.vector = None + + if sortable: + self.column_type = self.default_column() + + +class TEXT(FieldType): + """ + Configured field type for text fields (for example, the body text of an + article). The default is to store positional information to allow phrase + searching. This field type is always scorable. + """ + + def __init__(self, analyzer=None, phrase=True, chars=False, stored=False, + field_boost=1.0, multitoken_query="default", spelling=False, + sortable=False, lang=None, vector=None, + spelling_prefix="spell_"): + """ + :param analyzer: The analysis.Analyzer to use to index the field + contents. See the analysis module for more information. If you omit + this argument, the field uses analysis.StandardAnalyzer. + :param phrase: Whether the store positional information to allow phrase + searching. + :param chars: Whether to store character ranges along with positions. + If this is True, "phrase" is also implied. + :param stored: Whether to store the value of this field with the + document. Since this field type generally contains a lot of text, + you should avoid storing it with the document unless you need to, + for example to allow fast excerpts in the search results. + :param spelling: if True, and if the field's analyzer changes the form + of term text (such as a stemming analyzer), this field will store + extra information in a separate field (named using the + ``spelling_prefix`` keyword argument) to allow spelling suggestions + to use the unchanged word forms as spelling suggestions. + :param sortable: If True, make this field sortable using the default + column type. If you pass a :class:`whoosh.columns.Column` instance + instead of True, the field will use the given column type. + :param lang: automaticaly configure a + :class:`whoosh.analysis.LanguageAnalyzer` for the given language. + This is ignored if you also specify an ``analyzer``. + :param vector: if this value evaluates to true, store a list of the + terms in this field in each document. If the value is an instance + of :class:`whoosh.formats.Format`, the index will use the object to + store the term vector. Any other true value (e.g. ``vector=True``) + will use the field's index format to store the term vector as well. + """ + + if analyzer: + self.analyzer = analyzer + elif lang: + self.analyzer = analysis.LanguageAnalyzer(lang) + else: + self.analyzer = analysis.StandardAnalyzer() + + if chars: + formatclass = formats.Characters + elif phrase: + formatclass = formats.Positions + else: + formatclass = formats.Frequency + self.format = formatclass(field_boost=field_boost) + + if sortable: + if isinstance(sortable, columns.Column): + self.column_type = sortable + else: + self.column_type = columns.VarBytesColumn() + else: + self.column_type = None + + self.spelling = spelling + self.spelling_prefix = spelling_prefix + self.multitoken_query = multitoken_query + self.scorable = True + self.stored = stored + + if isinstance(vector, formats.Format): + self.vector = vector + elif vector: + self.vector = self.format + else: + self.vector = None + + def subfields(self): + yield "", self + + # If the user indicated this is a spellable field, and the analyzer + # is morphic, then also index into a spelling-only field that stores + # minimal information + if self.separate_spelling(): + yield self.spelling_prefix, SpellField(self.analyzer) + + def separate_spelling(self): + return self.spelling and self.analyzer.has_morph() + + def spelling_fieldname(self, fieldname): + if self.separate_spelling(): + return self.spelling_prefix + fieldname + else: + return fieldname + + +class SpellField(FieldType): + """ + This is a utility field type meant to be returned by ``TEXT.subfields()`` + when it needs a minimal field to store the spellable words. + """ + + def __init__(self, analyzer): + self.format = formats.Frequency() + self.analyzer = analyzer + self.column_type = None + self.scorabe = False + self.stored = False + self.unique = False + self.indexed = True + self.spelling = False + + # All the text analysis methods add "nomorph" to the keywords to get + # unmorphed term texts + + def index(self, value, boost=1.0, **kwargs): + kwargs["nomorph"] = True + return FieldType.index(self, value, boost=boost, **kwargs) + + def tokenzie(self, value, **kwargs): + kwargs["nomorph"] = True + return FieldType.tokenize(self, value, **kwargs) + + def process_text(self, qstring, mode='', **kwargs): + kwargs["nomorph"] = True + return FieldType.process_text(self, qstring, mode=mode, **kwargs) + + +class NGRAM(FieldType): + """ + Configured field that indexes text as N-grams. For example, with a field + type NGRAM(3,4), the value "hello" will be indexed as tokens + "hel", "hell", "ell", "ello", "llo". This field type chops the entire text + into N-grams, including whitespace and punctuation. See :class:`NGRAMWORDS` + for a field type that breaks the text into words first before chopping the + words into N-grams. + """ + + scorable = True + + def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0, + queryor=False, phrase=False, sortable=False): + """ + :param minsize: The minimum length of the N-grams. + :param maxsize: The maximum length of the N-grams. + :param stored: Whether to store the value of this field with the + document. Since this field type generally contains a lot of text, + you should avoid storing it with the document unless you need to, + for example to allow fast excerpts in the search results. + :param queryor: if True, combine the N-grams with an Or query. The + default is to combine N-grams with an And query. + :param phrase: store positions on the N-grams to allow exact phrase + searching. The default is off. + """ + + formatclass = formats.Frequency + if phrase: + formatclass = formats.Positions + + self.analyzer = analysis.NgramAnalyzer(minsize, maxsize) + self.format = formatclass(field_boost=field_boost) + self.analyzer = analysis.NgramAnalyzer(minsize, maxsize) + self.stored = stored + self.queryor = queryor + self.set_sortable(sortable) + + def self_parsing(self): + return True + + def parse_query(self, fieldname, qstring, boost=1.0): + from whoosh import query + + terms = [query.Term(fieldname, g) + for g in self.process_text(qstring, mode='query')] + cls = query.Or if self.queryor else query.And + + return cls(terms, boost=boost) + + +class NGRAMWORDS(NGRAM): + """ + Configured field that chops text into words using a tokenizer, + lowercases the words, and then chops the words into N-grams. + """ + + scorable = True + + def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0, + tokenizer=None, at=None, queryor=False, sortable=False): + """ + :param minsize: The minimum length of the N-grams. + :param maxsize: The maximum length of the N-grams. + :param stored: Whether to store the value of this field with the + document. Since this field type generally contains a lot of text, + you should avoid storing it with the document unless you need to, + for example to allow fast excerpts in the search results. + :param tokenizer: an instance of :class:`whoosh.analysis.Tokenizer` + used to break the text into words. + :param at: if 'start', only takes N-grams from the start of the word. + If 'end', only takes N-grams from the end. Otherwise the default + is to take all N-grams from each word. + :param queryor: if True, combine the N-grams with an Or query. The + default is to combine N-grams with an And query. + """ + + self.analyzer = analysis.NgramWordAnalyzer(minsize, maxsize, tokenizer, + at=at) + self.format = formats.Frequency(field_boost=field_boost) + self.stored = stored + self.queryor = queryor + self.set_sortable(sortable) + + +# Other fields + +class ReverseField(FieldWrapper): + def __init__(self, subfield, prefix="rev_"): + FieldWrapper.__init__(self, subfield, prefix) + self.analyzer = subfield.analyzer | analysis.ReverseTextFilter() + self.format = BasicFormat(lengths=False, weights=False) + + self.scorable = False + self.set_sortable(False) + self.stored = False + self.unique = False + self.vector = False + + def subfields(self): + yield "", self.subfield + yield self.name_prefix, self + + +# Schema class + +class MetaSchema(type): + def __new__(cls, name, bases, attrs): + super_new = super(MetaSchema, cls).__new__ + if not any(b for b in bases if isinstance(b, MetaSchema)): + # If this isn't a subclass of MetaSchema, don't do anything special + return super_new(cls, name, bases, attrs) + + # Create the class + special_attrs = {} + for key in list(attrs.keys()): + if key.startswith("__"): + special_attrs[key] = attrs.pop(key) + new_class = super_new(cls, name, bases, special_attrs) + + fields = {} + for b in bases: + if hasattr(b, "_clsfields"): + fields.update(b._clsfields) + fields.update(attrs) + new_class._clsfields = fields + return new_class + + def schema(self): + return Schema(**self._clsfields) + + +class Schema(object): + """ + Represents the collection of fields in an index. Maps field names to + FieldType objects which define the behavior of each field. + + Low-level parts of the index use field numbers instead of field names for + compactness. This class has several methods for converting between the + field name, field number, and field object itself. + """ + + def __init__(self, **fields): + """ + All keyword arguments to the constructor are treated as fieldname = + fieldtype pairs. The fieldtype can be an instantiated FieldType object, + or a FieldType sub-class (in which case the Schema will instantiate it + with the default constructor before adding it). + + For example:: + + s = Schema(content = TEXT, + title = TEXT(stored = True), + tags = KEYWORD(stored = True)) + """ + + self._fields = {} + self._subfields = {} + self._dyn_fields = {} + + for name in sorted(fields.keys()): + self.add(name, fields[name]) + + def copy(self): + """ + Returns a shallow copy of the schema. The field instances are not + deep copied, so they are shared between schema copies. + """ + + return self.__class__(**self._fields) + + def __eq__(self, other): + return (other.__class__ is self.__class__ + and list(self.items()) == list(other.items())) + + def __ne__(self, other): + return not(self.__eq__(other)) + + def __repr__(self): + return "<%s: %r>" % (self.__class__.__name__, self.names()) + + def __iter__(self): + """ + Returns the field objects in this schema. + """ + + return iter(self._fields.values()) + + def __getitem__(self, name): + """ + Returns the field associated with the given field name. + """ + + # If the name is in the dictionary, just return it + if name in self._fields: + return self._fields[name] + + # Check if the name matches a dynamic field + for expr, fieldtype in itervalues(self._dyn_fields): + if expr.match(name): + return fieldtype + + raise KeyError("No field named %r" % (name,)) + + def __len__(self): + """ + Returns the number of fields in this schema. + """ + + return len(self._fields) + + def __contains__(self, fieldname): + """ + Returns True if a field by the given name is in this schema. + """ + + # Defined in terms of __getitem__ so that there's only one method to + # override to provide dynamic fields + try: + field = self[fieldname] + return field is not None + except KeyError: + return False + + def to_bytes(self, fieldname, value): + return self[fieldname].to_bytes(value) + + def items(self): + """ + Returns a list of ("fieldname", field_object) pairs for the fields + in this schema. + """ + + return sorted(self._fields.items()) + + def names(self, check_names=None): + """ + Returns a list of the names of the fields in this schema. + + :param check_names: (optional) sequence of field names to check + whether the schema accepts them as (dynamic) field names - + acceptable names will also be in the result list. + Note: You may also have static field names in check_names, that + won't create duplicates in the result list. Unsupported names + will not be in the result list. + """ + + fieldnames = set(self._fields.keys()) + if check_names is not None: + check_names = set(check_names) - fieldnames + fieldnames.update(fieldname for fieldname in check_names + if fieldname in self) + return sorted(fieldnames) + + def clean(self): + for field in self: + field.clean() + + def add(self, name, fieldtype, glob=False): + """ + Adds a field to this schema. + + :param name: The name of the field. + :param fieldtype: An instantiated fields.FieldType object, or a + FieldType subclass. If you pass an instantiated object, the schema + will use that as the field configuration for this field. If you + pass a FieldType subclass, the schema will automatically + instantiate it with the default constructor. + """ + + # If the user passed a type rather than an instantiated field object, + # instantiate it automatically + if type(fieldtype) is type: + try: + fieldtype = fieldtype() + except: + e = sys.exc_info()[1] + raise FieldConfigurationError("Error: %s instantiating field " + "%r: %r" % (e, name, fieldtype)) + + if not isinstance(fieldtype, FieldType): + raise FieldConfigurationError("%r is not a FieldType object" + % fieldtype) + + self._subfields[name] = sublist = [] + for prefix, subfield in fieldtype.subfields(): + fname = prefix + name + sublist.append(fname) + + # Check field name + if fname.startswith("_"): + raise FieldConfigurationError("Names cannot start with _") + elif " " in fname: + raise FieldConfigurationError("Names cannot contain spaces") + elif fname in self._fields or (glob and fname in self._dyn_fields): + raise FieldConfigurationError("%r already in schema" % fname) + + # Add the field + if glob: + expr = re.compile(fnmatch.translate(name)) + self._dyn_fields[fname] = (expr, subfield) + else: + fieldtype.on_add(self, fname) + self._fields[fname] = subfield + + def remove(self, fieldname): + if fieldname in self._fields: + self._fields[fieldname].on_remove(self, fieldname) + del self._fields[fieldname] + + if fieldname in self._subfields: + for subname in self._subfields[fieldname]: + if subname in self._fields: + del self._fields[subname] + del self._subfields[fieldname] + + elif fieldname in self._dyn_fields: + del self._dyn_fields[fieldname] + + else: + raise KeyError("No field named %r" % fieldname) + + def indexable_fields(self, fieldname): + if fieldname in self._subfields: + for subname in self._subfields[fieldname]: + yield subname, self._fields[subname] + else: + # Use __getitem__ here instead of getting it directly from _fields + # because it might be a glob + yield fieldname, self[fieldname] + + def has_scorable_fields(self): + return any(ftype.scorable for ftype in self) + + def stored_names(self): + """ + Returns a list of the names of fields that are stored. + """ + + return [name for name, field in self.items() if field.stored] + + def scorable_names(self): + """ + Returns a list of the names of fields that store field + lengths. + """ + + return [name for name, field in self.items() if field.scorable] + + +class SchemaClass(with_metaclass(MetaSchema, Schema)): + """ + Allows you to define a schema using declarative syntax, similar to + Django models:: + + class MySchema(SchemaClass): + path = ID + date = DATETIME + content = TEXT + + You can use inheritance to share common fields between schemas:: + + class Parent(SchemaClass): + path = ID(stored=True) + date = DATETIME + + class Child1(Parent): + content = TEXT(positions=False) + + class Child2(Parent): + tags = KEYWORD + + This class overrides ``__new__`` so instantiating your sub-class always + results in an instance of ``Schema``. + + >>> class MySchema(SchemaClass): + ... title = TEXT(stored=True) + ... content = TEXT + ... + >>> s = MySchema() + >>> type(s) + + """ + + def __new__(cls, *args, **kwargs): + obj = super(Schema, cls).__new__(Schema) + kw = getattr(cls, "_clsfields", {}) + kw.update(kwargs) + obj.__init__(*args, **kw) + return obj + + +def ensure_schema(schema): + if isinstance(schema, type) and issubclass(schema, Schema): + schema = schema.schema() + if not isinstance(schema, Schema): + raise FieldConfigurationError("%r is not a Schema" % schema) + return schema + + +def merge_fielddict(d1, d2): + keyset = set(d1.keys()) | set(d2.keys()) + out = {} + for name in keyset: + field1 = d1.get(name) + field2 = d2.get(name) + if field1 and field2 and field1 != field2: + raise Exception("Inconsistent field %r: %r != %r" + % (name, field1, field2)) + out[name] = field1 or field2 + return out + + +def merge_schema(s1, s2): + schema = Schema() + schema._fields = merge_fielddict(s1._fields, s2._fields) + schema._dyn_fields = merge_fielddict(s1._dyn_fields, s2._dyn_fields) + return schema + + +def merge_schemas(schemas): + schema = schemas[0] + for i in xrange(1, len(schemas)): + schema = merge_schema(schema, schemas[i]) + return schema diff --git a/src/whoosh/filedb/__init__.py b/src/whoosh/filedb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/whoosh/filedb/compound.py b/src/whoosh/filedb/compound.py new file mode 100644 index 0000000..508077a --- /dev/null +++ b/src/whoosh/filedb/compound.py @@ -0,0 +1,331 @@ +# Copyright 2011 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import errno +import os +import sys +from threading import Lock +from shutil import copyfileobj + +try: + import mmap +except ImportError: + mmap = None + +from whoosh.compat import BytesIO, memoryview_ +from whoosh.filedb.structfile import BufferFile, StructFile +from whoosh.filedb.filestore import FileStorage, StorageError +from whoosh.system import emptybytes +from whoosh.util import random_name + + +class CompoundStorage(FileStorage): + readonly = True + + def __init__(self, dbfile, use_mmap=True, basepos=0): + self._file = dbfile + self.is_closed = False + + # Seek to the end to get total file size (to check if mmap is OK) + dbfile.seek(0, os.SEEK_END) + filesize = self._file.tell() + dbfile.seek(basepos) + + self._diroffset = self._file.read_long() + self._dirlength = self._file.read_int() + self._file.seek(self._diroffset) + self._dir = self._file.read_pickle() + self._options = self._file.read_pickle() + self._locks = {} + self._source = None + + use_mmap = ( + use_mmap + and hasattr(self._file, "fileno") # check file is a real file + and filesize < sys.maxsize # check fit on 32-bit Python + ) + if mmap and use_mmap: + # Try to open the entire segment as a memory-mapped object + try: + fileno = self._file.fileno() + self._source = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ) + except (mmap.error, OSError): + e = sys.exc_info()[1] + # If we got an error because there wasn't enough memory to + # open the map, ignore it and fall through, we'll just use the + # (slower) "sub-file" implementation + if e.errno == errno.ENOMEM: + pass + else: + raise + else: + # If that worked, we can close the file handle we were given + self._file.close() + self._file = None + + def __repr__(self): + return "<%s (%s)>" % (self.__class__.__name__, self._name) + + def close(self): + if self.is_closed: + raise Exception("Already closed") + self.is_closed = True + + if self._source: + try: + self._source.close() + except BufferError: + del self._source + if self._file: + self._file.close() + + def range(self, name): + try: + fileinfo = self._dir[name] + except KeyError: + raise NameError("Unknown file %r" % (name,)) + return fileinfo["offset"], fileinfo["length"] + + def open_file(self, name, *args, **kwargs): + if self.is_closed: + raise StorageError("Storage was closed") + + offset, length = self.range(name) + if self._source: + # Create a memoryview/buffer from the mmap + buf = memoryview_(self._source, offset, length) + f = BufferFile(buf, name=name) + elif hasattr(self._file, "subset"): + f = self._file.subset(offset, length, name=name) + else: + f = StructFile(SubFile(self._file, offset, length), name=name) + return f + + def list(self): + return list(self._dir.keys()) + + def file_exists(self, name): + return name in self._dir + + def file_length(self, name): + info = self._dir[name] + return info["length"] + + def file_modified(self, name): + info = self._dir[name] + return info["modified"] + + def lock(self, name): + if name not in self._locks: + self._locks[name] = Lock() + return self._locks[name] + + @staticmethod + def assemble(dbfile, store, names, **options): + assert names, names + + directory = {} + basepos = dbfile.tell() + dbfile.write_long(0) # Directory position + dbfile.write_int(0) # Directory length + + # Copy the files into the compound file + for name in names: + if name.endswith(".toc") or name.endswith(".seg"): + raise Exception(name) + + for name in names: + offset = dbfile.tell() + length = store.file_length(name) + modified = store.file_modified(name) + directory[name] = {"offset": offset, "length": length, + "modified": modified} + f = store.open_file(name) + copyfileobj(f, dbfile) + f.close() + + CompoundStorage.write_dir(dbfile, basepos, directory, options) + + @staticmethod + def write_dir(dbfile, basepos, directory, options=None): + options = options or {} + + dirpos = dbfile.tell() # Remember the start of the directory + dbfile.write_pickle(directory) # Write the directory + dbfile.write_pickle(options) + endpos = dbfile.tell() # Remember the end of the directory + dbfile.flush() + dbfile.seek(basepos) # Seek back to the start + dbfile.write_long(dirpos) # Directory position + dbfile.write_int(endpos - dirpos) # Directory length + + dbfile.close() + + +class SubFile(object): + def __init__(self, parentfile, offset, length, name=None): + self._file = parentfile + self._offset = offset + self._length = length + self._end = offset + length + self._pos = 0 + + self.name = name + self.closed = False + + def close(self): + self.closed = True + + def subset(self, position, length, name=None): + start = self._offset + position + end = start + length + name = name or self.name + assert self._offset >= start >= self._end + assert self._offset >= end >= self._end + return SubFile(self._file, self._offset + position, length, name=name) + + def read(self, size=None): + if size is None: + size = self._length - self._pos + else: + size = min(size, self._length - self._pos) + if size < 0: + size = 0 + + if size > 0: + self._file.seek(self._offset + self._pos) + self._pos += size + return self._file.read(size) + else: + return emptybytes + + def readline(self): + maxsize = self._length - self._pos + self._file.seek(self._offset + self._pos) + data = self._file.readline() + if len(data) > maxsize: + data = data[:maxsize] + self._pos += len(data) + return data + + def seek(self, where, whence=0): + if whence == 0: # Absolute + pos = where + elif whence == 1: # Relative + pos = self._pos + where + elif whence == 2: # From end + pos = self._length - where + else: + raise ValueError + + self._pos = pos + + def tell(self): + return self._pos + + +class CompoundWriter(object): + def __init__(self, tempstorage, buffersize=32 * 1024): + assert isinstance(buffersize, int) + self._tempstorage = tempstorage + self._tempname = "%s.ctmp" % random_name() + self._temp = tempstorage.create_file(self._tempname, mode="w+b") + self._buffersize = buffersize + self._streams = {} + + def create_file(self, name): + ss = self.SubStream(self._temp, self._buffersize) + self._streams[name] = ss + return StructFile(ss) + + def _readback(self): + temp = self._temp + for name, substream in self._streams.items(): + substream.close() + + def gen(): + for f, offset, length in substream.blocks: + if f is None: + f = temp + f.seek(offset) + yield f.read(length) + + yield (name, gen) + temp.close() + self._tempstorage.delete_file(self._tempname) + + def save_as_compound(self, dbfile): + basepos = dbfile.tell() + dbfile.write_long(0) # Directory offset + dbfile.write_int(0) # Directory length + + directory = {} + for name, blocks in self._readback(): + filestart = dbfile.tell() + for block in blocks(): + dbfile.write(block) + directory[name] = {"offset": filestart, + "length": dbfile.tell() - filestart} + + CompoundStorage.write_dir(dbfile, basepos, directory) + + def save_as_files(self, storage, name_fn): + for name, blocks in self._readback(): + f = storage.create_file(name_fn(name)) + for block in blocks(): + f.write(block) + f.close() + + class SubStream(object): + def __init__(self, dbfile, buffersize): + self._dbfile = dbfile + self._buffersize = buffersize + self._buffer = BytesIO() + self.blocks = [] + + def tell(self): + return sum(b[2] for b in self.blocks) + self._buffer.tell() + + def write(self, inbytes): + bio = self._buffer + buflen = bio.tell() + length = buflen + len(inbytes) + if length >= self._buffersize: + offset = self._dbfile.tell() + self._dbfile.write(bio.getvalue()[:buflen]) + self._dbfile.write(inbytes) + + self.blocks.append((None, offset, length)) + self._buffer.seek(0) + else: + bio.write(inbytes) + + def close(self): + bio = self._buffer + length = bio.tell() + if length: + self.blocks.append((bio, 0, length)) diff --git a/src/whoosh/filedb/filestore.py b/src/whoosh/filedb/filestore.py new file mode 100644 index 0000000..e4cc916 --- /dev/null +++ b/src/whoosh/filedb/filestore.py @@ -0,0 +1,655 @@ +# Copyright 2009 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import with_statement +import errno, os, sys, tempfile +from threading import Lock + +from whoosh.compat import BytesIO, memoryview_ +from whoosh.filedb.structfile import BufferFile, StructFile +from whoosh.index import _DEF_INDEX_NAME, EmptyIndexError +from whoosh.util import random_name +from whoosh.util.filelock import FileLock + + +# Exceptions + +class StorageError(Exception): + pass + + +class ReadOnlyError(StorageError): + pass + + +# Base class + +class Storage(object): + """Abstract base class for storage objects. + + A storage object is a virtual flat filesystem, allowing the creation and + retrieval of file-like objects + (:class:`~whoosh.filedb.structfile.StructFile` objects). The default + implementation (:class:`FileStorage`) uses actual files in a directory. + + All access to files in Whoosh goes through this object. This allows more + different forms of storage (for example, in RAM, in a database, in a single + file) to be used transparently. + + For example, to create a :class:`FileStorage` object:: + + # Create a storage object + st = FileStorage("indexdir") + # Create the directory if it doesn't already exist + st.create() + + The :meth:`Storage.create` method makes it slightly easier to swap storage + implementations. The ``create()`` method handles set-up of the storage + object. For example, ``FileStorage.create()`` creates the directory. A + database implementation might create tables. This is designed to let you + avoid putting implementation-specific setup code in your application. + """ + + readonly = False + supports_mmap = False + + def __iter__(self): + return iter(self.list()) + + def __enter__(self): + self.create() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def create(self): + """Creates any required implementation-specific resources. For example, + a filesystem-based implementation might create a directory, while a + database implementation might create tables. For example:: + + from whoosh.filedb.filestore import FileStorage + # Create a storage object + st = FileStorage("indexdir") + # Create any necessary resources + st.create() + + This method returns ``self`` so you can also say:: + + st = FileStorage("indexdir").create() + + Storage implementations should be written so that calling create() a + second time on the same storage + + :return: a :class:`Storage` instance. + """ + + return self + + def destroy(self, *args, **kwargs): + """Removes any implementation-specific resources related to this storage + object. For example, a filesystem-based implementation might delete a + directory, and a database implementation might drop tables. + + The arguments are implementation-specific. + """ + + pass + + def create_index(self, schema, indexname=_DEF_INDEX_NAME, indexclass=None): + """Creates a new index in this storage. + + >>> from whoosh import fields + >>> from whoosh.filedb.filestore import FileStorage + >>> schema = fields.Schema(content=fields.TEXT) + >>> # Create the storage directory + >>> st = FileStorage.create("indexdir") + >>> # Create an index in the storage + >>> ix = st.create_index(schema) + + :param schema: the :class:`whoosh.fields.Schema` object to use for the + new index. + :param indexname: the name of the index within the storage object. You + can use this option to store multiple indexes in the same storage. + :param indexclass: an optional custom ``Index`` sub-class to use to + create the index files. The default is + :class:`whoosh.index.FileIndex`. This method will call the + ``create`` class method on the given class to create the index. + :return: a :class:`whoosh.index.Index` instance. + """ + + if self.readonly: + raise ReadOnlyError + if indexclass is None: + import whoosh.index + indexclass = whoosh.index.FileIndex + return indexclass.create(self, schema, indexname) + + def open_index(self, indexname=_DEF_INDEX_NAME, schema=None, indexclass=None): + """Opens an existing index (created using :meth:`create_index`) in this + storage. + + >>> from whoosh.filedb.filestore import FileStorage + >>> st = FileStorage("indexdir") + >>> # Open an index in the storage + >>> ix = st.open_index() + + :param indexname: the name of the index within the storage object. You + can use this option to store multiple indexes in the same storage. + :param schema: if you pass in a :class:`whoosh.fields.Schema` object + using this argument, it will override the schema that was stored + with the index. + :param indexclass: an optional custom ``Index`` sub-class to use to + open the index files. The default is + :class:`whoosh.index.FileIndex`. This method will instantiate the + class with this storage object. + :return: a :class:`whoosh.index.Index` instance. + """ + + if indexclass is None: + import whoosh.index + indexclass = whoosh.index.FileIndex + return indexclass(self, schema=schema, indexname=indexname) + + def index_exists(self, indexname=None): + """Returns True if a non-empty index exists in this storage. + + :param indexname: the name of the index within the storage object. You + can use this option to store multiple indexes in the same storage. + :rtype: bool + """ + + if indexname is None: + indexname = _DEF_INDEX_NAME + try: + ix = self.open_index(indexname) + gen = ix.latest_generation() + ix.close() + return gen > -1 + except EmptyIndexError: + pass + return False + + def create_file(self, name): + """Creates a file with the given name in this storage. + + :param name: the name for the new file. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. + """ + + raise NotImplementedError + + def open_file(self, name, *args, **kwargs): + """Opens a file with the given name in this storage. + + :param name: the name for the new file. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. + """ + + raise NotImplementedError + + def list(self): + """Returns a list of file names in this storage. + + :return: a list of strings + """ + raise NotImplementedError + + def file_exists(self, name): + """Returns True if the given file exists in this storage. + + :param name: the name to check. + :rtype: bool + """ + + raise NotImplementedError + + def file_modified(self, name): + """Returns the last-modified time of the given file in this storage (as + a "ctime" UNIX timestamp). + + :param name: the name to check. + :return: a "ctime" number. + """ + + raise NotImplementedError + + def file_length(self, name): + """Returns the size (in bytes) of the given file in this storage. + + :param name: the name to check. + :rtype: int + """ + + raise NotImplementedError + + def delete_file(self, name): + """Removes the given file from this storage. + + :param name: the name to delete. + """ + + raise NotImplementedError + + def rename_file(self, frm, to, safe=False): + """Renames a file in this storage. + + :param frm: The current name of the file. + :param to: The new name for the file. + :param safe: if True, raise an exception if a file with the new name + already exists. + """ + + raise NotImplementedError + + def lock(self, name): + """Return a named lock object (implementing ``.acquire()`` and + ``.release()`` methods). Different storage implementations may use + different lock types with different guarantees. For example, the + RamStorage object uses Python thread locks, while the FileStorage + object uses filesystem-based locks that are valid across different + processes. + + :param name: a name for the lock. + :return: a lock-like object. + """ + + raise NotImplementedError + + def close(self): + """Closes any resources opened by this storage object. For some storage + implementations this will be a no-op, but for others it is necessary + to release locks and/or prevent leaks, so it's a good idea to call it + when you're done with a storage object. + """ + + pass + + def optimize(self): + """Optimizes the storage object. The meaning and cost of "optimizing" + will vary by implementation. For example, a database implementation + might run a garbage collection procedure on the underlying database. + """ + + pass + + def temp_storage(self, name=None): + """Creates a new storage object for temporary files. You can call + :meth:`Storage.destroy` on the new storage when you're finished with + it. + + :param name: a name for the new storage. This may be optional or + required depending on the storage implementation. + :rtype: :class:`Storage` + """ + + raise NotImplementedError + + +class OverlayStorage(Storage): + """Overlays two storage objects. Reads are processed from the first if it + has the named file, otherwise the second. Writes always go to the second. + """ + + def __init__(self, a, b): + self.a = a + self.b = b + + def create_index(self, *args, **kwargs): + self.b.create_index(*args, **kwargs) + + def open_index(self, *args, **kwargs): + self.a.open_index(*args, **kwargs) + + def create_file(self, *args, **kwargs): + return self.b.create_file(*args, **kwargs) + + def open_file(self, name, *args, **kwargs): + if self.a.file_exists(name): + return self.a.open_file(name, *args, **kwargs) + else: + return self.b.open_file(name, *args, **kwargs) + + def list(self): + return list(set(self.a.list()) | set(self.b.list())) + + def file_exists(self, name): + return self.a.file_exists(name) or self.b.file_exists(name) + + def file_modified(self, name): + if self.a.file_exists(name): + return self.a.file_modified(name) + else: + return self.b.file_modified(name) + + def file_length(self, name): + if self.a.file_exists(name): + return self.a.file_length(name) + else: + return self.b.file_length(name) + + def delete_file(self, name): + return self.b.delete_file(name) + + def rename_file(self, *args, **kwargs): + raise NotImplementedError + + def lock(self, name): + return self.b.lock(name) + + def close(self): + self.a.close() + self.b.close() + + def optimize(self): + self.a.optimize() + self.b.optimize() + + def temp_storage(self, name=None): + return self.b.temp_storage(name=name) + + +class FileStorage(Storage): + """Storage object that stores the index as files in a directory on disk. + + Prior to version 3, the initializer would raise an IOError if the directory + did not exist. As of version 3, the object does not check if the + directory exists at initialization. This change is to support using the + :meth:`FileStorage.create` method. + """ + + supports_mmap = True + + def __init__(self, path, supports_mmap=True, readonly=False, debug=False): + """ + :param path: a path to a directory. + :param supports_mmap: if True (the default), use the ``mmap`` module to + open memory mapped files. You can open the storage object with + ``supports_mmap=False`` to force Whoosh to open files normally + instead of with ``mmap``. + :param readonly: If ``True``, the object will raise an exception if you + attempt to create or rename a file. + """ + + self.folder = path + self.supports_mmap = supports_mmap + self.readonly = readonly + self._debug = debug + self.locks = {} + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.folder) + + def create(self): + """Creates this storage object's directory path using ``os.makedirs`` if + it doesn't already exist. + + >>> from whoosh.filedb.filestore import FileStorage + >>> st = FileStorage("indexdir") + >>> st.create() + + This method returns ``self``, you can say:: + + st = FileStorage("indexdir").create() + + Note that you can simply create handle the creation of the directory + yourself and open the storage object using the initializer:: + + dirname = "indexdir" + os.mkdir(dirname) + st = FileStorage(dirname) + + However, using the ``create()`` method allows you to potentially swap in + other storage implementations more easily. + + :return: a :class:`Storage` instance. + """ + + dirpath = os.path.abspath(self.folder) + # If the given directory does not already exist, try to create it + try: + os.makedirs(dirpath) + except OSError: + # This is necessary for compatibility between Py2 and Py3 + e = sys.exc_info()[1] + # If we get an error because the path already exists, ignore it + if e.errno != errno.EEXIST: + raise + + # Raise an exception if the given path is not a directory + if not os.path.isdir(dirpath): + e = IOError("%r is not a directory" % dirpath) + e.errno = errno.ENOTDIR + raise e + + return self + + def destroy(self): + """Removes any files in this storage object and then removes the + storage object's directory. What happens if any of the files or the + directory are in use depends on the underlying platform. + """ + + # Remove all files + self.clean() + # Try to remove the directory + os.rmdir(self.folder) + + def create_file(self, name, excl=False, mode="wb", **kwargs): + """Creates a file with the given name in this storage. + + :param name: the name for the new file. + :param excl: if True, try to open the file in "exclusive" mode. + :param mode: the mode flags with which to open the file. The default is + ``"wb"``. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. + """ + + if self.readonly: + raise ReadOnlyError + + path = self._fpath(name) + if excl: + flags = os.O_CREAT | os.O_EXCL | os.O_RDWR + if hasattr(os, "O_BINARY"): + flags |= os.O_BINARY + fd = os.open(path, flags) + fileobj = os.fdopen(fd, mode) + else: + fileobj = open(path, mode) + + f = StructFile(fileobj, name=name, **kwargs) + return f + + def open_file(self, name, **kwargs): + """Opens an existing file in this storage. + + :param name: the name of the file to open. + :param kwargs: additional keyword arguments are passed through to the + :class:`~whoosh.filedb.structfile.StructFile` initializer. + :return: a :class:`whoosh.filedb.structfile.StructFile` instance. + """ + + f = StructFile(open(self._fpath(name), "rb"), name=name, **kwargs) + return f + + def _fpath(self, fname): + return os.path.abspath(os.path.join(self.folder, fname)) + + def clean(self, ignore=False): + if self.readonly: + raise ReadOnlyError + + path = self.folder + files = self.list() + for fname in files: + try: + os.remove(os.path.join(path, fname)) + except OSError: + if not ignore: + raise + + def list(self): + try: + files = os.listdir(self.folder) + except IOError: + files = [] + + return files + + def file_exists(self, name): + return os.path.exists(self._fpath(name)) + + def file_modified(self, name): + return os.path.getmtime(self._fpath(name)) + + def file_length(self, name): + return os.path.getsize(self._fpath(name)) + + def delete_file(self, name): + if self.readonly: + raise ReadOnlyError + + os.remove(self._fpath(name)) + + def rename_file(self, oldname, newname, safe=False): + if self.readonly: + raise ReadOnlyError + + if os.path.exists(self._fpath(newname)): + if safe: + raise NameError("File %r exists" % newname) + else: + os.remove(self._fpath(newname)) + os.rename(self._fpath(oldname), self._fpath(newname)) + + def lock(self, name): + return FileLock(self._fpath(name)) + + def temp_storage(self, name=None): + name = name or "%s.tmp" % random_name() + path = os.path.join(self.folder, name) + tempstore = FileStorage(path) + return tempstore.create() + + +class RamStorage(Storage): + """Storage object that keeps the index in memory. + """ + + supports_mmap = False + + def __init__(self): + self.files = {} + self.locks = {} + self.folder = '' + + def destroy(self): + del self.files + del self.locks + + def list(self): + return list(self.files.keys()) + + def clean(self): + self.files = {} + + def total_size(self): + return sum(self.file_length(f) for f in self.list()) + + def file_exists(self, name): + return name in self.files + + def file_length(self, name): + if name not in self.files: + raise NameError(name) + return len(self.files[name]) + + def file_modified(self, name): + return -1 + + def delete_file(self, name): + if name not in self.files: + raise NameError(name) + del self.files[name] + + def rename_file(self, name, newname, safe=False): + if name not in self.files: + raise NameError(name) + if safe and newname in self.files: + raise NameError("File %r exists" % newname) + + content = self.files[name] + del self.files[name] + self.files[newname] = content + + def create_file(self, name, **kwargs): + def onclose_fn(sfile): + self.files[name] = sfile.file.getvalue() + f = StructFile(BytesIO(), name=name, onclose=onclose_fn) + return f + + def open_file(self, name, **kwargs): + if name not in self.files: + raise NameError(name) + buf = memoryview_(self.files[name]) + return BufferFile(buf, name=name, **kwargs) + + def lock(self, name): + if name not in self.locks: + self.locks[name] = Lock() + return self.locks[name] + + def temp_storage(self, name=None): + tdir = tempfile.gettempdir() + name = name or "%s.tmp" % random_name() + path = os.path.join(tdir, name) + tempstore = FileStorage(path) + return tempstore.create() + + +def copy_storage(sourcestore, deststore): + """Copies the files from the source storage object to the destination + storage object using ``shutil.copyfileobj``. + """ + from shutil import copyfileobj + + for name in sourcestore.list(): + with sourcestore.open_file(name) as source: + with deststore.create_file(name) as dest: + copyfileobj(source, dest) + + +def copy_to_ram(storage): + """Copies the given FileStorage object into a new RamStorage object. + + :rtype: :class:`RamStorage` + """ + + ram = RamStorage() + copy_storage(storage, ram) + return ram diff --git a/src/whoosh/filedb/filetables.py b/src/whoosh/filedb/filetables.py new file mode 100644 index 0000000..f34f409 --- /dev/null +++ b/src/whoosh/filedb/filetables.py @@ -0,0 +1,735 @@ +# Copyright 2009 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +"""This module defines writer and reader classes for a fast, immutable +on-disk key-value database format. The current format is based heavily on +D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html). +""" + +import os, struct +from binascii import crc32 +from bisect import bisect_left +from hashlib import md5 # @UnresolvedImport + +from whoosh.compat import b, bytes_type +from whoosh.compat import xrange +from whoosh.util.numlists import GrowableArray +from whoosh.system import _INT_SIZE, emptybytes + + +# Exceptions + +class FileFormatError(Exception): + pass + + +# Hash functions + +def cdb_hash(key): + h = 5381 + for c in key: + h = (h + (h << 5)) & 0xffffffff ^ ord(c) + return h + + +def md5_hash(key): + return int(md5(key).hexdigest(), 16) & 0xffffffff + + +def crc_hash(key): + return crc32(key) & 0xffffffff + + +_hash_functions = (md5_hash, crc_hash, cdb_hash) + + +# Structs + +# Two uints before the key/value pair giving the length of the key and value +_lengths = struct.Struct("!ii") +# A pointer in a hash table, giving the hash value and the key position +_pointer = struct.Struct("!Iq") +# A pointer in the hash table directory, giving the position and number of slots +_dir_entry = struct.Struct("!qi") + +_directory_size = 256 * _dir_entry.size + + +# Basic hash file + +class HashWriter(object): + """Implements a fast on-disk key-value store. This hash uses a two-level + hashing scheme, where a key is hashed, the low eight bits of the hash value + are used to index into one of 256 hash tables. This is basically the CDB + algorithm, but unlike CDB this object writes all data serially (it doesn't + seek backwards to overwrite information at the end). + + Also unlike CDB, this format uses 64-bit file pointers, so the file length + is essentially unlimited. However, each key and value must be less than + 2 GB in length. + """ + + def __init__(self, dbfile, magic=b("HSH3"), hashtype=0): + """ + :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object + to write to. + :param magic: the format tag bytes to write at the start of the file. + :param hashtype: an integer indicating which hashing algorithm to use. + Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash). + """ + + self.dbfile = dbfile + self.hashtype = hashtype + self.hashfn = _hash_functions[self.hashtype] + # A place for subclasses to put extra metadata + self.extras = {} + + self.startoffset = dbfile.tell() + # Write format tag + dbfile.write(magic) + # Write hash type + dbfile.write_byte(self.hashtype) + # Unused future expansion bits + dbfile.write_int(0) + dbfile.write_int(0) + + # 256 lists of hashed keys and positions + self.buckets = [[] for _ in xrange(256)] + # List to remember the positions of the hash tables + self.directory = [] + + def tell(self): + return self.dbfile.tell() + + def add(self, key, value): + """Adds a key/value pair to the file. Note that keys DO NOT need to be + unique. You can store multiple values under the same key and retrieve + them using :meth:`HashReader.all`. + """ + + assert isinstance(key, bytes_type) + assert isinstance(value, bytes_type) + + dbfile = self.dbfile + pos = dbfile.tell() + dbfile.write(_lengths.pack(len(key), len(value))) + dbfile.write(key) + dbfile.write(value) + + # Get hash value for the key + h = self.hashfn(key) + # Add hash and on-disk position to appropriate bucket + self.buckets[h & 255].append((h, pos)) + + def add_all(self, items): + """Convenience method to add a sequence of ``(key, value)`` pairs. This + is the same as calling :meth:`HashWriter.add` on each pair in the + sequence. + """ + + add = self.add + for key, value in items: + add(key, value) + + def _write_hashes(self): + # Writes 256 hash tables containing pointers to the key/value pairs + + dbfile = self.dbfile + # Represent and empty slot in the hash table using 0,0 (no key can + # start at position 0 because of the header) + null = (0, 0) + + for entries in self.buckets: + # Start position of this bucket's hash table + pos = dbfile.tell() + # Remember the start position and the number of slots + numslots = 2 * len(entries) + self.directory.append((pos, numslots)) + + # Create the empty hash table + hashtable = [null] * numslots + # For each (hash value, key position) tuple in the bucket + for hashval, position in entries: + # Bitshift and wrap to get the slot for this entry + slot = (hashval >> 8) % numslots + # If the slot is taken, keep going until we find an empty slot + while hashtable[slot] != null: + slot = (slot + 1) % numslots + # Insert the entry into the hashtable + hashtable[slot] = (hashval, position) + + # Write the hash table for this bucket to disk + for hashval, position in hashtable: + dbfile.write(_pointer.pack(hashval, position)) + + def _write_directory(self): + # Writes a directory of pointers to the 256 hash tables + + dbfile = self.dbfile + for position, numslots in self.directory: + dbfile.write(_dir_entry.pack(position, numslots)) + + def _write_extras(self): + self.dbfile.write_pickle(self.extras) + + def close(self): + dbfile = self.dbfile + + # Write hash tables + self._write_hashes() + # Write directory of pointers to hash tables + self._write_directory() + + expos = dbfile.tell() + # Write extra information + self._write_extras() + # Write length of pickle + dbfile.write_int(dbfile.tell() - expos) + + endpos = dbfile.tell() + dbfile.close() + return endpos + + +class HashReader(object): + """Reader for the fast on-disk key-value files created by + :class:`HashWriter`. + """ + + def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0): + """ + :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object + to read from. + :param length: the length of the file data. This is necessary since the + hashing information is written at the end of the file. + :param magic: the format tag bytes to look for at the start of the + file. If the file's format tag does not match these bytes, the + object raises a :class:`FileFormatError` exception. + :param startoffset: the starting point of the file data. + """ + + self.dbfile = dbfile + self.startoffset = startoffset + self.is_closed = False + + if length is None: + dbfile.seek(0, os.SEEK_END) + length = dbfile.tell() - startoffset + + dbfile.seek(startoffset) + # Check format tag + filemagic = dbfile.read(4) + if filemagic != magic: + raise FileFormatError("Unknown file header %r" % filemagic) + # Read hash type + self.hashtype = dbfile.read_byte() + self.hashfn = _hash_functions[self.hashtype] + # Skip unused future expansion bits + dbfile.read_int() + dbfile.read_int() + self.startofdata = dbfile.tell() + + exptr = startoffset + length - _INT_SIZE + # Get the length of extras from the end of the file + exlen = dbfile.get_int(exptr) + # Read the extras + expos = exptr - exlen + dbfile.seek(expos) + self._read_extras() + + # Calculate the directory base from the beginning of the extras + dbfile.seek(expos - _directory_size) + # Read directory of hash tables + self.tables = [] + entrysize = _dir_entry.size + unpackentry = _dir_entry.unpack + for _ in xrange(256): + # position, numslots + self.tables.append(unpackentry(dbfile.read(entrysize))) + # The position of the first hash table is the end of the key/value pairs + self.endofdata = self.tables[0][0] + + @classmethod + def open(cls, storage, name): + """Convenience method to open a hash file given a + :class:`whoosh.filedb.filestore.Storage` object and a name. This takes + care of opening the file and passing its length to the initializer. + """ + + length = storage.file_length(name) + dbfile = storage.open_file(name) + return cls(dbfile, length) + + def file(self): + return self.dbfile + + def _read_extras(self): + try: + self.extras = self.dbfile.read_pickle() + except EOFError: + self.extras = {} + + def close(self): + if self.is_closed: + raise Exception("Tried to close %r twice" % self) + self.dbfile.close() + self.is_closed = True + + def key_at(self, pos): + # Returns the key bytes at the given position + + dbfile = self.dbfile + keylen = dbfile.get_uint(pos) + return dbfile.get(pos + _lengths.size, keylen) + + def key_and_range_at(self, pos): + # Returns a (keybytes, datapos, datalen) tuple for the key at the given + # position + dbfile = self.dbfile + lenssize = _lengths.size + + if pos >= self.endofdata: + return None + + keylen, datalen = _lengths.unpack(dbfile.get(pos, lenssize)) + keybytes = dbfile.get(pos + lenssize, keylen) + datapos = pos + lenssize + keylen + return keybytes, datapos, datalen + + def _ranges(self, pos=None, eod=None): + # Yields a series of (keypos, keylength, datapos, datalength) tuples + # for the key/value pairs in the file + dbfile = self.dbfile + pos = pos or self.startofdata + eod = eod or self.endofdata + lenssize = _lengths.size + unpacklens = _lengths.unpack + + while pos < eod: + keylen, datalen = unpacklens(dbfile.get(pos, lenssize)) + keypos = pos + lenssize + datapos = keypos + keylen + yield (keypos, keylen, datapos, datalen) + pos = datapos + datalen + + def __getitem__(self, key): + for value in self.all(key): + return value + raise KeyError(key) + + def __iter__(self): + dbfile = self.dbfile + for keypos, keylen, datapos, datalen in self._ranges(): + key = dbfile.get(keypos, keylen) + value = dbfile.get(datapos, datalen) + yield (key, value) + + def __contains__(self, key): + for _ in self.ranges_for_key(key): + return True + return False + + def keys(self): + dbfile = self.dbfile + for keypos, keylen, _, _ in self._ranges(): + yield dbfile.get(keypos, keylen) + + def values(self): + dbfile = self.dbfile + for _, _, datapos, datalen in self._ranges(): + yield dbfile.get(datapos, datalen) + + def items(self): + dbfile = self.dbfile + for keypos, keylen, datapos, datalen in self._ranges(): + yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen)) + + def get(self, key, default=None): + for value in self.all(key): + return value + return default + + def all(self, key): + """Yields a sequence of values associated with the given key. + """ + + dbfile = self.dbfile + for datapos, datalen in self.ranges_for_key(key): + yield dbfile.get(datapos, datalen) + + def ranges_for_key(self, key): + """Yields a sequence of ``(datapos, datalength)`` tuples associated + with the given key. + """ + + if not isinstance(key, bytes_type): + raise TypeError("Key %r should be bytes" % key) + dbfile = self.dbfile + + # Hash the key + keyhash = self.hashfn(key) + # Get the position and number of slots for the hash table in which the + # key may be found + tablestart, numslots = self.tables[keyhash & 255] + # If the hash table is empty, we know the key doesn't exists + if not numslots: + return + + ptrsize = _pointer.size + unpackptr = _pointer.unpack + lenssize = _lengths.size + unpacklens = _lengths.unpack + + # Calculate where the key's slot should be + slotpos = tablestart + (((keyhash >> 8) % numslots) * ptrsize) + # Read slots looking for our key's hash value + for _ in xrange(numslots): + slothash, itempos = unpackptr(dbfile.get(slotpos, ptrsize)) + # If this slot is empty, we're done + if not itempos: + return + + # If the key hash in this slot matches our key's hash, we might have + # a match, so read the actual key and see if it's our key + if slothash == keyhash: + # Read the key and value lengths + keylen, datalen = unpacklens(dbfile.get(itempos, lenssize)) + # Only bother reading the actual key if the lengths match + if keylen == len(key): + keystart = itempos + lenssize + if key == dbfile.get(keystart, keylen): + # The keys match, so yield (datapos, datalen) + yield (keystart + keylen, datalen) + + slotpos += ptrsize + # If we reach the end of the hashtable, wrap around + if slotpos == tablestart + (numslots * ptrsize): + slotpos = tablestart + + def range_for_key(self, key): + for item in self.ranges_for_key(key): + return item + raise KeyError(key) + + +# Ordered hash file + +class OrderedHashWriter(HashWriter): + """Implements an on-disk hash, but requires that keys be added in order. + An :class:`OrderedHashReader` can then look up "nearest keys" based on + the ordering. + """ + + def __init__(self, dbfile): + HashWriter.__init__(self, dbfile) + # Keep an array of the positions of all keys + self.index = GrowableArray("H") + # Keep track of the last key added + self.lastkey = emptybytes + + def add(self, key, value): + if key <= self.lastkey: + raise ValueError("Keys must increase: %r..%r" + % (self.lastkey, key)) + self.index.append(self.dbfile.tell()) + HashWriter.add(self, key, value) + self.lastkey = key + + def _write_extras(self): + dbfile = self.dbfile + index = self.index + + # Store metadata about the index array + self.extras["indextype"] = index.typecode + self.extras["indexlen"] = len(index) + # Write the extras + HashWriter._write_extras(self) + # Write the index array + index.to_file(dbfile) + + +class OrderedHashReader(HashReader): + def closest_key(self, key): + """Returns the closest key equal to or greater than the given key. If + there is no key in the file equal to or greater than the given key, + returns None. + """ + + pos = self.closest_key_pos(key) + if pos is None: + return None + return self.key_at(pos) + + def ranges_from(self, key): + """Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples + for the ordered series of keys equal or greater than the given key. + """ + + pos = self.closest_key_pos(key) + if pos is None: + return + + for item in self._ranges(pos=pos): + yield item + + def keys_from(self, key): + """Yields an ordered series of keys equal to or greater than the given + key. + """ + + dbfile = self.dbfile + for keypos, keylen, _, _ in self.ranges_from(key): + yield dbfile.get(keypos, keylen) + + def items_from(self, key): + """Yields an ordered series of ``(key, value)`` tuples for keys equal + to or greater than the given key. + """ + + dbfile = self.dbfile + for keypos, keylen, datapos, datalen in self.ranges_from(key): + yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen)) + + def _read_extras(self): + dbfile = self.dbfile + + # Read the extras + HashReader._read_extras(self) + + # Set up for reading the index array + indextype = self.extras["indextype"] + self.indexbase = dbfile.tell() + self.indexlen = self.extras["indexlen"] + self.indexsize = struct.calcsize(indextype) + # Set up the function to read values from the index array + if indextype == "B": + self._get_pos = dbfile.get_byte + elif indextype == "H": + self._get_pos = dbfile.get_ushort + elif indextype == "i": + self._get_pos = dbfile.get_int + elif indextype == "I": + self._get_pos = dbfile.get_uint + elif indextype == "q": + self._get_pos = dbfile.get_long + else: + raise Exception("Unknown index type %r" % indextype) + + def closest_key_pos(self, key): + # Given a key, return the position of that key OR the next highest key + # if the given key does not exist + if not isinstance(key, bytes_type): + raise TypeError("Key %r should be bytes" % key) + + indexbase = self.indexbase + indexsize = self.indexsize + key_at = self.key_at + _get_pos = self._get_pos + + # Do a binary search of the positions in the index array + lo = 0 + hi = self.indexlen + while lo < hi: + mid = (lo + hi) // 2 + midkey = key_at(_get_pos(indexbase + mid * indexsize)) + if midkey < key: + lo = mid + 1 + else: + hi = mid + + # If we went off the end, return None + if lo == self.indexlen: + return None + # Return the closest key + return _get_pos(indexbase + lo * indexsize) + + +# Fielded Ordered hash file + +class FieldedOrderedHashWriter(HashWriter): + """Implements an on-disk hash, but writes separate position indexes for + each field. + """ + + def __init__(self, dbfile): + HashWriter.__init__(self, dbfile) + # Map field names to (startpos, indexpos, length, typecode) + self.fieldmap = self.extras["fieldmap"] = {} + + # Keep track of the last key added + self.lastkey = emptybytes + + def start_field(self, fieldname): + self.fieldstart = self.dbfile.tell() + self.fieldname = fieldname + # Keep an array of the positions of all keys + self.poses = GrowableArray("H") + self.lastkey = emptybytes + + def add(self, key, value): + if key <= self.lastkey: + raise ValueError("Keys must increase: %r..%r" + % (self.lastkey, key)) + self.poses.append(self.dbfile.tell() - self.fieldstart) + HashWriter.add(self, key, value) + self.lastkey = key + + def end_field(self): + dbfile = self.dbfile + fieldname = self.fieldname + poses = self.poses + self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses), + poses.typecode) + poses.to_file(dbfile) + + +class FieldedOrderedHashReader(HashReader): + def __init__(self, *args, **kwargs): + HashReader.__init__(self, *args, **kwargs) + self.fieldmap = self.extras["fieldmap"] + # Make a sorted list of the field names with their start and end ranges + self.fieldlist = [] + for fieldname in sorted(self.fieldmap.keys()): + startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] + self.fieldlist.append((fieldname, startpos, ixpos)) + + def field_start(self, fieldname): + return self.fieldmap[fieldname][0] + + def fielded_ranges(self, pos=None, eod=None): + flist = self.fieldlist + fpos = 0 + fieldname, start, end = flist[fpos] + for keypos, keylen, datapos, datalen in self._ranges(pos, eod): + if keypos >= end: + fpos += 1 + fieldname, start, end = flist[fpos] + yield fieldname, keypos, keylen, datapos, datalen + + def iter_terms(self): + get = self.dbfile.get + for fieldname, keypos, keylen, _, _ in self.fielded_ranges(): + yield fieldname, get(keypos, keylen) + + def iter_term_items(self): + get = self.dbfile.get + for item in self.fielded_ranges(): + fieldname, keypos, keylen, datapos, datalen = item + yield fieldname, get(keypos, keylen), get(datapos, datalen) + + def contains_term(self, fieldname, btext): + try: + x = self.range_for_term(fieldname, btext) + return True + except KeyError: + return False + + def range_for_term(self, fieldname, btext): + start, ixpos, ixsize, code = self.fieldmap[fieldname] + for datapos, datalen in self.ranges_for_key(btext): + if start < datapos < ixpos: + return datapos, datalen + raise KeyError((fieldname, btext)) + + def term_data(self, fieldname, btext): + datapos, datalen = self.range_for_term(fieldname, btext) + return self.dbfile.get(datapos, datalen) + + def term_get(self, fieldname, btext, default=None): + try: + return self.term_data(fieldname, btext) + except KeyError: + return default + + def closest_term_pos(self, fieldname, key): + # Given a key, return the position of that key OR the next highest key + # if the given key does not exist + if not isinstance(key, bytes_type): + raise TypeError("Key %r should be bytes" % key) + + dbfile = self.dbfile + key_at = self.key_at + startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] + + if ixtype == "B": + get_pos = dbfile.get_byte + elif ixtype == "H": + get_pos = dbfile.get_ushort + elif ixtype == "i": + get_pos = dbfile.get_int + elif ixtype == "I": + get_pos = dbfile.get_uint + elif ixtype == "q": + get_pos = dbfile.get_long + else: + raise Exception("Unknown index type %r" % ixtype) + + # Do a binary search of the positions in the index array + lo = 0 + hi = ixsize + while lo < hi: + mid = (lo + hi) // 2 + midkey = key_at(startpos + get_pos(ixpos + mid * ixsize)) + if midkey < key: + lo = mid + 1 + else: + hi = mid + + # If we went off the end, return None + if lo == ixsize: + return None + # Return the closest key + return startpos + get_pos(ixpos + lo * ixsize) + + def closest_term(self, fieldname, btext): + pos = self.closest_term_pos(fieldname, btext) + if pos is None: + return None + return self.key_at(pos) + + def term_ranges_from(self, fieldname, btext): + pos = self.closest_term_pos(fieldname, btext) + if pos is None: + return + + startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] + for item in self._ranges(pos, ixpos): + yield item + + def terms_from(self, fieldname, btext): + dbfile = self.dbfile + for keypos, keylen, _, _ in self.term_ranges_from(fieldname, btext): + yield dbfile.get(keypos, keylen) + + def term_items_from(self, fieldname, btext): + dbfile = self.dbfile + for item in self.term_ranges_from(fieldname, btext): + keypos, keylen, datapos, datalen = item + yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen)) + + + diff --git a/src/whoosh/filedb/gae.py b/src/whoosh/filedb/gae.py new file mode 100644 index 0000000..bf0d5d2 --- /dev/null +++ b/src/whoosh/filedb/gae.py @@ -0,0 +1,164 @@ +""" +This module contains EXPERIMENTAL support for storing a Whoosh index's files in +the Google App Engine blobstore. This will use a lot of RAM since all files are +loaded into RAM, but it potentially useful as a workaround for the lack of file +storage in Google App Engine. + +Use at your own risk, but please report any problems to me so I can fix them. + +To create a new index:: + + from whoosh.filedb.gae import DatastoreStorage + + ix = DatastoreStorage().create_index(schema) + +To open an existing index:: + + ix = DatastoreStorage().open_index() +""" + +import time + +from google.appengine.api import memcache # @UnresolvedImport +from google.appengine.ext import db # @UnresolvedImport + +from whoosh.compat import BytesIO +from whoosh.index import TOC, FileIndex, _DEF_INDEX_NAME +from whoosh.filedb.filestore import ReadOnlyError, Storage +from whoosh.filedb.structfile import StructFile + + +class DatastoreFile(db.Model): + """A file-like object that is backed by a BytesIO() object whose contents + is loaded from a BlobProperty in the app engine datastore. + """ + + value = db.BlobProperty() + mtime = db.IntegerProperty(default=0) + + def __init__(self, *args, **kwargs): + super(DatastoreFile, self).__init__(*args, **kwargs) + self.data = BytesIO() + + @classmethod + def loadfile(cls, name): + value = memcache.get(name, namespace="DatastoreFile") + if value is None: + file = cls.get_by_key_name(name) + memcache.set(name, file.value, namespace="DatastoreFile") + else: + file = cls(value=value) + file.data = BytesIO(file.value) + return file + + def close(self): + oldvalue = self.value + self.value = self.getvalue() + if oldvalue != self.value: + self.mtime = int(time.time()) + self.put() + memcache.set(self.key().id_or_name(), self.value, + namespace="DatastoreFile") + + def tell(self): + return self.data.tell() + + def write(self, data): + return self.data.write(data) + + def read(self, length): + return self.data.read(length) + + def seek(self, *args): + return self.data.seek(*args) + + def readline(self): + return self.data.readline() + + def getvalue(self): + return self.data.getvalue() + + +class MemcacheLock(object): + def __init__(self, name): + self.name = name + + def acquire(self, blocking=False): + val = memcache.add(self.name, "L", 360, namespace="whooshlocks") + + if blocking and not val: + # Simulate blocking by retrying the acquire over and over + import time + while not val: + time.sleep(0.1) + val = memcache.add(self.name, "", 360, namespace="whooshlocks") + + return val + + def release(self): + memcache.delete(self.name, namespace="whooshlocks") + + +class DatastoreStorage(Storage): + """An implementation of :class:`whoosh.store.Storage` that stores files in + the app engine datastore as blob properties. + """ + + def create_index(self, schema, indexname=_DEF_INDEX_NAME): + if self.readonly: + raise ReadOnlyError + + TOC.create(self, schema, indexname) + return FileIndex(self, schema, indexname) + + def open_index(self, indexname=_DEF_INDEX_NAME, schema=None): + return FileIndex(self, schema=schema, indexname=indexname) + + def list(self): + query = DatastoreFile.all() + keys = [] + for file in query: + keys.append(file.key().id_or_name()) + return keys + + def clean(self): + pass + + def total_size(self): + return sum(self.file_length(f) for f in self.list()) + + def file_exists(self, name): + return DatastoreFile.get_by_key_name(name) is not None + + def file_modified(self, name): + return DatastoreFile.get_by_key_name(name).mtime + + def file_length(self, name): + return len(DatastoreFile.get_by_key_name(name).value) + + def delete_file(self, name): + memcache.delete(name, namespace="DatastoreFile") + return DatastoreFile.get_by_key_name(name).delete() + + def rename_file(self, name, newname, safe=False): + file = DatastoreFile.get_by_key_name(name) + newfile = DatastoreFile(key_name=newname) + newfile.value = file.value + newfile.mtime = file.mtime + newfile.put() + file.delete() + + def create_file(self, name, **kwargs): + f = StructFile(DatastoreFile(key_name=name), name=name, + onclose=lambda sfile: sfile.file.close()) + return f + + def open_file(self, name, *args, **kwargs): + return StructFile(DatastoreFile.loadfile(name)) + + def lock(self, name): + return MemcacheLock(name) + + def temp_storage(self, name=None): + tempstore = DatastoreStorage() + return tempstore.create() diff --git a/src/whoosh/filedb/structfile.py b/src/whoosh/filedb/structfile.py new file mode 100644 index 0000000..8bf1cbc --- /dev/null +++ b/src/whoosh/filedb/structfile.py @@ -0,0 +1,402 @@ +# Copyright 2009 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from array import array +from copy import copy +from struct import calcsize + +from whoosh.compat import BytesIO, bytes_type +from whoosh.compat import dump as dump_pickle +from whoosh.compat import load as load_pickle +from whoosh.compat import array_frombytes, array_tobytes +from whoosh.system import _INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE +from whoosh.system import IS_LITTLE +from whoosh.system import pack_byte, unpack_byte, pack_sbyte, unpack_sbyte +from whoosh.system import pack_ushort, unpack_ushort +from whoosh.system import pack_ushort_le, unpack_ushort_le +from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint +from whoosh.system import pack_uint_le, unpack_uint_le +from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong +from whoosh.system import pack_float, unpack_float +from whoosh.util.varints import varint, read_varint +from whoosh.util.varints import signed_varint, decode_signed_varint + + +_SIZEMAP = dict((typecode, calcsize(typecode)) for typecode in "bBiIhHqQf") +_ORDERMAP = {"little": "<", "big": ">"} + +_types = (("sbyte", "b"), ("ushort", "H"), ("int", "i"), + ("long", "q"), ("float", "f")) + + +# Main function + +class StructFile(object): + """Returns a "structured file" object that wraps the given file object and + provides numerous additional methods for writing structured data, such as + "write_varint" and "write_long". + """ + + def __init__(self, fileobj, name=None, onclose=None): + self.file = fileobj + self._name = name + self.onclose = onclose + self.is_closed = False + + self.is_real = hasattr(fileobj, "fileno") + if self.is_real: + self.fileno = fileobj.fileno + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self._name) + + def __str__(self): + return self._name + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def __iter__(self): + return iter(self.file) + + def raw_file(self): + return self.file + + def read(self, *args, **kwargs): + return self.file.read(*args, **kwargs) + + def readline(self, *args, **kwargs): + return self.file.readline(*args, **kwargs) + + def write(self, *args, **kwargs): + return self.file.write(*args, **kwargs) + + def tell(self, *args, **kwargs): + return self.file.tell(*args, **kwargs) + + def seek(self, *args, **kwargs): + return self.file.seek(*args, **kwargs) + + def truncate(self, *args, **kwargs): + return self.file.truncate(*args, **kwargs) + + def flush(self): + """Flushes the buffer of the wrapped file. This is a no-op if the + wrapped file does not have a flush method. + """ + + if hasattr(self.file, "flush"): + self.file.flush() + + def close(self): + """Closes the wrapped file. + """ + + if self.is_closed: + raise Exception("This file is already closed") + if self.onclose: + self.onclose(self) + if hasattr(self.file, "close"): + self.file.close() + self.is_closed = True + + def subset(self, offset, length, name=None): + from whoosh.filedb.compound import SubFile + + name = name or self._name + return StructFile(SubFile(self.file, offset, length), name=name) + + def write_string(self, s): + """Writes a string to the wrapped file. This method writes the length + of the string first, so you can read the string back without having to + know how long it was. + """ + self.write_varint(len(s)) + self.write(s) + + def write_string2(self, s): + self.write(pack_ushort(len(s)) + s) + + def write_string4(self, s): + self.write(pack_int(len(s)) + s) + + def read_string(self): + """Reads a string from the wrapped file. + """ + return self.read(self.read_varint()) + + def read_string2(self): + l = self.read_ushort() + return self.read(l) + + def read_string4(self): + l = self.read_int() + return self.read(l) + + def get_string2(self, pos): + l = self.get_ushort(pos) + base = pos + _SHORT_SIZE + return self.get(base, l), base + l + + def get_string4(self, pos): + l = self.get_int(pos) + base = pos + _INT_SIZE + return self.get(base, l), base + l + + def skip_string(self): + l = self.read_varint() + self.seek(l, 1) + + def write_varint(self, i): + """Writes a variable-length unsigned integer to the wrapped file. + """ + self.write(varint(i)) + + def write_svarint(self, i): + """Writes a variable-length signed integer to the wrapped file. + """ + self.write(signed_varint(i)) + + def read_varint(self): + """Reads a variable-length encoded unsigned integer from the wrapped + file. + """ + return read_varint(self.read) + + def read_svarint(self): + """Reads a variable-length encoded signed integer from the wrapped + file. + """ + return decode_signed_varint(read_varint(self.read)) + + def write_tagint(self, i): + """Writes a sometimes-compressed unsigned integer to the wrapped file. + This is similar to the varint methods but uses a less compressed but + faster format. + """ + + # Store numbers 0-253 in one byte. Byte 254 means "an unsigned 16-bit + # int follows." Byte 255 means "An unsigned 32-bit int follows." + if i <= 253: + self.write(chr(i)) + elif i <= 65535: + self.write("\xFE" + pack_ushort(i)) + else: + self.write("\xFF" + pack_uint(i)) + + def read_tagint(self): + """Reads a sometimes-compressed unsigned integer from the wrapped file. + This is similar to the varint methods but uses a less compressed but + faster format. + """ + + tb = ord(self.read(1)) + if tb == 254: + return self.read_ushort() + elif tb == 255: + return self.read_uint() + else: + return tb + + def write_byte(self, n): + """Writes a single byte to the wrapped file, shortcut for + ``file.write(chr(n))``. + """ + self.write(pack_byte(n)) + + def read_byte(self): + return ord(self.read(1)) + + def write_pickle(self, obj, protocol=-1): + """Writes a pickled representation of obj to the wrapped file. + """ + dump_pickle(obj, self.file, protocol) + + def read_pickle(self): + """Reads a pickled object from the wrapped file. + """ + return load_pickle(self.file) + + def write_sbyte(self, n): + self.write(pack_sbyte(n)) + + def write_int(self, n): + self.write(pack_int(n)) + + def write_uint(self, n): + self.write(pack_uint(n)) + + def write_uint_le(self, n): + self.write(pack_uint_le(n)) + + def write_ushort(self, n): + self.write(pack_ushort(n)) + + def write_ushort_le(self, n): + self.write(pack_ushort_le(n)) + + def write_long(self, n): + self.write(pack_long(n)) + + def write_ulong(self, n): + self.write(pack_ulong(n)) + + def write_float(self, n): + self.write(pack_float(n)) + + def write_array(self, arry): + if IS_LITTLE: + arry = copy(arry) + arry.byteswap() + if self.is_real: + arry.tofile(self.file) + else: + self.write(array_tobytes(arry)) + + def read_sbyte(self): + return unpack_sbyte(self.read(1))[0] + + def read_int(self): + return unpack_int(self.read(_INT_SIZE))[0] + + def read_uint(self): + return unpack_uint(self.read(_INT_SIZE))[0] + + def read_uint_le(self): + return unpack_uint_le(self.read(_INT_SIZE))[0] + + def read_ushort(self): + return unpack_ushort(self.read(_SHORT_SIZE))[0] + + def read_ushort_le(self): + return unpack_ushort_le(self.read(_SHORT_SIZE))[0] + + def read_long(self): + return unpack_long(self.read(_LONG_SIZE))[0] + + def read_ulong(self): + return unpack_ulong(self.read(_LONG_SIZE))[0] + + def read_float(self): + return unpack_float(self.read(_FLOAT_SIZE))[0] + + def read_array(self, typecode, length): + a = array(typecode) + if self.is_real: + a.fromfile(self.file, length) + else: + array_frombytes(a, self.read(length * _SIZEMAP[typecode])) + if IS_LITTLE: + a.byteswap() + return a + + def get(self, position, length): + self.seek(position) + return self.read(length) + + def get_byte(self, position): + return unpack_byte(self.get(position, 1))[0] + + def get_sbyte(self, position): + return unpack_sbyte(self.get(position, 1))[0] + + def get_int(self, position): + return unpack_int(self.get(position, _INT_SIZE))[0] + + def get_uint(self, position): + return unpack_uint(self.get(position, _INT_SIZE))[0] + + def get_ushort(self, position): + return unpack_ushort(self.get(position, _SHORT_SIZE))[0] + + def get_long(self, position): + return unpack_long(self.get(position, _LONG_SIZE))[0] + + def get_ulong(self, position): + return unpack_ulong(self.get(position, _LONG_SIZE))[0] + + def get_float(self, position): + return unpack_float(self.get(position, _FLOAT_SIZE))[0] + + def get_array(self, position, typecode, length): + self.seek(position) + return self.read_array(typecode, length) + + +class BufferFile(StructFile): + def __init__(self, buf, name=None, onclose=None): + self._buf = buf + self._name = name + self.file = BytesIO(buf) + self.onclose = onclose + + self.is_real = False + self.is_closed = False + + def subset(self, position, length, name=None): + name = name or self._name + return BufferFile(self.get(position, length), name=name) + + def get(self, position, length): + return bytes_type(self._buf[position:position + length]) + + def get_array(self, position, typecode, length): + a = array(typecode) + array_frombytes(a, self.get(position, length * _SIZEMAP[typecode])) + if IS_LITTLE: + a.byteswap() + return a + + +class ChecksumFile(StructFile): + def __init__(self, *args, **kwargs): + StructFile.__init__(self, *args, **kwargs) + self._check = 0 + self._crc32 = __import__("zlib").crc32 + + def __iter__(self): + for line in self.file: + self._check = self._crc32(line, self._check) + yield line + + def seek(self, *args): + raise Exception("Cannot seek on a ChecksumFile") + + def read(self, *args, **kwargs): + b = self.file.read(*args, **kwargs) + self._check = self._crc32(b, self._check) + return b + + def write(self, b): + self._check = self._crc32(b, self._check) + self.file.write(b) + + def checksum(self): + return self._check & 0xffffffff diff --git a/src/whoosh/formats.py b/src/whoosh/formats.py new file mode 100644 index 0000000..963e136 --- /dev/null +++ b/src/whoosh/formats.py @@ -0,0 +1,481 @@ +# Copyright 2009 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +The classes in this module encode and decode posting information for a field. +The field format essentially determines what information is stored about each +occurance of a term. +""" + +from collections import defaultdict + +from whoosh.analysis import unstopped, entoken +from whoosh.compat import iteritems, dumps, loads, b +from whoosh.system import emptybytes +from whoosh.system import _INT_SIZE, _FLOAT_SIZE +from whoosh.system import pack_uint, unpack_uint, pack_float, unpack_float + + +# Format base class + +class Format(object): + """Abstract base class representing a storage format for a field or vector. + Format objects are responsible for writing and reading the low-level + representation of a field. It controls what kind/level of information to + store about the indexed fields. + """ + + posting_size = -1 + textual = True + __inittypes__ = dict(field_boost=float) + + def __init__(self, field_boost=1.0, **options): + """ + :param field_boost: A constant boost factor to scale to the score + of all queries matching terms in this field. + """ + + self.field_boost = field_boost + self.options = options + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.__dict__ == other.__dict__) + + def __repr__(self): + return "%s(boost=%s)" % (self.__class__.__name__, self.field_boost) + + def fixed_value_size(self): + if self.posting_size < 0: + return None + return self.posting_size + + def word_values(self, value, analyzer, **kwargs): + """Takes the text value to be indexed and yields a series of + ("tokentext", frequency, weight, valuestring) tuples, where frequency + is the number of times "tokentext" appeared in the value, weight is the + weight (a float usually equal to frequency in the absence of per-term + boosts) and valuestring is encoded field-specific posting value for the + token. For example, in a Frequency format, the value string would be + the same as frequency; in a Positions format, the value string would + encode a list of token positions at which "tokentext" occured. + + :param value: The unicode text to index. + :param analyzer: The analyzer to use to process the text. + """ + + raise NotImplementedError + + def supports(self, name): + """Returns True if this format supports interpreting its posting + value as 'name' (e.g. "frequency" or "positions"). + """ + return hasattr(self, "decode_" + name) + + def decoder(self, name): + """Returns the bound method for interpreting value as 'name', + where 'name' is for example "frequency" or "positions". This + object must have a corresponding Format.decode_() method. + """ + return getattr(self, "decode_" + name) + + def decode_as(self, astype, valuestring): + """Interprets the encoded value string as 'astype', where 'astype' is + for example "frequency" or "positions". This object must have a + corresponding decode_() method. + """ + return self.decoder(astype)(valuestring) + + +# Concrete field classes + +# TODO: as a legacy thing most of these formats store the frequency but not the +# weight in the value string, so if you use field or term boosts +# postreader.value_as("weight") will not match postreader.weight() + +def tokens(value, analyzer, kwargs): + if isinstance(value, (tuple, list)): + gen = entoken(value, **kwargs) + else: + gen = analyzer(value, **kwargs) + return unstopped(gen) + + +class Existence(Format): + """Only indexes whether a given term occurred in a given document; it does + not store frequencies or positions. This is useful for fields that should + be searchable but not scorable, such as file path. + + Supports: frequency, weight (always reports frequency = 1). + """ + + posting_size = 0 + __inittypes__ = dict(field_boost=float) + + def __init__(self, field_boost=1.0, **options): + self.field_boost = field_boost + self.options = options + + def word_values(self, value, analyzer, **kwargs): + fb = self.field_boost + wordset = set(t.text for t in tokens(value, analyzer, kwargs)) + return ((w, 1, fb, emptybytes) for w in wordset) + + def encode(self, value): + return emptybytes + + def decode_frequency(self, valuestring): + return 1 + + def decode_weight(self, valuestring): + return self.field_boost + + def combine(self, vs): + return emptybytes + + +class Frequency(Format): + """Stores frequency information for each posting. + + Supports: frequency, weight. + """ + + posting_size = _INT_SIZE + __inittypes__ = dict(field_boost=float, boost_as_freq=bool) + + def __init__(self, field_boost=1.0, boost_as_freq=False, + **options): + """ + :param field_boost: A constant boost factor to scale to the score of + all queries matching terms in this field. + """ + + assert isinstance(field_boost, float) + self.field_boost = field_boost + self.options = options + + def word_values(self, value, analyzer, **kwargs): + fb = self.field_boost + length = 0 + freqs = defaultdict(int) + weights = defaultdict(float) + + kwargs["boosts"] = True + for t in tokens(value, analyzer, kwargs): + length += 1 + freqs[t.text] += 1 + weights[t.text] += t.boost + + wvs = ((w, freq, weights[w] * fb, pack_uint(freq)) for w, freq + in iteritems(freqs)) + return wvs + + def decode_frequency(self, valuestring): + return unpack_uint(valuestring)[0] + + def decode_weight(self, valuestring): + freq = unpack_uint(valuestring)[0] + return freq * self.field_boost + + def combine(self, vs): + return pack_uint(sum(self.decode_value(v) for v in vs)) + + +class Positions(Format): + """Stores position information in each posting, to allow phrase searching + and "near" queries. + + Supports: frequency, weight, positions, position_boosts (always reports + position boost = 1.0). + """ + + def word_values(self, value, analyzer, **kwargs): + fb = self.field_boost + poses = defaultdict(list) + weights = defaultdict(float) + kwargs["positions"] = True + kwargs["boosts"] = True + for t in tokens(value, analyzer, kwargs): + poses[t.text].append(t.pos) + weights[t.text] += t.boost + + for w, poslist in iteritems(poses): + value = self.encode(poslist) + yield (w, len(poslist), weights[w] * fb, value) + + def encode(self, poslist): + deltas = [] + base = 0 + for pos in poslist: + deltas.append(pos - base) + base = pos + return pack_uint(len(deltas)) + dumps(deltas, -1) + + def decode_positions(self, valuestring): + if not valuestring.endswith(b(".")): + valuestring += b(".") + codes = loads(valuestring[_INT_SIZE:]) + position = 0 + positions = [] + for code in codes: + position += code + positions.append(position) + return positions + + def decode_frequency(self, valuestring): + return unpack_uint(valuestring[:_INT_SIZE])[0] + + def decode_weight(self, valuestring): + return self.decode_frequency(valuestring) * self.field_boost + + def decode_position_boosts(self, valuestring): + return [(pos, 1) for pos in self.decode_positions(valuestring)] + + def combine(self, vs): + s = set() + for v in vs: + s.update(self.decode_positions(v)) + return self.encode(sorted(s)) + + +class Characters(Positions): + """Stores token position and character start and end information for each + posting. + + Supports: frequency, weight, positions, position_boosts (always reports + position boost = 1.0), characters. + """ + + def word_values(self, value, analyzer, **kwargs): + fb = self.field_boost + seen = defaultdict(list) + weights = defaultdict(float) + + kwargs["positions"] = True + kwargs["chars"] = True + kwargs["boosts"] = True + for t in tokens(value, analyzer, kwargs): + seen[t.text].append((t.pos, t.startchar, t.endchar)) + weights[t.text] += t.boost + + for w, poslist in iteritems(seen): + value = self.encode(poslist) + yield (w, len(poslist), weights[w] * fb, value) + + def encode(self, poslist): + deltas = [] + posbase = 0 + charbase = 0 + for pos, startchar, endchar in poslist: + deltas.append((pos - posbase, startchar - charbase, + endchar - startchar)) + posbase = pos + charbase = endchar + return pack_uint(len(deltas)) + dumps(deltas, -1) + + def decode_characters(self, valuestring): + if not valuestring.endswith(b(".")): + valuestring += b(".") + codes = loads(valuestring[_INT_SIZE:]) + position = 0 + endchar = 0 + posns_chars = [] + for code in codes: + position = code[0] + position + startchar = code[1] + endchar + endchar = code[2] + startchar + posns_chars.append((position, startchar, endchar)) + return posns_chars + + def decode_positions(self, valuestring): + if not valuestring.endswith(b(".")): + valuestring += b(".") + codes = loads(valuestring[_INT_SIZE:]) + position = 0 + posns = [] + for code in codes: + position = code[0] + position + posns.append(position) + return posns + + def combine(self, vs): + s = {} + for v in vs: + for pos, sc, ec in self.decode_characters(v): + if pos in s: + old_sc, old_ec = pos[s] + s[pos] = (min(sc, old_sc), max(ec, old_ec)) + else: + s[pos] = (sc, ec) + poses = [(pos, s[pos][0], s[pos][1]) for pos in sorted(s.keys())] + return self.encode(poses) + + +class PositionBoosts(Positions): + """A format that stores positions and per-position boost information + in each posting. + + Supports: frequency, weight, positions, position_boosts. + """ + + def word_values(self, value, analyzer, **kwargs): + fb = self.field_boost + seen = defaultdict(list) + + kwargs["positions"] = True + kwargs["boosts"] = True + for t in tokens(value, analyzer, kwargs): + pos = t.pos + boost = t.boost + seen[t.text].append((pos, boost)) + + for w, poses in iteritems(seen): + value = self.encode(poses) + yield (w, len(poses), sum(p[1] for p in poses) * fb, value) + + def encode(self, poses): + codes = [] + base = 0 + summedboost = 0 + for pos, boost in poses: + summedboost += boost + codes.append((pos - base, boost)) + base = pos + return (pack_uint(len(poses)) + pack_float(summedboost) + + dumps(codes, -1)) + + def decode_position_boosts(self, valuestring): + if not valuestring.endswith(b(".")): + valuestring += b(".") + codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:]) + position = 0 + posns_boosts = [] + for code in codes: + position = code[0] + position + posns_boosts.append((position, code[1])) + return posns_boosts + + def decode_positions(self, valuestring): + if not valuestring.endswith(b(".")): + valuestring += b(".") + codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:]) + position = 0 + posns = [] + for code in codes: + position = code[0] + position + posns.append(position) + return posns + + def decode_weight(self, v): + summedboost = unpack_float(v[_INT_SIZE:_INT_SIZE + _FLOAT_SIZE])[0] + return summedboost * self.field_boost + + def combine(self, vs): + s = defaultdict(float) + for v in vs: + for pos, boost in self.decode_position_boosts(v): + s[pos] += boost + return self.encode(sorted(s.items())) + + +class CharacterBoosts(Characters): + """A format that stores positions, character start and end, and + per-position boost information in each posting. + + Supports: frequency, weight, positions, position_boosts, characters, + character_boosts. + """ + + def word_values(self, value, analyzer, **kwargs): + seen = defaultdict(list) + + kwargs["positions"] = True + kwargs["chars"] = True + kwargs["boosts"] = True + for t in tokens(value, analyzer, kwargs): + seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost)) + + for w, poses in iteritems(seen): + value, summedboost = self.encode(poses) + yield (w, len(poses), summedboost, value) + + def encode(self, poses): + fb = self.field_boost + # posns_chars_boosts = [(pos, startchar, endchar, boost), ...] + codes = [] + posbase = 0 + charbase = 0 + summedboost = 0 + for pos, startchar, endchar, boost in poses: + codes.append((pos - posbase, startchar - charbase, + endchar - startchar, boost)) + posbase = pos + charbase = endchar + summedboost += boost + + return ((pack_uint(len(poses)) + pack_float(summedboost * fb) + + dumps(codes, -1)), summedboost) + + def decode_character_boosts(self, valuestring): + if not valuestring.endswith(b(".")): + valuestring += b(".") + codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:]) + position = 0 + endchar = 0 + posn_char_boosts = [] + for code in codes: + position = position + code[0] + startchar = endchar + code[1] + endchar = startchar + code[2] + posn_char_boosts.append((position, startchar, endchar, code[3])) + return posn_char_boosts + + def decode_positions(self, valuestring): + return [item[0] for item in self.decode_character_boosts(valuestring)] + + def decode_characters(self, valuestring): + return [(pos, startchar, endchar) for pos, startchar, endchar, _ + in self.decode_character_boosts(valuestring)] + + def decode_position_boosts(self, valuestring): + return [(pos, boost) for pos, _, _, boost + in self.decode_character_boosts(valuestring)] + + def combine(self, vs): + s = {} + for v in vs: + for pos, sc, ec, boost in self.decode_character_boosts(v): + if pos in s: + old_sc, old_ec, old_boost = pos[s] + s[pos] = (min(sc, old_sc), max(ec, old_ec), + old_boost + boost) + else: + s[pos] = (sc, ec, boost) + poses = [(pos, sc, ec, boost) for pos, (sc, ec, boost) + in sorted(s.items())] + return self.encode(poses)[0] # encode() returns value, summedboost diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py new file mode 100644 index 0000000..7088387 --- /dev/null +++ b/src/whoosh/highlight.py @@ -0,0 +1,952 @@ +# Copyright 2008 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +"""The highlight module contains classes and functions for displaying short +excerpts from hit documents in the search results you present to the user, with +query terms highlighted. + +The highlighting system has four main elements. + +* **Fragmenters** chop up the original text into __fragments__, based on the + locations of matched terms in the text. + +* **Scorers** assign a score to each fragment, allowing the system to rank the + best fragments by whatever criterion. + +* **Order functions** control in what order the top-scoring fragments are + presented to the user. For example, you can show the fragments in the order + they appear in the document (FIRST) or show higher-scoring fragments first + (SCORE) + +* **Formatters** turn the fragment objects into human-readable output, such as + an HTML string. + +See :doc:`/highlight` for more information. +""" + +from __future__ import division +from collections import deque +from heapq import nlargest +from itertools import groupby + +from whoosh.compat import htmlescape +from whoosh.analysis import Token + + +# The default value for the maximum chars to examine when fragmenting +DEFAULT_CHARLIMIT = 2 ** 15 + + +# Fragment object + +def mkfrag(text, tokens, startchar=None, endchar=None, + charsbefore=0, charsafter=0): + """Returns a :class:`Fragment` object based on the :class:`analysis.Token` + objects in ``tokens`. + """ + + if startchar is None: + startchar = tokens[0].startchar if tokens else 0 + if endchar is None: + endchar = tokens[-1].endchar if tokens else len(text) + + startchar = max(0, startchar - charsbefore) + endchar = min(len(text), endchar + charsafter) + + return Fragment(text, tokens, startchar, endchar) + + +class Fragment(object): + """Represents a fragment (extract) from a hit document. This object is + mainly used to keep track of the start and end points of the fragment and + the "matched" character ranges inside; it does not contain the text of the + fragment or do much else. + + The useful attributes are: + + ``Fragment.text`` + The entire original text from which this fragment is taken. + + ``Fragment.matches`` + An ordered list of objects representing the matched terms in the + fragment. These objects have ``startchar`` and ``endchar`` attributes. + + ``Fragment.startchar`` + The index of the first character in the fragment. + + ``Fragment.endchar`` + The index of the last character in the fragment. + + ``Fragment.matched_terms`` + A ``set`` of the ``text`` of the matched terms in the fragment (if + available). + """ + + def __init__(self, text, matches, startchar=0, endchar= -1): + """ + :param text: the source text of the fragment. + :param matches: a list of objects which have ``startchar`` and + ``endchar`` attributes, and optionally a ``text`` attribute. + :param startchar: the index into ``text`` at which the fragment starts. + The default is 0. + :param endchar: the index into ``text`` at which the fragment ends. + The default is -1, which is interpreted as the length of ``text``. + """ + + self.text = text + self.matches = matches + + if endchar == -1: + endchar = len(text) + self.startchar = startchar + self.endchar = endchar + + self.matched_terms = set() + for t in matches: + if hasattr(t, "text"): + self.matched_terms.add(t.text) + + def __repr__(self): + return "" % (self.startchar, self.endchar, + len(self.matches)) + + def __len__(self): + return self.endchar - self.startchar + + def overlaps(self, fragment): + sc = self.startchar + ec = self.endchar + fsc = fragment.startchar + fec = fragment.endchar + return (sc < fsc < ec) or (sc < fec < ec) + + def overlapped_length(self, fragment): + sc = self.startchar + ec = self.endchar + fsc = fragment.startchar + fec = fragment.endchar + return max(ec, fec) - min(sc, fsc) + + def __lt__(self, other): + return id(self) < id(other) + + +# Tokenizing + +def set_matched_filter(tokens, termset): + for t in tokens: + t.matched = t.text in termset + yield t + + +# Fragmenters + +class Fragmenter(object): + def must_retokenize(self): + """Returns True if this fragmenter requires retokenized text. + + If this method returns True, the fragmenter's ``fragment_tokens`` + method will be called with an iterator of ALL tokens from the text, + with the tokens for matched terms having the ``matched`` attribute set + to True. + + If this method returns False, the fragmenter's ``fragment_matches`` + method will be called with a LIST of matching tokens. + """ + + return True + + def fragment_tokens(self, text, all_tokens): + """Yields :class:`Fragment` objects based on the tokenized text. + + :param text: the string being highlighted. + :param all_tokens: an iterator of :class:`analysis.Token` + objects from the string. + """ + + raise NotImplementedError + + def fragment_matches(self, text, matched_tokens): + """Yields :class:`Fragment` objects based on the text and the matched + terms. + + :param text: the string being highlighted. + :param matched_tokens: a list of :class:`analysis.Token` objects + representing the term matches in the string. + """ + + raise NotImplementedError + + +class WholeFragmenter(Fragmenter): + """Doesn't fragment the token stream. This object just returns the entire + entire stream as one "fragment". This is useful if you want to highlight + the entire text. + + Note that even if you use the `WholeFragmenter`, the highlight code will + return no fragment if no terms matched in the given field. To return the + whole fragment even in that case, call `highlights()` with `minscore=0`:: + + # Query where no terms match in the "text" field + q = query.Term("tag", "new") + + r = mysearcher.search(q) + r.fragmenter = highlight.WholeFragmenter() + r.formatter = highlight.UppercaseFormatter() + # Since no terms in the "text" field matched, we get no fragments back + assert r[0].highlights("text") == "" + + # If we lower the minimum score to 0, we get a fragment even though it + # has no matching terms + assert r[0].highlights("text", minscore=0) == "This is the text field." + + """ + + def __init__(self, charlimit=DEFAULT_CHARLIMIT): + self.charlimit = charlimit + + def fragment_tokens(self, text, tokens): + charlimit = self.charlimit + matches = [] + for t in tokens: + if charlimit and t.endchar > charlimit: + break + if t.matched: + matches.append(t.copy()) + return [Fragment(text, matches)] + + +# Backwards compatiblity +NullFragmeter = WholeFragmenter + + +class SentenceFragmenter(Fragmenter): + """Breaks the text up on sentence end punctuation characters + (".", "!", or "?"). This object works by looking in the original text for a + sentence end as the next character after each token's 'endchar'. + + When highlighting with this fragmenter, you should use an analyzer that + does NOT remove stop words, for example:: + + sa = StandardAnalyzer(stoplist=None) + """ + + def __init__(self, maxchars=200, sentencechars=".!?", + charlimit=DEFAULT_CHARLIMIT): + """ + :param maxchars: The maximum number of characters allowed in a + fragment. + """ + + self.maxchars = maxchars + self.sentencechars = frozenset(sentencechars) + self.charlimit = charlimit + + def fragment_tokens(self, text, tokens): + maxchars = self.maxchars + sentencechars = self.sentencechars + charlimit = self.charlimit + + textlen = len(text) + # startchar of first token in the current sentence + first = None + # Buffer for matched tokens in the current sentence + tks = [] + endchar = None + # Number of chars in the current sentence + currentlen = 0 + + for t in tokens: + startchar = t.startchar + endchar = t.endchar + if charlimit and endchar > charlimit: + break + + if first is None: + # Remember the startchar of the first token in a sentence + first = startchar + currentlen = 0 + + tlength = endchar - startchar + currentlen += tlength + + if t.matched: + tks.append(t.copy()) + + # If the character after the current token is end-of-sentence + # punctuation, finish the sentence and reset + if endchar < textlen and text[endchar] in sentencechars: + # Don't break for two periods in a row (e.g. ignore "...") + if endchar + 1 < textlen and text[endchar + 1] in sentencechars: + continue + + # If the sentence had matches and it's not too long, yield it + # as a token + if tks and currentlen <= maxchars: + yield mkfrag(text, tks, startchar=first, endchar=endchar) + # Reset the counts + tks = [] + first = None + currentlen = 0 + + # If we get to the end of the text and there's still a sentence + # in the buffer, yield it + if tks: + yield mkfrag(text, tks, startchar=first, endchar=endchar) + + +class ContextFragmenter(Fragmenter): + """Looks for matched terms and aggregates them with their surrounding + context. + """ + + def __init__(self, maxchars=200, surround=20, charlimit=DEFAULT_CHARLIMIT): + """ + :param maxchars: The maximum number of characters allowed in a + fragment. + :param surround: The number of extra characters of context to add both + before the first matched term and after the last matched term. + """ + + self.maxchars = maxchars + self.surround = surround + self.charlimit = charlimit + + def fragment_tokens(self, text, tokens): + maxchars = self.maxchars + surround = self.surround + charlimit = self.charlimit + + # startchar of the first token in the fragment + first = None + # Stack of startchars + firsts = deque() + # Each time we see a matched token, we reset the countdown to finishing + # the fragment. This also indicates whether we're currently inside a + # fragment (< 0 not in fragment, >= 0 in fragment) + countdown = -1 + # Tokens in current fragment + tks = [] + endchar = None + # Number of chars in the current fragment + currentlen = 0 + + for t in tokens: + startchar = t.startchar + endchar = t.endchar + tlength = endchar - startchar + if charlimit and endchar > charlimit: + break + + if countdown < 0 and not t.matched: + # We're not in a fragment currently, so just maintain the + # "charsbefore" buffer + firsts.append(startchar) + while firsts and endchar - firsts[0] > surround: + firsts.popleft() + elif currentlen + tlength > maxchars: + # We're in a fragment, but adding this token would put us past + # the maximum size. Zero the countdown so the code below will + # cause the fragment to be emitted + countdown = 0 + elif t.matched: + # Start/restart the countdown + countdown = surround + # Remember the first char of this fragment + if first is None: + if firsts: + first = firsts[0] + else: + first = startchar + # Add on unused front context + countdown += surround + tks.append(t.copy()) + + # If we're in a fragment... + if countdown >= 0: + # Update the counts + currentlen += tlength + countdown -= tlength + + # If the countdown is expired + if countdown <= 0: + # Finish the fragment + yield mkfrag(text, tks, startchar=first, endchar=endchar) + # Reset the counts + tks = [] + firsts = deque() + first = None + currentlen = 0 + + # If there's a fragment left over at the end, yield it + if tks: + yield mkfrag(text, tks, startchar=first, endchar=endchar) + + +class PinpointFragmenter(Fragmenter): + """This is a NON-RETOKENIZING fragmenter. It builds fragments from the + positions of the matched terms. + """ + + def __init__(self, maxchars=200, surround=20, autotrim=False, + charlimit=DEFAULT_CHARLIMIT): + """ + :param maxchars: The maximum number of characters allowed in a + fragment. + :param surround: The number of extra characters of context to add both + before the first matched term and after the last matched term. + :param autotrim: automatically trims text before the first space and + after the last space in the fragments, to try to avoid truncated + words at the start and end. For short fragments or fragments with + long runs between spaces this may give strange results. + """ + + self.maxchars = maxchars + self.surround = surround + self.autotrim = autotrim + self.charlimit = charlimit + + def must_retokenize(self): + return False + + def fragment_tokens(self, text, tokens): + matched = [t for t in tokens if t.matched] + return self.fragment_matches(text, matched) + + @staticmethod + def _autotrim(fragment): + text = fragment.text + startchar = fragment.startchar + endchar = fragment.endchar + + firstspace = text.find(" ", startchar, endchar) + if firstspace > 0: + startchar = firstspace + 1 + lastspace = text.rfind(" ", startchar, endchar) + if lastspace > 0: + endchar = lastspace + + if fragment.matches: + startchar = min(startchar, fragment.matches[0].startchar) + endchar = max(endchar, fragment.matches[-1].endchar) + + fragment.startchar = startchar + fragment.endchar = endchar + + def fragment_matches(self, text, tokens): + maxchars = self.maxchars + surround = self.surround + autotrim = self.autotrim + charlimit = self.charlimit + + j = -1 + + for i, t in enumerate(tokens): + if j >= i: + continue + j = i + left = t.startchar + right = t.endchar + if charlimit and right > charlimit: + break + + currentlen = right - left + while j < len(tokens) - 1 and currentlen < maxchars: + next = tokens[j + 1] + ec = next.endchar + if ec - right <= surround and ec - left <= maxchars: + j += 1 + right = ec + currentlen += (ec - next.startchar) + else: + break + + left = max(0, left - surround) + right = min(len(text), right + surround) + fragment = Fragment(text, tokens[i:j + 1], left, right) + if autotrim: + self._autotrim(fragment) + yield fragment + + +# Fragment scorers + +class FragmentScorer(object): + pass + + +class BasicFragmentScorer(FragmentScorer): + def __call__(self, f): + # Add up the boosts for the matched terms in this passage + score = sum(t.boost for t in f.matches) + + # Favor diversity: multiply score by the number of separate + # terms matched + score *= (len(f.matched_terms) * 100) or 1 + + return score + + +# Fragment sorters + +def SCORE(fragment): + "Sorts higher scored passages first." + return 1 + + +def FIRST(fragment): + "Sorts passages from earlier in the document first." + return fragment.startchar + + +def LONGER(fragment): + "Sorts longer passages first." + return 0 - len(fragment) + + +def SHORTER(fragment): + "Sort shorter passages first." + return len(fragment) + + +# Formatters + +def get_text(original, token, replace): + """Convenience function for getting the text to use for a match when + formatting. + + If ``replace`` is False, returns the part of ``original`` between + ``token.startchar`` and ``token.endchar``. If ``replace`` is True, returns + ``token.text``. + """ + + if replace: + return token.text + else: + return original[token.startchar:token.endchar] + + +class Formatter(object): + """Base class for formatters. + + For highlighters that return strings, it is usually only necessary to + override :meth:`Formatter.format_token`. + + Use the :func:`get_text` function as a convenience to get the token text:: + + class MyFormatter(Formatter): + def format_token(text, token, replace=False): + ttext = get_text(text, token, replace) + return "[%s]" % ttext + """ + + between = "..." + + def _text(self, text): + return text + + def format_token(self, text, token, replace=False): + """Returns a formatted version of the given "token" object, which + should have at least ``startchar`` and ``endchar`` attributes, and + a ``text`` attribute if ``replace`` is True. + + :param text: the original fragment text being highlighted. + :param token: an object having ``startchar`` and ``endchar`` attributes + and optionally a ``text`` attribute (if ``replace`` is True). + :param replace: if True, the original text between the token's + ``startchar`` and ``endchar`` indices will be replaced with the + value of the token's ``text`` attribute. + """ + + raise NotImplementedError + + def format_fragment(self, fragment, replace=False): + """Returns a formatted version of the given text, using the "token" + objects in the given :class:`Fragment`. + + :param fragment: a :class:`Fragment` object representing a list of + matches in the text. + :param replace: if True, the original text corresponding to each + match will be replaced with the value of the token object's + ``text`` attribute. + """ + + output = [] + index = fragment.startchar + text = fragment.text + + for t in fragment.matches: + if t.startchar is None: + continue + if t.startchar < index: + continue + if t.startchar > index: + output.append(self._text(text[index:t.startchar])) + output.append(self.format_token(text, t, replace)) + index = t.endchar + output.append(self._text(text[index:fragment.endchar])) + + out_string = "".join(output) + return out_string + + def format(self, fragments, replace=False): + """Returns a formatted version of the given text, using a list of + :class:`Fragment` objects. + """ + + formatted = [self.format_fragment(f, replace=replace) + for f in fragments] + return self.between.join(formatted) + + def __call__(self, text, fragments): + # For backwards compatibility + return self.format(fragments) + + +class NullFormatter(Formatter): + """Formatter that does not modify the string. + """ + + def format_token(self, text, token, replace=False): + return get_text(text, token, replace) + + +class UppercaseFormatter(Formatter): + """Returns a string in which the matched terms are in UPPERCASE. + """ + + def __init__(self, between="..."): + """ + :param between: the text to add between fragments. + """ + + self.between = between + + def format_token(self, text, token, replace=False): + ttxt = get_text(text, token, replace) + return ttxt.upper() + + +class HtmlFormatter(Formatter): + """Returns a string containing HTML formatting around the matched terms. + + This formatter wraps matched terms in an HTML element with two class names. + The first class name (set with the constructor argument ``classname``) is + the same for each match. The second class name (set with the constructor + argument ``termclass`` is different depending on which term matched. This + allows you to give different formatting (for example, different background + colors) to the different terms in the excerpt. + + >>> hf = HtmlFormatter(tagname="span", classname="match", termclass="term") + >>> hf(mytext, myfragments) + "The template geometry is..." + + This object maintains a dictionary mapping terms to HTML class names (e.g. + ``term0`` and ``term1`` above), so that multiple excerpts will use the same + class for the same term. If you want to re-use the same HtmlFormatter + object with different searches, you should call HtmlFormatter.clear() + between searches to clear the mapping. + """ + + template = '<%(tag)s class=%(q)s%(cls)s%(tn)s%(q)s>%(t)s' + + def __init__(self, tagname="strong", between="...", + classname="match", termclass="term", maxclasses=5, + attrquote='"'): + """ + :param tagname: the tag to wrap around matching terms. + :param between: the text to add between fragments. + :param classname: the class name to add to the elements wrapped around + matching terms. + :param termclass: the class name prefix for the second class which is + different for each matched term. + :param maxclasses: the maximum number of term classes to produce. This + limits the number of classes you have to define in CSS by recycling + term class names. For example, if you set maxclasses to 3 and have + 5 terms, the 5 terms will use the CSS classes ``term0``, ``term1``, + ``term2``, ``term0``, ``term1``. + """ + + self.between = between + self.tagname = tagname + self.classname = classname + self.termclass = termclass + self.attrquote = attrquote + self.maxclasses = maxclasses + self.seen = {} + self.htmlclass = " ".join((self.classname, self.termclass)) + + def _text(self, text): + return htmlescape(text, quote=False) + + def format_token(self, text, token, replace=False): + seen = self.seen + ttext = self._text(get_text(text, token, replace)) + if ttext in seen: + termnum = seen[ttext] + else: + termnum = len(seen) % self.maxclasses + seen[ttext] = termnum + + return self.template % {"tag": self.tagname, "q": self.attrquote, + "cls": self.htmlclass, "t": ttext, + "tn": termnum} + + def clean(self): + """Clears the dictionary mapping terms to HTML classnames. + """ + self.seen = {} + + +class GenshiFormatter(Formatter): + """Returns a Genshi event stream containing HTML formatting around the + matched terms. + """ + + def __init__(self, qname="strong", between="..."): + """ + :param qname: the QName for the tag to wrap around matched terms. + :param between: the text to add between fragments. + """ + + self.qname = qname + self.between = between + + from genshi.core import START, END, TEXT # @UnresolvedImport + from genshi.core import Attrs, Stream # @UnresolvedImport + self.START, self.END, self.TEXT = START, END, TEXT + self.Attrs, self.Stream = Attrs, Stream + + def _add_text(self, text, output): + if output and output[-1][0] == self.TEXT: + output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2]) + else: + output.append((self.TEXT, text, (None, -1, -1))) + + def format_token(self, text, token, replace=False): + qn = self.qname + txt = get_text(text, token, replace) + return self.Stream([(self.START, (qn, self.Attrs()), (None, -1, -1)), + (self.TEXT, txt, (None, -1, -1)), + (self.END, qn, (None, -1, -1))]) + + def format_fragment(self, fragment, replace=False): + output = [] + index = fragment.startchar + text = fragment.text + + for t in fragment.matches: + if t.startchar > index: + self._add_text(text[index:t.startchar], output) + output.append((text, t, replace)) + index = t.endchar + if index < len(text): + self._add_text(text[index:], output) + return self.Stream(output) + + def format(self, fragments, replace=False): + output = [] + first = True + for fragment in fragments: + if not first: + self._add_text(self.between, output) + output += self.format_fragment(fragment, replace=replace) + first = False + return self.Stream(output) + + +# Highlighting + +def top_fragments(fragments, count, scorer, order, minscore=1): + scored_fragments = ((scorer(f), f) for f in fragments) + scored_fragments = nlargest(count, scored_fragments) + best_fragments = [sf for score, sf in scored_fragments if score >= minscore] + best_fragments.sort(key=order) + return best_fragments + + +def highlight(text, terms, analyzer, fragmenter, formatter, top=3, + scorer=None, minscore=1, order=FIRST, mode="query"): + + if scorer is None: + scorer = BasicFragmentScorer() + + if type(fragmenter) is type: + fragmenter = fragmenter() + if type(formatter) is type: + formatter = formatter() + if type(scorer) is type: + scorer = scorer() + + if scorer is None: + scorer = BasicFragmentScorer() + + termset = frozenset(terms) + tokens = analyzer(text, chars=True, mode=mode, removestops=False) + tokens = set_matched_filter(tokens, termset) + fragments = fragmenter.fragment_tokens(text, tokens) + fragments = top_fragments(fragments, top, scorer, order, minscore) + return formatter(text, fragments) + + +class Highlighter(object): + def __init__(self, fragmenter=None, scorer=None, formatter=None, + always_retokenize=False, order=FIRST): + self.fragmenter = fragmenter or ContextFragmenter() + self.scorer = scorer or BasicFragmentScorer() + self.formatter = formatter or HtmlFormatter(tagname="b") + self.order = order + self.always_retokenize = always_retokenize + + def can_load_chars(self, results, fieldname): + # Is it possible to build a mapping between the matched terms/docs and + # their start and end chars for "pinpoint" highlighting (ie not require + # re-tokenizing text)? + + if self.always_retokenize: + # No, we've been configured to always retokenize some text + return False + if not results.has_matched_terms(): + # No, we don't know what the matched terms are yet + return False + if self.fragmenter.must_retokenize(): + # No, the configured fragmenter doesn't support it + return False + + # Maybe, if the field was configured to store characters + field = results.searcher.schema[fieldname] + return field.supports("characters") + + @staticmethod + def _load_chars(results, fieldname, texts, to_bytes): + # For each docnum, create a mapping of text -> [(startchar, endchar)] + # for the matched terms + + results._char_cache[fieldname] = cache = {} + sorted_ids = sorted(docnum for _, docnum in results.top_n) + + for docnum in sorted_ids: + cache[docnum] = {} + + for text in texts: + btext = to_bytes(text) + m = results.searcher.postings(fieldname, btext) + docset = set(results.termdocs[(fieldname, btext)]) + for docnum in sorted_ids: + if docnum in docset: + m.skip_to(docnum) + assert m.id() == docnum + cache[docnum][text] = m.value_as("characters") + + @staticmethod + def _merge_matched_tokens(tokens): + # Merges consecutive matched tokens together, so they are highlighted + # as one + + token = None + + for t in tokens: + if not t.matched: + if token is not None: + yield token + token = None + yield t + continue + + if token is None: + token = t.copy() + elif t.startchar <= token.endchar: + if t.endchar > token.endchar: + token.text += t.text[token.endchar-t.endchar:] + token.endchar = t.endchar + else: + yield token + token = None + + if token is not None: + yield token + + def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1): + results = hitobj.results + schema = results.searcher.schema + field = schema[fieldname] + to_bytes = field.to_bytes + from_bytes = field.from_bytes + + if text is None: + if fieldname not in hitobj: + raise KeyError("Field %r is not stored." % fieldname) + text = hitobj[fieldname] + + # Get the terms searched for/matched in this field + if results.has_matched_terms(): + bterms = (term for term in results.matched_terms() + if term[0] == fieldname) + else: + bterms = results.query_terms(expand=True, fieldname=fieldname) + # Convert bytes to unicode + words = frozenset(from_bytes(term[1]) for term in bterms) + + # If we can do "pinpoint" highlighting... + if self.can_load_chars(results, fieldname): + # Build the docnum->[(startchar, endchar),] map + if fieldname not in results._char_cache: + self._load_chars(results, fieldname, words, to_bytes) + + hitterms = (from_bytes(term[1]) for term in hitobj.matched_terms() + if term[0] == fieldname) + + # Grab the word->[(startchar, endchar)] map for this docnum + cmap = results._char_cache[fieldname][hitobj.docnum] + # A list of Token objects for matched words + tokens = [] + charlimit = self.fragmenter.charlimit + for word in hitterms: + chars = cmap[word] + for pos, startchar, endchar in chars: + if charlimit and endchar > charlimit: + break + tokens.append(Token(text=word, pos=pos, + startchar=startchar, endchar=endchar)) + tokens.sort(key=lambda t: t.startchar) + tokens = [max(group, key=lambda t: t.endchar - t.startchar) + for key, group in groupby(tokens, lambda t: t.startchar)] + fragments = self.fragmenter.fragment_matches(text, tokens) + else: + # Retokenize the text + analyzer = results.searcher.schema[fieldname].analyzer + tokens = analyzer(text, positions=True, chars=True, mode="index", + removestops=False) + # Set Token.matched attribute for tokens that match a query term + tokens = set_matched_filter(tokens, words) + tokens = self._merge_matched_tokens(tokens) + fragments = self.fragmenter.fragment_tokens(text, tokens) + + fragments = top_fragments(fragments, top, self.scorer, self.order, + minscore=minscore) + output = self.formatter.format(fragments) + return output diff --git a/src/whoosh/idsets.py b/src/whoosh/idsets.py new file mode 100644 index 0000000..b505a39 --- /dev/null +++ b/src/whoosh/idsets.py @@ -0,0 +1,703 @@ +""" +An implementation of an object that acts like a collection of on/off bits. +""" + +import operator +from array import array +from bisect import bisect_left, bisect_right, insort + +from whoosh.compat import integer_types, izip, izip_longest, next, xrange +from whoosh.util.numeric import bytes_for_bits + + +# Number of '1' bits in each byte (0-255) +_1SPERBYTE = array('B', [0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, +2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, +3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, +3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, +2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, +5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, +3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, +5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, +3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, +4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, +6, 7, 7, 8]) + + +class DocIdSet(object): + """Base class for a set of positive integers, implementing a subset of the + built-in ``set`` type's interface with extra docid-related methods. + + This is a superclass for alternative set implementations to the built-in + ``set`` which are more memory-efficient and specialized toward storing + sorted lists of positive integers, though they will inevitably be slower + than ``set`` for most operations since they're pure Python. + """ + + def __eq__(self, other): + for a, b in izip(self, other): + if a != b: + return False + return True + + def __neq__(self, other): + return not self.__eq__(other) + + def __len__(self): + raise NotImplementedError + + def __iter__(self): + raise NotImplementedError + + def __contains__(self, i): + raise NotImplementedError + + def __or__(self, other): + return self.union(other) + + def __and__(self, other): + return self.intersection(other) + + def __sub__(self, other): + return self.difference(other) + + def copy(self): + raise NotImplementedError + + def add(self, n): + raise NotImplementedError + + def discard(self, n): + raise NotImplementedError + + def update(self, other): + add = self.add + for i in other: + add(i) + + def intersection_update(self, other): + for n in self: + if n not in other: + self.discard(n) + + def difference_update(self, other): + for n in other: + self.discard(n) + + def invert_update(self, size): + """Updates the set in-place to contain numbers in the range + ``[0 - size)`` except numbers that are in this set. + """ + + for i in xrange(size): + if i in self: + self.discard(i) + else: + self.add(i) + + def intersection(self, other): + c = self.copy() + c.intersection_update(other) + return c + + def union(self, other): + c = self.copy() + c.update(other) + return c + + def difference(self, other): + c = self.copy() + c.difference_update(other) + return c + + def invert(self, size): + c = self.copy() + c.invert_update(size) + return c + + def isdisjoint(self, other): + a = self + b = other + if len(other) < len(self): + a, b = other, self + for num in a: + if num in b: + return False + return True + + def before(self, i): + """Returns the previous integer in the set before ``i``, or None. + """ + raise NotImplementedError + + def after(self, i): + """Returns the next integer in the set after ``i``, or None. + """ + raise NotImplementedError + + def first(self): + """Returns the first (lowest) integer in the set. + """ + raise NotImplementedError + + def last(self): + """Returns the last (highest) integer in the set. + """ + raise NotImplementedError + + +class BaseBitSet(DocIdSet): + # Methods to override + + def byte_count(self): + raise NotImplementedError + + def _get_byte(self, i): + raise NotImplementedError + + def _iter_bytes(self): + raise NotImplementedError + + # Base implementations + + def __len__(self): + return sum(_1SPERBYTE[b] for b in self._iter_bytes()) + + def __iter__(self): + base = 0 + for byte in self._iter_bytes(): + for i in xrange(8): + if byte & (1 << i): + yield base + i + base += 8 + + def __nonzero__(self): + return any(n for n in self._iter_bytes()) + + __bool__ = __nonzero__ + + def __contains__(self, i): + bucket = i // 8 + if bucket >= self.byte_count(): + return False + return bool(self._get_byte(bucket) & (1 << (i & 7))) + + def first(self): + return self.after(-1) + + def last(self): + return self.before(self.byte_count() * 8 + 1) + + def before(self, i): + _get_byte = self._get_byte + size = self.byte_count() * 8 + + if i <= 0: + return None + elif i >= size: + i = size - 1 + else: + i -= 1 + bucket = i // 8 + + while i >= 0: + byte = _get_byte(bucket) + if not byte: + bucket -= 1 + i = bucket * 8 + 7 + continue + if byte & (1 << (i & 7)): + return i + if i % 8 == 0: + bucket -= 1 + i -= 1 + + return None + + def after(self, i): + _get_byte = self._get_byte + size = self.byte_count() * 8 + + if i >= size: + return None + elif i < 0: + i = 0 + else: + i += 1 + bucket = i // 8 + + while i < size: + byte = _get_byte(bucket) + if not byte: + bucket += 1 + i = bucket * 8 + continue + if byte & (1 << (i & 7)): + return i + i += 1 + if i % 8 == 0: + bucket += 1 + + return None + + +class OnDiskBitSet(BaseBitSet): + """A DocIdSet backed by an array of bits on disk. + + >>> st = RamStorage() + >>> f = st.create_file("test.bin") + >>> bs = BitSet([1, 10, 15, 7, 2]) + >>> bytecount = bs.to_disk(f) + >>> f.close() + >>> # ... + >>> f = st.open_file("test.bin") + >>> odbs = OnDiskBitSet(f, bytecount) + >>> list(odbs) + [1, 2, 7, 10, 15] + """ + + def __init__(self, dbfile, basepos, bytecount): + """ + :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object + to read from. + :param basepos: the base position of the bytes in the given file. + :param bytecount: the number of bytes to use for the bit array. + """ + + self._dbfile = dbfile + self._basepos = basepos + self._bytecount = bytecount + + def __repr__(self): + return "%s(%s, %d, %d)" % (self.__class__.__name__, self.dbfile, + self._basepos, self.bytecount) + + def byte_count(self): + return self._bytecount + + def _get_byte(self, n): + return self._dbfile.get_byte(self._basepos + n) + + def _iter_bytes(self): + dbfile = self._dbfile + dbfile.seek(self._basepos) + for _ in xrange(self._bytecount): + yield dbfile.read_byte() + + +class BitSet(BaseBitSet): + """A DocIdSet backed by an array of bits. This can also be useful as a bit + array (e.g. for a Bloom filter). It is much more memory efficient than a + large built-in set of integers, but wastes memory for sparse sets. + """ + + def __init__(self, source=None, size=0): + """ + :param maxsize: the maximum size of the bit array. + :param source: an iterable of positive integers to add to this set. + :param bits: an array of unsigned bytes ("B") to use as the underlying + bit array. This is used by some of the object's methods. + """ + + # If the source is a list, tuple, or set, we can guess the size + if not size and isinstance(source, (list, tuple, set, frozenset)): + size = max(source) + bytecount = bytes_for_bits(size) + self.bits = array("B", (0 for _ in xrange(bytecount))) + + if source: + add = self.add + for num in source: + add(num) + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, list(self)) + + def byte_count(self): + return len(self.bits) + + def _get_byte(self, n): + return self.bits[n] + + def _iter_bytes(self): + return iter(self.bits) + + def _trim(self): + bits = self.bits + last = len(self.bits) - 1 + while last >= 0 and not bits[last]: + last -= 1 + del self.bits[last + 1:] + + def _resize(self, tosize): + curlength = len(self.bits) + newlength = bytes_for_bits(tosize) + if newlength > curlength: + self.bits.extend((0,) * (newlength - curlength)) + elif newlength < curlength: + del self.bits[newlength + 1:] + + def _zero_extra_bits(self, size): + bits = self.bits + spill = size - ((len(bits) - 1) * 8) + if spill: + mask = 2 ** spill - 1 + bits[-1] = bits[-1] & mask + + def _logic(self, obj, op, other): + objbits = obj.bits + for i, (byte1, byte2) in enumerate(izip_longest(objbits, other.bits, + fillvalue=0)): + value = op(byte1, byte2) & 0xFF + if i >= len(objbits): + objbits.append(value) + else: + objbits[i] = value + + obj._trim() + return obj + + def to_disk(self, dbfile): + dbfile.write_array(self.bits) + return len(self.bits) + + @classmethod + def from_bytes(cls, bs): + b = cls() + b.bits = array("B", bs) + return b + + @classmethod + def from_disk(cls, dbfile, bytecount): + return cls.from_bytes(dbfile.read_array("B", bytecount)) + + def copy(self): + b = self.__class__() + b.bits = array("B", iter(self.bits)) + return b + + def clear(self): + for i in xrange(len(self.bits)): + self.bits[i] = 0 + + def add(self, i): + bucket = i >> 3 + if bucket >= len(self.bits): + self._resize(i + 1) + self.bits[bucket] |= 1 << (i & 7) + + def discard(self, i): + bucket = i >> 3 + self.bits[bucket] &= ~(1 << (i & 7)) + + def _resize_to_other(self, other): + if isinstance(other, (list, tuple, set, frozenset)): + maxbit = max(other) + if maxbit // 8 > len(self.bits): + self._resize(maxbit) + + def update(self, iterable): + self._resize_to_other(iterable) + DocIdSet.update(self, iterable) + + def intersection_update(self, other): + if isinstance(other, BitSet): + return self._logic(self, operator.__and__, other) + discard = self.discard + for n in self: + if n not in other: + discard(n) + + def difference_update(self, other): + if isinstance(other, BitSet): + return self._logic(self, lambda x, y: x & ~y, other) + discard = self.discard + for n in other: + discard(n) + + def invert_update(self, size): + bits = self.bits + for i in xrange(len(bits)): + bits[i] = ~bits[i] & 0xFF + self._zero_extra_bits(size) + + def union(self, other): + if isinstance(other, BitSet): + return self._logic(self.copy(), operator.__or__, other) + b = self.copy() + b.update(other) + return b + + def intersection(self, other): + if isinstance(other, BitSet): + return self._logic(self.copy(), operator.__and__, other) + return BitSet(source=(n for n in self if n in other)) + + def difference(self, other): + if isinstance(other, BitSet): + return self._logic(self.copy(), lambda x, y: x & ~y, other) + return BitSet(source=(n for n in self if n not in other)) + + +class SortedIntSet(DocIdSet): + """A DocIdSet backed by a sorted array of integers. + """ + + def __init__(self, source=None, typecode="I"): + if source: + self.data = array(typecode, sorted(source)) + else: + self.data = array(typecode) + self.typecode = typecode + + def copy(self): + sis = SortedIntSet() + sis.data = array(self.typecode, self.data) + return sis + + def size(self): + return len(self.data) * self.data.itemsize + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.data) + + def __len__(self): + return len(self.data) + + def __iter__(self): + return iter(self.data) + + def __nonzero__(self): + return bool(self.data) + + __bool__ = __nonzero__ + + def __contains__(self, i): + data = self.data + if not data or i < data[0] or i > data[-1]: + return False + + pos = bisect_left(data, i) + if pos == len(data): + return False + return data[pos] == i + + def add(self, i): + data = self.data + if not data or i > data[-1]: + data.append(i) + else: + mn = data[0] + mx = data[-1] + if i == mn or i == mx: + return + elif i > mx: + data.append(i) + elif i < mn: + data.insert(0, i) + else: + pos = bisect_left(data, i) + if data[pos] != i: + data.insert(pos, i) + + def discard(self, i): + data = self.data + pos = bisect_left(data, i) + if data[pos] == i: + data.pop(pos) + + def clear(self): + self.data = array(self.typecode) + + def intersection_update(self, other): + self.data = array(self.typecode, (num for num in self if num in other)) + + def difference_update(self, other): + self.data = array(self.typecode, + (num for num in self if num not in other)) + + def intersection(self, other): + return SortedIntSet((num for num in self if num in other)) + + def difference(self, other): + return SortedIntSet((num for num in self if num not in other)) + + def first(self): + return self.data[0] + + def last(self): + return self.data[-1] + + def before(self, i): + data = self.data + pos = bisect_left(data, i) + if pos < 1: + return None + else: + return data[pos - 1] + + def after(self, i): + data = self.data + if not data or i >= data[-1]: + return None + elif i < data[0]: + return data[0] + + pos = bisect_right(data, i) + return data[pos] + + +class ReverseIdSet(DocIdSet): + """ + Wraps a DocIdSet object and reverses its semantics, so docs in the wrapped + set are not in this set, and vice-versa. + """ + + def __init__(self, idset, limit): + """ + :param idset: the DocIdSet object to wrap. + :param limit: the highest possible ID plus one. + """ + + self.idset = idset + self.limit = limit + + def __len__(self): + return self.limit - len(self.idset) + + def __contains__(self, i): + return i not in self.idset + + def __iter__(self): + ids = iter(self.idset) + try: + nx = next(ids) + except StopIteration: + nx = -1 + + for i in xrange(self.limit): + if i == nx: + try: + nx = next(ids) + except StopIteration: + nx = -1 + else: + yield i + + def add(self, n): + self.idset.discard(n) + + def discard(self, n): + self.idset.add(n) + + def first(self): + for i in self: + return i + + def last(self): + idset = self.idset + maxid = self.limit - 1 + if idset.last() < maxid - 1: + return maxid + + for i in xrange(maxid, -1, -1): + if i not in idset: + return i + +ROARING_CUTOFF = 1 << 12 + + +class RoaringIdSet(DocIdSet): + """ + Separates IDs into ranges of 2^16 bits, and stores each range in the most + efficient type of doc set, either a BitSet (if the range has >= 2^12 IDs) + or a sorted ID set of 16-bit shorts. + """ + + cutoff = 2**12 + + def __init__(self, source=None): + self.idsets = [] + if source: + self.update(source) + + def __len__(self): + if not self.idsets: + return 0 + + return sum(len(idset) for idset in self.idsets) + + def __contains__(self, n): + bucket = n >> 16 + if bucket >= len(self.idsets): + return False + return (n - (bucket << 16)) in self.idsets[bucket] + + def __iter__(self): + for i, idset in self.idsets: + floor = i << 16 + for n in idset: + yield floor + n + + def _find(self, n): + bucket = n >> 16 + floor = n << 16 + if bucket >= len(self.idsets): + self.idsets.extend([SortedIntSet() for _ + in xrange(len(self.idsets), bucket + 1)]) + idset = self.idsets[bucket] + return bucket, floor, idset + + def add(self, n): + bucket, floor, idset = self._find(n) + oldlen = len(idset) + idset.add(n - floor) + if oldlen <= ROARING_CUTOFF < len(idset): + self.idsets[bucket] = BitSet(idset) + + def discard(self, n): + bucket, floor, idset = self._find(n) + oldlen = len(idset) + idset.discard(n - floor) + if oldlen > ROARING_CUTOFF >= len(idset): + self.idsets[bucket] = SortedIntSet(idset) + + +class MultiIdSet(DocIdSet): + """Wraps multiple SERIAL sub-DocIdSet objects and presents them as an + aggregated, read-only set. + """ + + def __init__(self, idsets, offsets): + """ + :param idsets: a list of DocIdSet objects. + :param offsets: a list of offsets corresponding to the DocIdSet objects + in ``idsets``. + """ + + assert len(idsets) == len(offsets) + self.idsets = idsets + self.offsets = offsets + + def _document_set(self, n): + offsets = self.offsets + return max(bisect_left(offsets, n), len(self.offsets) - 1) + + def _set_and_docnum(self, n): + setnum = self._document_set(n) + offset = self.offsets[setnum] + return self.idsets[setnum], n - offset + + def __len__(self): + return sum(len(idset) for idset in self.idsets) + + def __iter__(self): + for idset, offset in izip(self.idsets, self.offsets): + for docnum in idset: + yield docnum + offset + + def __contains__(self, item): + idset, n = self._set_and_docnum(item) + return n in idset + + diff --git a/src/whoosh/index.py b/src/whoosh/index.py new file mode 100644 index 0000000..158158e --- /dev/null +++ b/src/whoosh/index.py @@ -0,0 +1,707 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +"""Contains the main functions/classes for creating, maintaining, and using +an index. +""" + +from __future__ import division +import os.path, re, sys +from time import time, sleep + +from whoosh import __version__ +from whoosh.legacy import toc_loaders +from whoosh.compat import pickle, string_type +from whoosh.fields import ensure_schema +from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE + + +_DEF_INDEX_NAME = "MAIN" +_CURRENT_TOC_VERSION = -111 + + +# Exceptions + +class LockError(Exception): + pass + + +class IndexError(Exception): + """Generic index error.""" + + +class IndexVersionError(IndexError): + """Raised when you try to open an index using a format that the current + version of Whoosh cannot read. That is, when the index you're trying to + open is either not backward or forward compatible with this version of + Whoosh. + """ + + def __init__(self, msg, version, release=None): + Exception.__init__(self, msg) + self.version = version + self.release = release + + +class OutOfDateError(IndexError): + """Raised when you try to commit changes to an index which is not the + latest generation. + """ + + +class EmptyIndexError(IndexError): + """Raised when you try to work with an index that has no indexed terms. + """ + + +# Convenience functions + +def create_in(dirname, schema, indexname=None): + """Convenience function to create an index in a directory. Takes care of + creating a FileStorage object for you. + + :param dirname: the path string of the directory in which to create the + index. + :param schema: a :class:`whoosh.fields.Schema` object describing the + index's fields. + :param indexname: the name of the index to create; you only need to specify + this if you are creating multiple indexes within the same storage + object. + :returns: :class:`Index` + """ + + from whoosh.filedb.filestore import FileStorage + + if not indexname: + indexname = _DEF_INDEX_NAME + storage = FileStorage(dirname) + return FileIndex.create(storage, schema, indexname) + + +def open_dir(dirname, indexname=None, readonly=False, schema=None): + """Convenience function for opening an index in a directory. Takes care of + creating a FileStorage object for you. dirname is the filename of the + directory in containing the index. indexname is the name of the index to + create; you only need to specify this if you have multiple indexes within + the same storage object. + + :param dirname: the path string of the directory in which to create the + index. + :param indexname: the name of the index to create; you only need to specify + this if you have multiple indexes within the same storage object. + """ + + from whoosh.filedb.filestore import FileStorage + + if indexname is None: + indexname = _DEF_INDEX_NAME + storage = FileStorage(dirname, readonly=readonly) + return FileIndex(storage, schema=schema, indexname=indexname) + + +def exists_in(dirname, indexname=None): + """Returns True if dirname contains a Whoosh index. + + :param dirname: the file path of a directory. + :param indexname: the name of the index. If None, the default index name is + used. + """ + + if os.path.exists(dirname): + try: + ix = open_dir(dirname, indexname=indexname) + return ix.latest_generation() > -1 + except EmptyIndexError: + pass + + return False + + +def exists(storage, indexname=None): + """Deprecated; use ``storage.index_exists()``. + + :param storage: a store.Storage object. + :param indexname: the name of the index. If None, the default index name is + used. + """ + + return storage.index_exists(indexname) + + +def version_in(dirname, indexname=None): + """Returns a tuple of (release_version, format_version), where + release_version is the release version number of the Whoosh code that + created the index -- e.g. (0, 1, 24) -- and format_version is the version + number of the on-disk format used for the index -- e.g. -102. + + You should avoid attaching significance to the second number (the index + version). This is simply a version number for the TOC file and probably + should not have been exposed in a public interface. The best way to check + if the current version of Whoosh can open an index is to actually try to + open it and see if it raises a ``whoosh.index.IndexVersionError`` exception. + + Note that the release and format version are available as attributes on the + Index object in Index.release and Index.version. + + :param dirname: the file path of a directory containing an index. + :param indexname: the name of the index. If None, the default index name is + used. + :returns: ((major_ver, minor_ver, build_ver), format_ver) + """ + + from whoosh.filedb.filestore import FileStorage + storage = FileStorage(dirname) + return version(storage, indexname=indexname) + + +def version(storage, indexname=None): + """Returns a tuple of (release_version, format_version), where + release_version is the release version number of the Whoosh code that + created the index -- e.g. (0, 1, 24) -- and format_version is the version + number of the on-disk format used for the index -- e.g. -102. + + You should avoid attaching significance to the second number (the index + version). This is simply a version number for the TOC file and probably + should not have been exposed in a public interface. The best way to check + if the current version of Whoosh can open an index is to actually try to + open it and see if it raises a ``whoosh.index.IndexVersionError`` exception. + + Note that the release and format version are available as attributes on the + Index object in Index.release and Index.version. + + :param storage: a store.Storage object. + :param indexname: the name of the index. If None, the default index name is + used. + :returns: ((major_ver, minor_ver, build_ver), format_ver) + """ + + try: + if indexname is None: + indexname = _DEF_INDEX_NAME + + ix = storage.open_index(indexname) + return (ix.release, ix.version) + except IndexVersionError: + e = sys.exc_info()[1] + return (None, e.version) + + +# Index base class + +class Index(object): + """Represents an indexed collection of documents. + """ + + def close(self): + """Closes any open resources held by the Index object itself. This may + not close all resources being used everywhere, for example by a + Searcher object. + """ + pass + + def add_field(self, fieldname, fieldspec): + """Adds a field to the index's schema. + + :param fieldname: the name of the field to add. + :param fieldspec: an instantiated :class:`whoosh.fields.FieldType` + object. + """ + + w = self.writer() + w.add_field(fieldname, fieldspec) + w.commit() + + def remove_field(self, fieldname): + """Removes the named field from the index's schema. Depending on the + backend implementation, this may or may not actually remove existing + data for the field from the index. Optimizing the index should always + clear out existing data for a removed field. + """ + + w = self.writer() + w.remove_field(fieldname) + w.commit() + + def latest_generation(self): + """Returns the generation number of the latest generation of this + index, or -1 if the backend doesn't support versioning. + """ + return -1 + + def refresh(self): + """Returns a new Index object representing the latest generation + of this index (if this object is the latest generation, or the backend + doesn't support versioning, returns self). + + :returns: :class:`Index` + """ + return self + + def up_to_date(self): + """Returns True if this object represents the latest generation of + this index. Returns False if this object is not the latest generation + (that is, someone else has updated the index since you opened this + object). + """ + return True + + def last_modified(self): + """Returns the last modified time of the index, or -1 if the backend + doesn't support last-modified times. + """ + return -1 + + def is_empty(self): + """Returns True if this index is empty (that is, it has never had any + documents successfully written to it. + """ + raise NotImplementedError + + def optimize(self): + """Optimizes this index, if necessary. + """ + pass + + def doc_count_all(self): + """Returns the total number of documents, DELETED OR UNDELETED, + in this index. + """ + + r = self.reader() + try: + return r.doc_count_all() + finally: + r.close() + + def doc_count(self): + """Returns the total number of UNDELETED documents in this index. + """ + + r = self.reader() + try: + return r.doc_count() + finally: + r.close() + + def searcher(self, **kwargs): + """Returns a Searcher object for this index. Keyword arguments are + passed to the Searcher object's constructor. + + :rtype: :class:`whoosh.searching.Searcher` + """ + + from whoosh.searching import Searcher + return Searcher(self.reader(), fromindex=self, **kwargs) + + def field_length(self, fieldname): + """Returns the total length of the field across all documents. + """ + + r = self.reader() + try: + return r.field_length(fieldname) + finally: + r.close() + + def max_field_length(self, fieldname): + """Returns the maximum length of the field across all documents. + """ + + r = self.reader() + try: + return r.max_field_length(fieldname) + finally: + r.close() + + def reader(self, reuse=None): + """Returns an IndexReader object for this index. + + :param reuse: an existing reader. Some implementations may recycle + resources from this existing reader to create the new reader. Note + that any resources in the "recycled" reader that are not used by + the new reader will be CLOSED, so you CANNOT use it afterward. + :rtype: :class:`whoosh.reading.IndexReader` + """ + + raise NotImplementedError + + def writer(self, **kwargs): + """Returns an IndexWriter object for this index. + + :rtype: :class:`whoosh.writing.IndexWriter` + """ + raise NotImplementedError + + def delete_by_term(self, fieldname, text, searcher=None): + w = self.writer() + w.delete_by_term(fieldname, text, searcher=searcher) + w.commit() + + def delete_by_query(self, q, searcher=None): + w = self.writer() + w.delete_by_query(q, searcher=searcher) + w.commit() + + +# Codec-based index implementation + +def clean_files(storage, indexname, gen, segments): + # Attempts to remove unused index files (called when a new generation + # is created). If existing Index and/or reader objects have the files + # open, they may not be deleted immediately (i.e. on Windows) but will + # probably be deleted eventually by a later call to clean_files. + + current_segment_names = set(s.segment_id() for s in segments) + tocpattern = TOC._pattern(indexname) + segpattern = TOC._segment_pattern(indexname) + + todelete = set() + for filename in storage: + if filename.startswith("."): + continue + tocm = tocpattern.match(filename) + segm = segpattern.match(filename) + if tocm: + if int(tocm.group(1)) != gen: + todelete.add(filename) + elif segm: + name = segm.group(1) + if name not in current_segment_names: + todelete.add(filename) + + for filename in todelete: + try: + storage.delete_file(filename) + except OSError: + # Another process still has this file open, I guess + pass + + +class FileIndex(Index): + def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME): + from whoosh.filedb.filestore import Storage + + if not isinstance(storage, Storage): + raise ValueError("%r is not a Storage object" % storage) + if not isinstance(indexname, string_type): + raise ValueError("indexname %r is not a string" % indexname) + + if schema: + schema = ensure_schema(schema) + + self.storage = storage + self._schema = schema + self.indexname = indexname + + # Try reading the TOC to see if it's possible + TOC.read(self.storage, self.indexname, schema=self._schema) + + @classmethod + def create(cls, storage, schema, indexname=_DEF_INDEX_NAME): + TOC.create(storage, schema, indexname) + return cls(storage, schema, indexname) + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, + self.storage, self.indexname) + + def close(self): + pass + + # add_field + # remove_field + + def latest_generation(self): + return TOC._latest_generation(self.storage, self.indexname) + + # refresh + # up_to_date + + def last_modified(self): + gen = self.latest_generation() + filename = TOC._filename(self.indexname, gen) + return self.storage.file_modified(filename) + + def is_empty(self): + return len(self._read_toc().segments) == 0 + + def optimize(self, **kwargs): + w = self.writer(**kwargs) + w.commit(optimize=True) + + # searcher + + def writer(self, procs=1, **kwargs): + if procs > 1: + from whoosh.multiproc import MpWriter + return MpWriter(self, procs=procs, **kwargs) + else: + from whoosh.writing import SegmentWriter + return SegmentWriter(self, **kwargs) + + def lock(self, name): + """Returns a lock object that you can try to call acquire() on to + lock the index. + """ + + return self.storage.lock(self.indexname + "_" + name) + + def _read_toc(self): + return TOC.read(self.storage, self.indexname, schema=self._schema) + + def _segments(self): + return self._read_toc().segments + + def _current_schema(self): + return self._read_toc().schema + + @property + def schema(self): + return self._current_schema() + + @property + def release(self): + return self._read_toc().release + + @property + def version(self): + return self._read_toc().version + + @classmethod + def _reader(cls, storage, schema, segments, generation, reuse=None): + # Returns a reader for the given segments, possibly reusing already + # opened readers + from whoosh.reading import SegmentReader, MultiReader, EmptyReader + + reusable = {} + try: + if len(segments) == 0: + # This index has no segments! Return an EmptyReader object, + # which simply returns empty or zero to every method + return EmptyReader(schema) + + if reuse: + # Put all atomic readers in a dictionary keyed by their + # generation, so we can re-use them if them if possible + readers = [r for r, _ in reuse.leaf_readers()] + reusable = dict((r.generation(), r) for r in readers) + + # Make a function to open readers, which reuses reusable readers. + # It removes any readers it reuses from the "reusable" dictionary, + # so later we can close any readers left in the dictionary. + def segreader(segment): + segid = segment.segment_id() + if segid in reusable: + r = reusable[segid] + del reusable[segid] + return r + else: + return SegmentReader(storage, schema, segment, + generation=generation) + + if len(segments) == 1: + # This index has one segment, so return a SegmentReader object + # for the segment + return segreader(segments[0]) + else: + # This index has multiple segments, so create a list of + # SegmentReaders for the segments, then composite them with a + # MultiReader + + readers = [segreader(segment) for segment in segments] + return MultiReader(readers, generation=generation) + finally: + for r in reusable.values(): + r.close() + + def reader(self, reuse=None): + retries = 10 + while retries > 0: + # Read the information from the TOC file + try: + info = self._read_toc() + return self._reader(self.storage, info.schema, info.segments, + info.generation, reuse=reuse) + except IOError: + # Presume that we got a "file not found error" because a writer + # deleted one of the files just as we were trying to open it, + # and so retry a few times before actually raising the + # exception + e = sys.exc_info()[1] + retries -= 1 + if retries <= 0: + raise e + sleep(0.05) + + +# TOC class + +class TOC(object): + """Object representing the state of the index after a commit. Essentially + a container for the index's schema and the list of segment objects. + """ + + def __init__(self, schema, segments, generation, + version=_CURRENT_TOC_VERSION, release=__version__): + self.schema = schema + self.segments = segments + self.generation = generation + self.version = version + self.release = release + + @classmethod + def _filename(cls, indexname, gen): + return "_%s_%s.toc" % (indexname, gen) + + @classmethod + def _pattern(cls, indexname): + return re.compile("^_%s_([0-9]+).toc$" % indexname) + + @classmethod + def _segment_pattern(cls, indexname): + return re.compile("(%s_[0-9a-z]+)[.][A-Za-z0-9_.]+" % indexname) + + @classmethod + def _latest_generation(cls, storage, indexname): + pattern = cls._pattern(indexname) + + mx = -1 + for filename in storage: + m = pattern.match(filename) + if m: + mx = max(int(m.group(1)), mx) + return mx + + @classmethod + def create(cls, storage, schema, indexname=_DEF_INDEX_NAME): + schema = ensure_schema(schema) + + # Clear existing files + prefix = "_%s_" % indexname + for filename in storage: + if filename.startswith(prefix): + storage.delete_file(filename) + + # Write a TOC file with an empty list of segments + toc = cls(schema, [], 0) + toc.write(storage, indexname) + + @classmethod + def read(cls, storage, indexname, gen=None, schema=None): + if gen is None: + gen = cls._latest_generation(storage, indexname) + if gen < 0: + raise EmptyIndexError("Index %r does not exist in %r" + % (indexname, storage)) + + # Read the content of this index from the .toc file. + tocfilename = cls._filename(indexname, gen) + stream = storage.open_file(tocfilename) + + def check_size(name, target): + sz = stream.read_varint() + if sz != target: + raise IndexError("Index was created on different architecture:" + " saved %s = %s, this computer = %s" + % (name, sz, target)) + + check_size("int", _INT_SIZE) + check_size("long", _LONG_SIZE) + check_size("float", _FLOAT_SIZE) + + if not stream.read_int() == -12345: + raise IndexError("Number misread: byte order problem") + + version = stream.read_int() + release = (stream.read_varint(), stream.read_varint(), + stream.read_varint()) + + if version != _CURRENT_TOC_VERSION: + if version in toc_loaders: + loader = toc_loaders[version] + schema, segments = loader(stream, gen, schema, version) + else: + raise IndexVersionError("Can't read format %s" % version, + version) + else: + # If the user supplied a schema object with the constructor, don't + # load the pickled schema from the saved index. + if schema: + stream.skip_string() + else: + schema = pickle.loads(stream.read_string()) + schema = ensure_schema(schema) + + # Generation + index_gen = stream.read_int() + assert gen == index_gen + + _ = stream.read_int() # Unused + segments = stream.read_pickle() + + stream.close() + return cls(schema, segments, gen, version=version, release=release) + + def write(self, storage, indexname): + schema = ensure_schema(self.schema) + schema.clean() + + # Use a temporary file for atomic write. + tocfilename = self._filename(indexname, self.generation) + tempfilename = '%s.%s' % (tocfilename, time()) + stream = storage.create_file(tempfilename) + + stream.write_varint(_INT_SIZE) + stream.write_varint(_LONG_SIZE) + stream.write_varint(_FLOAT_SIZE) + stream.write_int(-12345) + + stream.write_int(_CURRENT_TOC_VERSION) + for num in __version__[:3]: + stream.write_varint(num) + + try: + stream.write_string(pickle.dumps(schema, -1)) + except pickle.PicklingError: + # Try to narrow down the error to a single field + for fieldname, field in schema.items(): + try: + pickle.dumps(field) + except pickle.PicklingError: + e = sys.exc_info()[1] + raise pickle.PicklingError("%s %s=%r" % (e, fieldname, field)) + # Otherwise, re-raise the original exception + raise + + stream.write_int(self.generation) + stream.write_int(0) # Unused + stream.write_pickle(self.segments) + stream.close() + + # Rename temporary file to the proper filename + storage.rename_file(tempfilename, tocfilename, safe=True) + diff --git a/src/whoosh/lang/__init__.py b/src/whoosh/lang/__init__.py new file mode 100644 index 0000000..72299f2 --- /dev/null +++ b/src/whoosh/lang/__init__.py @@ -0,0 +1,140 @@ +# coding=utf-8 + +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + + +# Exceptions + +class NoStemmer(Exception): + pass + + +class NoStopWords(Exception): + pass + + +# Data and functions for language names + +languages = ("ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt", + "ro", "ru", "es", "sv", "tr") + +aliases = { + # By ISO 639-1 three letter codes + "ara": "ar", + "dan": "da", "nld": "nl", "eng": "en", "fin": "fi", "fra": "fr", + "deu": "de", "hun": "hu", "ita": "it", "nor": "no", "por": "pt", + "ron": "ro", "rus": "ru", "spa": "es", "swe": "sv", "tur": "tr", + + # By name in English + "arabic": "ar", + "danish": "da", + "dutch": "nl", + "english": "en", + "finnish": "fi", + "french": "fr", + "german": "de", + "hungarian": "hu", + "italian": "it", + "norwegian": "no", + "portuguese": "pt", + "romanian": "ro", + "russian": "ru", + "spanish": "es", + "swedish": "sw", + "turkish": "tr", + + # By name in own language + "العربية": "ar", + "dansk": "da", + "nederlands": "nl", + "suomi": "fi", + "français": "fr", + "deutsch": "de", + "magyar": "hu", + "italiano": "it", + "norsk": "no", + "português": "pt", + "русский язык": "ru", + "español": "es", + "svenska": "sv", + "türkçe": "tr", + } + + +def two_letter_code(name): + if name in languages: + return name + if name in aliases: + return aliases[name] + return None + + +# Getter functions + +def has_stemmer(lang): + try: + return bool(stemmer_for_language(lang)) + except NoStemmer: + return False + + +def has_stopwords(lang): + try: + return bool(stopwords_for_language(lang)) + except NoStopWords: + return False + + +def stemmer_for_language(lang): + if lang == "en_porter": + # Original porter stemming algorithm is several times faster than the + # more correct porter2 algorithm in snowball package + from .porter import stem as porter_stem + return porter_stem + + tlc = two_letter_code(lang) + + if tlc == "ar": + from .isri import ISRIStemmer + return ISRIStemmer().stem + + from .snowball import classes as snowball_classes + if tlc in snowball_classes: + return snowball_classes[tlc]().stem + + raise NoStemmer("No stemmer available for %r" % lang) + + +def stopwords_for_language(lang): + from .stopwords import stoplists + + tlc = two_letter_code(lang) + if tlc in stoplists: + return stoplists[tlc] + + raise NoStopWords("No stop-word list available for %r" % lang) diff --git a/src/whoosh/lang/dmetaphone.py b/src/whoosh/lang/dmetaphone.py new file mode 100644 index 0000000..d586c29 --- /dev/null +++ b/src/whoosh/lang/dmetaphone.py @@ -0,0 +1,415 @@ +# coding= utf-8 + +# This script implements the Double Metaphone algorythm (c) 1998, 1999 by +# Lawrence Philips. It was translated to Python from the C source written by +# Kevin Atkinson (http://aspell.net/metaphone/) By Andrew Collins - January 12, +# 2007 who claims no rights to this work. +# http://atomboy.isa-geek.com:8080/plone/Members/acoil/programing/double-metaphone + +import re + +from whoosh.compat import u + +vowels = frozenset("AEIOUY") +slavo_germ_exp = re.compile("W|K|CZ|WITZ") +silent_starts = re.compile("GN|KN|PN|WR|PS") + + +def double_metaphone(text): + text = text.upper() + slavo_germanic = bool(slavo_germ_exp.search(text)) + + length = len(text) + text = "--" + text + " " + first = pos = 2 + last = first + length - 1 + primary = secondary = "" + + if silent_starts.match(text, pos): + pos += 1 + + while pos < length + 2: + ch = text[pos] + + if ch in vowels: + # all init vowels now map to 'A' + if pos != first: + next = (None, 1) + else: + next = ("A", 1) + elif ch == "B": + #"-mb", e.g", "dumb", already skipped over... see 'M' below + if text[pos + 1] == "B": + next = ("P", 2) + else: + next = ("P", 1) + elif ch == "C": + # various germanic + if (pos > (first + 1) and text[pos - 2] not in vowels and text[pos - 1:pos + 2] == 'ACH' and \ + (text[pos + 2] not in ['I', 'E'] or text[pos - 2:pos + 4] in ['BACHER', 'MACHER'])): + next = ('K', 2) + # special case 'CAESAR' + elif pos == first and text[first:first + 6] == 'CAESAR': + next = ('S', 2) + elif text[pos:pos + 4] == 'CHIA': # italian 'chianti' + next = ('K', 2) + elif text[pos:pos + 2] == 'CH': + # find 'michael' + if pos > first and text[pos:pos + 4] == 'CHAE': + next = ('K', 'X', 2) + elif pos == first and (text[pos + 1:pos + 6] in ['HARAC', 'HARIS'] or \ + text[pos + 1:pos + 4] in ["HOR", "HYM", "HIA", "HEM"]) and text[first:first + 5] != 'CHORE': + next = ('K', 2) + # germanic, greek, or otherwise 'ch' for 'kh' sound + elif text[first:first + 4] in ['VAN ', 'VON '] or text[first:first + 3] == 'SCH' \ + or text[pos - 2:pos + 4] in ["ORCHES", "ARCHIT", "ORCHID"] \ + or text[pos + 2] in ['T', 'S'] \ + or ((text[pos - 1] in ["A", "O", "U", "E"] or pos == first) \ + and text[pos + 2] in ["L", "R", "N", "M", "B", "H", "F", "V", "W", " "]): + next = ('K', 1) + else: + if pos > first: + if text[first:first + 2] == 'MC': + next = ('K', 2) + else: + next = ('X', 'K', 2) + else: + next = ('X', 2) + # e.g, 'czerny' + elif text[pos:pos + 2] == 'CZ' and text[pos - 2:pos + 2] != 'WICZ': + next = ('S', 'X', 2) + # e.g., 'focaccia' + elif text[pos + 1:pos + 4] == 'CIA': + next = ('X', 3) + # double 'C', but not if e.g. 'McClellan' + elif text[pos:pos + 2] == 'CC' and not (pos == (first + 1) and text[first] == 'M'): + # 'bellocchio' but not 'bacchus' + if text[pos + 2] in ["I", "E", "H"] and text[pos + 2:pos + 4] != 'HU': + # 'accident', 'accede' 'succeed' + if (pos == (first + 1) and text[first] == 'A') or \ + text[pos - 1:pos + 4] in ['UCCEE', 'UCCES']: + next = ('KS', 3) + # 'bacci', 'bertucci', other italian + else: + next = ('X', 3) + else: + next = ('K', 2) + elif text[pos:pos + 2] in ["CK", "CG", "CQ"]: + next = ('K', 'K', 2) + elif text[pos:pos + 2] in ["CI", "CE", "CY"]: + # italian vs. english + if text[pos:pos + 3] in ["CIO", "CIE", "CIA"]: + next = ('S', 'X', 2) + else: + next = ('S', 2) + else: + # name sent in 'mac caffrey', 'mac gregor + if text[pos + 1:pos + 3] in [" C", " Q", " G"]: + next = ('K', 3) + else: + if text[pos + 1] in ["C", "K", "Q"] and text[pos + 1:pos + 3] not in ["CE", "CI"]: + next = ('K', 2) + else: # default for 'C' + next = ('K', 1) + elif ch == u('\xc7'): + next = ('S', 1) + elif ch == 'D': + if text[pos:pos + 2] == 'DG': + if text[pos + 2] in ['I', 'E', 'Y']: # e.g. 'edge' + next = ('J', 3) + else: + next = ('TK', 2) + elif text[pos:pos + 2] in ['DT', 'DD']: + next = ('T', 2) + else: + next = ('T', 1) + elif ch == 'F': + if text[pos + 1] == 'F': + next = ('F', 2) + else: + next = ('F', 1) + elif ch == 'G': + if text[pos + 1] == 'H': + if pos > first and text[pos - 1] not in vowels: + next = ('K', 2) + elif pos < (first + 3): + if pos == first: # 'ghislane', ghiradelli + if text[pos + 2] == 'I': + next = ('J', 2) + else: + next = ('K', 2) + # Parker's rule (with some further refinements) - e.g., 'hugh' + elif (pos > (first + 1) and text[pos - 2] in ['B', 'H', 'D']) \ + or (pos > (first + 2) and text[pos - 3] in ['B', 'H', 'D']) \ + or (pos > (first + 3) and text[pos - 4] in ['B', 'H']): + next = (None, 2) + else: + # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' + if pos > (first + 2) and text[pos - 1] == 'U' \ + and text[pos - 3] in ["C", "G", "L", "R", "T"]: + next = ('F', 2) + else: + if pos > first and text[pos - 1] != 'I': + next = ('K', 2) + elif text[pos + 1] == 'N': + if pos == (first + 1) and text[first] in vowels and not slavo_germanic: + next = ('KN', 'N', 2) + else: + # not e.g. 'cagney' + if text[pos + 2:pos + 4] != 'EY' and text[pos + 1] != 'Y' and not slavo_germanic: + next = ('N', 'KN', 2) + else: + next = ('KN', 2) + # 'tagliaro' + elif text[pos + 1:pos + 3] == 'LI' and not slavo_germanic: + next = ('KL', 'L', 2) + # -ges-,-gep-,-gel-, -gie- at beginning + elif pos == first and (text[pos + 1] == 'Y' \ + or text[pos + 1:pos + 3] in ["ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"]): + next = ('K', 'J', 2) + # -ger-, -gy- + elif (text[pos + 1:pos + 2] == 'ER' or text[pos + 1] == 'Y') \ + and text[first:first + 6] not in ["DANGER", "RANGER", "MANGER"] \ + and text[pos - 1] not in ['E', 'I'] and text[pos - 1:pos + 2] not in ['RGY', 'OGY']: + next = ('K', 'J', 2) + # italian e.g, 'biaggi' + elif text[pos + 1] in ['E', 'I', 'Y'] or text[pos - 1:pos + 3] in ["AGGI", "OGGI"]: + # obvious germanic + if text[first:first + 4] in ['VON ', 'VAN '] or text[first:first + 3] == 'SCH' \ + or text[pos + 1:pos + 3] == 'ET': + next = ('K', 2) + else: + # always soft if french ending + if text[pos + 1:pos + 5] == 'IER ': + next = ('J', 2) + else: + next = ('J', 'K', 2) + elif text[pos + 1] == 'G': + next = ('K', 2) + else: + next = ('K', 1) + elif ch == 'H': + # only keep if first & before vowel or btw. 2 vowels + if (pos == first or text[pos - 1] in vowels) and text[pos + 1] in vowels: + next = ('H', 2) + else: # (also takes care of 'HH') + next = (None, 1) + elif ch == 'J': + # obvious spanish, 'jose', 'san jacinto' + if text[pos:pos + 4] == 'JOSE' or text[first:first + 4] == 'SAN ': + if (pos == first and text[pos + 4] == ' ') or text[first:first + 4] == 'SAN ': + next = ('H',) + else: + next = ('J', 'H') + elif pos == first and text[pos:pos + 4] != 'JOSE': + next = ('J', 'A') # Yankelovich/Jankelowicz + else: + # spanish pron. of e.g. 'bajador' + if text[pos - 1] in vowels and not slavo_germanic \ + and text[pos + 1] in ['A', 'O']: + next = ('J', 'H') + else: + if pos == last: + next = ('J', ' ') + else: + if text[pos + 1] not in ["L", "T", "K", "S", "N", "M", "B", "Z"] \ + and text[pos - 1] not in ["S", "K", "L"]: + next = ('J',) + else: + next = (None,) + if text[pos + 1] == 'J': + next = next + (2,) + else: + next = next + (1,) + elif ch == 'K': + if text[pos + 1] == 'K': + next = ('K', 2) + else: + next = ('K', 1) + elif ch == 'L': + if text[pos + 1] == 'L': + # spanish e.g. 'cabrillo', 'gallegos' + if (pos == (last - 2) and text[pos - 1:pos + 3] in ["ILLO", "ILLA", "ALLE"]) \ + or ((text[last - 1:last + 1] in ["AS", "OS"] or text[last] in ["A", "O"]) \ + and text[pos - 1:pos + 3] == 'ALLE'): + next = ('L', '', 2) + else: + next = ('L', 2) + else: + next = ('L', 1) + elif ch == 'M': + if text[pos + 1:pos + 4] == 'UMB' \ + and (pos + 1 == last or text[pos + 2:pos + 4] == 'ER') \ + or text[pos + 1] == 'M': + next = ('M', 2) + else: + next = ('M', 1) + elif ch == 'N': + if text[pos + 1] == 'N': + next = ('N', 2) + else: + next = ('N', 1) + elif ch == u('\xd1'): + next = ('N', 1) + elif ch == 'P': + if text[pos + 1] == 'H': + next = ('F', 2) + elif text[pos + 1] in ['P', 'B']: # also account for "campbell", "raspberry" + next = ('P', 2) + else: + next = ('P', 1) + elif ch == 'Q': + if text[pos + 1] == 'Q': + next = ('K', 2) + else: + next = ('K', 1) + elif ch == 'R': + # french e.g. 'rogier', but exclude 'hochmeier' + if pos == last and not slavo_germanic \ + and text[pos - 2:pos] == 'IE' and text[pos - 4:pos - 2] not in ['ME', 'MA']: + next = ('', 'R') + else: + next = ('R',) + if text[pos + 1] == 'R': + next = next + (2,) + else: + next = next + (1,) + elif ch == 'S': + # special cases 'island', 'isle', 'carlisle', 'carlysle' + if text[pos - 1:pos + 2] in ['ISL', 'YSL']: + next = (None, 1) + # special case 'sugar-' + elif pos == first and text[first:first + 5] == 'SUGAR': + next = ('X', 'S', 1) + elif text[pos:pos + 2] == 'SH': + # germanic + if text[pos + 1:pos + 5] in ["HEIM", "HOEK", "HOLM", "HOLZ"]: + next = ('S', 2) + else: + next = ('X', 2) + # italian & armenian + elif text[pos:pos + 3] in ["SIO", "SIA"] or text[pos:pos + 4] == 'SIAN': + if not slavo_germanic: + next = ('S', 'X', 3) + else: + next = ('S', 3) + # german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' + # also, -sz- in slavic language altho in hungarian it is pronounced 's' + elif (pos == first and text[pos + 1] in ["M", "N", "L", "W"]) or text[pos + 1] == 'Z': + next = ('S', 'X') + if text[pos + 1] == 'Z': + next = next + (2,) + else: + next = next + (1,) + elif text[pos:pos + 2] == 'SC': + # Schlesinger's rule + if text[pos + 2] == 'H': + # dutch origin, e.g. 'school', 'schooner' + if text[pos + 3:pos + 5] in ["OO", "ER", "EN", "UY", "ED", "EM"]: + # 'schermerhorn', 'schenker' + if text[pos + 3:pos + 5] in ['ER', 'EN']: + next = ('X', 'SK', 3) + else: + next = ('SK', 3) + else: + if pos == first and text[first + 3] not in vowels and text[first + 3] != 'W': + next = ('X', 'S', 3) + else: + next = ('X', 3) + elif text[pos + 2] in ['I', 'E', 'Y']: + next = ('S', 3) + else: + next = ('SK', 3) + # french e.g. 'resnais', 'artois' + elif pos == last and text[pos - 2:pos] in ['AI', 'OI']: + next = ('', 'S', 1) + else: + next = ('S',) + if text[pos + 1] in ['S', 'Z']: + next = next + (2,) + else: + next = next + (1,) + elif ch == 'T': + if text[pos:pos + 4] == 'TION': + next = ('X', 3) + elif text[pos:pos + 3] in ['TIA', 'TCH']: + next = ('X', 3) + elif text[pos:pos + 2] == 'TH' or text[pos:pos + 3] == 'TTH': + # special case 'thomas', 'thames' or germanic + if text[pos + 2:pos + 4] in ['OM', 'AM'] or text[first:first + 4] in ['VON ', 'VAN '] \ + or text[first:first + 3] == 'SCH': + next = ('T', 2) + else: + next = ('0', 'T', 2) + elif text[pos + 1] in ['T', 'D']: + next = ('T', 2) + else: + next = ('T', 1) + elif ch == 'V': + if text[pos + 1] == 'V': + next = ('F', 2) + else: + next = ('F', 1) + elif ch == 'W': + # can also be in middle of word + if text[pos:pos + 2] == 'WR': + next = ('R', 2) + elif pos == first and (text[pos + 1] in vowels or text[pos:pos + 2] == 'WH'): + # Wasserman should match Vasserman + if text[pos + 1] in vowels: + next = ('A', 'F', 1) + else: + next = ('A', 1) + # Arnow should match Arnoff + elif (pos == last and text[pos - 1] in vowels) \ + or text[pos - 1:pos + 5] in ["EWSKI", "EWSKY", "OWSKI", "OWSKY"] \ + or text[first:first + 3] == 'SCH': + next = ('', 'F', 1) + # polish e.g. 'filipowicz' + elif text[pos:pos + 4] in ["WICZ", "WITZ"]: + next = ('TS', 'FX', 4) + else: # default is to skip it + next = (None, 1) + elif ch == 'X': + # french e.g. breaux + next = (None,) + if not(pos == last and (text[pos - 3:pos] in ["IAU", "EAU"] \ + or text[pos - 2:pos] in ['AU', 'OU'])): + next = ('KS',) + if text[pos + 1] in ['C', 'X']: + next = next + (2,) + else: + next = next + (1,) + elif ch == 'Z': + # chinese pinyin e.g. 'zhao' + if text[pos + 1] == 'H': + next = ('J',) + elif text[pos + 1:pos + 3] in ["ZO", "ZI", "ZA"] \ + or (slavo_germanic and pos > first and text[pos - 1] != 'T'): + next = ('S', 'TS') + else: + next = ('S',) + if text[pos + 1] == 'Z': + next = next + (2,) + else: + next = next + (1,) + else: + next = (None, 1) + + if len(next) == 2: + if next[0]: + primary += next[0] + secondary += next[0] + pos += next[1] + elif len(next) == 3: + if next[0]: + primary += next[0] + if next[1]: + secondary += next[1] + pos += next[2] + + if primary == secondary: + return (primary, None) + else: + return (primary, secondary) + diff --git a/src/whoosh/lang/isri.py b/src/whoosh/lang/isri.py new file mode 100644 index 0000000..b5061e6 --- /dev/null +++ b/src/whoosh/lang/isri.py @@ -0,0 +1,382 @@ +# -*- coding: utf-8 -*- +# +# Natural Language Toolkit: The ISRI Arabic Stemmer +# +# Copyright (C) 2001-2012 NLTK Proejct +# Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005) +# Author: Hosam Algasaier +# URL: +# For license information, see LICENSE.TXT + +""" +ISRI Arabic Stemmer + +The algorithm for this stemmer is described in: + +Taghva, K., Elkoury, R., and Coombs, J. 2005. Arabic Stemming without a root +dictionary. Information Science Research Institute. University of Nevada, Las +Vegas, USA. + +The Information Science Research Institute’s (ISRI) Arabic stemmer shares many +features with the Khoja stemmer. However, the main difference is that ISRI +stemmer does not use root dictionary. Also, if a root is not found, ISRI +stemmer returned normalized form, rather than returning the original +unmodified word. + +Additional adjustments were made to improve the algorithm: + +1- Adding 60 stop words. + +2- Adding the pattern (تفاعيل) to ISRI pattern set. + +3- The step 2 in the original algorithm was normalizing all hamza. This step is +discarded because it increases the word ambiguities and changes the original +root. +""" + +from __future__ import unicode_literals +import re + + +class ISRIStemmer(object): + ''' + ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary. + Information Science Research Institute. University of Nevada, Las Vegas, USA. + + A few minor modifications have been made to ISRI basic algorithm. + See the source code of this module for more information. + + isri.stem(token) returns Arabic root for the given token. + + The ISRI Stemmer requires that all tokens have Unicode string types. + If you use Python IDLE on Arabic Windows you have to decode text first + using Arabic '1256' coding. + ''' + + def __init__(self): + self.stm = 'defult none' + + self.p3 = ['\u0643\u0627\u0644', '\u0628\u0627\u0644', + '\u0648\u0644\u0644', '\u0648\u0627\u0644'] # length three prefixes + self.p2 = ['\u0627\u0644', '\u0644\u0644'] # length two prefixes + self.p1 = ['\u0644', '\u0628', '\u0641', '\u0633', '\u0648', + '\u064a', '\u062a', '\u0646', '\u0627'] # length one prefixes + + self.s3 = ['\u062a\u0645\u0644', '\u0647\u0645\u0644', + '\u062a\u0627\u0646', '\u062a\u064a\u0646', + '\u0643\u0645\u0644'] # length three suffixes + self.s2 = ['\u0648\u0646', '\u0627\u062a', '\u0627\u0646', + '\u064a\u0646', '\u062a\u0646', '\u0643\u0645', + '\u0647\u0646', '\u0646\u0627', '\u064a\u0627', + '\u0647\u0627', '\u062a\u0645', '\u0643\u0646', + '\u0646\u064a', '\u0648\u0627', '\u0645\u0627', + '\u0647\u0645'] # length two suffixes + self.s1 = ['\u0629', '\u0647', '\u064a', '\u0643', '\u062a', + '\u0627', '\u0646'] # length one suffixes + + self.pr4 = {0: ['\u0645'], 1:['\u0627'], + 2: ['\u0627', '\u0648', '\u064A'], 3:['\u0629']} # groups of length four patterns + self.pr53 = {0: ['\u0627', '\u062a'], + 1: ['\u0627', '\u064a', '\u0648'], + 2: ['\u0627', '\u062a', '\u0645'], + 3: ['\u0645', '\u064a', '\u062a'], + 4: ['\u0645', '\u062a'], + 5: ['\u0627', '\u0648'], + 6: ['\u0627', '\u0645']} # Groups of length five patterns and length three roots + + self.re_short_vowels = re.compile(r'[\u064B-\u0652]') + self.re_hamza = re.compile(r'[\u0621\u0624\u0626]') + self.re_intial_hamza = re.compile(r'^[\u0622\u0623\u0625]') + + self.stop_words = ['\u064a\u0643\u0648\u0646', + '\u0648\u0644\u064a\u0633', + '\u0648\u0643\u0627\u0646', + '\u0643\u0630\u0644\u0643', + '\u0627\u0644\u062a\u064a', + '\u0648\u0628\u064a\u0646', + '\u0639\u0644\u064a\u0647\u0627', + '\u0645\u0633\u0627\u0621', + '\u0627\u0644\u0630\u064a', + '\u0648\u0643\u0627\u0646\u062a', + '\u0648\u0644\u0643\u0646', + '\u0648\u0627\u0644\u062a\u064a', + '\u062a\u0643\u0648\u0646', + '\u0627\u0644\u064a\u0648\u0645', + '\u0627\u0644\u0644\u0630\u064a\u0646', + '\u0639\u0644\u064a\u0647', + '\u0643\u0627\u0646\u062a', + '\u0644\u0630\u0644\u0643', + '\u0623\u0645\u0627\u0645', + '\u0647\u0646\u0627\u0643', + '\u0645\u0646\u0647\u0627', + '\u0645\u0627\u0632\u0627\u0644', + '\u0644\u0627\u0632\u0627\u0644', + '\u0644\u0627\u064a\u0632\u0627\u0644', + '\u0645\u0627\u064a\u0632\u0627\u0644', + '\u0627\u0635\u0628\u062d', + '\u0623\u0635\u0628\u062d', + '\u0623\u0645\u0633\u0649', + '\u0627\u0645\u0633\u0649', + '\u0623\u0636\u062d\u0649', + '\u0627\u0636\u062d\u0649', + '\u0645\u0627\u0628\u0631\u062d', + '\u0645\u0627\u0641\u062a\u0626', + '\u0645\u0627\u0627\u0646\u0641\u0643', + '\u0644\u0627\u0633\u064a\u0645\u0627', + '\u0648\u0644\u0627\u064a\u0632\u0627\u0644', + '\u0627\u0644\u062d\u0627\u0644\u064a', + '\u0627\u0644\u064a\u0647\u0627', + '\u0627\u0644\u0630\u064a\u0646', + '\u0641\u0627\u0646\u0647', + '\u0648\u0627\u0644\u0630\u064a', + '\u0648\u0647\u0630\u0627', + '\u0644\u0647\u0630\u0627', + '\u0641\u0643\u0627\u0646', + '\u0633\u062a\u0643\u0648\u0646', + '\u0627\u0644\u064a\u0647', + '\u064a\u0645\u0643\u0646', + '\u0628\u0647\u0630\u0627', + '\u0627\u0644\u0630\u0649'] + + + def stem(self, token): + """ + Stemming a word token using the ISRI stemmer. + """ + + self.stm = token + self.norm(1) # remove diacritics which representing Arabic short vowels + if self.stm in self.stop_words: return self.stm # exclude stop words from being processed + self.pre32() # remove length three and length two prefixes in this order + self.suf32() # remove length three and length two suffixes in this order + self.waw() # remove connective ‘و’ if it precedes a word beginning with ‘و’ + self.norm(2) # normalize initial hamza to bare alif + if len(self.stm) <= 3: return self.stm # return stem if less than or equal to three + + if len(self.stm) == 4: # length 4 word + self.pro_w4() + return self.stm + elif len(self.stm) == 5: # length 5 word + self.pro_w53() + self.end_w5() + return self.stm + elif len(self.stm) == 6: # length 6 word + self.pro_w6() + self.end_w6() + return self.stm + elif len(self.stm) == 7: # length 7 word + self.suf1() + if len(self.stm) == 7: + self.pre1() + if len(self.stm) == 6: + self.pro_w6() + self.end_w6() + return self.stm + return self.stm # if word length >7 , then no stemming + + def norm(self, num): + """ + normalization: + num=1 normalize diacritics + num=2 normalize initial hamza + num=3 both 1&2 + """ + self.k = num + + if self.k == 1: + self.stm = self.re_short_vowels.sub('', self.stm) + return self.stm + elif self.k == 2: + self.stm = self.re_intial_hamza.sub(r'\u0627', self.stm) + return self.stm + elif self.k == 3: + self.stm = self.re_short_vowels.sub('', self.stm) + self.stm = self.re_intial_hamza.sub(r'\u0627', self.stm) + return self.stm + + def pre32(self): + """remove length three and length two prefixes in this order""" + if len(self.stm) >= 6: + for pre3 in self.p3: + if self.stm.startswith(pre3): + self.stm = self.stm[3:] + return self.stm + elif len(self.stm) >= 5: + for pre2 in self.p2: + if self.stm.startswith(pre2): + self.stm = self.stm[2:] + return self.stm + + def suf32(self): + """remove length three and length two suffixes in this order""" + if len(self.stm) >= 6: + for suf3 in self.s3: + if self.stm.endswith(suf3): + self.stm = self.stm[:-3] + return self.stm + elif len(self.stm) >= 5: + for suf2 in self.s2: + if self.stm.endswith(suf2): + self.stm = self.stm[:-2] + return self.stm + + + def waw(self): + """remove connective ‘و’ if it precedes a word beginning with ‘و’ """ + if (len(self.stm) >= 4) & (self.stm[:2] == '\u0648\u0648'): + self.stm = self.stm[1:] + return self.stm + + def pro_w4(self): + """process length four patterns and extract length three roots""" + if self.stm[0] in self.pr4[0]: # مفعل + self.stm = self.stm[1:] + return self.stm + elif self.stm[1] in self.pr4[1]: # فاعل + self.stm = self.stm[0] + self.stm[2:] + return self.stm + elif self.stm[2] in self.pr4[2]: # فعال - فعول - فعيل + self.stm = self.stm[:2] + self.stm[3] + return self.stm + elif self.stm[3] in self.pr4[3]: # فعلة + self.stm = self.stm[:-1] + return self.stm + else: + self.suf1() # do - normalize short sufix + if len(self.stm) == 4: + self.pre1() # do - normalize short prefix + return self.stm + + def pro_w53(self): + """process length five patterns and extract length three roots""" + if ((self.stm[2] in self.pr53[0]) & (self.stm[0] == '\u0627')): # افتعل - افاعل + self.stm = self.stm[1] + self.stm[3:] + return self.stm + elif ((self.stm[3] in self.pr53[1]) & (self.stm[0] == '\u0645')): # مفعول - مفعال - مفعيل + self.stm = self.stm[1:3] + self.stm[4] + return self.stm + elif ((self.stm[0] in self.pr53[2]) & (self.stm[4] == '\u0629')): # مفعلة - تفعلة - افعلة + self.stm = self.stm[1:4] + return self.stm + elif ((self.stm[0] in self.pr53[3]) & (self.stm[2] == '\u062a')): # مفتعل - يفتعل - تفتعل + self.stm = self.stm[1] + self.stm[3:] + return self.stm + elif ((self.stm[0] in self.pr53[4]) & (self.stm[2] == '\u0627')): #مفاعل - تفاعل + self.stm = self.stm[1] + self.stm[3:] + return self.stm + elif ((self.stm[2] in self.pr53[5]) & (self.stm[4] == '\u0629')): # فعولة - فعالة + self.stm = self.stm[:2] + self.stm[3] + return self.stm + elif ((self.stm[0] in self.pr53[6]) & (self.stm[1] == '\u0646')): # انفعل - منفعل + self.stm = self.stm[2:] + return self.stm + elif ((self.stm[3] == '\u0627') & (self.stm[0] == '\u0627')): # افعال + self.stm = self.stm[1:3] + self.stm[4] + return self.stm + elif ((self.stm[4] == '\u0646') & (self.stm[3] == '\u0627')): # فعلان + self.stm = self.stm[:3] + return self.stm + elif ((self.stm[3] == '\u064a') & (self.stm[0] == '\u062a')): # تفعيل + self.stm = self.stm[1:3] + self.stm[4] + return self.stm + elif ((self.stm[3] == '\u0648') & (self.stm[1] == '\u0627')): # فاعول + self.stm = self.stm[0] + self.stm[2] + self.stm[4] + return self.stm + elif ((self.stm[2] == '\u0627') & (self.stm[1] == '\u0648')): # فواعل + self.stm = self.stm[0] + self.stm[3:] + return self.stm + elif ((self.stm[3] == '\u0626') & (self.stm[2] == '\u0627')): # فعائل + self.stm = self.stm[:2] + self.stm[4] + return self.stm + elif ((self.stm[4] == '\u0629') & (self.stm[1] == '\u0627')): # فاعلة + self.stm = self.stm[0] + self.stm[2:4] + return self.stm + elif ((self.stm[4] == '\u064a') & (self.stm[2] == '\u0627')): # فعالي + self.stm = self.stm[:2] + self.stm[3] + return self.stm + else: + self.suf1() # do - normalize short sufix + if len(self.stm) == 5: + self.pre1() # do - normalize short prefix + return self.stm + + def pro_w54(self): + """process length five patterns and extract length four roots""" + if (self.stm[0] in self.pr53[2]): #تفعلل - افعلل - مفعلل + self.stm = self.stm[1:] + return self.stm + elif (self.stm[4] == '\u0629'): # فعللة + self.stm = self.stm[:4] + return self.stm + elif (self.stm[2] == '\u0627'): # فعالل + self.stm = self.stm[:2] + self.stm[3:] + return self.stm + + def end_w5(self): + """ending step (word of length five)""" + if len(self.stm) == 3: + return self.stm + elif len(self.stm) == 4: + self.pro_w4() + return self.stm + elif len(self.stm) == 5: + self.pro_w54() + return self.stm + + def pro_w6(self): + """process length six patterns and extract length three roots""" + if ((self.stm.startswith('\u0627\u0633\u062a')) or (self.stm.startswith('\u0645\u0633\u062a'))): # مستفعل - استفعل + self.stm = self.stm[3:] + return self.stm + elif (self.stm[0] == '\u0645' and self.stm[3] == '\u0627' and self.stm[5] == '\u0629'): # مفعالة + self.stm = self.stm[1:3] + self.stm[4] + return self.stm + elif (self.stm[0] == '\u0627' and self.stm[2] == '\u062a' and self.stm[4] == '\u0627'): # افتعال + self.stm = self.stm[1] + self.stm[3] + self.stm[5] + return self.stm + elif (self.stm[0] == '\u0627' and self.stm[3] == '\u0648' and self.stm[2] == self.stm[4]): # افعوعل + self.stm = self.stm[1] + self.stm[4:] + return self.stm + elif (self.stm[0] == '\u062a' and self.stm[2] == '\u0627' and self.stm[4] == '\u064a'): # تفاعيل new pattern + self.stm = self.stm[1] + self.stm[3] + self.stm[5] + return self.stm + else: + self.suf1() # do - normalize short sufix + if len(self.stm) == 6: + self.pre1() # do - normalize short prefix + return self.stm + + def pro_w64(self): + """process length six patterns and extract length four roots""" + if (self.stm[0] and self.stm[4]) == '\u0627': # افعلال + self.stm = self.stm[1:4] + self.stm[5] + return self.stm + elif (self.stm.startswith('\u0645\u062a')): # متفعلل + self.stm = self.stm[2:] + return self.stm + + def end_w6(self): + """ending step (word of length six)""" + if len(self.stm) == 3: + return self.stm + elif len(self.stm) == 5: + self.pro_w53() + self.end_w5() + return self.stm + elif len (self.stm) == 6: + self.pro_w64() + return self.stm + + def suf1(self): + """normalize short sufix""" + for sf1 in self.s1: + if self.stm.endswith(sf1): + self.stm = self.stm[:-1] + return self.stm + + def pre1(self): + """normalize short prefix""" + for sp1 in self.p1: + if self.stm.startswith(sp1): + self.stm = self.stm[1:] + return self.stm diff --git a/src/whoosh/lang/lovins.py b/src/whoosh/lang/lovins.py new file mode 100644 index 0000000..1e5a933 --- /dev/null +++ b/src/whoosh/lang/lovins.py @@ -0,0 +1,570 @@ +"""This module implements the Lovins stemming algorithm. Use the ``stem()`` +function:: + + stemmed_word = stem(word) +""" + +from collections import defaultdict + + +# Conditions + +def A(base): + # A No restrictions on stem + return True + + +def B(base): + # B Minimum stem length = 3 + return len(base) > 2 + + +def C(base): + # C Minimum stem length = 4 + return len(base) > 3 + + +def D(base): + # D Minimum stem length = 5 + return len(base) > 4 + + +def E(base): + # E Do not remove ending after e + return base[-1] != "e" + + +def F(base): + # F Minimum stem length = 3 and do not remove ending after e + return len(base) > 2 and base[-1] != "e" + + +def G(base): + # G Minimum stem length = 3 and remove ending only after f + return len(base) > 2 and base[-1] == "f" + + +def H(base): + # H Remove ending only after t or ll + c1, c2 = base[-2:] + return c2 == "t" or (c2 == "l" and c1 == "l") + + +def I(base): + # I Do not remove ending after o or e + c = base[-1] + return c != "o" and c != "e" + + +def J(base): + # J Do not remove ending after a or e + c = base[-1] + return c != "a" and c != "e" + + +def K(base): + # K Minimum stem length = 3 and remove ending only after l, i or u*e + c = base[-1] + cc = base[-3] + return len(base) > 2 and (c == "l" or c == "i" or (c == "e" and cc == "u")) + + +def L(base): + # L Do not remove ending after u, x or s, unless s follows o + c1, c2 = base[-2:] + return c2 != "u" and c2 != "x" and (c2 != "s" or c1 == "o") + + +def M(base): + # M Do not remove ending after a, c, e or m + c = base[-1] + return c != "a" and c != "c" and c != "e" and c != "m" + + +def N(base): + # N Minimum stem length = 4 after s**, elsewhere = 3 + return len(base) > 3 or (len(base) == 3 and base[-1] != "s") + + +def O(base): + # O Remove ending only after l or i + c = base[-1] + return c == "l" or c == "i" + + +def P(base): + # P Do not remove ending after c + return base[-1] != "c" + + +def Q(base): + # Q Minimum stem length = 3 and do not remove ending after l or n + c = base[-1] + return len(base) > 2 and (c != "l" and c != "n") + + +def R(base): + # R Remove ending only after n or r + c = base[-1] + return c == "n" or c == "r" + + +def S(base): + # S Remove ending only after dr or t, unless t follows t + l2 = base[-2] + return l2 == "rd" or (base[-1] == "t" and l2 != "tt") + + +def T(base): + # T Remove ending only after s or t, unless t follows o + c1, c2 = base[-2:] + return c2 == "s" or (c2 == "t" and c1 != "o") + + +def U(base): + # U Remove ending only after l, m, n or r + c = base[-1] + return c == "l" or c == "m" or c == "n" or c == "r" + + +def V(base): + # V Remove ending only after c + return base[-1] == "c" + + +def W(base): + # W Do not remove ending after s or u + c = base[-1] + return c != "s" and c != "u" + + +def X(base): + # X Remove ending only after l, i or u*e + c = base[-1] + cc = base[-3] + return c == "l" or c == "i" or (c == "e" and cc == "u") + + +def Y(base): + # Y Remove ending only after in + return base[-2:] == "in" + + +def Z(base): + # Z Do not remove ending after f + return base[-1] != "f" + + +def a(base): + # a Remove ending only after d, f, ph, th, l, er, or, es or t + c = base[-1] + l2 = base[-2:] + return (c == "d" or c == "f" or l2 == "ph" or l2 == "th" or c == "l" + or l2 == "er" or l2 == "or" or l2 == "es" or c == "t") + + +def b(base): + # b Minimum stem length = 3 and do not remove ending after met or ryst + return len(base) > 2 and not (base.endswith("met") + or base.endswith("ryst")) + + +def c(base): + # c Remove ending only after l + return base[-1] == "l" + + +# Endings + +m = [None] * 12 + +m[11] = dict(( + ("alistically", B), + ("arizability", A), + ("izationally", B))) +m[10] = dict(( + ("antialness", A), + ("arisations", A), + ("arizations", A), + ("entialness", A))) +m[9] = dict(( + ("allically", C), + ("antaneous", A), + ("antiality", A), + ("arisation", A), + ("arization", A), + ("ationally", B), + ("ativeness", A), + ("eableness", E), + ("entations", A), + ("entiality", A), + ("entialize", A), + ("entiation", A), + ("ionalness", A), + ("istically", A), + ("itousness", A), + ("izability", A), + ("izational", A))) +m[8] = dict(( + ("ableness", A), + ("arizable", A), + ("entation", A), + ("entially", A), + ("eousness", A), + ("ibleness", A), + ("icalness", A), + ("ionalism", A), + ("ionality", A), + ("ionalize", A), + ("iousness", A), + ("izations", A), + ("lessness", A))) +m[7] = dict(( + ("ability", A), + ("aically", A), + ("alistic", B), + ("alities", A), + ("ariness", E), + ("aristic", A), + ("arizing", A), + ("ateness", A), + ("atingly", A), + ("ational", B), + ("atively", A), + ("ativism", A), + ("elihood", E), + ("encible", A), + ("entally", A), + ("entials", A), + ("entiate", A), + ("entness", A), + ("fulness", A), + ("ibility", A), + ("icalism", A), + ("icalist", A), + ("icality", A), + ("icalize", A), + ("ication", G), + ("icianry", A), + ("ination", A), + ("ingness", A), + ("ionally", A), + ("isation", A), + ("ishness", A), + ("istical", A), + ("iteness", A), + ("iveness", A), + ("ivistic", A), + ("ivities", A), + ("ization", F), + ("izement", A), + ("oidally", A), + ("ousness", A))) +m[6] = dict(( + ("aceous", A), + ("acious", B), + ("action", G), + ("alness", A), + ("ancial", A), + ("ancies", A), + ("ancing", B), + ("ariser", A), + ("arized", A), + ("arizer", A), + ("atable", A), + ("ations", B), + ("atives", A), + ("eature", Z), + ("efully", A), + ("encies", A), + ("encing", A), + ("ential", A), + ("enting", C), + ("entist", A), + ("eously", A), + ("ialist", A), + ("iality", A), + ("ialize", A), + ("ically", A), + ("icance", A), + ("icians", A), + ("icists", A), + ("ifully", A), + ("ionals", A), + ("ionate", D), + ("ioning", A), + ("ionist", A), + ("iously", A), + ("istics", A), + ("izable", E), + ("lessly", A), + ("nesses", A), + ("oidism", A))) +m[5] = dict(( + ("acies", A), + ("acity", A), + ("aging", B), + ("aical", A), + ("alist", A), + ("alism", B), + ("ality", A), + ("alize", A), + ("allic", b), + ("anced", B), + ("ances", B), + ("antic", C), + ("arial", A), + ("aries", A), + ("arily", A), + ("arity", B), + ("arize", A), + ("aroid", A), + ("ately", A), + ("ating", I), + ("ation", B), + ("ative", A), + ("ators", A), + ("atory", A), + ("ature", E), + ("early", Y), + ("ehood", A), + ("eless", A), + ("elily", A), + ("ement", A), + ("enced", A), + ("ences", A), + ("eness", E), + ("ening", E), + ("ental", A), + ("ented", C), + ("ently", A), + ("fully", A), + ("ially", A), + ("icant", A), + ("ician", A), + ("icide", A), + ("icism", A), + ("icist", A), + ("icity", A), + ("idine", I), + ("iedly", A), + ("ihood", A), + ("inate", A), + ("iness", A), + ("ingly", B), + ("inism", J), + ("inity", c), + ("ional", A), + ("ioned", A), + ("ished", A), + ("istic", A), + ("ities", A), + ("itous", A), + ("ively", A), + ("ivity", A), + ("izers", F), + ("izing", F), + ("oidal", A), + ("oides", A), + ("otide", A), + ("ously", A))) +m[4] = dict(( + ("able", A), + ("ably", A), + ("ages", B), + ("ally", B), + ("ance", B), + ("ancy", B), + ("ants", B), + ("aric", A), + ("arly", K), + ("ated", I), + ("ates", A), + ("atic", B), + ("ator", A), + ("ealy", Y), + ("edly", E), + ("eful", A), + ("eity", A), + ("ence", A), + ("ency", A), + ("ened", E), + ("enly", E), + ("eous", A), + ("hood", A), + ("ials", A), + ("ians", A), + ("ible", A), + ("ibly", A), + ("ical", A), + ("ides", L), + ("iers", A), + ("iful", A), + ("ines", M), + ("ings", N), + ("ions", B), + ("ious", A), + ("isms", B), + ("ists", A), + ("itic", H), + ("ized", F), + ("izer", F), + ("less", A), + ("lily", A), + ("ness", A), + ("ogen", A), + ("ward", A), + ("wise", A), + ("ying", B), + ("yish", A))) +m[3] = dict(( + ("acy", A), + ("age", B), + ("aic", A), + ("als", b), + ("ant", B), + ("ars", O), + ("ary", F), + ("ata", A), + ("ate", A), + ("eal", Y), + ("ear", Y), + ("ely", E), + ("ene", E), + ("ent", C), + ("ery", E), + ("ese", A), + ("ful", A), + ("ial", A), + ("ian", A), + ("ics", A), + ("ide", L), + ("ied", A), + ("ier", A), + ("ies", P), + ("ily", A), + ("ine", M), + ("ing", N), + ("ion", Q), + ("ish", C), + ("ism", B), + ("ist", A), + ("ite", a), + ("ity", A), + ("ium", A), + ("ive", A), + ("ize", F), + ("oid", A), + ("one", R), + ("ous", A))) +m[2] = dict(( + ("ae", A), + ("al", b), + ("ar", X), + ("as", B), + ("ed", E), + ("en", F), + ("es", E), + ("ia", A), + ("ic", A), + ("is", A), + ("ly", B), + ("on", S), + ("or", T), + ("um", U), + ("us", V), + ("yl", R), + ("s'", A), + ("'s", A))) +m[1] = dict(( + ("a", A), + ("e", A), + ("i", A), + ("o", A), + ("s", W), + ("y", B))) + + +def remove_ending(word): + length = len(word) + el = 11 + while el > 0: + if length - el > 1: + ending = word[length - el:] + cond = m[el].get(ending) + if cond: + base = word[:length - el] + if cond(base): + return base + el -= 1 + return word + + +_endings = (("iev", "ief"), + ("uct", "uc"), + ("iev", "ief"), + ("uct", "uc"), + ("umpt", "um"), + ("rpt", "rb"), + ("urs", "ur"), + ("istr", "ister"), + ("metr", "meter"), + ("olv", "olut"), + ("ul", "l", "aoi"), + ("bex", "bic"), + ("dex", "dic"), + ("pex", "pic"), + ("tex", "tic"), + ("ax", "ac"), + ("ex", "ec"), + ("ix", "ic"), + ("lux", "luc"), + ("uad", "uas"), + ("vad", "vas"), + ("cid", "cis"), + ("lid", "lis"), + ("erid", "eris"), + ("pand", "pans"), + ("end", "ens", "s"), + ("ond", "ons"), + ("lud", "lus"), + ("rud", "rus"), + ("her", "hes", "pt"), + ("mit", "mis"), + ("ent", "ens", "m"), + ("ert", "ers"), + ("et", "es", "n"), + ("yt", "ys"), + ("yz", "ys")) + + +# Hash the ending rules by the last letter of the target ending +_endingrules = defaultdict(list) +for rule in _endings: + _endingrules[rule[0][-1]].append(rule) + +_doubles = frozenset(("dd", "gg", "ll", "mm", "nn", "pp", "rr", "ss", "tt")) + + +def fix_ending(word): + if word[-2:] in _doubles: + word = word[:-1] + + for endingrule in _endingrules[word[-1]]: + target, newend = endingrule[:2] + if word.endswith(target): + if len(endingrule) > 2: + exceptafter = endingrule[2] + c = word[0 - (len(target) + 1)] + if c in exceptafter: + return word + + return word[:0 - len(target)] + newend + + return word + + +def stem(word): + """Returns the stemmed version of the argument string. + """ + return fix_ending(remove_ending(word)) diff --git a/src/whoosh/lang/morph_en.py b/src/whoosh/lang/morph_en.py new file mode 100644 index 0000000..bb63573 --- /dev/null +++ b/src/whoosh/lang/morph_en.py @@ -0,0 +1,933 @@ +""" +Contains the variations() function for expanding an English word into multiple +variations by programatically adding and removing suffixes. + +Translated to Python from the ``com.sun.labs.minion.lexmorph.LiteMorph_en`` +class of Sun's `Minion search engine `_. +""" + +import re + +from whoosh.compat import xrange, iteritems +# Rule exceptions + +exceptions = [ + "a", + "abandoner abandon abandons abandoned abandoning abandonings abandoners", + "abdomen abdomens", + "about", + "above", + "acid acids acidic acidity acidities", + "across", + "act acts acted acting actor actors", + "ad ads", + "add adds added adding addings addition additions adder adders", + "advertise advertises advertised advertising advertiser advertisers advertisement advertisements advertisings", + "after", + "again", + "against", + "ago", + "all", + "almost", + "along", + "already", + "also", + "although", + "alumna alumnae alumnus alumni", + "always", + "amen amens", + "amidships", + "amid amidst", + "among amongst", + "an", + "analysis analyses", + "and", + "another other others", + "antenna antennas antennae", + "antitheses antithesis", + "any", + "anyone anybody", + "anything", + "appendix appendixes appendices", + "apropos", + "aquarium aquariums aquaria", + "argument arguments argue argues argued arguing arguings arguer arguers", + "arise arises arose arisen ariser arisers arising arisings", + "around", + "as", + "asbestos", + "at", + "atlas atlases", + "auger augers augered augering augerings augerer augerers", + "augment augments augmented augmenting augmentings augmentation augmentations augmenter augmenters", + "automata automaton automatons", + "automation automating automate automates automated automatic", + "avoirdupois", + "awake awakes awoke awaked awoken awaker awakers awaking awakings awakening awakenings", + "away", + "awful awfully awfulness", + "axis axes axises", + "bacillus bacilli", + "bacterium bacteria", + "bad worse worst badly badness", + "bas", + "bases basis", + "bases base based basing basings basely baseness basenesses basement basements baseless basic basics", + "be am are is was were been being", + "bear bears bore borne bearing bearings bearer bearers", + "beat beats beaten beating beatings beater beaters", + "because", + "become becomes became becoming", + "beef beefs beeves beefed beefing", + "beer beers", + "before", + "begin begins began begun beginning beginnings beginner beginners", + "behalf behalves", + "being beings", + "bend bends bent bending bendings bender benders", + "bereave bereaves bereaved bereft bereaving bereavings bereavement bereavements", + "beside besides", + "best bests bested besting", + "bet bets betting bettor bettors", + "betimes", + "between", + "beyond", + "bid bids bade bidden bidding biddings bidder bidders", + "bier biers", + "bind binds bound binding bindings binder binders", + "bit bits", + "bite bites bit bitten biting bitings biter biters", + "blackfoot blackfeet", + "bleed bleeds bled bleeding bleedings bleeder bleeders", + "blow blows blew blown blowing blowings blower blowers", + "bookshelf bookshelves", + "both", + "bound bounds bounded bounding boundings bounder bounders boundless", + "bourgeois bourgeoisie", + "bra bras", + "brahman brahmans", + "break breaks broke broken breaking breakings breaker breakers", + "breed breeds bred breeding breedings breeder breeders", + "bring brings brought bringing bringings bringer bringers", + "build builds built building buildings builder builders", + "bus buses bused bussed busing bussing busings bussings buser busers busser bussers", + "buss busses bussed bussing bussings busser bussers", + "but", + "buy buys bought buying buyings buyer buyers", + "by", + "calf calves calved calving calvings calver calvers", + "can cans canned canning cannings canner canners", + "can could cannot", + "canoes canoe canoed canoeing canoeings canoer canoers", + "catch catches caught catching catchings catcher catchers", + "cement cements cemented cementing cementings cementer cementers", + "cent cents", + "center centers centered centering centerings centerless", + "child children childless childish childishly", + "choose chooses chose chosen choosing choosings chooser choosers", + "cling clings clung clinging clingings clinger clingers", + "colloquium colloquia colloquiums", + "come comes came coming comings comer comers", + "comment comments commented commenting commentings commenter commenters", + "compendium compendia compendiums", + "complement complements complemented complementing complementings complementer complementers complementary", + "compliment compliments complimented complimenting complimentings complimenter complimenters complimentary", + "concerto concertos concerti", + "condiment condiments", + "corps", + "cortex cortices cortexes cortical", + "couscous", + "creep creeps crept creeping creepings creeper creepers creepy", + "crisis crises", + "criterion criteria criterial", + "cryptanalysis cryptanalyses", + "curriculum curricula curriculums curricular", + "datum data", + "day days daily", + "deal deals dealt dealing dealings dealer dealers", + "decrement decrements decremented decrementing decrementings decrementer decrementers decremental", + "deer deers", + "demented dementia", + "desideratum desiderata", + "diagnosis diagnoses diagnose diagnosed diagnosing diagnostic", + "dialysis dialyses", + "dice dices diced dicing dicings dicer dicers", + "die dice", + "die dies died dying dyings", + "dig digs dug digging diggings digger diggers", + "dive dives diver divers dove dived diving divings", + "divest divests divester divesters divested divesting divestings divestment divestments", + "do does did done doing doings doer doers", + "document documents documented documenting documentings documenter documenters documentation documentations documentary", + "doe does", + "dove doves", + "downstairs", + "dozen", + "draw draws drew drawn drawing drawings drawer drawers", + "drink drinks drank drunk drinking drinkings drinker drinkers", + "drive drives drove driven driving drivings driver drivers driverless", + "due dues duly", + "during", + "e", + "each", + "eager eagerer eagerest eagerly eagerness eagernesses", + "early earlier earliest", + "easement easements", + "eat eats ate eaten eating eatings eater eaters", + "effluvium effluvia", + "either", + "element elements elementary", + "elf elves elfen", + "ellipse ellipses elliptic elliptical elliptically", + "ellipsis ellipses elliptic elliptical elliptically", + "else", + "embolus emboli embolic embolism", + "emolument emoluments", + "emphasis emphases", + "employ employs employed employing employer employers employee employees employment employments employable", + "enough", + "equilibrium equilibria equilibriums", + "erratum errata", + "ever", + "every", + "everything", + "exotic exotically exoticness exotica", + "experiment experiments experimented experimenting experimentings experimenter experimenters experimentation experimental", + "extra extras", + "fall falls fell fallen falling fallings faller fallers", + "far farther farthest", + "fee fees feeless", + "feed feeds fed feeding feedings feeder feeders", + "feel feels felt feeling feelings feeler feelers", + "ferment ferments fermented fermenting fermentings fermentation fermentations fermenter fermenters", + "few fewer fewest", + "fight fights fought fighting fightings fighter fighters", + "figment figments", + "filament filaments", + "find finds found finding findings finder finders", + "firmament firmaments", + "flee flees fled fleeing fleeings", + "fling flings flung flinging flingings flinger flingers", + "floe floes", + "fly flies flew flown flying flyings flier fliers flyer flyers", + "focus foci focuses focused focusing focusses focussed focussing focuser focal", + "foment foments fomented fomenting fomentings fomenter fomenters", + "foot feet", + "foot foots footed footing footer footers", + "footing footings footer footers", + "for", + "forbid forbids forbade forbidden forbidding forbiddings forbidder forbidders", + "foresee foresaw foreseen foreseeing foreseeings foreseer foreseers", + "forest forests forester foresting forestation forestations", + "forget forgets forgot forgotten forgetting forgettings forgetter forgetters forgetful", + "forsake forsakes forsook forsaken forsaking forsakings forsaker forsakers", + "found founds founded founding foundings founder founders", + "fragment fragments fragmented fragmenting fragmentings fragmentation fragmentations fragmenter fragmenters", + "free frees freer freest freed freeing freely freeness freenesses", + "freeze freezes froze frozen freezing freezings freezer freezers", + "from", + "full fully fuller fullest", + "fuller fullers full fulls fulled fulling fullings", + "fungus fungi funguses fungal", + "gallows", + "ganglion ganglia ganglions ganglionic", + "garment garments", + "gas gasses gassed gassing gassings gasser gassers", + "gas gases gasses gaseous gasless", + "gel gels gelled gelling gellings geller gellers", + "german germans germanic germany German Germans Germanic Germany", + "get gets got gotten getting gettings getter getters", + "give gives gave given giving givings giver givers", + "gladiolus gladioli gladioluses gladiola gladiolas gladiolae", + "glans glandes", + "gluiness gluey glue glues glued gluing gluings gluer gluers", + "go goes went gone going goings goer goers", + "godchild godchildren", + "good better best goodly goodness goodnesses", + "goods", + "goose geese", + "goose gooses goosed goosing goosings gooser goosers", + "grandchild grandchildren", + "grind grinds ground grinding grindings grinder grinders", + "ground grounds grounded grounding groundings grounder grounders groundless", + "grow grows grew grown growing growings grower growers growth", + "gum gums gummed gumming gummings gummer gummers", + "half halves", + "halve halves halved halving halvings halver halvers", + "hang hangs hung hanged hanging hangings hanger hangers", + "have has had having havings haver havers", + "he him his himself", + "hear hears heard hearing hearings hearer hearers", + "here", + "hide hides hid hidden hiding hidings hider hiders", + "hippopotamus hippopotami hippopotamuses", + "hold holds held holding holdings holder holders", + "honorarium honoraria honorariums", + "hoof hoofs hooves hoofed hoofing hoofer hoofers", + "how", + "hum hums hummed humming hummings hummer hummers", + "hymen hymens hymenal", + "hypotheses hypothesis hypothesize hypothesizes hypothesized hypothesizer hypothesizing hypothetical hypothetically", + "i", + "if iffy", + "impediment impediments", + "implement implements implemented implementing implementings implementation implementations implementer implementers", + "imply implies implied implying implyings implier impliers", + "in inner", + "inclement", + "increment increments incremented incrementing incrementings incrementer incrementers incremental incrementally", + "index indexes indexed indexing indexings indexer indexers", + "index indexes indices indexical indexicals", + "indoor indoors", + "instrument instruments instrumented instrumenting instrumentings instrumenter instrumenters instrumentation instrumentations instrumental", + "integument integumentary", + "into", + "it its itself", + "java", + "july julys", + "keep keeps kept keeping keepings keeper keepers", + "knife knifes knifed knifing knifings knifer knifers", + "knife knives", + "know knows knew known knowing knowings knower knowers knowledge", + "lament laments lamented lamenting lamentings lamentation lamentations lamenter lamenters lamentable lamentably", + "larva larvae larvas larval", + "late later latest lately lateness", + "latter latterly", + "lay lays laid laying layer layers", + "layer layers layered layering layerings", + "lead leads led leading leadings leader leaders leaderless", + "leaf leafs leafed leafing leafings leafer leafers", + "leaf leaves leafless", + "leave leaves left leaving leavings leaver leavers", + "lend lends lent lending lendings lender lenders", + "less lesser least", + "let lets letting lettings", + "lie lies lay lain lying lier liers", + "lie lies lied lying liar liars", + "life lives lifeless", + "light lights lit lighted lighting lightings lightly lighter lighters lightness lightnesses lightless", + "likely likelier likeliest", + "limen limens", + "lineament lineaments", + "liniment liniments", + "live alive living", + "live lives lived living livings", + "liver livers", + "loaf loafs loafed loafing loafings loafer loafers", + "loaf loaves", + "logic logics logical logically", + "lose loses lost losing loser losers loss losses", + "louse lice", + "lumen lumens", + "make makes made making makings maker makers", + "man mans manned manning mannings", + "man men", + "manly manlier manliest manliness manful manfulness manhood", + "manic manically", + "manner manners mannered mannerly mannerless mannerful", + "many", + "matrix matrices matrixes", + "may might", + "maximum maxima maximums maximal maximize maximizes maximized maximizing", + "mean means meant meaning meanings meaningless meaningful", + "mean meaner meanest meanly meanness meannesses", + "median medians medianly medial", + "medium media mediums", + "meet meets met meeting meetings", + "memorandum memoranda memorandums", + "mere merely", + "metal metals metallic", + "might mighty mightily", + "millenium millennia milleniums millennial", + "mine mines mined mining minings miner miners", + "mine my our ours", + "minimum minima minimums minimal", + "minus minuses", + "miscellaneous miscellanea miscellaneously miscellaneousness miscellany", + "molest molests molested molesting molestings molester molesters", + "moment moments", + "monument monuments monumental", + "more most", + "mouse mice mouseless", + "much", + "multiply multiplies multiplier multipliers multiple multiples multiplying multiplyings multiplication multiplications", + "mum mums mummed mumming mummings mummer mummers", + "must musts", + "neither", + "nemeses nemesis", + "neurosis neuroses neurotic neurotics", + "nomen", + "none", + "nos no noes", + "not", + "nothing nothings nothingness", + "now", + "nowadays", + "nucleus nuclei nucleuses nuclear", + "number numbers numbered numbering numberings numberless", + "nutriment nutriments nutrient nutrients nutrition nutritions", + "oasis oases", + "octopus octopi octopuses", + "of", + "off", + "offer offers offered offering offerings offerer offerers offeror offerors", + "often", + "oftentimes", + "ointment ointments", + "omen omens", + "on", + "once", + "only", + "ornament ornaments ornamented ornamenting ornamentings ornamentation ornamenter ornamenters ornamental", + "outdoor outdoors", + "outlay outlays", + "outlie outlies outlay outlied outlain outlying outlier outliers", + "ovum ova", + "ox oxen", + "parentheses parenthesis", + "parliament parliaments parliamentary", + "passerby passer-by passersby passers-by", + "past pasts", + "pay pays paid paying payings payer payers payee payees payment payments", + "per", + "perhaps", + "person persons people", + "phenomenon phenomena phenomenal", + "pi", + "picnic picnics picnicker picnickers picnicked picnicking picnickings", + "pigment pigments pigmented pigmenting pigmentings pigmenter pigmenters pigmentation pigmentations", + "please pleases pleased pleasing pleasings pleaser pleasers pleasure pleasures pleasuring pleasurings pleasant pleasantly pleasureless pleasureful", + "plus pluses plusses", + "polyhedra polyhedron polyhedral", + "priest priests priestly priestlier priestliest priestliness priestless", + "prognosis prognoses", + "prostheses prosthesis", + "prove proves proved proving provings proofs proof prover provers provable", + "psychosis psychoses psychotic psychotics", + "qed", + "quiz quizzes quizzed quizzing quizzings quizzer quizzers", + "raiment", + "rather", + "re", + "real really", + "redo redoes redid redone redoing redoings redoer redoers", + "regiment regiments regimented regimenting regimenter regimenters regimentation regimental", + "rendezvous", + "requiz requizzes requizzed requizzing requizzings requizzer requizzers", + "ride rides rode ridden riding ridings rider riders rideless", + "ring rings rang rung ringing ringings ringer ringers ringless", + "rise rises rose risen rising risings riser risers", + "rose roses", + "rudiment rudiments rudimentary", + "rum rums rummed rumming rummings rummer rummers", + "run runs ran running runnings runner runners", + "sacrament sacraments sacramental", + "same sameness", + "sans", + "saw saws sawed sawn sawing sawings sawyer sawyers", + "say says said saying sayings sayer sayers", + "scarf scarfs scarves scarfless", + "schema schemata schemas", + "sediment sediments sedimentary sedimentation sedimentations", + "see sees saw seen seeing seeings seer seers", + "seek seeks sought seeking seekings seeker seekers", + "segment segments segmented segmenting segmentings segmenter segmenters segmentation segmentations", + "self selves selfless", + "sell sells sold selling sellings seller sellers", + "semen", + "send sends sent sending sendings sender senders", + "sentiment sentiments sentimental", + "series", + "set sets setting settings", + "several severally", + "sew sews sewed sewn sewing sewings sewer sewers", + "sewer sewers sewerless", + "shake shakes shook shaken shaking shakings shaker shakers", + "shall should", + "shaman shamans", + "shave shaves shaved shaven shaving shavings shaver shavers shaveless", + "she her hers herself", + "sheaf sheaves sheafless", + "sheep", + "shelf shelves shelved shelfing shelvings shelver shelvers shelfless", + "shine shines shined shone shining shinings shiner shiners shineless", + "shoe shoes shoed shod shoeing shoeings shoer shoers shoeless", + "shoot shoots shot shooting shootings shooter shooters", + "shot shots", + "show shows showed shown showing showings shower showers", + "shower showers showery showerless", + "shrink shrinks shrank shrunk shrinking shrinkings shrinker shrinkers shrinkable", + "sideways", + "simply simple simpler simplest", + "since", + "sing sings sang sung singing singings singer singers singable", + "sink sinks sank sunk sinking sinkings sinker sinkers sinkable", + "sit sits sat sitting sittings sitter sitters", + "ski skis skied skiing skiings skier skiers skiless skiable", + "sky skies", + "slay slays slew slain slaying slayings slayer slayers", + "sleep sleeps slept sleeping sleepings sleeper sleepers sleepless", + "so", + "some", + "something", + "sometime sometimes", + "soon", + "spa spas", + "speak speaks spoke spoken speaking speakings speaker speakers", + "species specie", + "spectrum spectra spectrums", + "speed speeds sped speeded speeding speedings speeder speeders", + "spend spends spent spending spendings spender spenders spendable", + "spin spins spun spinning spinnings spinner spinners", + "spoke spokes", + "spring springs sprang sprung springing springings springer springers springy springiness", + "staff staffs staves staffed staffing staffings staffer staffers", + "stand stands stood standing standings", + "stasis stases", + "steal steals stole stolen stealing stealings stealer stealers", + "stick sticks stuck sticking stickings sticker stickers", + "stigma stigmata stigmas stigmatize stigmatizes stigmatized stigmatizing", + "stimulus stimuli", + "sting stings stung stinging stingings stinger stingers", + "stink stinks stank stunk stinking stinkings stinker stinkers", + "stomach stomachs", + "stratum strata stratums", + "stride strides strode stridden striding stridings strider striders", + "string strings strung stringing stringings stringer stringers stringless", + "strive strives strove striven striving strivings striver strivers", + "strum strums strummed strumming strummings strummer strummers strummable", + "such", + "suffer suffers suffered suffering sufferings sufferer sufferers sufferable", + "suggest suggests suggested suggesting suggestings suggester suggesters suggestor suggestors suggestive suggestion suggestions suggestible suggestable", + "sum sums summed summing summings summer summers", + "summer summers summered summering summerings", + "supplement supplements supplemented supplementing supplementings supplementation supplementer supplementers supplementary supplemental", + "supply supplies supplied supplying supplyings supplier suppliers", + "swear swears swore sworn swearing swearings swearer swearers", + "sweep sweeps swept sweeping sweepings sweeper sweepers", + "swell swells swelled swollen swelling swellings", + "swim swims swam swum swimming swimmings swimmer swimmers swimable", + "swine", + "swing swings swung swinging swingings swinger swingers", + "syllabus syllabi syllabuses", + "symposium symposia symposiums", + "synapse synapses", + "synapsis synapses", + "synopsis synopses", + "synthesis syntheses", + "tableau tableaux tableaus", + "take takes took taken taking takings taker takers takable", + "teach teaches taught teaching teachings teacher teachers teachable", + "tear tears tore torn tearing tearings tearer tearers tearable", + "tegument teguments", + "tell tells told telling tellings teller tellers tellable", + "temperament temperaments temperamental temperamentally", + "tenement tenements", + "the", + "there theres", + "theses thesis", + "they them their theirs themselves", + "thief thieves thieving thievings", + "think thinks thought thinking thinker thinkers thinkable", + "this that these those", + "thought thoughts thougtful thoughtless", + "throw throws threw thrown throwing throwings thrower throwers throwable", + "tic tics", + "tie ties tied tying tyings tier tiers tieable tieless", + "tier tiers tiered tiering tierings tierer tierers", + "to", + "toe toes toed toeing toeings toer toers toeless", + "together togetherness", + "too", + "tooth teeth toothless", + "topaz topazes", + "torment torments tormented tormenting tormentings tormenter tormenters tormentable", + "toward towards", + "tread treads trod trodden treading treadings treader treaders", + "tread treads treadless retread retreads", + "true truly trueness", + "two twos", + "u", + "under", + "underlay underlays underlaid underlaying underlayings underlayer underlayers", + "underlie underlies underlay underlain underlying underlier underliers", + "undo undoes undid undone undoing undoings undoer undoers undoable", + "unrest unrestful", + "until", + "unto", + "up", + "upon", + "upstairs", + "use uses user users used using useful useless", + "various variously", + "vehement vehemently vehemence", + "versus", + "very", + "visit visits visited visiting visitings visitor visitors", + "vortex vortexes vortices", + "wake wakes woke waked woken waking wakings waker wakers wakeful wakefulness wakefulnesses wakeable", + "wear wears wore worn wearing wearings wearer wearers wearable", + "weather weathers weathered weathering weatherly", + "weave weaves wove woven weaving weavings weaver weavers weaveable", + "weep weeps wept weeping weepings weeper weepers", + "wharf wharfs wharves", + "where wheres", + "whereas whereases", + "whether whethers", + "while whiles whilst whiled whiling", + "whiz whizzes whizzed whizzing whizzings whizzer whizzers", + "who whom whos whose whoses", + "why whys", + "wife wives wifeless", + "will wills willed willing willings willful", + "will would", + "win wins won winning winnings winner winners winnable", + "wind winds wound winding windings winder winders windable", + "wind winds windy windless", + "with", + "within", + "without", + "wolf wolves", + "woman women womanless womanly", + "wound wounds wounded wounding woundings", + "write writes wrote written writing writings writer writers writeable", + "yeses yes", + "yet yets", + "you your yours yourself" + ] + +_exdict = {} +for exlist in exceptions: + for ex in exlist.split(" "): + _exdict[ex] = exlist + +# Programmatic rules + +vowels = "aeiouy" +cons = "bcdfghjklmnpqrstvwxyz" + +rules = ( + # Words ending in S + + # (e.g., happiness, business) + (r"[%s].*[%s](iness)" % (vowels, cons), "y,ies,ier,iers,iest,ied,ying,yings,ily,inesses,iment,iments,iless,iful"), + # (e.g., baseless, shoeless) + (r"[%s].*(eless)" % vowels, "e,es,er,ers,est,ed,ing,ings,eing,eings,ely,eness,enesses,ement,ements,eness,enesses,eful"), + # (e.g., gutless, hatless, spotless) + (r"[%s][%s][bdgklmnprt]?(less)" % (cons, vowels), ",s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,ful"), + # (e.g., thoughtless, worthless) + (r"[%s].*?(less)" % vowels, ",s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,ful"), + # (e.g., baseness, toeness) + (r"[%s].*(eness)" % vowels, "e,es,er,ers,est,ed,ing,ings,eing,eings,ely,enesses,ement,ements,eless,eful"), + # (e.g., bluntness, grayness) + (r"[%s].*(ness)" % vowels, ",s,er,ers,est,ed,ing,ings,ly,nesses,ment,ments,less,ful"), + # (e.g., albatross, kiss) + (r"[%s]ss" % vowels, "es,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., joyous, fractious, gaseous) + (r"[%s].*(ous)" % vowels, "ly,ness"), + # (e.g., tries, unties, jollies, beauties) + (r"(ies)", "y,ie,yer,yers,ier,iers,iest,ied,ying,yings,yness,iness,ieness,ynesses,inesses,ienesses,iment,iement,iments,iements,yless,iless,ieless,yful,iful,ieful"), + # (e.g., crisis, kinesis) + (r"[%s].*(sis)" % vowels, "ses,sises,sisness,sisment,sisments,sisless,sisful"), + # (e.g., bronchitis, bursitis) + (r"[%s].*(is)" % vowels, "es,ness,ment,ments,less,ful"), + (r"[%s].*[cs]h(es)" % vowels, ",e,er,ers,est,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ement,ments,ements,less,eless,ful,eful"), + # (e.g., tokenizes) // adds British variations + (r"[%s].*[%s](izes)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ise,iser,isers,ised,ising,isings,isation,isations"), + # (e.g., tokenises) // British variant // ~expertise + (r"[%s].*[%s](ises)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ise,iser,isers,ised,ising,isings,isation,isations"), + # (e.g., aches, arches) + (r"[%s].*[jsxz](es)" % vowels, ",e,er,ers,est,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ement,ments,ements,less,eless,ful,eful"), + # (e.g., judges, abridges) + (r"[%s].*dg(es)" % vowels, "e,er,ers,est,ed,ing,ings,ely,eness,enesses,ment,ments,ement,ements,eless,eful"), + # (e.g., trees, races, likes, agrees) covers all other -es words + (r"e(s)", ",*"), + # (e.g., segments, bisegments, cosegments) + (r"segment(s)", ",*"), + # (e.g., pigments, depigments, repigments) + (r"pigment(s)", ",*"), + # (e.g., judgments, abridgments) + (r"[%s].*dg(ments)" % vowels, "ment,*ments"), + # (e.g., merriments, embodiments) -iment in turn will generate y and *y (redo y) + (r"[%s].*[%s]iment(s)" % (vowels, cons), ",*"), + # (e.g., atonements, entrapments) + (r"[%s].*ment(s)" % vowels, ",*"), + # (e.g., viewers, meters, traders, transfers) + (r"[%s].*er(s)" % vowels, ",*"), + # (e.g., unflags) polysyllables + (r"[%s].*[%s][%s][bdglmnprt](s)" % (vowels, cons, vowels), ",*"), + # (e.g., frogs) monosyllables + (r"[%s][%s][bdglmnprt](s)" % (vowels, cons), ",*"), + # (e.g., killings, muggings) + (r"[%s].*ing(s)" % vowels, ",*"), + # (e.g., hulls, tolls) + (r"[%s].*ll(s)" % vowels, ",*"), + # e.g., boas, polkas, spas) don't generate latin endings + (r"a(s)", ",er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., beads, toads) + (r"[%s].*[%s].*(s)" % (vowels, cons), ",*"), + # (e.g., boas, zoos) + (r"[%s].*[%s](s)" % (cons, vowels), ",er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., ss, sss, ssss) no vowel (vowel case is already handled above) + (r"ss()", ""), + # (e.g., cds, lcds, m-16s) no vowel (can be a plural noun, but not verb) + (r"[%s].*[%s1234567890](s)" % (cons, cons), ""), + + # Words ending in E + + # (e.g., apple, so it doesn't include apply) + (r"appl(e)", "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ement,ements,eless,eful"), + # (e.g., supple, so it doesn't include supply) + (r"suppl(e)", "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ement,ements,eless,eful"), + # (e.g., able, abominable, fungible, table, enable, idle, subtle) + (r"[%s].*[%s]l(e)" % (vowels, cons), "es,er,ers,est,ed,ing,ings,y,ely,eness,enesses,ement,ements,eless,eful"), + # (e.g., bookie, magpie, vie) + (r"(ie)", "ies,ier,iers,iest,ied,ying,yings,iely,ieness,ienesses,iement,iements,ieless,ieful"), + # (e.g., dye, redye, redeye) + (r"ye()", "s,r,rs,st,d,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., judge, abridge) + (r"[%s].*dg(e)" % vowels, "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ment,ments,less,ful,ement,ements,eless,eful"), + # (e.g., true, due, imbue) + (r"u(e)", "es,er,ers,est,ed,ing,ings,eing,eings,ly,ely,eness,enesses,ment,ments,less,ful,ement,ements,eless,eful"), + # (e.g., tokenize) // adds British variations + (r"[%s].*[%s](ize)" % (vowels, cons), "izes,izer,izers,ized,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"), + # (e.g., tokenise) // British variant // ~expertise + (r"[%s].*[%s](ise)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ised,ising,isings,isation,isations"), + # (e.g., tree, agree, rage, horse, hoarse) + (r"[%s].*[%s](e)" % (vowels, cons), "es,er,ers,est,ed,ing,ings,eing,eings,ely,eness,enesses,ement,ements,eless,eful"), + + # Words ending in -ED + + # (e.g., agreed, freed, decreed, treed) + (r"ree(d)", "ds,der,ders,ded,ding,dings,dly,dness,dnesses,dment,dments,dless,dful,,*"), + # (e.g., feed, seed, Xweed) + (r"ee(d)", "ds,der,ders,ded,ding,dings,dly,dness,dnesses,dment,dments,dless,dful"), + # (e.g., tried) + (r"[%s](ied)" % cons, "y,ie,ies,ier,iers,iest,ying,yings,ily,yly,iness,yness,inesses,ynesses,iment,iments,iless,iful,yment,yments,yless,yful"), + # (e.g., controlled, fulfilled, rebelled) + (r"[%s].*[%s].*l(led)" % (vowels, cons), ",s,er,ers,est,ing,ings,ly,ness,nesses,ment,ments,less,ful,&,&s,&er,&ers,&est,&ing,&ings,&y,&ness,&nesses,&ment,&ments,&ful"), + # (e.g., pulled, filled, fulled) + (r"[%s].*l(led)" % vowels, "&,&s,&er,&ers,&est,&ing,&ings,&y,&ness,&nesses,&ment,&ments,&ful"), + # (e.g., hissed, grossed) + (r"[%s].*s(sed)" % vowels, "&,&es,&er,&ers,&est,&ing,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful"), + # (e.g., hugged, trekked) + (r"[%s][%s](?P[bdgklmnprt])((?P=ed1)ed)", ",s,&er,&ers,&est,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., tokenize) // adds British variations + (r"[%s].*[%s](ized)" % (vowels, cons), "izes,izer,izers,ize,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"), + # (e.g., tokenise) // British variant // ~expertise + (r"[%s].*[%s](ized)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ise,ising,isings,isation,isations"), + # (e.g., spoiled, tooled, tracked, roasted, atoned, abridged) + (r"[%s].*(ed)" % vowels, ",e,s,es,er,ers,est,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ement,ments,ements,less,eless,ful,eful"), + # (e.g., bed, sled) words with a single e as the only vowel + (r"ed()", "s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"), + + # Words ending in -ER + + # (e.g., altimeter, ammeter, odometer, perimeter) + (r"meter()", "s,er,ers,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., agreer, beer, budgeteer, engineer, freer) + (r"eer()", "eers,eered,eering,eerings,eerly,eerness,eernesses,eerment,eerments,eerless,eerful,ee,ees,eest,eed,eeing,eeings,eely,eeness,eenesses,eement,eements,eeless,eeful,eerer,eerers,eerest"), + # (e.g., acidifier, saltier) + (r"[%s].*[%s](ier)" % (vowels, cons), "y,ie,ies,iest,ied,ying,yings,ily,yly,iness,yness,inesses,ynesses,yment,yments,yless,yful,iment,iments,iless,iful,iers,iered,iering,ierings,ierly,ierness,iernesses,ierment,ierments,ierless,ierful,ierer,ierers,ierest"), + # (e.g., puller, filler, fuller) + (r"[%s].*l(ler)" % vowels, "&,&s,&est,&ed,&ing,&ings,ly,lely,&ness,&nesses,&ment,&ments,&ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erments,&erless,&erful"), + # (e.g., hisser, grosser) + (r"[%s].*s(ser)" % vowels, "&,&es,&est,&ed,&ing,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erment,&erments,&erless,&erful"), + # (e.g., bigger, trekker, hitter) + (r"[%s][%s](?P[bdgkmnprt])((?P=er1)er)" % (cons, vowels), "s,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erments,&erless,&erful"), + # (e.g., tokenize) // adds British variations + (r"[%s].*[%s](izer)" % (vowels, cons), "izes,ize,izers,ized,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"), + # (e.g., tokenise) // British variant // ~expertise + (r"[%s].*[%s](iser)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,ise,isers,ised,ising,isings,isation,isations"), + #(e.g., actioner, atoner, icer, trader, accruer, churchgoer, prefer) + (r"[%s].*(er)" % vowels, ",e,s,es,est,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ments,less,ful,ement,ements,eless,eful,ers,ered,erred,ering,erring,erings,errings,erly,erness,ernesses,erment,erments,erless,erful,erer,erers,erest,errer,errers,errest"), + + # Words ending in -EST + + # (e.g., sliest, happiest, wittiest) + (r"[%s](iest)" % cons, "y,ies,ier,iers,ied,ying,yings,ily,yly,iness,yness,inesses,ynesses,iment,iments,iless,iful"), + # (e.g., fullest) + (r"[%s].*l(lest)" % vowels, "&,&s,&er,&ers,&ed,&ing,&ings,ly,&ness,&nesses,&ment,&ments,&ful"), + # (e.g., grossest) + (r"[%s].*s(sest)" % vowels, "&,&es,&er,&ers,&ed,&ing,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful"), + # (e.g., biggest) + (r"[%s][%s](?P[bdglmnprst])((?P=est1)est)" % (cons, vowels), ",s,&er,&ers,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., basest, archest, rashest) + (r"[%s].*([cs]h|[jsxz])(est)" % vowels, "e,es,er,ers,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ments,less,ful,ement,ements,eless,eful,ests,ester,esters,ested,esting,estings,estly,estness,estnesses,estment,estments,estless,estful"), + # (e.g., severest, Xinterest, merest) + (r"er(est)", "e,es,er,ers,ed,eing,eings,ely,eness,enesses,ement,ements,eless,eful,ests,ester,esters,ested,esting,estings,estly,estness,estnesses,estment,estments,estless,estful"), + # (e.g., slickest, coolest, ablest, amplest, protest, quest) + (r"[%s].*(est)" % vowels, ",e,s,es,er,ers,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ments,less,ful,ement,ements,eless,eful,ests,ester,esters,ested,esting,estings,estly,estness,estnesses,estment,estments,estless,estful"), + # (e.g., rest, test) + (r"est", "s,er,ers,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + + # Words ending in -FUL + + # (e.g., beautiful, plentiful) + (r"[%s].*[%s](iful)" % (vowels, cons), "ifully,ifulness,*y"), + # (e.g., hopeful, sorrowful) + (r"[%s].*(ful)" % vowels, "fully,fulness,,*"), + + # Words ending in -ICAL + + (r"[%s].*(ical)" % vowels, "ic,ics,ically"), + + # Words ending in -IC + + (r"[%s].*(ic)" % vowels, "ics,ical,ically"), + + # Words ending in -ING + + # (e.g., dying, crying, supplying) + (r"[%s](ying)" % cons, "yings,ie,y,ies,ier,iers,iest,ied,iely,yly,ieness,yness,ienesses,ynesses,iment,iments,iless,iful"), + # (e.g., pulling, filling, fulling) + (r"[%s].*l(ling)" % vowels, ",*,&,&s,&er,&ers,&est,&ed,&ings,&ness,&nesses,&ment,&ments,&ful"), + # (e.g., hissing, grossing, processing) + (r"[%s].*s(sing)" % vowels, "&,&s,&er,&ers,&est,&ed,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful"), + # (e.g., hugging, trekking) + (r"[%s][%s](?P[bdgklmnprt])((?P=ing1)ing)" % (cons, vowels), ",s,&er,&ers,&est,&ed,&ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., freeing, agreeing) + (r"eeing()", "ee,ees,eer,eers,eest,eed,eeings,eely,eeness,eenesses,eement,eements,eeless,eeful"), + # (e.g., ageing, aweing) + (r"[%s].*(eing)" % vowels, "e,es,er,ers,est,ed,eings,ely,eness,enesses,ement,ements,eless,eful"), + # (e.g., toying, playing) + (r"[%s].*y(ing)" % vowels, ",s,er,ers,est,ed,ings,ly,ingly,ness,nesses,ment,ments,less,ful"), + # (e.g., editing, crediting, expediting, siting, exciting) + (r"[%s].*[%s][eio]t(ing)" % (vowels, cons), ",*,*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), + # (e.g., robing, siding, doling, translating, flaking) + (r"[%s][%s][bdgklmt](ing)" % (cons, vowels), "*e,ings,inger,ingers,ingest,inged,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), + # (e.g., tokenize) // adds British variations + (r"[%s].*[%s](izing)" % (vowels, cons), "izes,izer,izers,ized,ize,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"), + # (e.g., tokenise) // British variant // ~expertise + (r"[%s].*[%s](ising)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ised,ise,isings,isation,isations"), + # (e.g., icing, aging, achieving, amazing, housing) + (r"[%s][cgsvz](ing)" % vowels, "*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), + # (e.g., dancing, troubling, arguing, bluing, carving) + (r"[%s][clsuv](ing)" % cons, "*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), + # (e.g., charging, bulging) + (r"[%s].*[lr]g(ing)" % vowels, "*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), + # (e.g., farming, harping, interesting, bedspring, redwing) + (r"[%s].*[%s][bdfjkmnpqrtwxz](ing)" % (vowels, cons), ",*,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), + # (e.g., spoiling, reviling, autoing, egging, hanging, hingeing) + (r"[%s].*(ing)" % vowels, ",*,*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), + # (e.g., wing, thing) monosyllables + (r"(ing)", "ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), + + # -LEAF rules omitted + + # Words ending in -MAN + # (e.g., policewomen, hatchetmen, dolmen) + (r"(man)", "man,mens,mener,meners,menest,mened,mening,menings,menly,menness,mennesses,menless,menful"), + + # Words ending in -MENT + + # (e.g., segment, bisegment, cosegment, pigment, depigment, repigment) + (r"segment|pigment", "s,ed,ing,ings,er,ers,ly,ness,nesses,less,ful"), + # (e.g., judgment, abridgment) + (r"[%s].*dg(ment)" % vowels, "*e"), + # (e.g., merriment, embodiment) + (r"[%s].*[%s](iment)" % (vowels, cons), "*y"), + # (e.g., atonement, entrapment) + (r"[%s].*[%s](ment)" % (vowels, cons), ",*"), + + # Words ending in -O + + # (e.g., taboo, rodeo) + (r"[%s]o()" % vowels, "s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., tomato, bonito) + (r"[%s].*o()" % vowels, "s,es,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + + # Words ending in -UM + + # (e.g., datum, quantum, tedium, strum, [oil]drum, vacuum) + (r"[%s].*(um)" % vowels, "a,ums,umer,ummer,umers,ummers,umed,ummed,uming,umming,umings,ummings,umness,umments,umless,umful"), + + # Words ending in -Y + + # (e.g., ably, horribly, wobbly) + (r"[%s].*b(ly)" % vowels, "le,les,ler,lers,lest,led,ling,lings,leness,lenesses,lement,lements,leless,leful"), + # (e.g., happily, dizzily) + (r"[%s].*[%s](ily)" % (vowels, cons), "y,ies,ier,iers,iest,ied,ying,yings,yness,iness,ynesses,inesses,iment,iments,iless,iful"), + # (e.g., peaceful+ly) + (r"[%s].*ful(ly)" % vowels, ",*"), + # (e.g., fully, folly, coolly, fatally, dally) + (r"[%s].*l(ly)" % vowels, ",*,lies,lier,liers,liest,lied,lying,lyings,liness,linesses,liment,liments,liless,liful,*l"), + # (e.g., monopoly, Xcephaly, holy) + (r"[%s](ly)" % vowels, "lies,lier,liers,liest,lied,lying,lyings,liness,linesses,liment,liments,liless,liful"), + # (e.g., frequently, comely, deeply, apply, badly) + (r"[%s].*(ly)" % vowels, ",*,lies,lier,liers,liest,lied,lying,lyings,liness,linesses,lyless,lyful"), + # (e.g., happy, ply, spy, cry) + (r"[%s](y)" % cons, "ies,ier,iers,iest,ied,ying,yings,ily,yness,iness,ynesses,inesses,iment,iments,iless,iful,yment,yments,yless,yful"), + # (e.g., betray, gay, stay) + (r"[%s]y()" % vowels, "s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + + # Root rules + + # (e.g., fix, arch, rash) + (r"[%s].*(ch|sh|[jxz])()" % vowels, "es,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., unflag, open, besot) + (r"[%s].*[%s][%s][bdglmnprt]()" % (vowels, cons, vowels), "s,er,ers,est,ed,ing,ings,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., bed, cop) + (r"[%s][%s][bdglmnprt]()" % (cons, vowels), "s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"), + # (e.g., schemata, automata) + (r"[%s].*[%s][%s]ma(ta)" % (vowels, cons, vowels), ",s,tas,tum,tums,ton,tons,tic,tical"), + # (e.g., chordata, data, errata, sonata, toccata) + (r"[%s].*t(a)" % vowels, "as,ae,um,ums,on,ons,ic,ical"), + # (e.g., polka, spa, schema, ova, polyhedra) + (r"[%s].*[%s](a)" % (vowels, cons), "as,aed,aing,ae,ata,um,ums,on,ons,al,atic,atical"), + # (e.g., full) + (r"[%s].*ll()" % vowels, "s,er,ers,est,ed,ing,ings,y,ness,nesses,ment,ments,-less,ful"), + # (e.g., spoon, rhythm) + (r"[%s].*()", "s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), + ) + +# There are a limited number of named groups available in a single +# regular expression, so we'll partition the list of rules into +# smaller chunks. + +_partition_size = 20 +_partitions = [] +for p in xrange(0, len(rules) // _partition_size + 1): + start = p * _partition_size + end = (p + 1) * _partition_size + pattern = "|".join("(?P<_g%s>%s)$" % (i, r[0]) + for i, r in enumerate(rules[start:end])) + _partitions.append(re.compile(pattern)) + + +def variations(word): + """Given an English word, returns a collection of morphological variations + on the word by algorithmically adding and removing suffixes. The variation + list may contain non-words (e.g. render -> renderment). + + >>> variations("pull") + set(['pull', 'pullings', 'pullnesses', 'pullful', 'pullment', 'puller', ... ]) + """ + + if word in _exdict: + return _exdict[word].split(" ") + + for i, p in enumerate(_partitions): + match = p.search(word) + if match: + # Get the named group that matched + num = int([k for k, v in iteritems(match.groupdict()) + if v is not None and k.startswith("_g")][0][2:]) + # Get the positional groups for the matched group (all other + # positional groups are None) + groups = [g for g in match.groups() if g is not None] + ending = groups[-1] + root = word[:0 - len(ending)] if ending else word + + out = set((word,)) + results = rules[i * _partition_size + num][1] + for result in results.split(","): + if result.startswith("&"): + out.add(root + root[-1] + result[1:]) + elif result.startswith("*"): + out.union(variations(root + result[1:])) + else: + out.add(root + result) + return set(out) + + return [word] diff --git a/src/whoosh/lang/paicehusk.py b/src/whoosh/lang/paicehusk.py new file mode 100644 index 0000000..481c3e4 --- /dev/null +++ b/src/whoosh/lang/paicehusk.py @@ -0,0 +1,242 @@ +"""This module contains an object that implements the Paice-Husk stemming +algorithm. + +If you just want to use the standard Paice-Husk stemming rules, use the +module's ``stem()`` function:: + + stemmed_word = stem(word) + +If you want to use a custom rule set, read the rules into a string where the +rules are separated by newlines, and instantiate the object with the string, +then use the object's stem method to stem words:: + + stemmer = PaiceHuskStemmer(my_rules_string) + stemmed_word = stemmer.stem(word) +""" + +import re +from collections import defaultdict + + +class PaiceHuskStemmer(object): + """Implements the Paice-Husk stemming algorithm. + """ + + rule_expr = re.compile(r""" + ^(?P\w+) + (?P[*]?) + (?P\d+) + (?P\w*) + (?P[.>]) + """, re.UNICODE | re.VERBOSE) + + stem_expr = re.compile("^\w+", re.UNICODE) + + def __init__(self, ruletable): + """ + :param ruletable: a string containing the rule data, separated + by newlines. + """ + self.rules = defaultdict(list) + self.read_rules(ruletable) + + def read_rules(self, ruletable): + rule_expr = self.rule_expr + rules = self.rules + + for line in ruletable.split("\n"): + line = line.strip() + if not line: + continue + + match = rule_expr.match(line) + if match: + ending = match.group("ending")[::-1] + lastchar = ending[-1] + intact = match.group("intact") == "*" + num = int(match.group("num")) + append = match.group("append") + cont = match.group("cont") == ">" + + rules[lastchar].append((ending, intact, num, append, cont)) + else: + raise Exception("Bad rule: %r" % line) + + def first_vowel(self, word): + vp = min([p for p in [word.find(v) for v in "aeiou"] + if p > -1]) + yp = word.find("y") + if yp > 0 and yp < vp: + return yp + return vp + + def strip_prefix(self, word): + for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega", + "nano", "pico", "pseudo"): + if word.startswith(prefix): + return word[len(prefix):] + return word + + def stem(self, word): + """Returns a stemmed version of the argument string. + """ + + rules = self.rules + match = self.stem_expr.match(word) + if not match: + return word + stem = self.strip_prefix(match.group(0)) + + is_intact = True + continuing = True + while continuing: + pfv = self.first_vowel(stem) + rulelist = rules.get(stem[-1]) + if not rulelist: + break + + continuing = False + for ending, intact, num, append, cont in rulelist: + if stem.endswith(ending): + if intact and not is_intact: + continue + newlen = len(stem) - num + len(append) + + if ((pfv == 0 and newlen < 2) + or (pfv > 0 and newlen < 3)): + # If word starts with vowel, minimum stem length is 2. + # If word starts with consonant, minimum stem length is + # 3. + continue + + is_intact = False + stem = stem[:0 - num] + append + + continuing = cont + break + + return stem + +# The default rules for the Paice-Husk stemming algorithm + +defaultrules = """ +ai*2. { -ia > - if intact } +a*1. { -a > - if intact } +bb1. { -bb > -b } +city3s. { -ytic > -ys } +ci2> { -ic > - } +cn1t> { -nc > -nt } +dd1. { -dd > -d } +dei3y> { -ied > -y } +deec2ss. { -ceed > -cess } +dee1. { -eed > -ee } +de2> { -ed > - } +dooh4> { -hood > - } +e1> { -e > - } +feil1v. { -lief > -liev } +fi2> { -if > - } +gni3> { -ing > - } +gai3y. { -iag > -y } +ga2> { -ag > - } +gg1. { -gg > -g } +ht*2. { -th > - if intact } +hsiug5ct. { -guish > -ct } +hsi3> { -ish > - } +i*1. { -i > - if intact } +i1y> { -i > -y } +ji1d. { -ij > -id -- see nois4j> & vis3j> } +juf1s. { -fuj > -fus } +ju1d. { -uj > -ud } +jo1d. { -oj > -od } +jeh1r. { -hej > -her } +jrev1t. { -verj > -vert } +jsim2t. { -misj > -mit } +jn1d. { -nj > -nd } +j1s. { -j > -s } +lbaifi6. { -ifiabl > - } +lbai4y. { -iabl > -y } +lba3> { -abl > - } +lbi3. { -ibl > - } +lib2l> { -bil > -bl } +lc1. { -cl > c } +lufi4y. { -iful > -y } +luf3> { -ful > - } +lu2. { -ul > - } +lai3> { -ial > - } +lau3> { -ual > - } +la2> { -al > - } +ll1. { -ll > -l } +mui3. { -ium > - } +mu*2. { -um > - if intact } +msi3> { -ism > - } +mm1. { -mm > -m } +nois4j> { -sion > -j } +noix4ct. { -xion > -ct } +noi3> { -ion > - } +nai3> { -ian > - } +na2> { -an > - } +nee0. { protect -een } +ne2> { -en > - } +nn1. { -nn > -n } +pihs4> { -ship > - } +pp1. { -pp > -p } +re2> { -er > - } +rae0. { protect -ear } +ra2. { -ar > - } +ro2> { -or > - } +ru2> { -ur > - } +rr1. { -rr > -r } +rt1> { -tr > -t } +rei3y> { -ier > -y } +sei3y> { -ies > -y } +sis2. { -sis > -s } +si2> { -is > - } +ssen4> { -ness > - } +ss0. { protect -ss } +suo3> { -ous > - } +su*2. { -us > - if intact } +s*1> { -s > - if intact } +s0. { -s > -s } +tacilp4y. { -plicat > -ply } +ta2> { -at > - } +tnem4> { -ment > - } +tne3> { -ent > - } +tna3> { -ant > - } +tpir2b. { -ript > -rib } +tpro2b. { -orpt > -orb } +tcud1. { -duct > -duc } +tpmus2. { -sumpt > -sum } +tpec2iv. { -cept > -ceiv } +tulo2v. { -olut > -olv } +tsis0. { protect -sist } +tsi3> { -ist > - } +tt1. { -tt > -t } +uqi3. { -iqu > - } +ugo1. { -ogu > -og } +vis3j> { -siv > -j } +vie0. { protect -eiv } +vi2> { -iv > - } +ylb1> { -bly > -bl } +yli3y> { -ily > -y } +ylp0. { protect -ply } +yl2> { -ly > - } +ygo1. { -ogy > -og } +yhp1. { -phy > -ph } +ymo1. { -omy > -om } +ypo1. { -opy > -op } +yti3> { -ity > - } +yte3> { -ety > - } +ytl2. { -lty > -l } +yrtsi5. { -istry > - } +yra3> { -ary > - } +yro3> { -ory > - } +yfi3. { -ify > - } +ycn2t> { -ncy > -nt } +yca3> { -acy > - } +zi2> { -iz > - } +zy1s. { -yz > -ys } +""" + +# Make the standard rules available as a module-level function + +stem = PaiceHuskStemmer(defaultrules).stem diff --git a/src/whoosh/lang/phonetic.py b/src/whoosh/lang/phonetic.py new file mode 100644 index 0000000..1cee86a --- /dev/null +++ b/src/whoosh/lang/phonetic.py @@ -0,0 +1,119 @@ +#encoding: utf-8 + +""" +This module contains quasi-phonetic encoders for words in different languages. +""" + +import re + +from whoosh.compat import iteritems + +# This soundex implementation is adapted from the recipe here: +# http://code.activestate.com/recipes/52213/ + +english_codes = '01230120022455012623010202' + + +def soundex_en(word): + # digits holds the soundex values for the alphabet + r = "" + if word: + # Remember first character + fc = None + prevcode = None + for char in word.lower(): + c = ord(char) + if c >= 97 and c <= 122: # a-z + if not fc: + fc = char + code = english_codes[c - 97] + # Don't append the code if it's the same as the previous + if code != prevcode: + r += code + prevcode = code + + # Replace first digit with first alpha character + r = fc + r[1:] + + return r + + +# Quasi-phonetic coder for Spanish, translated to Python from Sebastian +# Ferreyra's version here: +# http://www.javalobby.org/java/forums/t16936.html + +_esp_codes = (("\\Aw?[uh]?([aeiou])", ""), + ("c[eiéí]|z|ll|sh|ch|sch|cc|y[aeiouáéíóú]|ps|bs|x|j|g[eiéí]", "s"), + ("[aeiouhwáéíóúü]+", ""), + ("y", ""), + ("ñ|gn", "n"), + ("[dpc]t", "t"), + ("c[aouáóú]|ck|q", "k"), + ("v", "b"), + ("d$", "t"), # Change a trailing d to a t + ) +_esp_codes = tuple((re.compile(pat), repl) for pat, repl in _esp_codes) + + +def soundex_esp(word): + word = word.lower() + r = "" + + prevcode = None + i = 0 + while i < len(word): + code = None + for expr, ecode in _esp_codes: + match = expr.match(word, i) + if match: + i = match.end() + code = ecode + break + + if code is None: + code = word[i] + i += 1 + + if code != prevcode: + r += code + prevcode = code + + return r + + +# This version of soundex for Arabic is translated to Python from Tammam +# Koujan's C# version here: +# http://www.codeproject.com/KB/recipes/ArabicSoundex.aspx + +# Create a dictionary mapping arabic characters to digits +_arabic_codes = {} +for chars, code in iteritems({'\u0627\u0623\u0625\u0622\u062d\u062e\u0647\u0639\u063a\u0634\u0648\u064a': "0", + '\u0641\u0628': "1", + '\u062c\u0632\u0633\u0635\u0638\u0642\u0643': "2", + '\u062a\u062b\u062f\u0630\u0636\u0637': "3", + '\u0644': "4", + '\u0645\u0646': "5", + '\u0631': "6", + }): + for char in chars: + _arabic_codes[char] = code + + +def soundex_ar(word): + if word[0] in "\u0627\u0623\u0625\u0622": + word = word[1:] + + r = "0" + prevcode = "0" + if len(word) > 1: + # Discard the first character + for char in word[1:]: + if char in _arabic_codes: + code = _arabic_codes.get(char, "0") + # Don't append the code if it's the same as the previous + if code != prevcode: + # If the code is a 0 (vowel), don't process it + if code != "0": + r += code + prevcode = code + return r diff --git a/src/whoosh/lang/porter.py b/src/whoosh/lang/porter.py new file mode 100755 index 0000000..65d169a --- /dev/null +++ b/src/whoosh/lang/porter.py @@ -0,0 +1,175 @@ +""" +Reimplementation of the +`Porter stemming algorithm `_ +in Python. + +In my quick tests, this implementation about 3.5 times faster than the +seriously weird Python linked from the official page. +""" + +import re + +# Suffix replacement lists + +_step2list = { + "ational": "ate", + "tional": "tion", + "enci": "ence", + "anci": "ance", + "izer": "ize", + "bli": "ble", + "alli": "al", + "entli": "ent", + "eli": "e", + "ousli": "ous", + "ization": "ize", + "ation": "ate", + "ator": "ate", + "alism": "al", + "iveness": "ive", + "fulness": "ful", + "ousness": "ous", + "aliti": "al", + "iviti": "ive", + "biliti": "ble", + "logi": "log", + } + +_step3list = { + "icate": "ic", + "ative": "", + "alize": "al", + "iciti": "ic", + "ical": "ic", + "ful": "", + "ness": "", + } + + +_cons = "[^aeiou]" +_vowel = "[aeiouy]" +_cons_seq = "[^aeiouy]+" +_vowel_seq = "[aeiou]+" + +# m > 0 +_mgr0 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq) +# m == 0 +_meq1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$") +# m > 1 +_mgr1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq) +# vowel in stem +_s_v = re.compile("^(" + _cons_seq + ")?" + _vowel) +# ??? +_c_v = re.compile("^" + _cons_seq + _vowel + "[^aeiouwxy]$") + +# Patterns used in the rules + +_ed_ing = re.compile("^(.*)(ed|ing)$") +_at_bl_iz = re.compile("(at|bl|iz)$") +_step1b = re.compile("([^aeiouylsz])\\1$") +_step2 = re.compile("^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$") +_step3 = re.compile("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$") +_step4_1 = re.compile("^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$") +_step4_2 = re.compile("^(.+?)(s|t)(ion)$") +_step5 = re.compile("^(.+?)e$") + + +# Stemming function + +def stem(w): + """Uses the Porter stemming algorithm to remove suffixes from English + words. + + >>> stem("fundamentally") + "fundament" + """ + + if len(w) < 3: + return w + + first_is_y = w[0] == "y" + if first_is_y: + w = "Y" + w[1:] + + # Step 1a + if w.endswith("s"): + if w.endswith("sses"): + w = w[:-2] + elif w.endswith("ies"): + w = w[:-2] + elif w[-2] != "s": + w = w[:-1] + + # Step 1b + + if w.endswith("eed"): + s = w[:-3] + if _mgr0.match(s): + w = w[:-1] + else: + m = _ed_ing.match(w) + if m: + stem = m.group(1) + if _s_v.match(stem): + w = stem + if _at_bl_iz.match(w): + w += "e" + elif _step1b.match(w): + w = w[:-1] + elif _c_v.match(w): + w += "e" + + # Step 1c + + if w.endswith("y"): + stem = w[:-1] + if _s_v.match(stem): + w = stem + "i" + + # Step 2 + + m = _step2.match(w) + if m: + stem = m.group(1) + suffix = m.group(2) + if _mgr0.match(stem): + w = stem + _step2list[suffix] + + # Step 3 + + m = _step3.match(w) + if m: + stem = m.group(1) + suffix = m.group(2) + if _mgr0.match(stem): + w = stem + _step3list[suffix] + + # Step 4 + + m = _step4_1.match(w) + if m: + stem = m.group(1) + if _mgr1.match(stem): + w = stem + else: + m = _step4_2.match(w) + if m: + stem = m.group(1) + m.group(2) + if _mgr1.match(stem): + w = stem + + # Step 5 + + m = _step5.match(w) + if m: + stem = m.group(1) + if _mgr1.match(stem) or (_meq1.match(stem) and not _c_v.match(stem)): + w = stem + + if w.endswith("ll") and _mgr1.match(w): + w = w[:-1] + + if first_is_y: + w = "y" + w[1:] + + return w diff --git a/src/whoosh/lang/porter2.py b/src/whoosh/lang/porter2.py new file mode 100644 index 0000000..4c74047 --- /dev/null +++ b/src/whoosh/lang/porter2.py @@ -0,0 +1,313 @@ +"""An implementation of the Porter2 stemming algorithm. +See http://snowball.tartarus.org/algorithms/english/stemmer.html + +Adapted from pyporter2 by Michael Dirolf. + +This algorithm is more correct but (at least in this implementation) +several times slower than the original porter algorithm as implemented +in stemming.porter. +""" + +import re + +r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)") +ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$") +ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$") +ccy_exp = re.compile(r"([aeiouy])y") +s1a_exp = re.compile(r"[aeiouy].") +s1b_exp = re.compile(r"[aeiouy]") + + +def get_r1(word): + # exceptional forms + if word.startswith('gener') or word.startswith('arsen'): + return 5 + if word.startswith('commun'): + return 6 + + # normal form + match = r_exp.match(word) + if match: + return match.start(1) + return len(word) + + +def get_r2(word): + match = r_exp.match(word, get_r1(word)) + if match: + return match.start(1) + return len(word) + + +def ends_with_short_syllable(word): + if len(word) == 2: + if ewss_exp1.match(word): + return True + if ewss_exp2.match(word): + return True + return False + + +def is_short_word(word): + if ends_with_short_syllable(word): + if get_r1(word) == len(word): + return True + return False + + +def remove_initial_apostrophe(word): + if word.startswith("'"): + return word[1:] + return word + + +def capitalize_consonant_ys(word): + if word.startswith('y'): + word = 'Y' + word[1:] + return ccy_exp.sub('\g<1>Y', word) + + +def step_0(word): + if word.endswith("'s'"): + return word[:-3] + if word.endswith("'s"): + return word[:-2] + if word.endswith("'"): + return word[:-1] + return word + + +def step_1a(word): + if word.endswith('sses'): + return word[:-4] + 'ss' + if word.endswith('ied') or word.endswith('ies'): + if len(word) > 4: + return word[:-3] + 'i' + else: + return word[:-3] + 'ie' + if word.endswith('us') or word.endswith('ss'): + return word + if word.endswith('s'): + preceding = word[:-1] + if s1a_exp.search(preceding): + return preceding + return word + return word + + +doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt') + + +def ends_with_double(word): + for double in doubles: + if word.endswith(double): + return True + return False + + +def step_1b_helper(word): + if word.endswith('at') or word.endswith('bl') or word.endswith('iz'): + return word + 'e' + if ends_with_double(word): + return word[:-1] + if is_short_word(word): + return word + 'e' + return word + + +s1b_suffixes = ('ed', 'edly', 'ing', 'ingly') + + +def step_1b(word, r1): + if word.endswith('eedly'): + if len(word) - 5 >= r1: + return word[:-3] + return word + if word.endswith('eed'): + if len(word) - 3 >= r1: + return word[:-1] + return word + + for suffix in s1b_suffixes: + if word.endswith(suffix): + preceding = word[:-len(suffix)] + if s1b_exp.search(preceding): + return step_1b_helper(preceding) + return word + + return word + + +def step_1c(word): + if word.endswith('y') or word.endswith('Y') and len(word) > 1: + if word[-2] not in 'aeiouy': + if len(word) > 2: + return word[:-1] + 'i' + return word + + +def step_2_helper(word, r1, end, repl, prev): + if word.endswith(end): + if len(word) - len(end) >= r1: + if prev == []: + return word[:-len(end)] + repl + for p in prev: + if word[:-len(end)].endswith(p): + return word[:-len(end)] + repl + return word + return None + + +s2_triples = (('ization', 'ize', []), + ('ational', 'ate', []), + ('fulness', 'ful', []), + ('ousness', 'ous', []), + ('iveness', 'ive', []), + ('tional', 'tion', []), + ('biliti', 'ble', []), + ('lessli', 'less', []), + ('entli', 'ent', []), + ('ation', 'ate', []), + ('alism', 'al', []), + ('aliti', 'al', []), + ('ousli', 'ous', []), + ('iviti', 'ive', []), + ('fulli', 'ful', []), + ('enci', 'ence', []), + ('anci', 'ance', []), + ('abli', 'able', []), + ('izer', 'ize', []), + ('ator', 'ate', []), + ('alli', 'al', []), + ('bli', 'ble', []), + ('ogi', 'og', ['l']), + ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'])) + + +def step_2(word, r1): + for trip in s2_triples: + attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2]) + if attempt: + return attempt + return word + + +def step_3_helper(word, r1, r2, end, repl, r2_necessary): + if word.endswith(end): + if len(word) - len(end) >= r1: + if not r2_necessary: + return word[:-len(end)] + repl + else: + if len(word) - len(end) >= r2: + return word[:-len(end)] + repl + return word + return None + + +s3_triples = (('ational', 'ate', False), + ('tional', 'tion', False), + ('alize', 'al', False), + ('icate', 'ic', False), + ('iciti', 'ic', False), + ('ative', '', True), + ('ical', 'ic', False), + ('ness', '', False), + ('ful', '', False)) + + +def step_3(word, r1, r2): + for trip in s3_triples: + attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2]) + if attempt: + return attempt + return word + + +s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement', + 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize') + + +def step_4(word, r2): + for end in s4_delete_list: + if word.endswith(end): + if len(word) - len(end) >= r2: + return word[:-len(end)] + return word + + if word.endswith('sion') or word.endswith('tion'): + if len(word) - 3 >= r2: + return word[:-3] + + return word + + +def step_5(word, r1, r2): + if word.endswith('l'): + if len(word) - 1 >= r2 and word[-2] == 'l': + return word[:-1] + return word + + if word.endswith('e'): + if len(word) - 1 >= r2: + return word[:-1] + if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]): + return word[:-1] + + return word + + +def normalize_ys(word): + return word.replace('Y', 'y') + + +exceptional_forms = {'skis': 'ski', + 'skies': 'sky', + 'dying': 'die', + 'lying': 'lie', + 'tying': 'tie', + 'idly': 'idl', + 'gently': 'gentl', + 'ugly': 'ugli', + 'early': 'earli', + 'only': 'onli', + 'singly': 'singl', + 'sky': 'sky', + 'news': 'news', + 'howe': 'howe', + 'atlas': 'atlas', + 'cosmos': 'cosmos', + 'bias': 'bias', + 'andes': 'andes'} + +exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring', + 'earring', 'proceed', 'exceed', 'succeed']) + + +def stem(word): + if len(word) <= 2: + return word + word = remove_initial_apostrophe(word) + + # handle some exceptional forms + if word in exceptional_forms: + return exceptional_forms[word] + + word = capitalize_consonant_ys(word) + r1 = get_r1(word) + r2 = get_r2(word) + word = step_0(word) + word = step_1a(word) + + # handle some more exceptional forms + if word in exceptional_early_exit_post_1a: + return word + + word = step_1b(word, r1) + word = step_1c(word) + word = step_2(word, r1) + word = step_3(word, r1, r2) + word = step_4(word, r2) + word = step_5(word, r1, r2) + word = normalize_ys(word) + + return word diff --git a/src/whoosh/lang/snowball/__init__.py b/src/whoosh/lang/snowball/__init__.py new file mode 100644 index 0000000..d450288 --- /dev/null +++ b/src/whoosh/lang/snowball/__init__.py @@ -0,0 +1,74 @@ +# Copyright (C) 2001-2012 NLTK Project +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Natural Language Toolkit: Snowball Stemmer +# +# Copyright (C) 2001-2012 NLTK Project +# Author: Peter Michael Stahl +# Peter Ljunglof (revisions) +# Algorithms: Dr Martin Porter +# URL: +# For license information, see LICENSE.TXT + +# HJ 2012/07/19 adapted from https://github.com/kmike/nltk.git (branch 2and3) +# 2.0.1rc4-256-g45768f8 + +""" +This module provides a port of the Snowball stemmers developed by Martin +Porter. + +At the moment, this port is able to stem words from fourteen languages: Danish, +Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, +Portuguese, Romanian, Russian, Spanish and Swedish. + +The algorithms have been developed by Martin Porter. These stemmers are called +Snowball, because he invented a programming language with this name for +creating new stemming algorithms. There is more information available at +http://snowball.tartarus.org/ +""" + +from .danish import DanishStemmer +from .dutch import DutchStemmer +from .english import EnglishStemmer +from .finnish import FinnishStemmer +from .french import FrenchStemmer +from .german import GermanStemmer +from .hungarian import HungarianStemmer +from .italian import ItalianStemmer +from .norwegian import NorwegianStemmer +from .portugese import PortugueseStemmer +from .romanian import RomanianStemmer +from .russian import RussianStemmer +from .spanish import SpanishStemmer +from .swedish import SwedishStemmer + + +# Map two-letter codes to stemming classes + +classes = {"da": DanishStemmer, + "nl": DutchStemmer, + "en": EnglishStemmer, + "fi": FinnishStemmer, + "fr": FrenchStemmer, + "de": GermanStemmer, + "hu": HungarianStemmer, + "it": ItalianStemmer, + "no": NorwegianStemmer, + "pt": PortugueseStemmer, + "ro": RomanianStemmer, + "ru": RussianStemmer, + "es": SpanishStemmer, + "sv": SwedishStemmer, + } diff --git a/src/whoosh/lang/snowball/bases.py b/src/whoosh/lang/snowball/bases.py new file mode 100644 index 0000000..0602385 --- /dev/null +++ b/src/whoosh/lang/snowball/bases.py @@ -0,0 +1,133 @@ +# Base classes + + +class _ScandinavianStemmer(object): + + """ + This subclass encapsulates a method for defining the string region R1. + It is used by the Danish, Norwegian, and Swedish stemmer. + + """ + + def _r1_scandinavian(self, word, vowels): + """ + Return the region R1 that is used by the Scandinavian stemmers. + + R1 is the region after the first non-vowel following a vowel, + or is the null region at the end of the word if there is no + such non-vowel. But then R1 is adjusted so that the region + before it contains at least three letters. + + :param word: The word whose region R1 is determined. + :type word: str or unicode + :param vowels: The vowels of the respective language that are + used to determine the region R1. + :type vowels: unicode + :return: the region R1 for the respective word. + :rtype: unicode + :note: This helper method is invoked by the respective stem method of + the subclasses DanishStemmer, NorwegianStemmer, and + SwedishStemmer. It is not to be invoked directly! + + """ + r1 = "" + for i in range(1, len(word)): + if word[i] not in vowels and word[i - 1] in vowels: + if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0: + r1 = word[3:] + elif len(word[:i + 1]) >= 3: + r1 = word[i + 1:] + else: + return word + break + + return r1 + + +class _StandardStemmer(object): + """ + This subclass encapsulates two methods for defining the standard versions + of the string regions R1, R2, and RV. + """ + + def _r1r2_standard(self, word, vowels): + """ + Return the standard interpretations of the string regions R1 and R2. + + R1 is the region after the first non-vowel following a vowel, + or is the null region at the end of the word if there is no + such non-vowel. + + R2 is the region after the first non-vowel following a vowel + in R1, or is the null region at the end of the word if there + is no such non-vowel. + + :param word: The word whose regions R1 and R2 are determined. + :type word: str or unicode + :param vowels: The vowels of the respective language that are + used to determine the regions R1 and R2. + :type vowels: unicode + :return: (r1,r2), the regions R1 and R2 for the respective word. + :rtype: tuple + :note: This helper method is invoked by the respective stem method of + the subclasses DutchStemmer, FinnishStemmer, + FrenchStemmer, GermanStemmer, ItalianStemmer, + PortugueseStemmer, RomanianStemmer, and SpanishStemmer. + It is not to be invoked directly! + :note: A detailed description of how to define R1 and R2 + can be found at http://snowball.tartarus.org/texts/r1r2.html + + """ + r1 = "" + r2 = "" + for i in range(1, len(word)): + if word[i] not in vowels and word[i - 1] in vowels: + r1 = word[i + 1:] + break + + for i in range(1, len(r1)): + if r1[i] not in vowels and r1[i - 1] in vowels: + r2 = r1[i + 1:] + break + + return (r1, r2) + + def _rv_standard(self, word, vowels): + """ + Return the standard interpretation of the string region RV. + + If the second letter is a consonant, RV is the region after the + next following vowel. If the first two letters are vowels, RV is + the region after the next following consonant. Otherwise, RV is + the region after the third letter. + + :param word: The word whose region RV is determined. + :type word: str or unicode + :param vowels: The vowels of the respective language that are + used to determine the region RV. + :type vowels: unicode + :return: the region RV for the respective word. + :rtype: unicode + :note: This helper method is invoked by the respective stem method of + the subclasses ItalianStemmer, PortugueseStemmer, + RomanianStemmer, and SpanishStemmer. It is not to be + invoked directly! + + """ + rv = "" + if len(word) >= 2: + if word[1] not in vowels: + for i in range(2, len(word)): + if word[i] in vowels: + rv = word[i + 1:] + break + + elif word[:2] in vowels: + for i in range(2, len(word)): + if word[i] not in vowels: + rv = word[i + 1:] + break + else: + rv = word[3:] + + return rv diff --git a/src/whoosh/lang/snowball/danish.py b/src/whoosh/lang/snowball/danish.py new file mode 100644 index 0000000..759a1bf --- /dev/null +++ b/src/whoosh/lang/snowball/danish.py @@ -0,0 +1,115 @@ +from .bases import _ScandinavianStemmer + +from whoosh.compat import u + + +class DanishStemmer(_ScandinavianStemmer): + """ + The Danish Snowball stemmer. + + :cvar __vowels: The Danish vowels. + :type __vowels: unicode + :cvar __consonants: The Danish consonants. + :type __consonants: unicode + :cvar __double_consonants: The Danish double consonants. + :type __double_consonants: tuple + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Danish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/danish/stemmer.html + + """ + + # The language's vowels and other important characters are defined. + __vowels = u("aeiouy\xE6\xE5\xF8") + __consonants = "bcdfghjklmnpqrstvwxz" + __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj", + "kk", "ll", "mm", "nn", "pp", "qq", "rr", + "ss", "tt", "vv", "ww", "xx", "zz") + __s_ending = u("abcdfghjklmnoprtvyz\xE5") + + # The different suffixes, divided into the algorithm's steps + # and organized by length, are listed in tuples. + __step1_suffixes = ("erendes", "erende", "hedens", "ethed", + "erede", "heden", "heder", "endes", + "ernes", "erens", "erets", "ered", + "ende", "erne", "eren", "erer", "heds", + "enes", "eres", "eret", "hed", "ene", "ere", + "ens", "ers", "ets", "en", "er", "es", "et", + "e", "s") + __step2_suffixes = ("gd", "dt", "gt", "kt") + __step3_suffixes = ("elig", u("l\xF8st"), "lig", "els", "ig") + + def stem(self, word): + """ + Stem a Danish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + # Every word is put into lower case for normalization. + word = word.lower() + + # After this, the required regions are generated + # by the respective helper method. + r1 = self._r1_scandinavian(word, self.__vowels) + + # Then the actual stemming process starts. + # Every new step is explicitly indicated + # according to the descriptions on the Snowball website. + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "s": + if word[-2] in self.__s_ending: + word = word[:-1] + r1 = r1[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 3 + if r1.endswith("igst"): + word = word[:-2] + r1 = r1[:-2] + + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix == u("l\xF8st"): + word = word[:-1] + r1 = r1[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + + if r1.endswith(self.__step2_suffixes): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 4: Undouble + for double_cons in self.__double_consonants: + if word.endswith(double_cons) and len(word) > 3: + word = word[:-1] + break + + return word diff --git a/src/whoosh/lang/snowball/dutch.py b/src/whoosh/lang/snowball/dutch.py new file mode 100644 index 0000000..22573f6 --- /dev/null +++ b/src/whoosh/lang/snowball/dutch.py @@ -0,0 +1,173 @@ +from .bases import _StandardStemmer + +from whoosh.compat import u + + +class DutchStemmer(_StandardStemmer): + """ + The Dutch Snowball stemmer. + + :cvar __vowels: The Dutch vowels. + :type __vowels: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm. + :type __step3b_suffixes: tuple + :note: A detailed description of the Dutch + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/dutch/stemmer.html + """ + + __vowels = u("aeiouy\xE8") + __step1_suffixes = ("heden", "ene", "en", "se", "s") + __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig") + + def stem(self, word): + """ + Stem a Dutch word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step2_success = False + + # Vowel accents are removed. + word = (word.replace(u("\xE4"), "a").replace(u("\xE1"), "a") + .replace(u("\xEB"), "e").replace(u("\xE9"), "e") + .replace(u("\xED"), "i").replace(u("\xEF"), "i") + .replace(u("\xF6"), "o").replace(u("\xF3"), "o") + .replace(u("\xFC"), "u").replace(u("\xFA"), "u")) + + # An initial 'y', a 'y' after a vowel, + # and an 'i' between self.__vowels is put into upper case. + # As from now these are treated as consonants. + if word.startswith("y"): + word = "".join(("Y", word[1:])) + + for i in range(1, len(word)): + if word[i - 1] in self.__vowels and word[i] == "y": + word = "".join((word[:i], "Y", word[i + 1:])) + + for i in range(1, len(word) - 1): + if (word[i - 1] in self.__vowels and word[i] == "i" and + word[i + 1] in self.__vowels): + word = "".join((word[:i], "I", word[i + 1:])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # R1 is adjusted so that the region before it + # contains at least 3 letters. + for i in range(1, len(word)): + if word[i] not in self.__vowels and word[i - 1] in self.__vowels: + if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0: + r1 = word[3:] + elif len(word[:i + 1]) == 0: + return word + break + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "heden": + word = "".join((word[:-5], "heid")) + r1 = "".join((r1[:-5], "heid")) + if r2.endswith("heden"): + r2 = "".join((r2[:-5], "heid")) + + elif (suffix in ("ene", "en") and + not word.endswith("heden") and + word[-len(suffix) - 1] not in self.__vowels and + word[-len(suffix) - 3:-len(suffix)] != "gem"): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif (suffix in ("se", "s") and + word[-len(suffix) - 1] not in self.__vowels and + word[-len(suffix) - 1] != "j"): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 2 + if r1.endswith("e") and word[-2] not in self.__vowels: + step2_success = True + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + # STEP 3a + if r2.endswith("heid") and word[-5] != "c": + word = word[:-4] + r1 = r1[:-4] + r2 = r2[:-4] + + if (r1.endswith("en") and word[-3] not in self.__vowels and + word[-5:-2] != "gem"): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + # STEP 3b: Derivational suffixes + for suffix in self.__step3b_suffixes: + if r2.endswith(suffix): + if suffix in ("end", "ing"): + word = word[:-3] + r2 = r2[:-3] + + if r2.endswith("ig") and word[-3] != "e": + word = word[:-2] + else: + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + + elif suffix == "ig" and word[-3] != "e": + word = word[:-2] + + elif suffix == "lijk": + word = word[:-4] + r1 = r1[:-4] + + if r1.endswith("e") and word[-2] not in self.__vowels: + word = word[:-1] + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + + elif suffix == "baar": + word = word[:-4] + + elif suffix == "bar" and step2_success: + word = word[:-3] + break + + # STEP 4: Undouble vowel + if len(word) >= 4: + if word[-1] not in self.__vowels and word[-1] != "I": + if word[-3:-1] in ("aa", "ee", "oo", "uu"): + if word[-4] not in self.__vowels: + word = "".join((word[:-3], word[-3], word[-1])) + + # All occurrences of 'I' and 'Y' are put back into lower case. + word = word.replace("I", "i").replace("Y", "y") + + return word diff --git a/src/whoosh/lang/snowball/english.py b/src/whoosh/lang/snowball/english.py new file mode 100644 index 0000000..0ed9240 --- /dev/null +++ b/src/whoosh/lang/snowball/english.py @@ -0,0 +1,465 @@ +from .bases import _StandardStemmer + +from whoosh.compat import u + + +class EnglishStemmer(_StandardStemmer): + """ + The English Snowball stemmer. + + :cvar __vowels: The English vowels. + :type __vowels: unicode + :cvar __double_consonants: The English double consonants. + :type __double_consonants: tuple + :cvar __li_ending: Letters that may directly appear before a word final 'li'. + :type __li_ending: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm. + :type __step1a_suffixes: tuple + :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm. + :type __step1b_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. + :type __step5_suffixes: tuple + :cvar __special_words: A dictionary containing words + which have to be stemmed specially. + :type __special_words: dict + :note: A detailed description of the English + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/english/stemmer.html + """ + + __vowels = "aeiouy" + __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", + "pp", "rr", "tt") + __li_ending = "cdeghkmnrt" + __step0_suffixes = ("'s'", "'s", "'") + __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s") + __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed") + __step2_suffixes = ('ization', 'ational', 'fulness', 'ousness', + 'iveness', 'tional', 'biliti', 'lessli', + 'entli', 'ation', 'alism', 'aliti', 'ousli', + 'iviti', 'fulli', 'enci', 'anci', 'abli', + 'izer', 'ator', 'alli', 'bli', 'ogi', 'li') + __step3_suffixes = ('ational', 'tional', 'alize', 'icate', 'iciti', + 'ative', 'ical', 'ness', 'ful') + __step4_suffixes = ('ement', 'ance', 'ence', 'able', 'ible', 'ment', + 'ant', 'ent', 'ism', 'ate', 'iti', 'ous', + 'ive', 'ize', 'ion', 'al', 'er', 'ic') + __step5_suffixes = ("e", "l") + __special_words = {"skis": "ski", + "skies": "sky", + "dying": "die", + "lying": "lie", + "tying": "tie", + "idly": "idl", + "gently": "gentl", + "ugly": "ugli", + "early": "earli", + "only": "onli", + "singly": "singl", + "sky": "sky", + "news": "news", + "howe": "howe", + "atlas": "atlas", + "cosmos": "cosmos", + "bias": "bias", + "andes": "andes", + "inning": "inning", + "innings": "inning", + "outing": "outing", + "outings": "outing", + "canning": "canning", + "cannings": "canning", + "herring": "herring", + "herrings": "herring", + "earring": "earring", + "earrings": "earring", + "proceed": "proceed", + "proceeds": "proceed", + "proceeded": "proceed", + "proceeding": "proceed", + "exceed": "exceed", + "exceeds": "exceed", + "exceeded": "exceed", + "exceeding": "exceed", + "succeed": "succeed", + "succeeds": "succeed", + "succeeded": "succeed", + "succeeding": "succeed"} + + def stem(self, word): + + """ + Stem an English word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if word in self.__special_words: + return self.__special_words[word] + + # Map the different apostrophe characters to a single consistent one + word = (word.replace(u("\u2019"), u("\x27")) + .replace(u("\u2018"), u("\x27")) + .replace(u("\u201B"), u("\x27"))) + + if word.startswith(u("\x27")): + word = word[1:] + + if word.startswith("y"): + word = "".join(("Y", word[1:])) + + for i in range(1, len(word)): + if word[i - 1] in self.__vowels and word[i] == "y": + word = "".join((word[:i], "Y", word[i + 1:])) + + step1a_vowel_found = False + step1b_vowel_found = False + + r1 = "" + r2 = "" + + if word.startswith(("gener", "commun", "arsen")): + if word.startswith(("gener", "arsen")): + r1 = word[5:] + else: + r1 = word[6:] + + for i in range(1, len(r1)): + if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels: + r2 = r1[i + 1:] + break + else: + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # STEP 0 + for suffix in self.__step0_suffixes: + if word.endswith(suffix): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 1a + for suffix in self.__step1a_suffixes: + if word.endswith(suffix): + + if suffix == "sses": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix in ("ied", "ies"): + if len(word[:-len(suffix)]) > 1: + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + else: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif suffix == "s": + for letter in word[:-2]: + if letter in self.__vowels: + step1a_vowel_found = True + break + + if step1a_vowel_found: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + break + + # STEP 1b + for suffix in self.__step1b_suffixes: + if word.endswith(suffix): + if suffix in ("eed", "eedly"): + + if r1.endswith(suffix): + word = "".join((word[:-len(suffix)], "ee")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ee")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ee")) + else: + r2 = "" + else: + for letter in word[:-len(suffix)]: + if letter in self.__vowels: + step1b_vowel_found = True + break + + if step1b_vowel_found: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + + if word.endswith(("at", "bl", "iz")): + word = "".join((word, "e")) + r1 = "".join((r1, "e")) + + if len(word) > 5 or len(r1) >= 3: + r2 = "".join((r2, "e")) + + elif word.endswith(self.__double_consonants): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif ((r1 == "" and len(word) >= 3 and + word[-1] not in self.__vowels and + word[-1] not in "wxY" and + word[-2] in self.__vowels and + word[-3] not in self.__vowels) + or + (r1 == "" and len(word) == 2 and + word[0] in self.__vowels and + word[1] not in self.__vowels)): + + word = "".join((word, "e")) + + if len(r1) > 0: + r1 = "".join((r1, "e")) + + if len(r2) > 0: + r2 = "".join((r2, "e")) + break + + # STEP 1c + if (len(word) > 2 + and word[-1] in "yY" + and word[-2] not in self.__vowels): + word = "".join((word[:-1], "i")) + if len(r1) >= 1: + r1 = "".join((r1[:-1], "i")) + else: + r1 = "" + + if len(r2) >= 1: + r2 = "".join((r2[:-1], "i")) + else: + r2 = "" + + # STEP 2 + for suffix in self.__step2_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix == "tional": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix in ("enci", "anci", "abli"): + word = "".join((word[:-1], "e")) + + if len(r1) >= 1: + r1 = "".join((r1[:-1], "e")) + else: + r1 = "" + + if len(r2) >= 1: + r2 = "".join((r2[:-1], "e")) + else: + r2 = "" + + elif suffix == "entli": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix in ("izer", "ization"): + word = "".join((word[:-len(suffix)], "ize")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ize")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ize")) + else: + r2 = "" + + elif suffix in ("ational", "ation", "ator"): + word = "".join((word[:-len(suffix)], "ate")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ate")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ate")) + else: + r2 = "e" + + elif suffix in ("alism", "aliti", "alli"): + word = "".join((word[:-len(suffix)], "al")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "al")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "al")) + else: + r2 = "" + + elif suffix == "fulness": + word = word[:-4] + r1 = r1[:-4] + r2 = r2[:-4] + + elif suffix in ("ousli", "ousness"): + word = "".join((word[:-len(suffix)], "ous")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ous")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ous")) + else: + r2 = "" + + elif suffix in ("iveness", "iviti"): + word = "".join((word[:-len(suffix)], "ive")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ive")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ive")) + else: + r2 = "e" + + elif suffix in ("biliti", "bli"): + word = "".join((word[:-len(suffix)], "ble")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ble")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ble")) + else: + r2 = "" + + elif suffix == "ogi" and word[-4] == "l": + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif suffix in ("fulli", "lessli"): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "li" and word[-3] in self.__li_ending: + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + break + + # STEP 3 + for suffix in self.__step3_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix == "tional": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "ational": + word = "".join((word[:-len(suffix)], "ate")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ate")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ate")) + else: + r2 = "" + + elif suffix == "alize": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + + elif suffix in ("icate", "iciti", "ical"): + word = "".join((word[:-len(suffix)], "ic")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ic")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ic")) + else: + r2 = "" + + elif suffix in ("ful", "ness"): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + + elif suffix == "ative" and r2.endswith(suffix): + word = word[:-5] + r1 = r1[:-5] + r2 = r2[:-5] + break + + # STEP 4 + for suffix in self.__step4_suffixes: + if word.endswith(suffix): + if r2.endswith(suffix): + if suffix == "ion": + if word[-4] in "st": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 5 + if r2.endswith("l") and word[-2] == "l": + word = word[:-1] + elif r2.endswith("e"): + word = word[:-1] + elif r1.endswith("e"): + if len(word) >= 4 and (word[-2] in self.__vowels or + word[-2] in "wxY" or + word[-3] not in self.__vowels or + word[-4] in self.__vowels): + word = word[:-1] + + word = word.replace("Y", "y") + return word diff --git a/src/whoosh/lang/snowball/finnish.py b/src/whoosh/lang/snowball/finnish.py new file mode 100644 index 0000000..d05207f --- /dev/null +++ b/src/whoosh/lang/snowball/finnish.py @@ -0,0 +1,266 @@ +from .bases import _StandardStemmer + +from whoosh.compat import u + + +class FinnishStemmer(_StandardStemmer): + """ + The Finnish Snowball stemmer. + + :cvar __vowels: The Finnish vowels. + :type __vowels: unicode + :cvar __restricted_vowels: A subset of the Finnish vowels. + :type __restricted_vowels: unicode + :cvar __long_vowels: The Finnish vowels in their long forms. + :type __long_vowels: tuple + :cvar __consonants: The Finnish consonants. + :type __consonants: unicode + :cvar __double_consonants: The Finnish double consonants. + :type __double_consonants: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :note: A detailed description of the Finnish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/finnish/stemmer.html + """ + + __vowels = u("aeiouy\xE4\xF6") + __restricted_vowels = u("aeiou\xE4\xF6") + __long_vowels = ("aa", "ee", "ii", "oo", "uu", u("\xE4\xE4"), + u("\xF6\xF6")) + __consonants = "bcdfghjklmnpqrstvwxz" + __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj", + "kk", "ll", "mm", "nn", "pp", "qq", "rr", + "ss", "tt", "vv", "ww", "xx", "zz") + __step1_suffixes = ('kaan', u('k\xE4\xE4n'), 'sti', 'kin', 'han', + u('h\xE4n'), 'ko', u('k\xF6'), 'pa', u('p\xE4')) + __step2_suffixes = ('nsa', u('ns\xE4'), 'mme', 'nne', 'si', 'ni', + 'an', u('\xE4n'), 'en') + __step3_suffixes = ('siin', 'tten', 'seen', 'han', 'hen', 'hin', + 'hon', u('h\xE4n'), u('h\xF6n'), 'den', 'tta', + u('tt\xE4'), 'ssa', u('ss\xE4'), 'sta', + u('st\xE4'), 'lla', u('ll\xE4'), 'lta', + u('lt\xE4'), 'lle', 'ksi', 'ine', 'ta', + u('t\xE4'), 'na', u('n\xE4'), 'a', u('\xE4'), + 'n') + __step4_suffixes = ('impi', 'impa', u('imp\xE4'), 'immi', 'imma', + u('imm\xE4'), 'mpi', 'mpa', u('mp\xE4'), 'mmi', + 'mma', u('mm\xE4'), 'eja', u('ej\xE4')) + + def stem(self, word): + """ + Stem a Finnish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step3_success = False + + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # STEP 1: Particles etc. + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "sti": + if suffix in r2: + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + else: + if word[-len(suffix) - 1] in u("ntaeiouy\xE4\xF6"): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 2: Possessives + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + if suffix == "si": + if word[-3] != "k": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "ni": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + if word.endswith("kse"): + word = "".join((word[:-3], "ksi")) + + if r1.endswith("kse"): + r1 = "".join((r1[:-3], "ksi")) + + if r2.endswith("kse"): + r2 = "".join((r2[:-3], "ksi")) + + elif suffix == "an": + if (word[-4:-2] in ("ta", "na") or + word[-5:-2] in ("ssa", "sta", "lla", "lta")): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == u("\xE4n"): + if (word[-4:-2] in (u("t\xE4"), u("n\xE4")) or + word[-5:-2] in (u("ss\xE4"), u("st\xE4"), + u("ll\xE4"), u("lt\xE4"))): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "en": + if word[-5:-2] in ("lle", "ine"): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + else: + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + break + + # STEP 3: Cases + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix in ("han", "hen", "hin", "hon", u("h\xE4n"), + u("h\xF6n")): + if ((suffix == "han" and word[-4] == "a") or + (suffix == "hen" and word[-4] == "e") or + (suffix == "hin" and word[-4] == "i") or + (suffix == "hon" and word[-4] == "o") or + (suffix == u("h\xE4n") and word[-4] == u("\xE4")) or + (suffix == u("h\xF6n") and word[-4] == u("\xF6"))): + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + step3_success = True + + elif suffix in ("siin", "den", "tten"): + if (word[-len(suffix) - 1] == "i" and + word[-len(suffix) - 2] in self.__restricted_vowels): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + step3_success = True + else: + continue + + elif suffix == "seen": + if word[-6:-4] in self.__long_vowels: + word = word[:-4] + r1 = r1[:-4] + r2 = r2[:-4] + step3_success = True + else: + continue + + elif suffix in ("a", u("\xE4")): + if word[-2] in self.__vowels and word[-3] in self.__consonants: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + step3_success = True + + elif suffix in ("tta", u("tt\xE4")): + if word[-4] == "e": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + step3_success = True + + elif suffix == "n": + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + step3_success = True + + if word[-2:] == "ie" or word[-2:] in self.__long_vowels: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + step3_success = True + break + + # STEP 4: Other endings + for suffix in self.__step4_suffixes: + if r2.endswith(suffix): + if suffix in ("mpi", "mpa", u("mp\xE4"), "mmi", "mma", + u("mm\xE4")): + if word[-5:-3] != "po": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 5: Plurals + if step3_success and len(r1) >= 1 and r1[-1] in "ij": + word = word[:-1] + r1 = r1[:-1] + + elif (not step3_success and len(r1) >= 2 and + r1[-1] == "t" and r1[-2] in self.__vowels): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + if r2.endswith("imma"): + word = word[:-4] + r1 = r1[:-4] + elif r2.endswith("mma") and r2[-5:-3] != "po": + word = word[:-3] + r1 = r1[:-3] + + # STEP 6: Tidying up + if r1[-2:] in self.__long_vowels: + word = word[:-1] + r1 = r1[:-1] + + if (len(r1) >= 2 and r1[-2] in self.__consonants and + r1[-1] in u("a\xE4ei")): + word = word[:-1] + r1 = r1[:-1] + + if r1.endswith(("oj", "uj")): + word = word[:-1] + r1 = r1[:-1] + + if r1.endswith("jo"): + word = word[:-1] + r1 = r1[:-1] + + # If the word ends with a double consonant + # followed by zero or more vowels, the last consonant is removed. + for i in range(1, len(word)): + if word[-i] in self.__vowels: + continue + else: + if i == 1: + if word[-i - 1:] in self.__double_consonants: + word = word[:-1] + else: + if word[-i - 1:-i + 1] in self.__double_consonants: + word = "".join((word[:-i], word[-i + 1:])) + break + + + return word diff --git a/src/whoosh/lang/snowball/french.py b/src/whoosh/lang/snowball/french.py new file mode 100644 index 0000000..72a7e30 --- /dev/null +++ b/src/whoosh/lang/snowball/french.py @@ -0,0 +1,348 @@ +from .bases import _StandardStemmer + +from whoosh.compat import u + + +class FrenchStemmer(_StandardStemmer): + + """ + The French Snowball stemmer. + + :cvar __vowels: The French vowels. + :type __vowels: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. + :type __step2a_suffixes: tuple + :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. + :type __step2b_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :note: A detailed description of the French + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/french/stemmer.html + """ + + __vowels = u("aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9") + __step1_suffixes = ('issements', 'issement', 'atrices', 'atrice', + 'ateurs', 'ations', 'logies', 'usions', + 'utions', 'ements', 'amment', 'emment', + 'ances', 'iqUes', 'ismes', 'ables', 'istes', + 'ateur', 'ation', 'logie', 'usion', 'ution', + 'ences', 'ement', 'euses', 'ments', 'ance', + 'iqUe', 'isme', 'able', 'iste', 'ence', + u('it\xE9s'), 'ives', 'eaux', 'euse', 'ment', + 'eux', u('it\xE9'), 'ive', 'ifs', 'aux', 'if') + __step2a_suffixes = ('issaIent', 'issantes', 'iraIent', 'issante', + 'issants', 'issions', 'irions', 'issais', + 'issait', 'issant', 'issent', 'issiez', 'issons', + 'irais', 'irait', 'irent', 'iriez', 'irons', + 'iront', 'isses', 'issez', u('\xEEmes'), + u('\xEEtes'), 'irai', 'iras', 'irez', 'isse', + 'ies', 'ira', u('\xEEt'), 'ie', 'ir', 'is', + 'it', 'i') + __step2b_suffixes = ('eraIent', 'assions', 'erions', 'assent', + 'assiez', u('\xE8rent'), 'erais', 'erait', + 'eriez', 'erons', 'eront', 'aIent', 'antes', + 'asses', 'ions', 'erai', 'eras', 'erez', + u('\xE2mes'), u('\xE2tes'), 'ante', 'ants', + 'asse', u('\xE9es'), 'era', 'iez', 'ais', + 'ait', 'ant', u('\xE9e'), u('\xE9s'), 'er', + 'ez', u('\xE2t'), 'ai', 'as', u('\xE9'), 'a') + __step4_suffixes = (u('i\xE8re'), u('I\xE8re'), 'ion', 'ier', 'Ier', + 'e', u('\xEB')) + + def stem(self, word): + """ + Stem a French word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step1_success = False + rv_ending_found = False + step2a_success = False + step2b_success = False + + # Every occurrence of 'u' after 'q' is put into upper case. + for i in range(1, len(word)): + if word[i - 1] == "q" and word[i] == "u": + word = "".join((word[:i], "U", word[i + 1:])) + + # Every occurrence of 'u' and 'i' + # between vowels is put into upper case. + # Every occurrence of 'y' preceded or + # followed by a vowel is also put into upper case. + for i in range(1, len(word) - 1): + if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i + 1:])) + + elif word[i] == "i": + word = "".join((word[:i], "I", word[i + 1:])) + + if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels: + if word[i] == "y": + word = "".join((word[:i], "Y", word[i + 1:])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self.__rv_french(word, self.__vowels) + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "eaux": + word = word[:-1] + step1_success = True + + elif suffix in ("euse", "euses"): + if suffix in r2: + word = word[:-len(suffix)] + step1_success = True + + elif suffix in r1: + word = "".join((word[:-len(suffix)], "eux")) + step1_success = True + + elif suffix in ("ement", "ements") and suffix in rv: + word = word[:-len(suffix)] + step1_success = True + + if word[-2:] == "iv" and "iv" in r2: + word = word[:-2] + + if word[-2:] == "at" and "at" in r2: + word = word[:-2] + + elif word[-3:] == "eus": + if "eus" in r2: + word = word[:-3] + elif "eus" in r1: + word = "".join((word[:-1], "x")) + + elif word[-3:] in ("abl", "iqU"): + if "abl" in r2 or "iqU" in r2: + word = word[:-3] + + elif word[-3:] in (u("i\xE8r"), u("I\xE8r")): + if u("i\xE8r") in rv or u("I\xE8r") in rv: + word = "".join((word[:-3], "i")) + + elif suffix == "amment" and suffix in rv: + word = "".join((word[:-6], "ant")) + rv = "".join((rv[:-6], "ant")) + rv_ending_found = True + + elif suffix == "emment" and suffix in rv: + word = "".join((word[:-6], "ent")) + rv_ending_found = True + + elif (suffix in ("ment", "ments") and suffix in rv and + not rv.startswith(suffix) and + rv[rv.rindex(suffix) - 1] in self.__vowels): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + rv_ending_found = True + + elif suffix == "aux" and suffix in r1: + word = "".join((word[:-2], "l")) + step1_success = True + + elif (suffix in ("issement", "issements") and suffix in r1 + and word[-len(suffix) - 1] not in self.__vowels): + word = word[:-len(suffix)] + step1_success = True + + elif suffix in ("ance", "iqUe", "isme", "able", "iste", + "eux", "ances", "iqUes", "ismes", + "ables", "istes") and suffix in r2: + word = word[:-len(suffix)] + step1_success = True + + elif suffix in ("atrice", "ateur", "ation", "atrices", + "ateurs", "ations") and suffix in r2: + word = word[:-len(suffix)] + step1_success = True + + if word[-2:] == "ic": + if "ic" in r2: + word = word[:-2] + else: + word = "".join((word[:-2], "iqU")) + + elif suffix in ("logie", "logies") and suffix in r2: + word = "".join((word[:-len(suffix)], "log")) + step1_success = True + + elif (suffix in ("usion", "ution", "usions", "utions") and + suffix in r2): + word = "".join((word[:-len(suffix)], "u")) + step1_success = True + + elif suffix in ("ence", "ences") and suffix in r2: + word = "".join((word[:-len(suffix)], "ent")) + step1_success = True + + elif suffix in (u("it\xE9"), u("it\xE9s")) and suffix in r2: + word = word[:-len(suffix)] + step1_success = True + + if word[-4:] == "abil": + if "abil" in r2: + word = word[:-4] + else: + word = "".join((word[:-2], "l")) + + elif word[-2:] == "ic": + if "ic" in r2: + word = word[:-2] + else: + word = "".join((word[:-2], "iqU")) + + elif word[-2:] == "iv": + if "iv" in r2: + word = word[:-2] + + elif (suffix in ("if", "ive", "ifs", "ives") and + suffix in r2): + word = word[:-len(suffix)] + step1_success = True + + if word[-2:] == "at" and "at" in r2: + word = word[:-2] + + if word[-2:] == "ic": + if "ic" in r2: + word = word[:-2] + else: + word = "".join((word[:-2], "iqU")) + break + + # STEP 2a: Verb suffixes beginning 'i' + if not step1_success or rv_ending_found: + for suffix in self.__step2a_suffixes: + if word.endswith(suffix): + if (suffix in rv and len(rv) > len(suffix) and + rv[rv.rindex(suffix) - 1] not in self.__vowels): + word = word[:-len(suffix)] + step2a_success = True + break + + # STEP 2b: Other verb suffixes + if not step2a_success: + for suffix in self.__step2b_suffixes: + if rv.endswith(suffix): + if suffix == "ions" and "ions" in r2: + word = word[:-4] + step2b_success = True + + elif suffix in ('eraIent', 'erions', u('\xE8rent'), + 'erais', 'erait', 'eriez', + 'erons', 'eront', 'erai', 'eras', + 'erez', u('\xE9es'), 'era', 'iez', + u('\xE9e'), u('\xE9s'), 'er', 'ez', + u('\xE9')): + word = word[:-len(suffix)] + step2b_success = True + + elif suffix in ('assions', 'assent', 'assiez', + 'aIent', 'antes', 'asses', + u('\xE2mes'), u('\xE2tes'), 'ante', + 'ants', 'asse', 'ais', 'ait', + 'ant', u('\xE2t'), 'ai', 'as', + 'a'): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + step2b_success = True + if rv.endswith("e"): + word = word[:-1] + break + + # STEP 3 + if step1_success or step2a_success or step2b_success: + if word[-1] == "Y": + word = "".join((word[:-1], "i")) + elif word[-1] == u("\xE7"): + word = "".join((word[:-1], "c")) + + # STEP 4: Residual suffixes + else: + if (len(word) >= 2 and word[-1] == "s" and + word[-2] not in u("aiou\xE8s")): + word = word[:-1] + + for suffix in self.__step4_suffixes: + if word.endswith(suffix): + if suffix in rv: + if (suffix == "ion" and suffix in r2 and + rv[-4] in "st"): + word = word[:-3] + + elif suffix in ("ier", u("i\xE8re"), "Ier", + u("I\xE8re")): + word = "".join((word[:-len(suffix)], "i")) + + elif suffix == "e": + word = word[:-1] + + elif suffix == u("\xEB") and word[-3:-1] == "gu": + word = word[:-1] + break + + # STEP 5: Undouble + if word.endswith(("enn", "onn", "ett", "ell", "eill")): + word = word[:-1] + + # STEP 6: Un-accent + for i in range(1, len(word)): + if word[-i] not in self.__vowels: + i += 1 + else: + if i != 1 and word[-i] in (u("\xE9"), u("\xE8")): + word = "".join((word[:-i], "e", word[-i + 1:])) + break + + word = (word.replace("I", "i") + .replace("U", "u") + .replace("Y", "y")) + return word + + def __rv_french(self, word, vowels): + """ + Return the region RV that is used by the French stemmer. + + If the word begins with two vowels, RV is the region after + the third letter. Otherwise, it is the region after the first + vowel not at the beginning of the word, or the end of the word + if these positions cannot be found. (Exceptionally, u'par', + u'col' or u'tap' at the beginning of a word is also taken to + define RV as the region to their right.) + + :param word: The French word whose region RV is determined. + :type word: str or unicode + :param vowels: The French vowels that are used to determine + the region RV. + :type vowels: unicode + :return: the region RV for the respective French word. + :rtype: unicode + :note: This helper method is invoked by the stem method of + the subclass FrenchStemmer. It is not to be invoked directly! + + """ + rv = "" + if len(word) >= 2: + if (word.startswith(("par", "col", "tap")) or + (word[0] in vowels and word[1] in vowels)): + rv = word[3:] + else: + for i in range(1, len(word)): + if word[i] in vowels: + rv = word[i + 1:] + break + + return rv diff --git a/src/whoosh/lang/snowball/german.py b/src/whoosh/lang/snowball/german.py new file mode 100644 index 0000000..73743f1 --- /dev/null +++ b/src/whoosh/lang/snowball/german.py @@ -0,0 +1,144 @@ +from .bases import _StandardStemmer + +from whoosh.compat import u + + +class GermanStemmer(_StandardStemmer): + + """ + The German Snowball stemmer. + + :cvar __vowels: The German vowels. + :type __vowels: unicode + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __st_ending: Letter that may directly appear before a word final 'st'. + :type __st_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the German + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/german/stemmer.html + + """ + + __vowels = u("aeiouy\xE4\xF6\xFC") + __s_ending = "bdfghklmnrt" + __st_ending = "bdfghklmnt" + + __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s") + __step2_suffixes = ("est", "en", "er", "st") + __step3_suffixes = ("isch", "lich", "heit", "keit", + "end", "ung", "ig", "ik") + + def stem(self, word): + """ + Stem a German word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + word = word.replace(u("\xDF"), "ss") + + # Every occurrence of 'u' and 'y' + # between vowels is put into upper case. + for i in range(1, len(word) - 1): + if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i + 1:])) + + elif word[i] == "y": + word = "".join((word[:i], "Y", word[i + 1:])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # R1 is adjusted so that the region before it + # contains at least 3 letters. + for i in range(1, len(word)): + if word[i] not in self.__vowels and word[i - 1] in self.__vowels: + if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0: + r1 = word[3:] + elif len(word[:i + 1]) == 0: + return word + break + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if (suffix in ("en", "es", "e") and + word[-len(suffix) - 4:-len(suffix)] == "niss"): + word = word[:-len(suffix) - 1] + r1 = r1[:-len(suffix) - 1] + r2 = r2[:-len(suffix) - 1] + + elif suffix == "s": + if word[-2] in self.__s_ending: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + if suffix == "st": + if word[-3] in self.__st_ending and len(word[:-3]) >= 3: + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 3: Derivational suffixes + for suffix in self.__step3_suffixes: + if r2.endswith(suffix): + if suffix in ("end", "ung"): + if ("ig" in r2[-len(suffix) - 2:-len(suffix)] and + "e" not in r2[-len(suffix) - 3:-len(suffix) - 2]): + word = word[:-len(suffix) - 2] + else: + word = word[:-len(suffix)] + + elif (suffix in ("ig", "ik", "isch") and + "e" not in r2[-len(suffix) - 1:-len(suffix)]): + word = word[:-len(suffix)] + + elif suffix in ("lich", "heit"): + if ("er" in r1[-len(suffix) - 2:-len(suffix)] or + "en" in r1[-len(suffix) - 2:-len(suffix)]): + word = word[:-len(suffix) - 2] + else: + word = word[:-len(suffix)] + + elif suffix == "keit": + if "lich" in r2[-len(suffix) - 4:-len(suffix)]: + word = word[:-len(suffix) - 4] + + elif "ig" in r2[-len(suffix) - 2:-len(suffix)]: + word = word[:-len(suffix) - 2] + else: + word = word[:-len(suffix)] + break + + # Umlaut accents are removed and + # 'u' and 'y' are put back into lower case. + word = (word.replace(u("\xE4"), "a").replace(u("\xF6"), "o") + .replace(u("\xFC"), "u").replace("U", "u") + .replace("Y", "y")) + return word diff --git a/src/whoosh/lang/snowball/hungarian.py b/src/whoosh/lang/snowball/hungarian.py new file mode 100644 index 0000000..ed98b6f --- /dev/null +++ b/src/whoosh/lang/snowball/hungarian.py @@ -0,0 +1,268 @@ +from whoosh.compat import u + +class HungarianStemmer(object): + + """ + The Hungarian Snowball stemmer. + + :cvar __vowels: The Hungarian vowels. + :type __vowels: unicode + :cvar __digraphs: The Hungarian digraphs. + :type __digraphs: tuple + :cvar __double_consonants: The Hungarian double consonants. + :type __double_consonants: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. + :type __step5_suffixes: tuple + :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm. + :type __step6_suffixes: tuple + :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm. + :type __step7_suffixes: tuple + :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm. + :type __step8_suffixes: tuple + :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm. + :type __step9_suffixes: tuple + :note: A detailed description of the Hungarian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/hungarian/stemmer.html + + """ + + __vowels = u("aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB") + __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs") + __double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg", + "ggy", "jj", "kk", "ll", "lly", "mm", + "nn", "nny", "pp", "rr", "ss", "ssz", + "tt", "tty", "vv", "zz", "zzs") + + __step1_suffixes = ("al", "el") + __step2_suffixes = (u('k\xE9ppen'), u('onk\xE9nt'), u('enk\xE9nt'), + u('ank\xE9nt'), u('k\xE9pp'), u('k\xE9nt'), 'ban', + 'ben', 'nak', 'nek', 'val', 'vel', u('t\xF3l'), + u('t\xF5l'), u('r\xF3l'), u('r\xF5l'), u('b\xF3l'), + u('b\xF5l'), 'hoz', 'hez', u('h\xF6z'), + u('n\xE1l'), u('n\xE9l'), u('\xE9rt'), 'kor', + 'ba', 'be', 'ra', 're', 'ig', 'at', 'et', + 'ot', u('\xF6t'), 'ul', u('\xFCl'), u('v\xE1'), + u('v\xE9'), 'en', 'on', 'an', u('\xF6n'), + 'n', 't') + __step3_suffixes = (u("\xE1nk\xE9nt"), u("\xE1n"), u("\xE9n")) + __step4_suffixes = ('astul', u('est\xFCl'), u('\xE1stul'), + u('\xE9st\xFCl'), 'stul', u('st\xFCl')) + __step5_suffixes = (u("\xE1"), u("\xE9")) + __step6_suffixes = (u('ok\xE9'), u('\xF6k\xE9'), u('ak\xE9'), + u('ek\xE9'), u('\xE1k\xE9'), u('\xE1\xE9i'), + u('\xE9k\xE9'), u('\xE9\xE9i'), u('k\xE9'), + u('\xE9i'), u('\xE9\xE9'), u('\xE9')) + __step7_suffixes = (u('\xE1juk'), u('\xE9j\xFCk'), u('\xFCnk'), + 'unk', 'juk', u('j\xFCk'), u('\xE1nk'), + u('\xE9nk'), 'nk', 'uk', u('\xFCk'), 'em', + 'om', 'am', 'od', 'ed', 'ad', u('\xF6d'), + 'ja', 'je', u('\xE1m'), u('\xE1d'), u('\xE9m'), + u('\xE9d'), 'm', 'd', 'a', 'e', 'o', + u('\xE1'), u('\xE9')) + __step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok', + 'eitek', u('\xE1itok'), u('\xE9itek'), 'jaim', + 'jeim', 'jaid', 'jeid', 'eink', 'aink', + 'itek', 'jeik', 'jaik', u('\xE1ink'), + u('\xE9ink'), 'aim', 'eim', 'aid', 'eid', + 'jai', 'jei', 'ink', 'aik', 'eik', + u('\xE1im'), u('\xE1id'), u('\xE1ik'), u('\xE9im'), + u('\xE9id'), u('\xE9ik'), 'im', 'id', 'ai', + 'ei', 'ik', u('\xE1i'), u('\xE9i'), 'i') + __step9_suffixes = (u("\xE1k"), u("\xE9k"), u("\xF6k"), "ok", + "ek", "ak", "k") + + def stem(self, word): + """ + Stem an Hungarian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs) + + # STEP 1: Remove instrumental case + if r1.endswith(self.__step1_suffixes): + for double_cons in self.__double_consonants: + if word[-2 - len(double_cons):-2] == double_cons: + word = "".join((word[:-4], word[-3])) + + if r1[-2 - len(double_cons):-2] == double_cons: + r1 = "".join((r1[:-4], r1[-3])) + break + + # STEP 2: Remove frequent cases + for suffix in self.__step2_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + + if r1.endswith(u("\xE1")): + word = "".join((word[:-1], "a")) + r1 = "".join((r1[:-1], "a")) + + elif r1.endswith(u("\xE9")): + word = "".join((word[:-1], "e")) + r1 = "".join((r1[:-1], "e")) + break + + # STEP 3: Remove special cases + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix == u("\xE9n"): + word = "".join((word[:-2], "e")) + r1 = "".join((r1[:-2], "e")) + else: + word = "".join((word[:-len(suffix)], "a")) + r1 = "".join((r1[:-len(suffix)], "a")) + break + + # STEP 4: Remove other cases + for suffix in self.__step4_suffixes: + if r1.endswith(suffix): + if suffix == u("\xE1stul"): + word = "".join((word[:-5], "a")) + r1 = "".join((r1[:-5], "a")) + + elif suffix == u("\xE9st\xFCl"): + word = "".join((word[:-5], "e")) + r1 = "".join((r1[:-5], "e")) + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 5: Remove factive case + for suffix in self.__step5_suffixes: + if r1.endswith(suffix): + for double_cons in self.__double_consonants: + if word[-1 - len(double_cons):-1] == double_cons: + word = "".join((word[:-3], word[-2])) + + if r1[-1 - len(double_cons):-1] == double_cons: + r1 = "".join((r1[:-3], r1[-2])) + break + + # STEP 6: Remove owned + for suffix in self.__step6_suffixes: + if r1.endswith(suffix): + if suffix in (u("\xE1k\xE9"), u("\xE1\xE9i")): + word = "".join((word[:-3], "a")) + r1 = "".join((r1[:-3], "a")) + + elif suffix in (u("\xE9k\xE9"), u("\xE9\xE9i"), + u("\xE9\xE9")): + word = "".join((word[:-len(suffix)], "e")) + r1 = "".join((r1[:-len(suffix)], "e")) + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 7: Remove singular owner suffixes + for suffix in self.__step7_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix in (u("\xE1nk"), u("\xE1juk"), u("\xE1m"), + u("\xE1d"), u("\xE1")): + word = "".join((word[:-len(suffix)], "a")) + r1 = "".join((r1[:-len(suffix)], "a")) + + elif suffix in (u("\xE9nk"), u("\xE9j\xFCk"), + u("\xE9m"), u("\xE9d"), u("\xE9")): + word = "".join((word[:-len(suffix)], "e")) + r1 = "".join((r1[:-len(suffix)], "e")) + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 8: Remove plural owner suffixes + for suffix in self.__step8_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix in (u("\xE1im"), u("\xE1id"), u("\xE1i"), + u("\xE1ink"), u("\xE1itok"), u("\xE1ik")): + word = "".join((word[:-len(suffix)], "a")) + r1 = "".join((r1[:-len(suffix)], "a")) + + elif suffix in (u("\xE9im"), u("\xE9id"), u("\xE9i"), + u("\xE9ink"), u("\xE9itek"), u("\xE9ik")): + word = "".join((word[:-len(suffix)], "e")) + r1 = "".join((r1[:-len(suffix)], "e")) + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 9: Remove plural suffixes + for suffix in self.__step9_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix == u("\xE1k"): + word = "".join((word[:-2], "a")) + elif suffix == u("\xE9k"): + word = "".join((word[:-2], "e")) + else: + word = word[:-len(suffix)] + break + + return word + + def __r1_hungarian(self, word, vowels, digraphs): + """ + Return the region R1 that is used by the Hungarian stemmer. + + If the word begins with a vowel, R1 is defined as the region + after the first consonant or digraph (= two letters stand for + one phoneme) in the word. If the word begins with a consonant, + it is defined as the region after the first vowel in the word. + If the word does not contain both a vowel and consonant, R1 + is the null region at the end of the word. + + :param word: The Hungarian word whose region R1 is determined. + :type word: str or unicode + :param vowels: The Hungarian vowels that are used to determine + the region R1. + :type vowels: unicode + :param digraphs: The digraphs that are used to determine the + region R1. + :type digraphs: tuple + :return: the region R1 for the respective word. + :rtype: unicode + :note: This helper method is invoked by the stem method of the subclass + HungarianStemmer. It is not to be invoked directly! + + """ + r1 = "" + if word[0] in vowels: + for digraph in digraphs: + if digraph in word[1:]: + r1 = word[word.index(digraph[-1]) + 1:] + return r1 + + for i in range(1, len(word)): + if word[i] not in vowels: + r1 = word[i + 1:] + break + else: + for i in range(1, len(word)): + if word[i] in vowels: + r1 = word[i + 1:] + break + + return r1 diff --git a/src/whoosh/lang/snowball/italian.py b/src/whoosh/lang/snowball/italian.py new file mode 100644 index 0000000..8a98146 --- /dev/null +++ b/src/whoosh/lang/snowball/italian.py @@ -0,0 +1,230 @@ +from .bases import _StandardStemmer + +from whoosh.compat import u + + +class ItalianStemmer(_StandardStemmer): + + """ + The Italian Snowball stemmer. + + :cvar __vowels: The Italian vowels. + :type __vowels: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :note: A detailed description of the Italian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/italian/stemmer.html + + """ + + __vowels = u("aeiou\xE0\xE8\xEC\xF2\xF9") + __step0_suffixes = ('gliela', 'gliele', 'glieli', 'glielo', + 'gliene', 'sene', 'mela', 'mele', 'meli', + 'melo', 'mene', 'tela', 'tele', 'teli', + 'telo', 'tene', 'cela', 'cele', 'celi', + 'celo', 'cene', 'vela', 'vele', 'veli', + 'velo', 'vene', 'gli', 'ci', 'la', 'le', + 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi') + __step1_suffixes = ('atrice', 'atrici', 'azione', 'azioni', + 'uzione', 'uzioni', 'usione', 'usioni', + 'amento', 'amenti', 'imento', 'imenti', + 'amente', 'abile', 'abili', 'ibile', 'ibili', + 'mente', 'atore', 'atori', 'logia', 'logie', + 'anza', 'anze', 'iche', 'ichi', 'ismo', + 'ismi', 'ista', 'iste', 'isti', u('ist\xE0'), + u('ist\xE8'), u('ist\xEC'), 'ante', 'anti', + 'enza', 'enze', 'ico', 'ici', 'ica', 'ice', + 'oso', 'osi', 'osa', 'ose', u('it\xE0'), + 'ivo', 'ivi', 'iva', 'ive') + __step2_suffixes = ('erebbero', 'irebbero', 'assero', 'assimo', + 'eranno', 'erebbe', 'eremmo', 'ereste', + 'eresti', 'essero', 'iranno', 'irebbe', + 'iremmo', 'ireste', 'iresti', 'iscano', + 'iscono', 'issero', 'arono', 'avamo', 'avano', + 'avate', 'eremo', 'erete', 'erono', 'evamo', + 'evano', 'evate', 'iremo', 'irete', 'irono', + 'ivamo', 'ivano', 'ivate', 'ammo', 'ando', + 'asse', 'assi', 'emmo', 'enda', 'ende', + 'endi', 'endo', 'erai', 'erei', 'Yamo', + 'iamo', 'immo', 'irai', 'irei', 'isca', + 'isce', 'isci', 'isco', 'ano', 'are', 'ata', + 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', + u('er\xE0'), 'ere', u('er\xF2'), 'ete', 'eva', + 'evi', 'evo', u('ir\xE0'), 'ire', u('ir\xF2'), + 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', + 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', + 'ar', 'ir') + + def stem(self, word): + """ + Stem an Italian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step1_success = False + + # All acute accents are replaced by grave accents. + word = (word.replace(u("\xE1"), u("\xE0")) + .replace(u("\xE9"), u("\xE8")) + .replace(u("\xED"), u("\xEC")) + .replace(u("\xF3"), u("\xF2")) + .replace(u("\xFA"), u("\xF9"))) + + # Every occurrence of 'u' after 'q' + # is put into upper case. + for i in range(1, len(word)): + if word[i - 1] == "q" and word[i] == "u": + word = "".join((word[:i], "U", word[i + 1:])) + + # Every occurrence of 'u' and 'i' + # between vowels is put into upper case. + for i in range(1, len(word) - 1): + if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i + 1:])) + elif word[i] == "i": + word = "".join((word[:i], "I", word[i + 1:])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 0: Attached pronoun + for suffix in self.__step0_suffixes: + if rv.endswith(suffix): + if rv[-len(suffix) - 4:-len(suffix)] in ("ando", "endo"): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + elif (rv[-len(suffix) - 2:-len(suffix)] in + ("ar", "er", "ir")): + word = "".join((word[:-len(suffix)], "e")) + r1 = "".join((r1[:-len(suffix)], "e")) + r2 = "".join((r2[:-len(suffix)], "e")) + rv = "".join((rv[:-len(suffix)], "e")) + break + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "amente" and r1.endswith(suffix): + step1_success = True + word = word[:-6] + r2 = r2[:-6] + rv = rv[:-6] + + if r2.endswith("iv"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(("os", "ic")): + word = word[:-2] + rv = rv[:-2] + + elif r2 .endswith("abil"): + word = word[:-4] + rv = rv[:-4] + + elif (suffix in ("amento", "amenti", + "imento", "imenti") and + rv.endswith(suffix)): + step1_success = True + word = word[:-6] + rv = rv[:-6] + + elif r2.endswith(suffix): + step1_success = True + if suffix in ("azione", "azioni", "atore", "atori"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + if r2.endswith("ic"): + word = word[:-2] + rv = rv[:-2] + + elif suffix in ("logia", "logie"): + word = word[:-2] + rv = word[:-2] + + elif suffix in ("uzione", "uzioni", + "usione", "usioni"): + word = word[:-5] + rv = rv[:-5] + + elif suffix in ("enza", "enze"): + word = "".join((word[:-2], "te")) + rv = "".join((rv[:-2], "te")) + + elif suffix == u("it\xE0"): + word = word[:-3] + r2 = r2[:-3] + rv = rv[:-3] + + if r2.endswith(("ic", "iv")): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith("abil"): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("ivo", "ivi", "iva", "ive"): + word = word[:-3] + r2 = r2[:-3] + rv = rv[:-3] + + if r2.endswith("at"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("ic"): + word = word[:-2] + rv = rv[:-2] + else: + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 2: Verb suffixes + if not step1_success: + for suffix in self.__step2_suffixes: + if rv.endswith(suffix): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 3a + if rv.endswith(("a", "e", "i", "o", u("\xE0"), u("\xE8"), + u("\xEC"), u("\xF2"))): + word = word[:-1] + rv = rv[:-1] + + if rv.endswith("i"): + word = word[:-1] + rv = rv[:-1] + + # STEP 3b + if rv.endswith(("ch", "gh")): + word = word[:-1] + + word = word.replace("I", "i").replace("U", "u") + return word diff --git a/src/whoosh/lang/snowball/norwegian.py b/src/whoosh/lang/snowball/norwegian.py new file mode 100644 index 0000000..79f872a --- /dev/null +++ b/src/whoosh/lang/snowball/norwegian.py @@ -0,0 +1,84 @@ +from .bases import _ScandinavianStemmer + +from whoosh.compat import u + + +class NorwegianStemmer(_ScandinavianStemmer): + + """ + The Norwegian Snowball stemmer. + + :cvar __vowels: The Norwegian vowels. + :type __vowels: unicode + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Norwegian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/norwegian/stemmer.html + + """ + + __vowels = u("aeiouy\xE6\xE5\xF8") + __s_ending = "bcdfghjlmnoprtvyz" + __step1_suffixes = ("hetenes", "hetene", "hetens", "heter", + "heten", "endes", "ande", "ende", "edes", + "enes", "erte", "ede", "ane", "ene", "ens", + "ers", "ets", "het", "ast", "ert", "en", + "ar", "er", "as", "es", "et", "a", "e", "s") + + __step2_suffixes = ("dt", "vt") + + __step3_suffixes = ("hetslov", "eleg", "elig", "elov", "slov", + "leg", "eig", "lig", "els", "lov", "ig") + + def stem(self, word): + """ + Stem a Norwegian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + r1 = self._r1_scandinavian(word, self.__vowels) + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix in ("erte", "ert"): + word = "".join((word[:-len(suffix)], "er")) + r1 = "".join((r1[:-len(suffix)], "er")) + + elif suffix == "s": + if (word[-2] in self.__s_ending or + (word[-2] == "k" and word[-3] not in self.__vowels)): + word = word[:-1] + r1 = r1[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 3 + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + word = word[:-len(suffix)] + break + + return word diff --git a/src/whoosh/lang/snowball/portugese.py b/src/whoosh/lang/snowball/portugese.py new file mode 100644 index 0000000..c5fe0c3 --- /dev/null +++ b/src/whoosh/lang/snowball/portugese.py @@ -0,0 +1,205 @@ +from .bases import _StandardStemmer + +from whoosh.compat import u + + +class PortugueseStemmer(_StandardStemmer): + + """ + The Portuguese Snowball stemmer. + + :cvar __vowels: The Portuguese vowels. + :type __vowels: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :note: A detailed description of the Portuguese + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/portuguese/stemmer.html + + """ + + __vowels = u("aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4") + __step1_suffixes = ('amentos', 'imentos', 'uciones', 'amento', + 'imento', 'adoras', 'adores', u('a\xE7o~es'), + u('log\xEDas'), u('\xEAncias'), 'amente', + 'idades', 'ismos', 'istas', 'adora', + u('a\xE7a~o'), 'antes', u('\xE2ncia'), + u('log\xEDa'), u('uci\xF3n'), u('\xEAncia'), + 'mente', 'idade', 'ezas', 'icos', 'icas', + 'ismo', u('\xE1vel'), u('\xEDvel'), 'ista', + 'osos', 'osas', 'ador', 'ante', 'ivas', + 'ivos', 'iras', 'eza', 'ico', 'ica', + 'oso', 'osa', 'iva', 'ivo', 'ira') + __step2_suffixes = (u('ar\xEDamos'), u('er\xEDamos'), u('ir\xEDamos'), + u('\xE1ssemos'), u('\xEAssemos'), u('\xEDssemos'), + u('ar\xEDeis'), u('er\xEDeis'), u('ir\xEDeis'), + u('\xE1sseis'), u('\xE9sseis'), u('\xEDsseis'), + u('\xE1ramos'), u('\xE9ramos'), u('\xEDramos'), + u('\xE1vamos'), 'aremos', 'eremos', 'iremos', + 'ariam', 'eriam', 'iriam', 'assem', 'essem', + 'issem', 'ara~o', 'era~o', 'ira~o', 'arias', + 'erias', 'irias', 'ardes', 'erdes', 'irdes', + 'asses', 'esses', 'isses', 'astes', 'estes', + 'istes', u('\xE1reis'), 'areis', u('\xE9reis'), + 'ereis', u('\xEDreis'), 'ireis', u('\xE1veis'), + u('\xEDamos'), 'armos', 'ermos', 'irmos', + 'aria', 'eria', 'iria', 'asse', 'esse', + 'isse', 'aste', 'este', 'iste', 'arei', + 'erei', 'irei', 'aram', 'eram', 'iram', + 'avam', 'arem', 'erem', 'irem', + 'ando', 'endo', 'indo', 'adas', 'idas', + u('ar\xE1s'), 'aras', u('er\xE1s'), 'eras', + u('ir\xE1s'), 'avas', 'ares', 'eres', 'ires', + u('\xEDeis'), 'ados', 'idos', u('\xE1mos'), + 'amos', 'emos', 'imos', 'iras', 'ada', 'ida', + u('ar\xE1'), 'ara', u('er\xE1'), 'era', + u('ir\xE1'), 'ava', 'iam', 'ado', 'ido', + 'ias', 'ais', 'eis', 'ira', 'ia', 'ei', 'am', + 'em', 'ar', 'er', 'ir', 'as', + 'es', 'is', 'eu', 'iu', 'ou') + __step4_suffixes = ("os", "a", "i", "o", u("\xE1"), + u("\xED"), u("\xF3")) + + def stem(self, word): + """ + Stem a Portuguese word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step1_success = False + step2_success = False + + word = (word.replace(u("\xE3"), "a~") + .replace(u("\xF5"), "o~")) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "amente" and r1.endswith(suffix): + step1_success = True + + word = word[:-6] + r2 = r2[:-6] + rv = rv[:-6] + + if r2.endswith("iv"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(("os", "ic", "ad")): + word = word[:-2] + rv = rv[:-2] + + elif (suffix in ("ira", "iras") and rv.endswith(suffix) and + word[-len(suffix) - 1:-len(suffix)] == "e"): + step1_success = True + + word = "".join((word[:-len(suffix)], "ir")) + rv = "".join((rv[:-len(suffix)], "ir")) + + elif r2.endswith(suffix): + step1_success = True + + if suffix in (u("log\xEDa"), u("log\xEDas")): + word = word[:-2] + rv = rv[:-2] + + elif suffix in (u("uci\xF3n"), "uciones"): + word = "".join((word[:-len(suffix)], "u")) + rv = "".join((rv[:-len(suffix)], "u")) + + elif suffix in (u("\xEAncia"), u("\xEAncias")): + word = "".join((word[:-len(suffix)], "ente")) + rv = "".join((rv[:-len(suffix)], "ente")) + + elif suffix == "mente": + word = word[:-5] + r2 = r2[:-5] + rv = rv[:-5] + + if r2.endswith(("ante", "avel", u("\xEDvel"))): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("idade", "idades"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + if r2.endswith(("ic", "iv")): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith("abil"): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("iva", "ivo", "ivas", "ivos"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + else: + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 2: Verb suffixes + if not step1_success: + for suffix in self.__step2_suffixes: + if rv.endswith(suffix): + step2_success = True + + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 3 + if step1_success or step2_success: + if rv.endswith("i") and word[-2] == "c": + word = word[:-1] + rv = rv[:-1] + + ### STEP 4: Residual suffix + if not step1_success and not step2_success: + for suffix in self.__step4_suffixes: + if rv.endswith(suffix): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 5 + if rv.endswith(("e", u("\xE9"), u("\xEA"))): + word = word[:-1] + rv = rv[:-1] + + if ((word.endswith("gu") and rv.endswith("u")) or + (word.endswith("ci") and rv.endswith("i"))): + word = word[:-1] + + elif word.endswith(u("\xE7")): + word = "".join((word[:-1], "c")) + + word = word.replace("a~", u("\xE3")).replace("o~", u("\xF5")) + return word diff --git a/src/whoosh/lang/snowball/romanian.py b/src/whoosh/lang/snowball/romanian.py new file mode 100644 index 0000000..2b9e1b7 --- /dev/null +++ b/src/whoosh/lang/snowball/romanian.py @@ -0,0 +1,253 @@ +from .bases import _StandardStemmer + +from whoosh.compat import u + + +class RomanianStemmer(_StandardStemmer): + + """ + The Romanian Snowball stemmer. + + :cvar __vowels: The Romanian vowels. + :type __vowels: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Romanian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/romanian/stemmer.html + + """ + + __vowels = u("aeiou\u0103\xE2\xEE") + __step0_suffixes = ('iilor', 'ului', 'elor', 'iile', 'ilor', + 'atei', u('a\u0163ie'), u('a\u0163ia'), 'aua', + 'ele', 'iua', 'iei', 'ile', 'ul', 'ea', + 'ii') + __step1_suffixes = ('abilitate', 'abilitati', u('abilit\u0103\u0163i'), + 'ibilitate', u('abilit\u0103i'), 'ivitate', + 'ivitati', u('ivit\u0103\u0163i'), 'icitate', + 'icitati', u('icit\u0103\u0163i'), 'icatori', + u('ivit\u0103i'), u('icit\u0103i'), 'icator', + u('a\u0163iune'), 'atoare', u('\u0103toare'), + u('i\u0163iune'), 'itoare', 'iciva', 'icive', + 'icivi', u('iciv\u0103'), 'icala', 'icale', + 'icali', u('ical\u0103'), 'ativa', 'ative', + 'ativi', u('ativ\u0103'), 'atori', u('\u0103tori'), + 'itiva', 'itive', 'itivi', u('itiv\u0103'), + 'itori', 'iciv', 'ical', 'ativ', 'ator', + u('\u0103tor'), 'itiv', 'itor') + __step2_suffixes = ('abila', 'abile', 'abili', u('abil\u0103'), + 'ibila', 'ibile', 'ibili', u('ibil\u0103'), + 'atori', 'itate', 'itati', u('it\u0103\u0163i'), + 'abil', 'ibil', 'oasa', u('oas\u0103'), 'oase', + 'anta', 'ante', 'anti', u('ant\u0103'), 'ator', + u('it\u0103i'), 'iune', 'iuni', 'isme', 'ista', + 'iste', 'isti', u('ist\u0103'), u('i\u015Fti'), + 'ata', u('at\u0103'), 'ati', 'ate', 'uta', + u('ut\u0103'), 'uti', 'ute', 'ita', u('it\u0103'), + 'iti', 'ite', 'ica', 'ice', 'ici', u('ic\u0103'), + 'osi', u('o\u015Fi'), 'ant', 'iva', 'ive', 'ivi', + u('iv\u0103'), 'ism', 'ist', 'at', 'ut', 'it', + 'ic', 'os', 'iv') + __step3_suffixes = (u('seser\u0103\u0163i'), u('aser\u0103\u0163i'), + u('iser\u0103\u0163i'), u('\xE2ser\u0103\u0163i'), + u('user\u0103\u0163i'), u('seser\u0103m'), + u('aser\u0103m'), u('iser\u0103m'), u('\xE2ser\u0103m'), + u('user\u0103m'), u('ser\u0103\u0163i'), u('sese\u015Fi'), + u('seser\u0103'), u('easc\u0103'), u('ar\u0103\u0163i'), + u('ur\u0103\u0163i'), u('ir\u0103\u0163i'), + u('\xE2r\u0103\u0163i'), u('ase\u015Fi'), + u('aser\u0103'), u('ise\u015Fi'), u('iser\u0103'), + u('\xe2se\u015Fi'), u('\xE2ser\u0103'), + u('use\u015Fi'), u('user\u0103'), u('ser\u0103m'), + 'sesem', 'indu', '\xE2ndu', u('eaz\u0103'), + u('e\u015Fti'), u('e\u015Fte'), u('\u0103\u015Fti'), + u('\u0103\u015Fte'), u('ea\u0163i'), u('ia\u0163i'), + u('ar\u0103m'), u('ur\u0103m'), u('ir\u0103m'), + u('\xE2r\u0103m'), 'asem', 'isem', + '\xE2sem', 'usem', u('se\u015Fi'), u('ser\u0103'), + 'sese', 'are', 'ere', 'ire', '\xE2re', + 'ind', '\xE2nd', 'eze', 'ezi', 'esc', + u('\u0103sc'), 'eam', 'eai', 'eau', 'iam', + 'iai', 'iau', u('a\u015Fi'), u('ar\u0103'), + u('u\u015Fi'), u('ur\u0103'), u('i\u015Fi'), u('ir\u0103'), + u('\xE2\u015Fi'), u('\xe2r\u0103'), 'ase', + 'ise', '\xE2se', 'use', u('a\u0163i'), + u('e\u0163i'), u('i\u0163i'), u('\xe2\u0163i'), 'sei', + 'ez', 'am', 'ai', 'au', 'ea', 'ia', 'ui', + '\xE2i', u('\u0103m'), 'em', 'im', '\xE2m', + 'se') + + def stem(self, word): + """ + Stem a Romanian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step1_success = False + step2_success = False + + for i in range(1, len(word) - 1): + if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i + 1:])) + + elif word[i] == "i": + word = "".join((word[:i], "I", word[i + 1:])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 0: Removal of plurals and other simplifications + for suffix in self.__step0_suffixes: + if word.endswith(suffix): + if suffix in r1: + if suffix in ("ul", "ului"): + word = word[:-len(suffix)] + + if suffix in rv: + rv = rv[:-len(suffix)] + else: + rv = "" + + elif (suffix == "aua" or suffix == "atei" or + (suffix == "ile" and word[-5:-3] != "ab")): + word = word[:-2] + + elif suffix in ("ea", "ele", "elor"): + word = "".join((word[:-len(suffix)], "e")) + + if suffix in rv: + rv = "".join((rv[:-len(suffix)], "e")) + else: + rv = "" + + elif suffix in ("ii", "iua", "iei", + "iile", "iilor", "ilor"): + word = "".join((word[:-len(suffix)], "i")) + + if suffix in rv: + rv = "".join((rv[:-len(suffix)], "i")) + else: + rv = "" + + elif suffix in ("a\u0163ie", "a\u0163ia"): + word = word[:-1] + break + + # STEP 1: Reduction of combining suffixes + while True: + + replacement_done = False + + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix in r1: + step1_success = True + replacement_done = True + + if suffix in ("abilitate", "abilitati", + "abilit\u0103i", + "abilit\u0103\u0163i"): + word = "".join((word[:-len(suffix)], "abil")) + + elif suffix == "ibilitate": + word = word[:-5] + + elif suffix in ("ivitate", "ivitati", + "ivit\u0103i", + "ivit\u0103\u0163i"): + word = "".join((word[:-len(suffix)], "iv")) + + elif suffix in ("icitate", "icitati", "icit\u0103i", + "icit\u0103\u0163i", "icator", + "icatori", "iciv", "iciva", + "icive", "icivi", "iciv\u0103", + "ical", "icala", "icale", "icali", + "ical\u0103"): + word = "".join((word[:-len(suffix)], "ic")) + + elif suffix in ("ativ", "ativa", "ative", "ativi", + "ativ\u0103", "a\u0163iune", + "atoare", "ator", "atori", + "\u0103toare", + "\u0103tor", "\u0103tori"): + word = "".join((word[:-len(suffix)], "at")) + + if suffix in r2: + r2 = "".join((r2[:-len(suffix)], "at")) + + elif suffix in ("itiv", "itiva", "itive", "itivi", + "itiv\u0103", "i\u0163iune", + "itoare", "itor", "itori"): + word = "".join((word[:-len(suffix)], "it")) + + if suffix in r2: + r2 = "".join((r2[:-len(suffix)], "it")) + else: + step1_success = False + break + + if not replacement_done: + break + + # STEP 2: Removal of standard suffixes + for suffix in self.__step2_suffixes: + if word.endswith(suffix): + if suffix in r2: + step2_success = True + + if suffix in ("iune", "iuni"): + if word[-5] == "\u0163": + word = "".join((word[:-5], "t")) + + elif suffix in ("ism", "isme", "ist", "ista", "iste", + "isti", "ist\u0103", "i\u015Fti"): + word = "".join((word[:-len(suffix)], "ist")) + + else: + word = word[:-len(suffix)] + break + + # STEP 3: Removal of verb suffixes + if not step1_success and not step2_success: + for suffix in self.__step3_suffixes: + if word.endswith(suffix): + if suffix in rv: + if suffix in (u('seser\u0103\u0163i'), u('seser\u0103m'), + u('ser\u0103\u0163i'), u('sese\u015Fi'), + u('seser\u0103'), u('ser\u0103m'), 'sesem', + u('se\u015Fi'), u('ser\u0103'), 'sese', + u('a\u0163i'), u('e\u0163i'), u('i\u0163i'), + u('\xE2\u0163i'), 'sei', u('\u0103m'), + 'em', 'im', '\xE2m', 'se'): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + else: + if (not rv.startswith(suffix) and + rv[rv.index(suffix) - 1] not in + "aeio\u0103\xE2\xEE"): + word = word[:-len(suffix)] + break + + # STEP 4: Removal of final vowel + for suffix in ("ie", "a", "e", "i", "\u0103"): + if word.endswith(suffix): + if suffix in rv: + word = word[:-len(suffix)] + break + + word = word.replace("I", "i").replace("U", "u") + return word diff --git a/src/whoosh/lang/snowball/russian.py b/src/whoosh/lang/snowball/russian.py new file mode 100644 index 0000000..80cf0dc --- /dev/null +++ b/src/whoosh/lang/snowball/russian.py @@ -0,0 +1,422 @@ +from whoosh.compat import u + +class RussianStemmer(object): + """ + The Russian Snowball stemmer. + + :cvar __perfective_gerund_suffixes: Suffixes to be deleted. + :type __perfective_gerund_suffixes: tuple + :cvar __adjectival_suffixes: Suffixes to be deleted. + :type __adjectival_suffixes: tuple + :cvar __reflexive_suffixes: Suffixes to be deleted. + :type __reflexive_suffixes: tuple + :cvar __verb_suffixes: Suffixes to be deleted. + :type __verb_suffixes: tuple + :cvar __noun_suffixes: Suffixes to be deleted. + :type __noun_suffixes: tuple + :cvar __superlative_suffixes: Suffixes to be deleted. + :type __superlative_suffixes: tuple + :cvar __derivational_suffixes: Suffixes to be deleted. + :type __derivational_suffixes: tuple + :note: A detailed description of the Russian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/russian/stemmer.html + + """ + + __perfective_gerund_suffixes = ("ivshis'", "yvshis'", "vshis'", + "ivshi", "yvshi", "vshi", "iv", + "yv", "v") + __adjectival_suffixes = ('ui^ushchi^ui^u', 'ui^ushchi^ai^a', + 'ui^ushchimi', 'ui^ushchymi', 'ui^ushchego', + 'ui^ushchogo', 'ui^ushchemu', 'ui^ushchomu', + 'ui^ushchikh', 'ui^ushchykh', + 'ui^ushchui^u', 'ui^ushchaia', + 'ui^ushchoi^u', 'ui^ushchei^u', + 'i^ushchi^ui^u', 'i^ushchi^ai^a', + 'ui^ushchee', 'ui^ushchie', + 'ui^ushchye', 'ui^ushchoe', 'ui^ushchei`', + 'ui^ushchii`', 'ui^ushchyi`', + 'ui^ushchoi`', 'ui^ushchem', 'ui^ushchim', + 'ui^ushchym', 'ui^ushchom', 'i^ushchimi', + 'i^ushchymi', 'i^ushchego', 'i^ushchogo', + 'i^ushchemu', 'i^ushchomu', 'i^ushchikh', + 'i^ushchykh', 'i^ushchui^u', 'i^ushchai^a', + 'i^ushchoi^u', 'i^ushchei^u', 'i^ushchee', + 'i^ushchie', 'i^ushchye', 'i^ushchoe', + 'i^ushchei`', 'i^ushchii`', + 'i^ushchyi`', 'i^ushchoi`', 'i^ushchem', + 'i^ushchim', 'i^ushchym', 'i^ushchom', + 'shchi^ui^u', 'shchi^ai^a', 'ivshi^ui^u', + 'ivshi^ai^a', 'yvshi^ui^u', 'yvshi^ai^a', + 'shchimi', 'shchymi', 'shchego', 'shchogo', + 'shchemu', 'shchomu', 'shchikh', 'shchykh', + 'shchui^u', 'shchai^a', 'shchoi^u', + 'shchei^u', 'ivshimi', 'ivshymi', + 'ivshego', 'ivshogo', 'ivshemu', 'ivshomu', + 'ivshikh', 'ivshykh', 'ivshui^u', + 'ivshai^a', 'ivshoi^u', 'ivshei^u', + 'yvshimi', 'yvshymi', 'yvshego', 'yvshogo', + 'yvshemu', 'yvshomu', 'yvshikh', 'yvshykh', + 'yvshui^u', 'yvshai^a', 'yvshoi^u', + 'yvshei^u', 'vshi^ui^u', 'vshi^ai^a', + 'shchee', 'shchie', 'shchye', 'shchoe', + 'shchei`', 'shchii`', 'shchyi`', 'shchoi`', + 'shchem', 'shchim', 'shchym', 'shchom', + 'ivshee', 'ivshie', 'ivshye', 'ivshoe', + 'ivshei`', 'ivshii`', 'ivshyi`', + 'ivshoi`', 'ivshem', 'ivshim', 'ivshym', + 'ivshom', 'yvshee', 'yvshie', 'yvshye', + 'yvshoe', 'yvshei`', 'yvshii`', + 'yvshyi`', 'yvshoi`', 'yvshem', + 'yvshim', 'yvshym', 'yvshom', 'vshimi', + 'vshymi', 'vshego', 'vshogo', 'vshemu', + 'vshomu', 'vshikh', 'vshykh', 'vshui^u', + 'vshai^a', 'vshoi^u', 'vshei^u', + 'emi^ui^u', 'emi^ai^a', 'nni^ui^u', + 'nni^ai^a', 'vshee', + 'vshie', 'vshye', 'vshoe', 'vshei`', + 'vshii`', 'vshyi`', 'vshoi`', + 'vshem', 'vshim', 'vshym', 'vshom', + 'emimi', 'emymi', 'emego', 'emogo', + 'ememu', 'emomu', 'emikh', 'emykh', + 'emui^u', 'emai^a', 'emoi^u', 'emei^u', + 'nnimi', 'nnymi', 'nnego', 'nnogo', + 'nnemu', 'nnomu', 'nnikh', 'nnykh', + 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u', + 'emee', 'emie', 'emye', 'emoe', + 'emei`', 'emii`', 'emyi`', + 'emoi`', 'emem', 'emim', 'emym', + 'emom', 'nnee', 'nnie', 'nnye', 'nnoe', + 'nnei`', 'nnii`', 'nnyi`', + 'nnoi`', 'nnem', 'nnim', 'nnym', + 'nnom', 'i^ui^u', 'i^ai^a', 'imi', 'ymi', + 'ego', 'ogo', 'emu', 'omu', 'ikh', + 'ykh', 'ui^u', 'ai^a', 'oi^u', 'ei^u', + 'ee', 'ie', 'ye', 'oe', 'ei`', + 'ii`', 'yi`', 'oi`', 'em', + 'im', 'ym', 'om') + __reflexive_suffixes = ("si^a", "s'") + __verb_suffixes = ("esh'", 'ei`te', 'ui`te', 'ui^ut', + "ish'", 'ete', 'i`te', 'i^ut', 'nno', + 'ila', 'yla', 'ena', 'ite', 'ili', 'yli', + 'ilo', 'ylo', 'eno', 'i^at', 'uet', 'eny', + "it'", "yt'", 'ui^u', 'la', 'na', 'li', + 'em', 'lo', 'no', 'et', 'ny', "t'", + 'ei`', 'ui`', 'il', 'yl', 'im', + 'ym', 'en', 'it', 'yt', 'i^u', 'i`', + 'l', 'n') + __noun_suffixes = ('ii^ami', 'ii^akh', 'i^ami', 'ii^am', 'i^akh', + 'ami', 'iei`', 'i^am', 'iem', 'akh', + 'ii^u', "'i^u", 'ii^a', "'i^a", 'ev', 'ov', + 'ie', "'e", 'ei', 'ii', 'ei`', + 'oi`', 'ii`', 'em', 'am', 'om', + 'i^u', 'i^a', 'a', 'e', 'i', 'i`', + 'o', 'u', 'y', "'") + __superlative_suffixes = ("ei`she", "ei`sh") + __derivational_suffixes = ("ost'", "ost") + + def stem(self, word): + """ + Stem a Russian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + chr_exceeded = False + for i in range(len(word)): + if ord(word[i]) > 255: + chr_exceeded = True + break + + if chr_exceeded: + word = self.__cyrillic_to_roman(word) + + step1_success = False + adjectival_removed = False + verb_removed = False + undouble_success = False + superlative_removed = False + + rv, r2 = self.__regions_russian(word) + + # Step 1 + for suffix in self.__perfective_gerund_suffixes: + if rv.endswith(suffix): + if suffix in ("v", "vshi", "vshis'"): + if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or + rv[-len(suffix) - 1:-len(suffix)] == "a"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + step1_success = True + break + else: + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + step1_success = True + break + + if not step1_success: + for suffix in self.__reflexive_suffixes: + if rv.endswith(suffix): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + for suffix in self.__adjectival_suffixes: + if rv.endswith(suffix): + if suffix in ('i^ushchi^ui^u', 'i^ushchi^ai^a', + 'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u', + 'i^ushchei^u', 'i^ushchimi', 'i^ushchymi', + 'i^ushchego', 'i^ushchogo', 'i^ushchemu', + 'i^ushchomu', 'i^ushchikh', 'i^ushchykh', + 'shchi^ui^u', 'shchi^ai^a', 'i^ushchee', + 'i^ushchie', 'i^ushchye', 'i^ushchoe', + 'i^ushchei`', 'i^ushchii`', 'i^ushchyi`', + 'i^ushchoi`', 'i^ushchem', 'i^ushchim', + 'i^ushchym', 'i^ushchom', 'vshi^ui^u', + 'vshi^ai^a', 'shchui^u', 'shchai^a', + 'shchoi^u', 'shchei^u', 'emi^ui^u', + 'emi^ai^a', 'nni^ui^u', 'nni^ai^a', + 'shchimi', 'shchymi', 'shchego', 'shchogo', + 'shchemu', 'shchomu', 'shchikh', 'shchykh', + 'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u', + 'shchee', 'shchie', 'shchye', 'shchoe', + 'shchei`', 'shchii`', 'shchyi`', 'shchoi`', + 'shchem', 'shchim', 'shchym', 'shchom', + 'vshimi', 'vshymi', 'vshego', 'vshogo', + 'vshemu', 'vshomu', 'vshikh', 'vshykh', + 'emui^u', 'emai^a', 'emoi^u', 'emei^u', + 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u', + 'vshee', 'vshie', 'vshye', 'vshoe', + 'vshei`', 'vshii`', 'vshyi`', 'vshoi`', + 'vshem', 'vshim', 'vshym', 'vshom', + 'emimi', 'emymi', 'emego', 'emogo', + 'ememu', 'emomu', 'emikh', 'emykh', + 'nnimi', 'nnymi', 'nnego', 'nnogo', + 'nnemu', 'nnomu', 'nnikh', 'nnykh', + 'emee', 'emie', 'emye', 'emoe', 'emei`', + 'emii`', 'emyi`', 'emoi`', 'emem', 'emim', + 'emym', 'emom', 'nnee', 'nnie', 'nnye', + 'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`', + 'nnem', 'nnim', 'nnym', 'nnom'): + if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or + rv[-len(suffix) - 1:-len(suffix)] == "a"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + adjectival_removed = True + break + else: + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + adjectival_removed = True + break + + if not adjectival_removed: + for suffix in self.__verb_suffixes: + if rv.endswith(suffix): + if suffix in ("la", "na", "ete", "i`te", "li", + "i`", "l", "em", "n", "lo", "no", + "et", "i^ut", "ny", "t'", "esh'", + "nno"): + if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or + rv[-len(suffix) - 1:-len(suffix)] == "a"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + verb_removed = True + break + else: + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + verb_removed = True + break + + if not adjectival_removed and not verb_removed: + for suffix in self.__noun_suffixes: + if rv.endswith(suffix): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # Step 2 + if rv.endswith("i"): + word = word[:-1] + r2 = r2[:-1] + + # Step 3 + for suffix in self.__derivational_suffixes: + if r2.endswith(suffix): + word = word[:-len(suffix)] + break + + # Step 4 + if word.endswith("nn"): + word = word[:-1] + undouble_success = True + + if not undouble_success: + for suffix in self.__superlative_suffixes: + if word.endswith(suffix): + word = word[:-len(suffix)] + superlative_removed = True + break + if word.endswith("nn"): + word = word[:-1] + + if not undouble_success and not superlative_removed: + if word.endswith("'"): + word = word[:-1] + + if chr_exceeded: + word = self.__roman_to_cyrillic(word) + return word + + def __regions_russian(self, word): + """ + Return the regions RV and R2 which are used by the Russian stemmer. + + In any word, RV is the region after the first vowel, + or the end of the word if it contains no vowel. + + R2 is the region after the first non-vowel following + a vowel in R1, or the end of the word if there is no such non-vowel. + + R1 is the region after the first non-vowel following a vowel, + or the end of the word if there is no such non-vowel. + + :param word: The Russian word whose regions RV and R2 are determined. + :type word: str or unicode + :return: the regions RV and R2 for the respective Russian word. + :rtype: tuple + :note: This helper method is invoked by the stem method of the subclass + RussianStemmer. It is not to be invoked directly! + + """ + r1 = "" + r2 = "" + rv = "" + + vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y") + word = (word.replace("i^a", "A") + .replace("i^u", "U") + .replace("e`", "E")) + + for i in range(1, len(word)): + if word[i] not in vowels and word[i - 1] in vowels: + r1 = word[i + 1:] + break + + for i in range(1, len(r1)): + if r1[i] not in vowels and r1[i - 1] in vowels: + r2 = r1[i + 1:] + break + + for i in range(len(word)): + if word[i] in vowels: + rv = word[i + 1:] + break + + r2 = (r2.replace("A", "i^a") + .replace("U", "i^u") + .replace("E", "e`")) + rv = (rv.replace("A", "i^a") + .replace("U", "i^u") + .replace("E", "e`")) + return (rv, r2) + + def __cyrillic_to_roman(self, word): + """ + Transliterate a Russian word into the Roman alphabet. + + A Russian word whose letters consist of the Cyrillic + alphabet are transliterated into the Roman alphabet + in order to ease the forthcoming stemming process. + + :param word: The word that is transliterated. + :type word: unicode + :return: the transliterated word. + :rtype: unicode + :note: This helper method is invoked by the stem method of the subclass + RussianStemmer. It is not to be invoked directly! + + """ + word = (word.replace(u("\u0410"), "a").replace(u("\u0430"), "a") + .replace(u("\u0411"), "b").replace(u("\u0431"), "b") + .replace(u("\u0412"), "v").replace(u("\u0432"), "v") + .replace(u("\u0413"), "g").replace(u("\u0433"), "g") + .replace(u("\u0414"), "d").replace(u("\u0434"), "d") + .replace(u("\u0415"), "e").replace(u("\u0435"), "e") + .replace(u("\u0401"), "e").replace(u("\u0451"), "e") + .replace(u("\u0416"), "zh").replace(u("\u0436"), "zh") + .replace(u("\u0417"), "z").replace(u("\u0437"), "z") + .replace(u("\u0418"), "i").replace(u("\u0438"), "i") + .replace(u("\u0419"), "i`").replace(u("\u0439"), "i`") + .replace(u("\u041A"), "k").replace(u("\u043A"), "k") + .replace(u("\u041B"), "l").replace(u("\u043B"), "l") + .replace(u("\u041C"), "m").replace(u("\u043C"), "m") + .replace(u("\u041D"), "n").replace(u("\u043D"), "n") + .replace(u("\u041E"), "o").replace(u("\u043E"), "o") + .replace(u("\u041F"), "p").replace(u("\u043F"), "p") + .replace(u("\u0420"), "r").replace(u("\u0440"), "r") + .replace(u("\u0421"), "s").replace(u("\u0441"), "s") + .replace(u("\u0422"), "t").replace(u("\u0442"), "t") + .replace(u("\u0423"), "u").replace(u("\u0443"), "u") + .replace(u("\u0424"), "f").replace(u("\u0444"), "f") + .replace(u("\u0425"), "kh").replace(u("\u0445"), "kh") + .replace(u("\u0426"), "t^s").replace(u("\u0446"), "t^s") + .replace(u("\u0427"), "ch").replace(u("\u0447"), "ch") + .replace(u("\u0428"), "sh").replace(u("\u0448"), "sh") + .replace(u("\u0429"), "shch").replace(u("\u0449"), "shch") + .replace(u("\u042A"), "''").replace(u("\u044A"), "''") + .replace(u("\u042B"), "y").replace(u("\u044B"), "y") + .replace(u("\u042C"), "'").replace(u("\u044C"), "'") + .replace(u("\u042D"), "e`").replace(u("\u044D"), "e`") + .replace(u("\u042E"), "i^u").replace(u("\u044E"), "i^u") + .replace(u("\u042F"), "i^a").replace(u("\u044F"), "i^a")) + return word + + def __roman_to_cyrillic(self, word): + """ + Transliterate a Russian word back into the Cyrillic alphabet. + + A Russian word formerly transliterated into the Roman alphabet + in order to ease the stemming process, is transliterated back + into the Cyrillic alphabet, its original form. + + :param word: The word that is transliterated. + :type word: str or unicode + :return: word, the transliterated word. + :rtype: unicode + :note: This helper method is invoked by the stem method of the subclass + RussianStemmer. It is not to be invoked directly! + + """ + word = (word.replace("i^u", u("\u044E")).replace("i^a", u("\u044F")) + .replace("shch", u("\u0449")).replace("kh", u("\u0445")) + .replace("t^s", u("\u0446")).replace("ch", u("\u0447")) + .replace("e`", u("\u044D")).replace("i`", u("\u0439")) + .replace("sh", u("\u0448")).replace("k", u("\u043A")) + .replace("e", u("\u0435")).replace("zh", u("\u0436")) + .replace("a", u("\u0430")).replace("b", u("\u0431")) + .replace("v", u("\u0432")).replace("g", u("\u0433")) + .replace("d", u("\u0434")).replace("e", u("\u0435")) + .replace("z", u("\u0437")).replace("i", u("\u0438")) + .replace("l", u("\u043B")).replace("m", u("\u043C")) + .replace("n", u("\u043D")).replace("o", u("\u043E")) + .replace("p", u("\u043F")).replace("r", u("\u0440")) + .replace("s", u("\u0441")).replace("t", u("\u0442")) + .replace("u", u("\u0443")).replace("f", u("\u0444")) + .replace("''", u("\u044A")).replace("y", u("\u044B")) + .replace("'", u("\u044C"))) + return word diff --git a/src/whoosh/lang/snowball/spanish.py b/src/whoosh/lang/snowball/spanish.py new file mode 100644 index 0000000..8a3c5fb --- /dev/null +++ b/src/whoosh/lang/snowball/spanish.py @@ -0,0 +1,248 @@ +from .bases import _StandardStemmer + +from whoosh.compat import u + + +class SpanishStemmer(_StandardStemmer): + + """ + The Spanish Snowball stemmer. + + :cvar __vowels: The Spanish vowels. + :type __vowels: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. + :type __step2a_suffixes: tuple + :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. + :type __step2b_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Spanish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/spanish/stemmer.html + + """ + + __vowels = u("aeiou\xE1\xE9\xED\xF3\xFA\xFC") + __step0_suffixes = ("selas", "selos", "sela", "selo", "las", + "les", "los", "nos", "me", "se", "la", "le", + "lo") + __step1_suffixes = ('amientos', 'imientos', 'amiento', 'imiento', + 'aciones', 'uciones', 'adoras', 'adores', + 'ancias', u('log\xEDas'), 'encias', 'amente', + 'idades', 'anzas', 'ismos', 'ables', 'ibles', + 'istas', 'adora', u('aci\xF3n'), 'antes', + 'ancia', u('log\xEDa'), u('uci\xf3n'), 'encia', + 'mente', 'anza', 'icos', 'icas', 'ismo', + 'able', 'ible', 'ista', 'osos', 'osas', + 'ador', 'ante', 'idad', 'ivas', 'ivos', + 'ico', + 'ica', 'oso', 'osa', 'iva', 'ivo') + __step2a_suffixes = ('yeron', 'yendo', 'yamos', 'yais', 'yan', + 'yen', 'yas', 'yes', 'ya', 'ye', 'yo', + u('y\xF3')) + __step2b_suffixes = (u('ar\xEDamos'), u('er\xEDamos'), u('ir\xEDamos'), + u('i\xE9ramos'), u('i\xE9semos'), u('ar\xEDais'), + 'aremos', u('er\xEDais'), 'eremos', + u('ir\xEDais'), 'iremos', 'ierais', 'ieseis', + 'asteis', 'isteis', u('\xE1bamos'), + u('\xE1ramos'), u('\xE1semos'), u('ar\xEDan'), + u('ar\xEDas'), u('ar\xE9is'), u('er\xEDan'), + u('er\xEDas'), u('er\xE9is'), u('ir\xEDan'), + u('ir\xEDas'), u('ir\xE9is'), + 'ieran', 'iesen', 'ieron', 'iendo', 'ieras', + 'ieses', 'abais', 'arais', 'aseis', + u('\xE9amos'), u('ar\xE1n'), u('ar\xE1s'), + u('ar\xEDa'), u('er\xE1n'), u('er\xE1s'), + u('er\xEDa'), u('ir\xE1n'), u('ir\xE1s'), + u('ir\xEDa'), 'iera', 'iese', 'aste', 'iste', + 'aban', 'aran', 'asen', 'aron', 'ando', + 'abas', 'adas', 'idas', 'aras', 'ases', + u('\xEDais'), 'ados', 'idos', 'amos', 'imos', + 'emos', u('ar\xE1'), u('ar\xE9'), u('er\xE1'), + u('er\xE9'), u('ir\xE1'), u('ir\xE9'), 'aba', + 'ada', 'ida', 'ara', 'ase', u('\xEDan'), + 'ado', 'ido', u('\xEDas'), u('\xE1is'), + u('\xE9is'), u('\xEDa'), 'ad', 'ed', 'id', + 'an', u('i\xF3'), 'ar', 'er', 'ir', 'as', + u('\xEDs'), 'en', 'es') + __step3_suffixes = ("os", "a", "e", "o", u("\xE1"), + u("\xE9"), u("\xED"), u("\xF3")) + + def stem(self, word): + """ + Stem a Spanish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step1_success = False + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 0: Attached pronoun + for suffix in self.__step0_suffixes: + if word.endswith(suffix): + if rv.endswith(suffix): + if rv[:-len(suffix)].endswith((u("i\xE9ndo"), + u("\xE1ndo"), + u("\xE1r"), u("\xE9r"), + u("\xEDr"))): + word = (word[:-len(suffix)].replace(u("\xE1"), "a") + .replace(u("\xE9"), "e") + .replace(u("\xED"), "i")) + r1 = (r1[:-len(suffix)].replace(u("\xE1"), "a") + .replace(u("\xE9"), "e") + .replace(u("\xED"), "i")) + r2 = (r2[:-len(suffix)].replace(u("\xE1"), "a") + .replace(u("\xE9"), "e") + .replace(u("\xED"), "i")) + rv = (rv[:-len(suffix)].replace(u("\xE1"), "a") + .replace(u("\xE9"), "e") + .replace(u("\xED"), "i")) + + elif rv[:-len(suffix)].endswith(("ando", "iendo", + "ar", "er", "ir")): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + elif (rv[:-len(suffix)].endswith("yendo") and + word[:-len(suffix)].endswith("uyendo")): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "amente" and r1.endswith(suffix): + step1_success = True + word = word[:-6] + r2 = r2[:-6] + rv = rv[:-6] + + if r2.endswith("iv"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(("os", "ic", "ad")): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(suffix): + step1_success = True + if suffix in ("adora", "ador", u("aci\xF3n"), "adoras", + "adores", "aciones", "ante", "antes", + "ancia", "ancias"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + if r2.endswith("ic"): + word = word[:-2] + rv = rv[:-2] + + elif suffix in (u("log\xEDa"), u("log\xEDas")): + word = word.replace(suffix, "log") + rv = rv.replace(suffix, "log") + + elif suffix in (u("uci\xF3n"), "uciones"): + word = word.replace(suffix, "u") + rv = rv.replace(suffix, "u") + + elif suffix in ("encia", "encias"): + word = word.replace(suffix, "ente") + rv = rv.replace(suffix, "ente") + + elif suffix == "mente": + word = word[:-5] + r2 = r2[:-5] + rv = rv[:-5] + + if r2.endswith(("ante", "able", "ible")): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("idad", "idades"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + for pre_suff in ("abil", "ic", "iv"): + if r2.endswith(pre_suff): + word = word[:-len(pre_suff)] + rv = rv[:-len(pre_suff)] + + elif suffix in ("ivo", "iva", "ivos", "ivas"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + else: + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 2a: Verb suffixes beginning 'y' + if not step1_success: + for suffix in self.__step2a_suffixes: + if (rv.endswith(suffix) and + word[-len(suffix) - 1:-len(suffix)] == "u"): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 2b: Other verb suffixes + for suffix in self.__step2b_suffixes: + if rv.endswith(suffix): + if suffix in ("en", "es", u("\xE9is"), "emos"): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + + if word.endswith("gu"): + word = word[:-1] + + if rv.endswith("gu"): + rv = rv[:-1] + else: + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 3: Residual suffix + for suffix in self.__step3_suffixes: + if rv.endswith(suffix): + if suffix in ("e", u("\xE9")): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + + if word[-2:] == "gu" and rv[-1] == "u": + word = word[:-1] + else: + word = word[:-len(suffix)] + break + + word = (word.replace(u("\xE1"), "a").replace(u("\xE9"), "e") + .replace(u("\xED"), "i").replace(u("\xF3"), "o") + .replace(u("\xFA"), "u")) + return word diff --git a/src/whoosh/lang/snowball/swedish.py b/src/whoosh/lang/snowball/swedish.py new file mode 100644 index 0000000..037e273 --- /dev/null +++ b/src/whoosh/lang/snowball/swedish.py @@ -0,0 +1,80 @@ +from .bases import _ScandinavianStemmer + +from whoosh.compat import u + + +class SwedishStemmer(_ScandinavianStemmer): + + """ + The Swedish Snowball stemmer. + + :cvar __vowels: The Swedish vowels. + :type __vowels: unicode + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Swedish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/swedish/stemmer.html + """ + + __vowels = u("aeiouy\xE4\xE5\xF6") + __s_ending = "bcdfghjklmnoprtvy" + __step1_suffixes = ("heterna", "hetens", "heter", "heten", + "anden", "arnas", "ernas", "ornas", "andes", + "andet", "arens", "arna", "erna", "orna", + "ande", "arne", "aste", "aren", "ades", + "erns", "ade", "are", "ern", "ens", "het", + "ast", "ad", "en", "ar", "er", "or", "as", + "es", "at", "a", "e", "s") + __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt") + __step3_suffixes = ("fullt", u("l\xF6st"), "els", "lig", "ig") + + def stem(self, word): + """ + Stem a Swedish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + r1 = self._r1_scandinavian(word, self.__vowels) + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "s": + if word[-2] in self.__s_ending: + word = word[:-1] + r1 = r1[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 3 + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix in ("els", "lig", "ig"): + word = word[:-len(suffix)] + elif suffix in ("fullt", u("l\xF6st")): + word = word[:-1] + break + + return word diff --git a/src/whoosh/lang/stopwords.py b/src/whoosh/lang/stopwords.py new file mode 100644 index 0000000..8fc1703 --- /dev/null +++ b/src/whoosh/lang/stopwords.py @@ -0,0 +1,285 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +# Stopwords Corpus +# +# This module contains lists of stop words for several languages. These +# are high-frequency grammatical words which are usually ignored in text +# retrieval applications. +# +# They were obtained from: +# anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ + + +# ===== +# This module was generated from the original files using the following script + +#import os.path +#import textwrap +# +#names = os.listdir("stopwords") +#for name in names: +# f = open("stopwords/" + name) +# wordls = [line.strip() for line in f] +# words = " ".join(wordls) +# print '"%s": frozenset(u"""' % name +# print textwrap.fill(words, 72) +# print '""".split())' +# print + + +stoplists = { + "da": frozenset(""" + og i jeg det at en den til er som på de med han af for ikke der var mig + sig men et har om vi min havde ham hun nu over da fra du ud sin dem os + op man hans hvor eller hvad skal selv her alle vil blev kunne ind når + være dog noget ville jo deres efter ned skulle denne end dette mit + også under have dig anden hende mine alt meget sit sine vor mod disse + hvis din nogle hos blive mange ad bliver hendes været thi jer sådan + """.split()), + + "nl": frozenset(""" + de en van ik te dat die in een hij het niet zijn is was op aan met als + voor had er maar om hem dan zou of wat mijn men dit zo door over ze zich + bij ook tot je mij uit der daar haar naar heb hoe heeft hebben deze u + want nog zal me zij nu ge geen omdat iets worden toch al waren veel meer + doen toen moet ben zonder kan hun dus alles onder ja eens hier wie werd + altijd doch wordt wezen kunnen ons zelf tegen na reeds wil kon niets uw + iemand geweest andere + """.split()), + + "en": frozenset(""" + i me my myself we our ours ourselves you your yours yourself yourselves + he him his himself she her hers herself it its itself they them their + theirs themselves what which who whom this that these those am is are + was were be been being have has had having do does did doing a an the + and but if or because as until while of at by for with about against + between into through during before after above below to from up down in + out on off over under again further then once here there when where why + how all any both each few more most other some such no nor not only own + same so than too very s t can will just don should now + """.split()), + + "fi": frozenset(""" + olla olen olet on olemme olette ovat ole oli olisi olisit olisin + olisimme olisitte olisivat olit olin olimme olitte olivat ollut olleet + en et ei emme ette eivät minä minun minut minua minussa minusta minuun + minulla minulta minulle sinä sinun sinut sinua sinussa sinusta sinuun + sinulla sinulta sinulle hän hänen hänet häntä hänessä hänestä + häneen hänellä häneltä hänelle me meidän meidät meitä meissä + meistä meihin meillä meiltä meille te teidän teidät teitä teissä + teistä teihin teillä teiltä teille he heidän heidät heitä heissä + heistä heihin heillä heiltä heille tämä tämän tätä tässä + tästä tähän tallä tältä tälle tänä täksi tuo tuon tuotä + tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi se sen sitä + siinä siitä siihen sillä siltä sille sinä siksi nämä näiden + näitä näissä näistä näihin näillä näiltä näille näinä + näiksi nuo noiden noita noissa noista noihin noilla noilta noille noina + noiksi ne niiden niitä niissä niistä niihin niillä niiltä niille + niinä niiksi kuka kenen kenet ketä kenessä kenestä keneen kenellä + keneltä kenelle kenenä keneksi ketkä keiden ketkä keitä keissä + keistä keihin keillä keiltä keille keinä keiksi mikä minkä minkä + mitä missä mistä mihin millä miltä mille minä miksi mitkä joka + jonka jota jossa josta johon jolla jolta jolle jona joksi jotka joiden + joita joissa joista joihin joilla joilta joille joina joiksi että ja + jos koska kuin mutta niin sekä sillä tai vaan vai vaikka kanssa mukaan + noin poikki yli kun niin nyt itse + """.split()), + + "fr": frozenset(""" + au aux avec ce ces dans de des du elle en et eux il je la le leur lui ma + mais me même mes moi mon ne nos notre nous on ou par pas pour qu que + qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l + à m n s t y été étée étées étés étant étante étants étantes + suis es est sommes êtes sont serai seras sera serons serez seront + serais serait serions seriez seraient étais était étions étiez + étaient fus fut fûmes fûtes furent sois soit soyons soyez soient + fusse fusses fût fussions fussiez fussent ayant ayante ayantes ayants + eu eue eues eus ai as avons avez ont aurai auras aura aurons aurez + auront aurais aurait aurions auriez auraient avais avait avions aviez + avaient eut eûmes eûtes eurent aie aies ait ayons ayez aient eusse + eusses eût eussions eussiez eussent + """.split()), + + "de": frozenset(""" + aber alle allem allen aller alles als also am an ander andere anderem + anderen anderer anderes anderm andern anderr anders auch auf aus bei bin + bis bist da damit dann der den des dem die das daß derselbe derselben + denselben desselben demselben dieselbe dieselben dasselbe dazu dein + deine deinem deinen deiner deines denn derer dessen dich dir du dies + diese diesem diesen dieser dieses doch dort durch ein eine einem einen + einer eines einig einige einigem einigen einiger einiges einmal er ihn + ihm es etwas euer eure eurem euren eurer eures für gegen gewesen hab + habe haben hat hatte hatten hier hin hinter ich mich mir ihr ihre ihrem + ihren ihrer ihres euch im in indem ins ist jede jedem jeden jeder jedes + jene jenem jenen jener jenes jetzt kann kein keine keinem keinen keiner + keines können könnte machen man manche manchem manchen mancher manches + mein meine meinem meinen meiner meines mit muss musste nach nicht nichts + noch nun nur ob oder ohne sehr sein seine seinem seinen seiner seines + selbst sich sie ihnen sind so solche solchem solchen solcher solches + soll sollte sondern sonst über um und uns unse unsem unsen unser unses + unter viel vom von vor während war waren warst was weg weil weiter + welche welchem welchen welcher welches wenn werde werden wie wieder will + wir wird wirst wo wollen wollte würde würden zu zum zur zwar zwischen + """.split()), + + "hu": frozenset(""" + a ahogy ahol aki akik akkor alatt által általában amely amelyek + amelyekben amelyeket amelyet amelynek ami amit amolyan amíg amikor át + abban ahhoz annak arra arról az azok azon azt azzal azért aztán + azután azonban bár be belül benne cikk cikkek cikkeket csak de e + eddig egész egy egyes egyetlen egyéb egyik egyre ekkor el elég ellen + elõ elõször elõtt elsõ én éppen ebben ehhez emilyen ennek erre ez + ezt ezek ezen ezzel ezért és fel felé hanem hiszen hogy hogyan igen + így illetve ill. ill ilyen ilyenkor ison ismét itt jó jól jobban + kell kellett keresztül keressünk ki kívül között közül legalább + lehet lehetett legyen lenne lenni lesz lett maga magát majd majd már + más másik meg még mellett mert mely melyek mi mit míg miért milyen + mikor minden mindent mindenki mindig mint mintha mivel most nagy nagyobb + nagyon ne néha nekem neki nem néhány nélkül nincs olyan ott össze + õ õk õket pedig persze rá s saját sem semmi sok sokat sokkal + számára szemben szerint szinte talán tehát teljes tovább továbbá + több úgy ugyanis új újabb újra után utána utolsó vagy vagyis + valaki valami valamint való vagyok van vannak volt voltam voltak + voltunk vissza vele viszont volna + """.split()), + + "it": frozenset(""" + ad al allo ai agli all agl alla alle con col coi da dal dallo dai dagli + dall dagl dalla dalle di del dello dei degli dell degl della delle in + nel nello nei negli nell negl nella nelle su sul sullo sui sugli sull + sugl sulla sulle per tra contro io tu lui lei noi voi loro mio mia miei + mie tuo tua tuoi tue suo sua suoi sue nostro nostra nostri nostre vostro + vostra vostri vostre mi ti ci vi lo la li le gli ne il un uno una ma ed + se perché anche come dov dove che chi cui non più quale quanto quanti + quanta quante quello quelli quella quelle questo questi questa queste si + tutto tutti a c e i l o ho hai ha abbiamo avete hanno abbia abbiate + abbiano avrò avrai avrà avremo avrete avranno avrei avresti avrebbe + avremmo avreste avrebbero avevo avevi aveva avevamo avevate avevano ebbi + avesti ebbe avemmo aveste ebbero avessi avesse avessimo avessero avendo + avuto avuta avuti avute sono sei è siamo siete sia siate siano sarò + sarai sarà saremo sarete saranno sarei saresti sarebbe saremmo sareste + sarebbero ero eri era eravamo eravate erano fui fosti fu fummo foste + furono fossi fosse fossimo fossero essendo faccio fai facciamo fanno + faccia facciate facciano farò farai farà faremo farete faranno farei + faresti farebbe faremmo fareste farebbero facevo facevi faceva facevamo + facevate facevano feci facesti fece facemmo faceste fecero facessi + facesse facessimo facessero facendo sto stai sta stiamo stanno stia + stiate stiano starò starai starà staremo starete staranno starei + staresti starebbe staremmo stareste starebbero stavo stavi stava stavamo + stavate stavano stetti stesti stette stemmo steste stettero stessi + stesse stessimo stessero stando + """.split()), + + "no": frozenset(""" + og i jeg det at en et den til er som på de med han av ikke ikkje der + så var meg seg men ett har om vi min mitt ha hadde hun nå over da ved + fra du ut sin dem oss opp man kan hans hvor eller hva skal selv sjøl + her alle vil bli ble blei blitt kunne inn når være kom noen noe ville + dere som deres kun ja etter ned skulle denne for deg si sine sitt mot å + meget hvorfor dette disse uten hvordan ingen din ditt blir samme hvilken + hvilke sånn inni mellom vår hver hvem vors hvis både bare enn fordi + før mange også slik vært være båe begge siden dykk dykkar dei deira + deires deim di då eg ein eit eitt elles honom hjå ho hoe henne hennar + hennes hoss hossen ikkje ingi inkje korleis korso kva kvar kvarhelst + kven kvi kvifor me medan mi mine mykje no nokon noka nokor noko nokre si + sia sidan so somt somme um upp vere vore verte vort varte vart + """.split()), + + "pt": frozenset(""" + de a o que e do da em um para com não uma os no se na por mais as dos + como mas ao ele das à seu sua ou quando muito nos já eu também só + pelo pela até isso ela entre depois sem mesmo aos seus quem nas me esse + eles você essa num nem suas meu às minha numa pelos elas qual nós lhe + deles essas esses pelas este dele tu te vocês vos lhes meus minhas teu + tua teus tuas nosso nossa nossos nossas dela delas esta estes estas + aquele aquela aqueles aquelas isto aquilo estou está estamos estão + estive esteve estivemos estiveram estava estávamos estavam estivera + estivéramos esteja estejamos estejam estivesse estivéssemos estivessem + estiver estivermos estiverem hei há havemos hão houve houvemos + houveram houvera houvéramos haja hajamos hajam houvesse houvéssemos + houvessem houver houvermos houverem houverei houverá houveremos + houverão houveria houveríamos houveriam sou somos são era éramos + eram fui foi fomos foram fora fôramos seja sejamos sejam fosse + fôssemos fossem for formos forem serei será seremos serão seria + seríamos seriam tenho tem temos tém tinha tínhamos tinham tive teve + tivemos tiveram tivera tivéramos tenha tenhamos tenham tivesse + tivéssemos tivessem tiver tivermos tiverem terei terá teremos terão + teria teríamos teriam + """.split()), + + "ru": frozenset(""" + и в во не что он на я с со как а то все она + так его но да ты к у же вы за бы по только + ее мне было вот от меня еще нет о из ему + теперь когда даже ну вдруг ли если уже + или ни быть был него до вас нибудь опять + уж вам ведь там потом себя ничего ей + может они тут где есть надо ней для мы + тебя их чем была сам чтоб без будто чего + раз тоже себе под будет ж тогда кто этот + того потому этого какой совсем ним + здесь этом один почти мой тем чтобы нее + сейчас были куда зачем всех никогда + можно при наконец два об другой хоть + после над больше тот через эти нас про + всего них какая много разве три эту моя + впрочем хорошо свою этой перед иногда + лучше чуть том нельзя такой им более + всегда конечно всю между + """.split()), + + "es": frozenset(""" + de la que el en y a los del se las por un para con no una su al lo como + más pero sus le ya o este sí porque esta entre cuando muy sin sobre + también me hasta hay donde quien desde todo nos durante todos uno les + ni contra otros ese eso ante ellos e esto mí antes algunos qué unos yo + otro otras otra él tanto esa estos mucho quienes nada muchos cual poco + ella estar estas algunas algo nosotros mi mis tú te ti tu tus ellas + nosotras vosostros vosostras os mío mía míos mías tuyo tuya tuyos + tuyas suyo suya suyos suyas nuestro nuestra nuestros nuestras vuestro + vuestra vuestros vuestras esos esas estoy estás está estamos estáis + están esté estés estemos estéis estén estaré estarás estará + estaremos estaréis estarán estaría estarías estaríamos estaríais + estarían estaba estabas estábamos estabais estaban estuve estuviste + estuvo estuvimos estuvisteis estuvieron estuviera estuvieras + estuviéramos estuvierais estuvieran estuviese estuvieses estuviésemos + estuvieseis estuviesen estando estado estada estados estadas estad he + has ha hemos habéis han haya hayas hayamos hayáis hayan habré habrás + habrá habremos habréis habrán habría habrías habríamos habríais + habrían había habías habíamos habíais habían hube hubiste hubo + hubimos hubisteis hubieron hubiera hubieras hubiéramos hubierais + hubieran hubiese hubieses hubiésemos hubieseis hubiesen habiendo habido + habida habidos habidas soy eres es somos sois son sea seas seamos seáis + sean seré serás será seremos seréis serán sería serías seríamos + seríais serían era eras éramos erais eran fui fuiste fue fuimos + fuisteis fueron fuera fueras fuéramos fuerais fueran fuese fueses + fuésemos fueseis fuesen sintiendo sentido sentida sentidos sentidas + siente sentid tengo tienes tiene tenemos tenéis tienen tenga tengas + tengamos tengáis tengan tendré tendrás tendrá tendremos tendréis + tendrán tendría tendrías tendríamos tendríais tendrían tenía + tenías teníamos teníais tenían tuve tuviste tuvo tuvimos tuvisteis + tuvieron tuviera tuvieras tuviéramos tuvierais tuvieran tuviese + tuvieses tuviésemos tuvieseis tuviesen teniendo tenido tenida tenidos + tenidas tened + """.split()), + + "sv": frozenset(""" + och det att i en jag hon som han på den med var sig för så till är + men ett om hade de av icke mig du henne då sin nu har inte hans honom + skulle hennes där min man ej vid kunde något från ut när efter upp + vi dem vara vad över än dig kan sina här ha mot alla under någon + eller allt mycket sedan ju denna själv detta åt utan varit hur ingen + mitt ni bli blev oss din dessa några deras blir mina samma vilken er + sådan vår blivit dess inom mellan sådant varför varje vilka ditt vem + vilket sitta sådana vart dina vars vårt våra ert era vilkas + """.split()), + + "tr": frozenset(""" + acaba ama aslında az bazı belki biri birkaç birşey biz bu çok + çünkü da daha de defa diye eğer en gibi hem hep hepsi her hiç için + ile ise kez ki kim mı mu mü nasıl ne neden nerde nerede nereye niçin + niye o sanki şey siz şu tüm ve veya ya yani + """.split()), +} diff --git a/src/whoosh/lang/wordnet.py b/src/whoosh/lang/wordnet.py new file mode 100644 index 0000000..05016e8 --- /dev/null +++ b/src/whoosh/lang/wordnet.py @@ -0,0 +1,242 @@ +# Copyright 2009 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +"""This module contains low-level functions and a high-level class for parsing +the prolog file "wn_s.pl" from the WordNet prolog download +into an object suitable for looking up synonyms and performing query expansion. + +http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz +""" + +from collections import defaultdict + +from whoosh.compat import iterkeys, text_type +from whoosh.fields import Schema, ID, STORED + + +def parse_file(f): + """Parses the WordNet wn_s.pl prolog file and returns two dictionaries: + word2nums and num2words. + """ + + word2nums = defaultdict(list) + num2words = defaultdict(list) + + for line in f: + if not line.startswith("s("): + continue + + line = line[2:] + num = int(line[:line.find(",")]) + qt = line.find("'") + line = line[qt + 1:] + qt = line.find("'") + word = line[:qt].lower() + + if not word.isalpha(): + continue + + word2nums[word].append(num) + num2words[num].append(word) + + return word2nums, num2words + + +def make_index(storage, indexname, word2nums, num2words): + """Creates a Whoosh index in the given storage object containing + synonyms taken from word2nums and num2words. Returns the Index + object. + """ + + schema = Schema(word=ID, syns=STORED) + ix = storage.create_index(schema, indexname=indexname) + w = ix.writer() + for word in iterkeys(word2nums): + syns = synonyms(word2nums, num2words, word) + w.add_document(word=text_type(word), syns=syns) + w.commit() + return ix + + +def synonyms(word2nums, num2words, word): + """Uses the word2nums and num2words dicts to look up synonyms + for the given word. Returns a list of synonym strings. + """ + + keys = word2nums[word] + syns = set() + for key in keys: + syns = syns.union(num2words[key]) + + if word in syns: + syns.remove(word) + return sorted(syns) + + +class Thesaurus(object): + """Represents the WordNet synonym database, either loaded into memory + from the wn_s.pl Prolog file, or stored on disk in a Whoosh index. + + This class allows you to parse the prolog file "wn_s.pl" from the WordNet prolog + download into an object suitable for looking up synonyms and performing query + expansion. + + http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz + + To load a Thesaurus object from the wn_s.pl file... + + >>> t = Thesaurus.from_filename("wn_s.pl") + + To save the in-memory Thesaurus to a Whoosh index... + + >>> from whoosh.filedb.filestore import FileStorage + >>> fs = FileStorage("index") + >>> t.to_storage(fs) + + To load a Thesaurus object from a Whoosh index... + + >>> t = Thesaurus.from_storage(fs) + + The Thesaurus object is thus usable in two ways: + + * Parse the wn_s.pl file into memory (Thesaurus.from_*) and then look up + synonyms in memory. This has a startup cost for parsing the file, and uses + quite a bit of memory to store two large dictionaries, however synonym + look-ups are very fast. + + * Parse the wn_s.pl file into memory (Thesaurus.from_filename) then save it to + an index (to_storage). From then on, open the thesaurus from the saved + index (Thesaurus.from_storage). This has a large cost for storing the index, + but after that it is faster to open the Thesaurus (than re-parsing the file) + but slightly slower to look up synonyms. + + Here are timings for various tasks on my (fast) Windows machine, which might + give an idea of relative costs for in-memory vs. on-disk. + + ================================================ ================ + Task Approx. time (s) + ================================================ ================ + Parsing the wn_s.pl file 1.045 + Saving to an on-disk index 13.084 + Loading from an on-disk index 0.082 + Look up synonyms for "light" (in memory) 0.0011 + Look up synonyms for "light" (loaded from disk) 0.0028 + ================================================ ================ + + Basically, if you can afford spending the memory necessary to parse the + Thesaurus and then cache it, it's faster. Otherwise, use an on-disk index. + """ + + def __init__(self): + self.w2n = None + self.n2w = None + self.searcher = None + + @classmethod + def from_file(cls, fileobj): + """Creates a Thesaurus object from the given file-like object, which should + contain the WordNet wn_s.pl file. + + >>> f = open("wn_s.pl") + >>> t = Thesaurus.from_file(f) + >>> t.synonyms("hail") + ['acclaim', 'come', 'herald'] + """ + + thes = cls() + thes.w2n, thes.n2w = parse_file(fileobj) + return thes + + @classmethod + def from_filename(cls, filename): + """Creates a Thesaurus object from the given filename, which should + contain the WordNet wn_s.pl file. + + >>> t = Thesaurus.from_filename("wn_s.pl") + >>> t.synonyms("hail") + ['acclaim', 'come', 'herald'] + """ + + f = open(filename, "rb") + try: + return cls.from_file(f) + finally: + f.close() + + @classmethod + def from_storage(cls, storage, indexname="THES"): + """Creates a Thesaurus object from the given storage object, + which should contain an index created by Thesaurus.to_storage(). + + >>> from whoosh.filedb.filestore import FileStorage + >>> fs = FileStorage("index") + >>> t = Thesaurus.from_storage(fs) + >>> t.synonyms("hail") + ['acclaim', 'come', 'herald'] + + :param storage: A :class:`whoosh.store.Storage` object from + which to load the index. + :param indexname: A name for the index. This allows you to + store multiple indexes in the same storage object. + """ + + thes = cls() + index = storage.open_index(indexname=indexname) + thes.searcher = index.searcher() + return thes + + def to_storage(self, storage, indexname="THES"): + """Creates am index in the given storage object from the + synonyms loaded from a WordNet file. + + >>> from whoosh.filedb.filestore import FileStorage + >>> fs = FileStorage("index") + >>> t = Thesaurus.from_filename("wn_s.pl") + >>> t.to_storage(fs) + + :param storage: A :class:`whoosh.store.Storage` object in + which to save the index. + :param indexname: A name for the index. This allows you to + store multiple indexes in the same storage object. + """ + + if not self.w2n or not self.n2w: + raise Exception("No synonyms loaded") + make_index(storage, indexname, self.w2n, self.n2w) + + def synonyms(self, word): + """Returns a list of synonyms for the given word. + + >>> thesaurus.synonyms("hail") + ['acclaim', 'come', 'herald'] + """ + + word = word.lower() + if self.searcher: + return self.searcher.document(word=word)["syns"] + else: + return synonyms(self.w2n, self.n2w, word) diff --git a/src/whoosh/legacy.py b/src/whoosh/legacy.py new file mode 100644 index 0000000..cc7e2e7 --- /dev/null +++ b/src/whoosh/legacy.py @@ -0,0 +1,77 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module contains code for maintaining backwards compatibility with old +index formats. +""" + +from whoosh.util.loading import RenamingUnpickler + + +def load_110_toc(stream, gen, schema, version): + # Between version -110 and version -111, I reorganized the modules and + # changed the implementation of the NUMERIC field, so we have to change the + # classes the unpickler tries to load if we need to read an old schema + + # Read the length of the pickled schema + picklen = stream.read_varint() + if schema: + # If the user passed us a schema, use it and skip the one on disk + stream.seek(picklen, 1) + else: + # Remap the old classes and functions to their moved versions as we + # unpickle the schema + scuts = {"wf": "whoosh.fields", + "wsn": "whoosh.support.numeric", + "wcw2": "whoosh.codec.whoosh2"} + objmap = {"%(wf)s.NUMERIC": "%(wcw2)s.OLD_NUMERIC", + "%(wf)s.DATETIME": "%(wcw2)s.OLD_DATETIME", + "%(wsn)s.int_to_text": "%(wcw2)s.int_to_text", + "%(wsn)s.text_to_int": "%(wcw2)s.text_to_int", + "%(wsn)s.long_to_text": "%(wcw2)s.long_to_text", + "%(wsn)s.text_to_long": "%(wcw2)s.text_to_long", + "%(wsn)s.float_to_text": "%(wcw2)s.float_to_text", + "%(wsn)s.text_to_float": "%(wcw2)s.text_to_float", } + ru = RenamingUnpickler(stream, objmap, shortcuts=scuts) + schema = ru.load() + # Read the generation number + index_gen = stream.read_int() + assert gen == index_gen + # Unused number + _ = stream.read_int() + # Unpickle the list of segment objects + segments = stream.read_pickle() + return schema, segments + + +# Map TOC version numbers to functions to load that version +toc_loaders = {-110: load_110_toc} + + +# Map segment class names to functions to load the segment +segment_loaders = {} diff --git a/src/whoosh/matching/__init__.py b/src/whoosh/matching/__init__.py new file mode 100644 index 0000000..3f826b9 --- /dev/null +++ b/src/whoosh/matching/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.matching.mcore import * +from whoosh.matching.binary import * +from whoosh.matching.wrappers import * +from whoosh.matching.combo import * diff --git a/src/whoosh/matching/binary.py b/src/whoosh/matching/binary.py new file mode 100644 index 0000000..b752295 --- /dev/null +++ b/src/whoosh/matching/binary.py @@ -0,0 +1,803 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.matching import mcore + + +class BiMatcher(mcore.Matcher): + """Base class for matchers that combine the results of two sub-matchers in + some way. + """ + + def __init__(self, a, b): + super(BiMatcher, self).__init__() + self.a = a + self.b = b + + def reset(self): + self.a.reset() + self.b.reset() + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, self.a, self.b) + + def children(self): + return [self.a, self.b] + + def copy(self): + return self.__class__(self.a.copy(), self.b.copy()) + + def depth(self): + return 1 + max(self.a.depth(), self.b.depth()) + + def skip_to(self, id): + if not self.is_active(): + raise mcore.ReadTooFar + ra = self.a.skip_to(id) + rb = self.b.skip_to(id) + return ra or rb + + def supports_block_quality(self): + return (self.a.supports_block_quality() + and self.b.supports_block_quality()) + + def supports(self, astype): + return self.a.supports(astype) and self.b.supports(astype) + + +class AdditiveBiMatcher(BiMatcher): + """Base class for binary matchers where the scores of the sub-matchers are + added together. + """ + + def max_quality(self): + q = 0.0 + if self.a.is_active(): + q += self.a.max_quality() + if self.b.is_active(): + q += self.b.max_quality() + return q + + def block_quality(self): + bq = 0.0 + if self.a.is_active(): + bq += self.a.block_quality() + if self.b.is_active(): + bq += self.b.block_quality() + return bq + + def weight(self): + return (self.a.weight() + self.b.weight()) + + def score(self): + return (self.a.score() + self.b.score()) + + def __eq__(self, other): + return self.__class__ is type(other) + + def __lt__(self, other): + return type(other) is self.__class__ + + def __ne__(self, other): + return not self.__eq__(other) + + def __gt__(self, other): + return not (self.__lt__(other) or self.__eq__(other)) + + def __le__(self, other): + return self.__eq__(other) or self.__lt__(other) + + def __ge__(self, other): + return self.__eq__(other) or self.__gt__(other) + + +class UnionMatcher(AdditiveBiMatcher): + """Matches the union (OR) of the postings in the two sub-matchers. + """ + + _id = None + + def replace(self, minquality=0): + a = self.a + b = self.b + a_active = a.is_active() + b_active = b.is_active() + + # If neither sub-matcher on its own has a high enough max quality to + # contribute, convert to an intersection matcher + if minquality and a_active and b_active: + a_max = a.max_quality() + b_max = b.max_quality() + if a_max < minquality and b_max < minquality: + return IntersectionMatcher(a, b).replace(minquality) + elif a_max < minquality: + return AndMaybeMatcher(b, a) + elif b_max < minquality: + return AndMaybeMatcher(a, b) + + # If one or both of the sub-matchers are inactive, convert + if not (a_active or b_active): + return mcore.NullMatcher() + elif not a_active: + return b.replace(minquality) + elif not b_active: + return a.replace(minquality) + + a = a.replace(minquality - b.max_quality() if minquality else 0) + b = b.replace(minquality - a.max_quality() if minquality else 0) + # If one of the sub-matchers changed, return a new union + if a is not self.a or b is not self.b: + return self.__class__(a, b) + else: + self._id = None + return self + + def is_active(self): + return self.a.is_active() or self.b.is_active() + + def skip_to(self, id): + self._id = None + ra = rb = False + + if self.a.is_active(): + ra = self.a.skip_to(id) + if self.b.is_active(): + rb = self.b.skip_to(id) + + return ra or rb + + def id(self): + _id = self._id + if _id is not None: + return _id + + a = self.a + b = self.b + if not a.is_active(): + _id = b.id() + elif not b.is_active(): + _id = a.id() + else: + _id = min(a.id(), b.id()) + self._id = _id + return _id + + # Using sets is faster in most cases, but could potentially use a lot of + # memory. Comment out this method override to not use sets. + #def all_ids(self): + # return iter(sorted(set(self.a.all_ids()) | set(self.b.all_ids()))) + + def next(self): + self._id = None + + a = self.a + b = self.b + a_active = a.is_active() + b_active = b.is_active() + + # Shortcut when one matcher is inactive + if not (a_active or b_active): + raise mcore.ReadTooFar + elif not a_active: + return b.next() + elif not b_active: + return a.next() + + a_id = a.id() + b_id = b.id() + ar = br = None + + # After all that, here's the actual implementation + if a_id <= b_id: + ar = a.next() + if b_id <= a_id: + br = b.next() + return ar or br + + def spans(self): + if not self.a.is_active(): + return self.b.spans() + if not self.b.is_active(): + return self.a.spans() + + id_a = self.a.id() + id_b = self.b.id() + if id_a < id_b: + return self.a.spans() + elif id_b < id_a: + return self.b.spans() + else: + return sorted(set(self.a.spans()) | set(self.b.spans())) + + def weight(self): + a = self.a + b = self.b + + if not a.is_active(): + return b.weight() + if not b.is_active(): + return a.weight() + + id_a = a.id() + id_b = b.id() + if id_a < id_b: + return a.weight() + elif id_b < id_a: + return b.weight() + else: + return (a.weight() + b.weight()) + + def score(self): + a = self.a + b = self.b + + if not a.is_active(): + return b.score() + if not b.is_active(): + return a.score() + + id_a = a.id() + id_b = b.id() + if id_a < id_b: + return a.score() + elif id_b < id_a: + return b.score() + else: + return (a.score() + b.score()) + + def skip_to_quality(self, minquality): + self._id = None + + a = self.a + b = self.b + if not (a.is_active() or b.is_active()): + raise mcore.ReadTooFar + + # Short circuit if one matcher is inactive + if not a.is_active(): + return b.skip_to_quality(minquality) + elif not b.is_active(): + return a.skip_to_quality(minquality) + + skipped = 0 + aq = a.block_quality() + bq = b.block_quality() + while a.is_active() and b.is_active() and aq + bq <= minquality: + if aq < bq: + skipped += a.skip_to_quality(minquality - bq) + aq = a.block_quality() + else: + skipped += b.skip_to_quality(minquality - aq) + bq = b.block_quality() + + return skipped + + +class DisjunctionMaxMatcher(UnionMatcher): + """Matches the union (OR) of two sub-matchers. Where both sub-matchers + match the same posting, returns the weight/score of the higher-scoring + posting. + """ + + # TODO: this class inherits from AdditiveBiMatcher (through UnionMatcher) + # but it does not add the scores of the sub-matchers together (it + # overrides all methods that perform addition). Need to clean up the + # inheritance. + + def __init__(self, a, b, tiebreak=0.0): + super(DisjunctionMaxMatcher, self).__init__(a, b) + self.tiebreak = tiebreak + + def copy(self): + return self.__class__(self.a.copy(), self.b.copy(), + tiebreak=self.tiebreak) + + def replace(self, minquality=0): + a = self.a + b = self.b + a_active = a.is_active() + b_active = b.is_active() + + # DisMax takes the max of the sub-matcher qualities instead of adding + # them, so we need special logic here + if minquality and a_active and b_active: + a_max = a.max_quality() + b_max = b.max_quality() + + if a_max < minquality and b_max < minquality: + # If neither sub-matcher has a high enough max quality to + # contribute, return an inactive matcher + return mcore.NullMatcher() + elif b_max < minquality: + # If the b matcher can't contribute, return a + return a.replace(minquality) + elif a_max < minquality: + # If the a matcher can't contribute, return b + return b.replace(minquality) + + if not (a_active or b_active): + return mcore.NullMatcher() + elif not a_active: + return b.replace(minquality) + elif not b_active: + return a.replace(minquality) + + # We CAN pass the minquality down here, since we don't add the two + # scores together + a = a.replace(minquality) + b = b.replace(minquality) + a_active = a.is_active() + b_active = b.is_active() + # It's kind of tedious to check for inactive sub-matchers all over + # again here after we replace them, but it's probably better than + # returning a replacement with an inactive sub-matcher + if not (a_active and b_active): + return mcore.NullMatcher() + elif not a_active: + return b + elif not b_active: + return a + elif a is not self.a or b is not self.b: + # If one of the sub-matchers changed, return a new DisMax + return self.__class__(a, b) + else: + return self + + def score(self): + if not self.a.is_active(): + return self.b.score() + elif not self.b.is_active(): + return self.a.score() + else: + return max(self.a.score(), self.b.score()) + + def max_quality(self): + return max(self.a.max_quality(), self.b.max_quality()) + + def block_quality(self): + return max(self.a.block_quality(), self.b.block_quality()) + + def skip_to_quality(self, minquality): + a = self.a + b = self.b + + # Short circuit if one matcher is inactive + if not a.is_active(): + sk = b.skip_to_quality(minquality) + return sk + elif not b.is_active(): + return a.skip_to_quality(minquality) + + skipped = 0 + aq = a.block_quality() + bq = b.block_quality() + while a.is_active() and b.is_active() and max(aq, bq) <= minquality: + if aq <= minquality: + skipped += a.skip_to_quality(minquality) + aq = a.block_quality() + if bq <= minquality: + skipped += b.skip_to_quality(minquality) + bq = b.block_quality() + return skipped + + +class IntersectionMatcher(AdditiveBiMatcher): + """Matches the intersection (AND) of the postings in the two sub-matchers. + """ + + def __init__(self, a, b): + super(IntersectionMatcher, self).__init__(a, b) + self._find_first() + + def reset(self): + self.a.reset() + self.b.reset() + self._find_first() + + def _find_first(self): + if (self.a.is_active() + and self.b.is_active() + and self.a.id() != self.b.id()): + self._find_next() + + def replace(self, minquality=0): + a = self.a + b = self.b + a_active = a.is_active() + b_active = b.is_active() + + if not (a_active and b_active): + # Intersection matcher requires that both sub-matchers be active + return mcore.NullMatcher() + + if minquality: + a_max = a.max_quality() + b_max = b.max_quality() + if a_max + b_max < minquality: + # If the combined quality of the sub-matchers can't contribute, + # return an inactive matcher + return mcore.NullMatcher() + # Require that the replacements be able to contribute results + # higher than the minquality + a_min = minquality - b_max + b_min = minquality - a_max + else: + a_min = b_min = 0 + + a = a.replace(a_min) + b = b.replace(b_min) + a_active = a.is_active() + b_active = b.is_active() + if not (a_active or b_active): + return mcore.NullMatcher() + elif not a_active: + return b + elif not b_active: + return a + elif a is not self.a or b is not self.b: + return self.__class__(a, b) + else: + return self + + def is_active(self): + return self.a.is_active() and self.b.is_active() + + def _find_next(self): + a = self.a + b = self.b + a_id = a.id() + b_id = b.id() + assert a_id != b_id + r = False + + while a.is_active() and b.is_active() and a_id != b_id: + if a_id < b_id: + ra = a.skip_to(b_id) + if not a.is_active(): + return + r = r or ra + a_id = a.id() + else: + rb = b.skip_to(a_id) + if not b.is_active(): + return + r = r or rb + b_id = b.id() + return r + + def id(self): + return self.a.id() + + # Using sets is faster in some cases, but could potentially use a lot of + # memory + def all_ids(self): + return iter(sorted(set(self.a.all_ids()) & set(self.b.all_ids()))) + + def skip_to(self, id): + if not self.is_active(): + raise mcore.ReadTooFar + ra = self.a.skip_to(id) + rb = self.b.skip_to(id) + if self.is_active(): + rn = False + if self.a.id() != self.b.id(): + rn = self._find_next() + return ra or rb or rn + + def skip_to_quality(self, minquality): + a = self.a + b = self.b + minquality = minquality + + skipped = 0 + aq = a.block_quality() + bq = b.block_quality() + while a.is_active() and b.is_active() and aq + bq <= minquality: + if aq < bq: + # If the block quality of A is less than B, skip A ahead until + # it can contribute at least the balance of the required min + # quality when added to B + sk = a.skip_to_quality(minquality - bq) + skipped += sk + if not sk and a.is_active(): + # The matcher couldn't skip ahead for some reason, so just + # advance and try again + a.next() + else: + # And vice-versa + sk = b.skip_to_quality(minquality - aq) + skipped += sk + if not sk and b.is_active(): + b.next() + + if not a.is_active() or not b.is_active(): + # One of the matchers is exhausted + break + if a.id() != b.id(): + # We want to always leave in a state where the matchers are at + # the same document, so call _find_next() to sync them + self._find_next() + + # Get the block qualities at the new matcher positions + aq = a.block_quality() + bq = b.block_quality() + return skipped + + def next(self): + if not self.is_active(): + raise mcore.ReadTooFar + + # We must assume that the ids are equal whenever next() is called (they + # should have been made equal by _find_next), so advance them both + ar = self.a.next() + if self.is_active(): + nr = self._find_next() + return ar or nr + + def spans(self): + return sorted(set(self.a.spans()) | set(self.b.spans())) + + +class AndNotMatcher(BiMatcher): + """Matches the postings in the first sub-matcher that are NOT present in + the second sub-matcher. + """ + + def __init__(self, a, b): + super(AndNotMatcher, self).__init__(a, b) + self._find_first() + + def reset(self): + self.a.reset() + self.b.reset() + self._find_first() + + def _find_first(self): + if (self.a.is_active() + and self.b.is_active() + and self.a.id() == self.b.id()): + self._find_next() + + def is_active(self): + return self.a.is_active() + + def _find_next(self): + pos = self.a + neg = self.b + if not neg.is_active(): + return + pos_id = pos.id() + r = False + + if neg.id() < pos_id: + neg.skip_to(pos_id) + + while pos.is_active() and neg.is_active() and pos_id == neg.id(): + nr = pos.next() + if not pos.is_active(): + break + + r = r or nr + pos_id = pos.id() + neg.skip_to(pos_id) + + return r + + def supports_block_quality(self): + return self.a.supports_block_quality() + + def replace(self, minquality=0): + if not self.a.is_active(): + # The a matcher is required, so if it's inactive, return an + # inactive matcher + return mcore.NullMatcher() + elif (minquality + and self.a.max_quality() < minquality): + # If the quality of the required matcher isn't high enough to + # contribute, return an inactive matcher + return mcore.NullMatcher() + elif not self.b.is_active(): + # If the prohibited matcher is inactive, convert to just the + # required matcher + return self.a.replace(minquality) + + a = self.a.replace(minquality) + b = self.b.replace() + if a is not self.a or b is not self.b: + # If one of the sub-matchers was replaced, return a new AndNot + return self.__class__(a, b) + else: + return self + + def max_quality(self): + return self.a.max_quality() + + def block_quality(self): + return self.a.block_quality() + + def skip_to_quality(self, minquality): + skipped = self.a.skip_to_quality(minquality) + self._find_next() + return skipped + + def id(self): + return self.a.id() + + def next(self): + if not self.a.is_active(): + raise mcore.ReadTooFar + ar = self.a.next() + nr = False + if self.a.is_active() and self.b.is_active(): + nr = self._find_next() + return ar or nr + + def skip_to(self, id): + if not self.a.is_active(): + raise mcore.ReadTooFar + if id < self.a.id(): + return + + self.a.skip_to(id) + if self.b.is_active(): + self.b.skip_to(id) + self._find_next() + + def weight(self): + return self.a.weight() + + def score(self): + return self.a.score() + + def supports(self, astype): + return self.a.supports(astype) + + def value(self): + return self.a.value() + + def value_as(self, astype): + return self.a.value_as(astype) + + +class AndMaybeMatcher(AdditiveBiMatcher): + """Matches postings in the first sub-matcher, and if the same posting is + in the second sub-matcher, adds their scores. + """ + + def __init__(self, a, b): + AdditiveBiMatcher.__init__(self, a, b) + self._first_b() + + def reset(self): + self.a.reset() + self.b.reset() + self._first_b() + + def _first_b(self): + a = self.a + b = self.b + if a.is_active() and b.is_active() and a.id() != b.id(): + b.skip_to(a.id()) + + def is_active(self): + return self.a.is_active() + + def id(self): + return self.a.id() + + def next(self): + if not self.a.is_active(): + raise mcore.ReadTooFar + + ar = self.a.next() + br = False + if self.a.is_active() and self.b.is_active(): + br = self.b.skip_to(self.a.id()) + return ar or br + + def skip_to(self, id): + if not self.a.is_active(): + raise mcore.ReadTooFar + + ra = self.a.skip_to(id) + rb = False + if self.a.is_active() and self.b.is_active(): + rb = self.b.skip_to(id) + return ra or rb + + def replace(self, minquality=0): + a = self.a + b = self.b + a_active = a.is_active() + b_active = b.is_active() + + if not a_active: + return mcore.NullMatcher() + elif minquality and b_active: + if a.max_quality() + b.max_quality() < minquality: + # If the combined max quality of the sub-matchers isn't high + # enough to possibly contribute, return an inactive matcher + return mcore.NullMatcher() + elif a.max_quality() < minquality: + # If the max quality of the main sub-matcher isn't high enough + # to ever contribute without the optional sub- matcher, change + # into an IntersectionMatcher + return IntersectionMatcher(self.a, self.b) + elif not b_active: + return a.replace(minquality) + + new_a = a.replace(minquality - b.max_quality()) + new_b = b.replace(minquality - a.max_quality()) + if new_a is not a or new_b is not b: + # If one of the sub-matchers changed, return a new AndMaybe + return self.__class__(new_a, new_b) + else: + return self + + def skip_to_quality(self, minquality): + a = self.a + b = self.b + minquality = minquality + + if not a.is_active(): + raise mcore.ReadTooFar + if not b.is_active(): + return a.skip_to_quality(minquality) + + skipped = 0 + aq = a.block_quality() + bq = b.block_quality() + while a.is_active() and b.is_active() and aq + bq <= minquality: + if aq < bq: + skipped += a.skip_to_quality(minquality - bq) + aq = a.block_quality() + else: + skipped += b.skip_to_quality(minquality - aq) + bq = b.block_quality() + + return skipped + + def weight(self): + if self.a.id() == self.b.id(): + return self.a.weight() + self.b.weight() + else: + return self.a.weight() + + def score(self): + if self.b.is_active() and self.a.id() == self.b.id(): + return self.a.score() + self.b.score() + else: + return self.a.score() + + def supports(self, astype): + return self.a.supports(astype) + + def value(self): + return self.a.value() + + def value_as(self, astype): + return self.a.value_as(astype) diff --git a/src/whoosh/matching/combo.py b/src/whoosh/matching/combo.py new file mode 100644 index 0000000..7c47df9 --- /dev/null +++ b/src/whoosh/matching/combo.py @@ -0,0 +1,312 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import division +from array import array + +from whoosh.compat import xrange +from whoosh.matching import mcore + + +class CombinationMatcher(mcore.Matcher): + def __init__(self, submatchers, boost=1.0): + self._submatchers = submatchers + self._boost = boost + + def supports_block_quality(self): + return all(m.supports_block_quality() for m in self._submatchers) + + def max_quality(self): + return max(m.max_quality() for m in self._submatchers + if m.is_active()) * self._boost + + def supports(self, astype): + return all(m.supports(astype) for m in self._submatchers) + + def children(self): + return iter(self._submatchers) + + def score(self): + return sum(m.score() for m in self._submatchers) * self._boost + + +class PreloadedUnionMatcher(CombinationMatcher): + """Instead of marching the sub-matchers along in parallel, this + matcher pre-reads the scores for EVERY MATCHING DOCUMENT, trading memory + for speed. + + This is faster than the implementation using a binary tree of + :class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just + because of less overhead), but it doesn't allow getting information about + the "current" document other than the score, because there isn't really a + current document, just an array of scores. + """ + + def __init__(self, submatchers, doccount, boost=1.0, scored=True): + CombinationMatcher.__init__(self, submatchers, boost=boost) + + self._doccount = doccount + + a = array("d") + active = [subm for subm in self._submatchers if subm.is_active()] + if active: + offset = self._docnum = min(m.id() for m in active) + for m in active: + while m.is_active(): + if scored: + score = m.score() * boost + else: + score = boost + + docnum = m.id() + place = docnum - offset + if len(a) <= place: + a.extend(0 for _ in xrange(place - len(a) + 1)) + a[place] += score + m.next() + self._a = a + self._offset = offset + else: + self._docnum = 0 + self._offset = 0 + self._a = a + + def is_active(self): + return self._docnum - self._offset < len(self._a) + + def id(self): + return self._docnum + + def score(self): + return self._a[self._docnum - self._offset] + + def next(self): + a = self._a + offset = self._offset + place = self._docnum - offset + + place += 1 + while place < len(a) and a[place] == 0: + place += 1 + self._docnum = place + offset + + def max_quality(self): + return max(self._a[self._docnum - self._offset:]) + + def block_quality(self): + return self.max_quality() + + def skip_to(self, docnum): + if docnum < self._docnum: + return + + self._docnum = docnum + i = docnum - self._offset + if i < len(self._a) and self._a[i] == 0: + self.next() + + def skip_to_quality(self, minquality): + a = self._a + offset = self._offset + place = self._docnum - offset + + skipped = 0 + while place < len(a) and a[place] <= minquality: + place += 1 + skipped = 1 + + self._docnum = place + offset + return skipped + + def supports(self, astype): + # This matcher doesn't support any posting values + return False + + def all_ids(self): + a = self._a + offset = self._offset + place = self._docnum - offset + + while place < len(a): + if a[place] > 0: + yield place + offset + place += 1 + + +class ArrayUnionMatcher(CombinationMatcher): + """Instead of marching the sub-matchers along in parallel, this matcher + pre-reads the scores for a large block of documents at a time from each + matcher, accumulating the scores in an array. + + This is faster than the implementation using a binary tree of + :class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just + because of less overhead), but it doesn't allow getting information about + the "current" document other than the score, because there isn't really a + current document, just an array of scores. + """ + + def __init__(self, submatchers, doccount, boost=1.0, scored=True, + partsize=2048): + CombinationMatcher.__init__(self, submatchers, boost=boost) + self._scored = scored + self._doccount = doccount + + if not partsize: + partsize = doccount + self._partsize = partsize + + self._a = array("d", (0 for _ in xrange(self._partsize))) + self._docnum = self._min_id() + self._read_part() + + def __repr__(self): + return ("%s(%r, boost=%f, scored=%r, partsize=%d)" + % (self.__class__.__name__, self._submatchers, self._boost, + self._scored, self._partsize)) + + def _min_id(self): + active = [subm for subm in self._submatchers if subm.is_active()] + if active: + return min(subm.id() for subm in active) + else: + return self._doccount + + def _read_part(self): + scored = self._scored + boost = self._boost + limit = min(self._docnum + self._partsize, self._doccount) + offset = self._docnum + a = self._a + + # Clear the array + for i in xrange(self._partsize): + a[i] = 0 + + # Add the scores from the submatchers into the array + for m in self._submatchers: + while m.is_active() and m.id() < limit: + i = m.id() - offset + if scored: + a[i] += m.score() * boost + else: + a[i] = 1 + m.next() + + self._offset = offset + self._limit = limit + + def _find_next(self): + a = self._a + docnum = self._docnum + offset = self._offset + limit = self._limit + + while docnum < limit: + if a[docnum - offset] > 0: + break + docnum += 1 + + if docnum == limit: + self._docnum = self._min_id() + self._read_part() + else: + self._docnum = docnum + + def supports(self, astype): + # This matcher doesn't support any posting values + return False + + def is_active(self): + return self._docnum < self._doccount + + def max_quality(self): + return max(m.max_quality() for m in self._submatchers) + + def block_quality(self): + return max(self._a) + + def skip_to(self, docnum): + if docnum < self._offset: + # We've already passed it + return + elif docnum < self._limit: + # It's in the current part + self._docnum = docnum + self._find_next() + return + + # Advance all active submatchers + submatchers = self._submatchers + active = False + for subm in submatchers: + if subm.is_active(): + subm.skip_to(docnum) + + if any(subm.is_active() for subm in submatchers): + # Rebuffer + self._docnum = self._min_id() + self._read_part() + else: + self._docnum = self._doccount + + def skip_to_quality(self, minquality): + skipped = 0 + while self.is_active() and self.block_quality() <= minquality: + skipped += 1 + self._docnum = self._limit + self._read_part() + if self.is_active(): + self._find_next() + return skipped + + def id(self): + return self._docnum + + def all_ids(self): + doccount = self._doccount + docnum = self._docnum + offset = self._offset + limit = self._limit + + a = self._a + while docnum < doccount: + if a[docnum - offset] > 0: + yield docnum + + docnum += 1 + if docnum == limit: + self._docnum = docnum + self._read_part() + offset = self._offset + limit = self._limit + + def next(self): + self._docnum += 1 + return self._find_next() + + def score(self): + return self._a[self._docnum - self._offset] diff --git a/src/whoosh/matching/mcore.py b/src/whoosh/matching/mcore.py new file mode 100644 index 0000000..0b61c7d --- /dev/null +++ b/src/whoosh/matching/mcore.py @@ -0,0 +1,622 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module contains "matcher" classes. Matchers deal with posting lists. The +most basic matcher, which reads the list of postings for a term, will be +provided by the backend implementation (for example, +:class:`whoosh.filedb.filepostings.FilePostingReader`). The classes in this +module provide additional functionality, such as combining the results of two +matchers, or modifying the results of a matcher. + +You do not need to deal with the classes in this module unless you need to +write your own Matcher implementation to provide some new functionality. These +classes are not instantiated by the user. They are usually created by a +:class:`~whoosh.query.Query` object's :meth:`~whoosh.query.Query.matcher()` +method, which returns the appropriate matcher to implement the query (for +example, the :class:`~whoosh.query.Or` query's +:meth:`~whoosh.query.Or.matcher()` method returns a +:py:class:`~whoosh.matching.UnionMatcher` object). + +Certain backends support "quality" optimizations. These backends have the +ability to skip ahead if it knows the current block of postings can't +contribute to the top N documents. If the matcher tree and backend support +these optimizations, the matcher's :meth:`Matcher.supports_block_quality()` +method will return ``True``. +""" + +import sys +from itertools import repeat + +from whoosh.compat import izip, xrange +from whoosh.compat import abstractmethod + + +# Exceptions + +class ReadTooFar(Exception): + """Raised when :meth:`~whoosh.matching.Matcher.next()` or + :meth:`~whoosh.matching.Matcher.skip_to()` are called on an inactive + matcher. + """ + + +class NoQualityAvailable(Exception): + """Raised when quality methods are called on a matcher that does not + support block quality optimizations. + """ + + +# Classes + +class Matcher(object): + """Base class for all matchers. + """ + + @abstractmethod + def is_active(self): + """Returns True if this matcher is still "active", that is, it has not + yet reached the end of the posting list. + """ + + raise NotImplementedError + + @abstractmethod + def reset(self): + """Returns to the start of the posting list. + + Note that reset() may not do what you expect after you call + :meth:`Matcher.replace()`, since this can mean calling reset() not on + the original matcher, but on an optimized replacement. + """ + + raise NotImplementedError + + def term(self): + """Returns a ``("fieldname", "termtext")`` tuple for the term this + matcher matches, or None if this matcher is not a term matcher. + """ + + return None + + def term_matchers(self): + """Returns an iterator of term matchers in this tree. + """ + + if self.term() is not None: + yield self + else: + for cm in self.children(): + for m in cm.term_matchers(): + yield m + + def matching_terms(self, id=None): + """Returns an iterator of ``("fieldname", "termtext")`` tuples for the + **currently matching** term matchers in this tree. + """ + + if not self.is_active(): + return + + if id is None: + id = self.id() + elif id != self.id(): + return + + t = self.term() + if t is None: + for c in self.children(): + for t in c.matching_terms(id): + yield t + else: + yield t + + def is_leaf(self): + return not bool(self.children()) + + def children(self): + """Returns an (possibly empty) list of the submatchers of this + matcher. + """ + + return [] + + def replace(self, minquality=0): + """Returns a possibly-simplified version of this matcher. For example, + if one of the children of a UnionMatcher is no longer active, calling + this method on the UnionMatcher will return the other child. + """ + + return self + + @abstractmethod + def copy(self): + """Returns a copy of this matcher. + """ + + raise NotImplementedError + + def depth(self): + """Returns the depth of the tree under this matcher, or 0 if this + matcher does not have any children. + """ + + return 0 + + def supports_block_quality(self): + """Returns True if this matcher supports the use of ``quality`` and + ``block_quality``. + """ + + return False + + def max_quality(self): + """Returns the maximum possible quality measurement for this matcher, + according to the current weighting algorithm. Raises + ``NoQualityAvailable`` if the matcher or weighting do not support + quality measurements. + """ + + raise NoQualityAvailable(self.__class__) + + def block_quality(self): + """Returns a quality measurement of the current block of postings, + according to the current weighting algorithm. Raises + ``NoQualityAvailable`` if the matcher or weighting do not support + quality measurements. + """ + + raise NoQualityAvailable(self.__class__) + + @abstractmethod + def id(self): + """Returns the ID of the current posting. + """ + + raise NotImplementedError + + def all_ids(self): + """Returns a generator of all IDs in the matcher. + + What this method returns for a matcher that has already read some + postings (whether it only yields the remaining postings or all postings + from the beginning) is undefined, so it's best to only use this method + on fresh matchers. + """ + + i = 0 + m = self + while m.is_active(): + yield m.id() + m.next() + i += 1 + if i == 10: + m = m.replace() + i = 0 + + def all_items(self): + """Returns a generator of all (ID, encoded value) pairs in the matcher. + + What this method returns for a matcher that has already read some + postings (whether it only yields the remaining postings or all postings + from the beginning) is undefined, so it's best to only use this method + on fresh matchers. + """ + + i = 0 + m = self + while self.is_active(): + yield (m.id(), m.value()) + m.next() + i += 1 + if i == 10: + m = m.replace() + i = 0 + + def items_as(self, astype): + """Returns a generator of all (ID, decoded value) pairs in the matcher. + + What this method returns for a matcher that has already read some + postings (whether it only yields the remaining postings or all postings + from the beginning) is undefined, so it's best to only use this method + on fresh matchers. + """ + + while self.is_active(): + yield (self.id(), self.value_as(astype)) + self.next() + + @abstractmethod + def value(self): + """Returns the encoded value of the current posting. + """ + + raise NotImplementedError + + @abstractmethod + def supports(self, astype): + """Returns True if the field's format supports the named data type, + for example 'frequency' or 'characters'. + """ + + raise NotImplementedError("supports not implemented in %s" + % self.__class__) + + @abstractmethod + def value_as(self, astype): + """Returns the value(s) of the current posting as the given type. + """ + + raise NotImplementedError("value_as not implemented in %s" + % self.__class__) + + def spans(self): + """Returns a list of :class:`~whoosh.query.spans.Span` objects for the + matches in this document. Raises an exception if the field being + searched does not store positions. + """ + + from whoosh.query.spans import Span + + if self.supports("characters"): + return [Span(pos, startchar=startchar, endchar=endchar) + for pos, startchar, endchar in self.value_as("characters")] + elif self.supports("positions"): + return [Span(pos) for pos in self.value_as("positions")] + else: + raise Exception("Field does not support spans") + + def skip_to(self, id): + """Moves this matcher to the first posting with an ID equal to or + greater than the given ID. + """ + + while self.is_active() and self.id() < id: + self.next() + + def skip_to_quality(self, minquality): + """Moves this matcher to the next block with greater than the given + minimum quality value. + """ + + raise NotImplementedError(self.__class__.__name__) + + @abstractmethod + def next(self): + """Moves this matcher to the next posting. + """ + + raise NotImplementedError(self.__class__.__name__) + + def weight(self): + """Returns the weight of the current posting. + """ + + return self.value_as("weight") + + @abstractmethod + def score(self): + """Returns the score of the current posting. + """ + + raise NotImplementedError(self.__class__.__name__) + + def __eq__(self, other): + return self.__class__ is type(other) + + def __lt__(self, other): + return type(other) is self.__class__ + + def __ne__(self, other): + return not self.__eq__(other) + + def __gt__(self, other): + return not (self.__lt__(other) or self.__eq__(other)) + + def __le__(self, other): + return self.__eq__(other) or self.__lt__(other) + + def __ge__(self, other): + return self.__eq__(other) or self.__gt__(other) + + +# Simple intermediate classes + +class ConstantScoreMatcher(Matcher): + def __init__(self, score=1.0): + self._score = score + + def supports_block_quality(self): + return True + + def max_quality(self): + return self._score + + def block_quality(self): + return self._score + + def skip_to_quality(self, minquality): + if minquality >= self._score: + self.go_inactive() + + def score(self): + return self._score + + +# Null matcher + +class NullMatcherClass(Matcher): + """Matcher with no postings which is never active. + """ + + def __call__(self): + return self + + def __repr__(self): + return "" + + def supports_block_quality(self): + return True + + def max_quality(self): + return 0 + + def block_quality(self): + return 0 + + def skip_to_quality(self, minquality): + return 0 + + def is_active(self): + return False + + def reset(self): + pass + + def all_ids(self): + return [] + + def copy(self): + return self + + +# Singleton instance +NullMatcher = NullMatcherClass() + + +class ListMatcher(Matcher): + """Synthetic matcher backed by a list of IDs. + """ + + def __init__(self, ids, weights=None, values=None, format=None, + scorer=None, position=0, all_weights=None, term=None, + terminfo=None): + """ + :param ids: a list of doc IDs. + :param weights: a list of weights corresponding to the list of IDs. + If this argument is not supplied, a list of 1.0 values is used. + :param values: a list of encoded values corresponding to the list of + IDs. + :param format: a :class:`whoosh.formats.Format` object representing the + format of the field. + :param scorer: a :class:`whoosh.scoring.BaseScorer` object for scoring + the postings. + :param term: a ``("fieldname", "text")`` tuple, or None if this is not + a term matcher. + """ + + self._ids = ids + self._weights = weights + self._all_weights = all_weights + self._values = values + self._i = position + self._format = format + self._scorer = scorer + self._term = term + self._terminfo = terminfo + + def __repr__(self): + return "<%s>" % self.__class__.__name__ + + def is_active(self): + return self._i < len(self._ids) + + def reset(self): + self._i = 0 + + def skip_to(self, id): + if not self.is_active(): + raise ReadTooFar + if id < self.id(): + return + + while self._i < len(self._ids) and self._ids[self._i] < id: + self._i += 1 + + def term(self): + return self._term + + def copy(self): + return self.__class__(self._ids, self._weights, self._values, + self._format, self._scorer, self._i, + self._all_weights) + + def replace(self, minquality=0): + if not self.is_active(): + return NullMatcher() + elif minquality and self.max_quality() < minquality: + return NullMatcher() + else: + return self + + def supports_block_quality(self): + return (self._scorer is not None + and self._scorer.supports_block_quality()) + + def max_quality(self): + # This matcher treats all postings in the list as one "block", so the + # block quality is the same as the quality of the entire list + if self._scorer: + return self._scorer.block_quality(self) + else: + return self.block_max_weight() + + def block_quality(self): + return self._scorer.block_quality(self) + + def skip_to_quality(self, minquality): + while self._i < len(self._ids) and self.block_quality() <= minquality: + self._i += 1 + return 0 + + def id(self): + return self._ids[self._i] + + def all_ids(self): + return iter(self._ids) + + def all_items(self): + values = self._values + if values is None: + values = repeat('') + + return izip(self._ids, values) + + def value(self): + if self._values: + v = self._values[self._i] + + if isinstance(v, list): + # This object supports "values" that are actually lists of + # value strings. This is to support combining the results of + # several different matchers into a single ListMatcher (see the + # TOO_MANY_CLAUSES functionality of MultiTerm). We combine the + # values here instead of combining them first and then making + # the ListMatcher to avoid wasting time combining values if the + # consumer never asks for them. + assert len(v) > 0 + if len(v) == 1: + v = v[0] + else: + v = self._format.combine(v) + # Replace the list with the computed value string + self._values[self._i] = v + + return v + else: + return '' + + def value_as(self, astype): + decoder = self._format.decoder(astype) + return decoder(self.value()) + + def supports(self, astype): + return self._format.supports(astype) + + def next(self): + self._i += 1 + + def weight(self): + if self._all_weights: + return self._all_weights + elif self._weights: + return self._weights[self._i] + else: + return 1.0 + + def block_min_length(self): + return self._terminfo.min_length() + + def block_max_length(self): + return self._terminfo.max_length() + + def block_max_weight(self): + if self._all_weights: + return self._all_weights + elif self._weights: + return max(self._weights) + elif self._terminfo is not None: + return self._terminfo.max_weight() + else: + return 1.0 + + def score(self): + if self._scorer: + return self._scorer.score(self) + else: + return self.weight() + + +# Term/vector leaf posting matcher middleware + +class LeafMatcher(Matcher): + # Subclasses need to set + # self.scorer -- a Scorer object or None + # self.format -- Format object for the posting values + + def __repr__(self): + return "%s(%r, %s)" % (self.__class__.__name__, self.term(), + self.is_active()) + + def term(self): + return self._term + + def items_as(self, astype): + decoder = self.format.decoder(astype) + for id, value in self.all_items(): + yield (id, decoder(value)) + + def supports(self, astype): + return self.format.supports(astype) + + def value_as(self, astype): + decoder = self.format.decoder(astype) + return decoder(self.value()) + + def spans(self): + from whoosh.query.spans import Span + + if self.supports("characters"): + return [Span(pos, startchar=startchar, endchar=endchar) + for pos, startchar, endchar in self.value_as("characters")] + elif self.supports("positions"): + return [Span(pos) for pos in self.value_as("positions")] + else: + raise Exception("Field does not support positions (%r)" + % self.term()) + + def supports_block_quality(self): + return self.scorer and self.scorer.supports_block_quality() + + def max_quality(self): + return self.scorer.max_quality() + + def block_quality(self): + return self.scorer.block_quality(self) + + def score(self): + return self.scorer.score(self) diff --git a/src/whoosh/matching/wrappers.py b/src/whoosh/matching/wrappers.py new file mode 100644 index 0000000..0fadf08 --- /dev/null +++ b/src/whoosh/matching/wrappers.py @@ -0,0 +1,572 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import division + +from whoosh.compat import xrange +from whoosh.matching import mcore + + +class WrappingMatcher(mcore.Matcher): + """Base class for matchers that wrap sub-matchers. + """ + + def __init__(self, child, boost=1.0): + self.child = child + self.boost = boost + + def __repr__(self): + return "%s(%r, boost=%s)" % (self.__class__.__name__, self.child, + self.boost) + + def copy(self): + kwargs = {} + if hasattr(self, "boost"): + kwargs["boost"] = self.boost + return self.__class__(self.child.copy(), **kwargs) + + def depth(self): + return 1 + self.child.depth() + + def _replacement(self, newchild): + return self.__class__(newchild, boost=self.boost) + + def replace(self, minquality=0): + # Replace the child matcher + r = self.child.replace(minquality) + if r is not self.child: + # If the child changed, return a new wrapper on the new child + return self._replacement(r) + else: + return self + + def id(self): + return self.child.id() + + def all_ids(self): + return self.child.all_ids() + + def is_active(self): + return self.child.is_active() + + def reset(self): + self.child.reset() + + def children(self): + return [self.child] + + def supports(self, astype): + return self.child.supports(astype) + + def value(self): + return self.child.value() + + def value_as(self, astype): + return self.child.value_as(astype) + + def spans(self): + return self.child.spans() + + def skip_to(self, id): + return self.child.skip_to(id) + + def next(self): + self.child.next() + + def supports_block_quality(self): + return self.child.supports_block_quality() + + def skip_to_quality(self, minquality): + return self.child.skip_to_quality(minquality / self.boost) + + def max_quality(self): + return self.child.max_quality() * self.boost + + def block_quality(self): + return self.child.block_quality() * self.boost + + def weight(self): + return self.child.weight() * self.boost + + def score(self): + return self.child.score() * self.boost + + +class MultiMatcher(mcore.Matcher): + """Serializes the results of a list of sub-matchers. + """ + + def __init__(self, matchers, idoffsets, scorer=None, current=0): + """ + :param matchers: a list of Matcher objects. + :param idoffsets: a list of offsets corresponding to items in the + ``matchers`` list. + """ + + self.matchers = matchers + self.offsets = idoffsets + self.scorer = scorer + self.current = current + self._next_matcher() + + def __repr__(self): + return "%s(%r, %r, current=%s)" % (self.__class__.__name__, + self.matchers, self.offsets, + self.current) + + def is_active(self): + return self.current < len(self.matchers) + + def reset(self): + for mr in self.matchers: + mr.reset() + self.current = 0 + + def children(self): + return [self.matchers[self.current]] + + def _next_matcher(self): + matchers = self.matchers + while (self.current < len(matchers) + and not matchers[self.current].is_active()): + self.current += 1 + + def copy(self): + return self.__class__([mr.copy() for mr in self.matchers], + self.offsets, current=self.current) + + def depth(self): + if self.is_active(): + return 1 + max(mr.depth() for mr in self.matchers[self.current:]) + else: + return 0 + + def replace(self, minquality=0): + m = self + if minquality: + # Skip sub-matchers that don't have a high enough max quality to + # contribute + while (m.is_active() + and m.matchers[m.current].max_quality() < minquality): + m = self.__class__(self.matchers, self.offsets, self.scorer, + m.current + 1) + m._next_matcher() + + if not m.is_active(): + return mcore.NullMatcher() + + # TODO: Possible optimization: if the last matcher is current, replace + # this with the last matcher, but wrap it with a matcher that adds the + # offset. Have to check whether that's actually faster, though. + return m + + def id(self): + current = self.current + return self.matchers[current].id() + self.offsets[current] + + def all_ids(self): + offsets = self.offsets + for i, mr in enumerate(self.matchers): + for id in mr.all_ids(): + yield id + offsets[i] + + def spans(self): + return self.matchers[self.current].spans() + + def supports(self, astype): + return self.matchers[self.current].supports(astype) + + def value(self): + return self.matchers[self.current].value() + + def value_as(self, astype): + return self.matchers[self.current].value_as(astype) + + def next(self): + if not self.is_active(): + raise mcore.ReadTooFar + + self.matchers[self.current].next() + if not self.matchers[self.current].is_active(): + self._next_matcher() + + def skip_to(self, id): + if not self.is_active(): + raise mcore.ReadTooFar + if id <= self.id(): + return + + matchers = self.matchers + offsets = self.offsets + r = False + + while self.current < len(matchers) and id > self.id(): + mr = matchers[self.current] + sr = mr.skip_to(id - offsets[self.current]) + r = sr or r + if mr.is_active(): + break + + self._next_matcher() + + return r + + def supports_block_quality(self): + return all(mr.supports_block_quality() for mr + in self.matchers[self.current:]) + + def max_quality(self): + return max(m.max_quality() for m in self.matchers[self.current:]) + + def block_quality(self): + return self.matchers[self.current].block_quality() + + def weight(self): + return self.matchers[self.current].weight() + + def score(self): + return self.scorer.score(self) + + +def ExcludeMatcher(child, excluded, boost=1.0): + return FilterMatcher(child, excluded, exclude=True, boost=boost) + + +class FilterMatcher(WrappingMatcher): + """Filters the postings from the wrapped based on whether the IDs are + present in or absent from a set. + """ + + def __init__(self, child, ids, exclude=False, boost=1.0): + """ + :param child: the child matcher. + :param ids: a set of IDs to filter by. + :param exclude: by default, only IDs from the wrapped matcher that are + **in** the set are used. If this argument is True, only IDs from + the wrapped matcher that are **not in** the set are used. + """ + + super(FilterMatcher, self).__init__(child) + self._ids = ids + self._exclude = exclude + self.boost = boost + self._find_next() + + def __repr__(self): + return "%s(%r, %r, %r, boost=%s)" % (self.__class__.__name__, + self.child, self._ids, + self._exclude, self.boost) + + def reset(self): + self.child.reset() + self._find_next() + + def copy(self): + return self.__class__(self.child.copy(), self._ids, self._exclude, + boost=self.boost) + + def _replacement(self, newchild): + return self.__class__(newchild, self._ids, exclude=self._exclude, + boost=self.boost) + + def _find_next(self): + child = self.child + ids = self._ids + r = False + + if self._exclude: + while child.is_active() and child.id() in ids: + r = child.next() or r + else: + while child.is_active() and child.id() not in ids: + r = child.next() or r + return r + + def next(self): + self.child.next() + self._find_next() + + def skip_to(self, id): + self.child.skip_to(id) + self._find_next() + + def all_ids(self): + ids = self._ids + if self._exclude: + return (id for id in self.child.all_ids() if id not in ids) + else: + return (id for id in self.child.all_ids() if id in ids) + + def all_items(self): + ids = self._ids + if self._exclude: + return (item for item in self.child.all_items() + if item[0] not in ids) + else: + return (item for item in self.child.all_items() if item[0] in ids) + + +class InverseMatcher(WrappingMatcher): + """Synthetic matcher, generates postings that are NOT present in the + wrapped matcher. + """ + + def __init__(self, child, limit, missing=None, weight=1.0, id=0): + super(InverseMatcher, self).__init__(child) + self.limit = limit + self._weight = weight + self.missing = missing or (lambda id: False) + self._id = id + self._find_next() + + def copy(self): + return self.__class__(self.child.copy(), self.limit, + weight=self._weight, missing=self.missing, + id=self._id) + + def _replacement(self, newchild): + return self.__class__(newchild, self.limit, missing=self.missing, + weight=self._weight, id=self._id) + + def is_active(self): + return self._id < self.limit + + def reset(self): + self.child.reset() + self._id = 0 + self._find_next() + + def supports_block_quality(self): + return False + + def _find_next(self): + child = self.child + missing = self.missing + + # If the current docnum isn't missing and the child matcher is + # exhausted (so we don't have to worry about skipping its matches), we + # don't have to do anything + if not child.is_active() and not missing(self._id): + return + + # Skip missing documents + while self._id < self.limit and missing(self._id): + self._id += 1 + + # Catch the child matcher up to where this matcher is + if child.is_active() and child.id() < self._id: + child.skip_to(self._id) + + # While self._id is missing or is in the child matcher, increase it + while child.is_active() and self._id < self.limit: + if missing(self._id): + self._id += 1 + continue + + if self._id == child.id(): + self._id += 1 + child.next() + continue + + break + + def id(self): + return self._id + + def all_ids(self): + return mcore.Matcher.all_ids(self) + + def next(self): + if self._id >= self.limit: + raise mcore.ReadTooFar + self._id += 1 + self._find_next() + + def skip_to(self, id): + if self._id >= self.limit: + raise mcore.ReadTooFar + if id < self._id: + return + self._id = id + self._find_next() + + def weight(self): + return self._weight + + def score(self): + return self._weight + + +class RequireMatcher(WrappingMatcher): + """Matches postings that are in both sub-matchers, but only uses scores + from the first. + """ + + def __init__(self, a, b): + from whoosh.matching.binary import IntersectionMatcher + + self.a = a + self.b = b + WrappingMatcher.__init__(self, IntersectionMatcher(a, b)) + + def copy(self): + return self.__class__(self.a.copy(), self.b.copy()) + + def supports_block_quality(self): + return self.a.supports_block_quality() + + def replace(self, minquality=0): + if not self.child.is_active(): + # If one of the sub-matchers is inactive, go inactive + return mcore.NullMatcher() + elif minquality and self.a.max_quality() < minquality: + # If the required matcher doesn't have a high enough max quality + # to possibly contribute, return an inactive matcher + return mcore.NullMatcher() + + new_a = self.a.replace(minquality) + new_b = self.b.replace() + if not new_a.is_active(): + return mcore.NullMatcher() + elif new_a is not self.a or new_b is not self.b: + # If one of the sub-matchers changed, return a new Require + return self.__class__(new_a, self.b) + else: + return self + + def max_quality(self): + return self.a.max_quality() + + def block_quality(self): + return self.a.block_quality() + + def skip_to_quality(self, minquality): + skipped = self.a.skip_to_quality(minquality) + self.child._find_next() + return skipped + + def weight(self): + return self.a.weight() + + def score(self): + return self.a.score() + + def supports(self, astype): + return self.a.supports(astype) + + def value(self): + return self.a.value() + + def value_as(self, astype): + return self.a.value_as(astype) + + +class ConstantScoreWrapperMatcher(WrappingMatcher): + def __init__(self, child, score=1.0): + WrappingMatcher.__init__(self, child) + self._score = score + + def copy(self): + return self.__class__(self.child.copy(), score=self._score) + + def _replacement(self, newchild): + return self.__class__(newchild, score=self._score) + + def max_quality(self): + return self._score + + def block_quality(self): + return self._score + + def score(self): + return self._score + + +class SingleTermMatcher(WrappingMatcher): + """Makes a tree of matchers act as if they were a matcher for a single + term for the purposes of "what terms are matching?" questions. + """ + + def __init__(self, child, term): + WrappingMatcher.__init__(self, child) + self._term = term + + def term(self): + return self._term + + def replace(self, minquality=0): + return self + + +class CoordMatcher(WrappingMatcher): + """Modifies the computed score to penalize documents that don't match all + terms in the matcher tree. + + Because this matcher modifies the score, it may give unexpected results + when compared to another matcher returning the unmodified score. + """ + + def __init__(self, child, scale=1.0): + WrappingMatcher.__init__(self, child) + self._termcount = len(list(child.term_matchers())) + self._maxqual = child.max_quality() + self._scale = scale + + def _replacement(self, newchild): + return self.__class__(newchild, scale=self._scale) + + def _sqr(self, score, matching): + # This is the "SQR" (Short Query Ranking) function used by Apple's old + # V-twin search library, described in the paper "V-Twin: A Lightweight + # Engine for Interactive Use". + # + # http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.56.1916 + + # score - document score using the current weighting function + # matching - number of matching terms in the current document + termcount = self._termcount # Number of terms in this tree + scale = self._scale # Scaling factor + + sqr = ((score + ((matching - 1) / (termcount - scale) ** 2)) + * ((termcount - 1) / termcount)) + return sqr + + def max_quality(self): + return self._sqr(self.child.max_quality(), self._termcount) + + def block_quality(self): + return self._sqr(self.child.block_quality(), self._termcount) + + def score(self): + child = self.child + + score = child.score() + matching = 0 + for _ in child.matching_terms(child.id()): + matching += 1 + + return self._sqr(score, matching) diff --git a/src/whoosh/multiproc.py b/src/whoosh/multiproc.py new file mode 100644 index 0000000..54109a8 --- /dev/null +++ b/src/whoosh/multiproc.py @@ -0,0 +1,381 @@ +# Copyright 2011 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import with_statement +import os +from multiprocessing import Process, Queue, cpu_count + +from whoosh.compat import xrange, iteritems, pickle +from whoosh.codec import base +from whoosh.writing import PostingPool, SegmentWriter +from whoosh.externalsort import imerge +from whoosh.util import random_name + + +def finish_subsegment(writer, k=64): + # Tell the pool to finish up the current file + writer.pool.save() + # Tell the pool to merge any and all runs in the pool until there + # is only one run remaining. "k" is an optional parameter passed + # from the parent which sets the maximum number of files to open + # while reducing. + writer.pool.reduce_to(1, k) + + # The filename of the single remaining run + runname = writer.pool.runs[0] + # The indexed field names + fieldnames = writer.pool.fieldnames + # The segment object (parent can use this to re-open the files created + # by the sub-writer) + segment = writer._partial_segment() + + return runname, fieldnames, segment + + +# Multiprocessing Writer + +class SubWriterTask(Process): + # This is a Process object that takes "jobs" off a job Queue, processes + # them, and when it's done, puts a summary of its work on a results Queue + + def __init__(self, storage, indexname, jobqueue, resultqueue, kwargs, + multisegment): + Process.__init__(self) + self.storage = storage + self.indexname = indexname + self.jobqueue = jobqueue + self.resultqueue = resultqueue + self.kwargs = kwargs + self.multisegment = multisegment + self.running = True + + def run(self): + # This is the main loop of the process. OK, so the way this works is + # kind of brittle and stupid, but I had to figure out how to use the + # multiprocessing module, work around bugs, and address performance + # issues, so there is at least some reasoning behind some of this + + # The "parent" task farms individual documents out to the subtasks for + # indexing. You could pickle the actual documents and put them in the + # queue, but that is not very performant. Instead, we assume the tasks + # share a filesystem and use that to pass the information around. The + # parent task writes a certain number of documents to a file, then puts + # the filename on the "job queue". A subtask gets the filename off the + # queue and reads through the file processing the documents. + + jobqueue = self.jobqueue + resultqueue = self.resultqueue + multisegment = self.multisegment + + # Open a placeholder object representing the index + ix = self.storage.open_index(self.indexname) + # Open a writer for the index. The _lk=False parameter means to not try + # to lock the index (the parent object that started me takes care of + # locking the index) + writer = self.writer = SegmentWriter(ix, _lk=False, **self.kwargs) + + # If the parent task calls cancel() on me, it will set self.running to + # False, so I'll notice the next time through the loop + while self.running: + # Take an object off the job queue + jobinfo = jobqueue.get() + # If the object is None, it means the parent task wants me to + # finish up + if jobinfo is None: + break + # The object from the queue is a tuple of (filename, + # number_of_docs_in_file). Pass those two pieces of information as + # arguments to _process_file(). + self._process_file(*jobinfo) + + if not self.running: + # I was cancelled, so I'll cancel my underlying writer + writer.cancel() + else: + if multisegment: + # Actually finish the segment and return it with no run + runname = None + fieldnames = writer.pool.fieldnames + segment = writer._finalize_segment() + else: + # Merge all runs in the writer's pool into one run, close the + # segment, and return the run name and the segment + k = self.kwargs.get("k", 64) + runname, fieldnames, segment = finish_subsegment(writer, k) + + # Put the results (the run filename and the segment object) on the + # result queue + resultqueue.put((runname, fieldnames, segment), timeout=5) + + def _process_file(self, filename, doc_count): + # This method processes a "job file" written out by the parent task. A + # job file is a series of pickled (code, arguments) tuples. Currently + # the only command codes is 0=add_document + + writer = self.writer + tempstorage = writer.temp_storage() + + load = pickle.load + with tempstorage.open_file(filename).raw_file() as f: + for _ in xrange(doc_count): + # Load the next pickled tuple from the file + code, args = load(f) + assert code == 0 + writer.add_document(**args) + # Remove the job file + tempstorage.delete_file(filename) + + def cancel(self): + self.running = False + + +class MpWriter(SegmentWriter): + def __init__(self, ix, procs=None, batchsize=100, subargs=None, + multisegment=False, **kwargs): + # This is the "main" writer that will aggregate the results created by + # the sub-tasks + SegmentWriter.__init__(self, ix, **kwargs) + + self.procs = procs or cpu_count() + # The maximum number of documents in each job file submitted to the + # sub-tasks + self.batchsize = batchsize + # You can use keyword arguments or the "subargs" argument to pass + # keyword arguments to the sub-writers + self.subargs = subargs if subargs else kwargs + # If multisegment is True, don't merge the segments created by the + # sub-writers, just add them directly to the TOC + self.multisegment = multisegment + + # A list to hold the sub-task Process objects + self.tasks = [] + # A queue to pass the filenames of job files to the sub-tasks + self.jobqueue = Queue(self.procs * 4) + # A queue to get back the final results of the sub-tasks + self.resultqueue = Queue() + # A buffer for documents before they are flushed to a job file + self.docbuffer = [] + + self._grouping = 0 + self._added_sub = False + + def _new_task(self): + task = SubWriterTask(self.storage, self.indexname, + self.jobqueue, self.resultqueue, self.subargs, + self.multisegment) + self.tasks.append(task) + task.start() + return task + + def _enqueue(self): + # Flush the documents stored in self.docbuffer to a file and put the + # filename on the job queue + docbuffer = self.docbuffer + dump = pickle.dump + length = len(docbuffer) + + filename = "%s.doclist" % random_name() + with self.temp_storage().create_file(filename).raw_file() as f: + for item in docbuffer: + dump(item, f, -1) + + if len(self.tasks) < self.procs: + self._new_task() + jobinfo = (filename, length) + self.jobqueue.put(jobinfo) + self.docbuffer = [] + + def cancel(self): + try: + for task in self.tasks: + task.cancel() + finally: + SegmentWriter.cancel(self) + + def start_group(self): + self._grouping += 1 + + def end_group(self): + if not self._grouping: + raise Exception("Unbalanced end_group") + self._grouping -= 1 + + def add_document(self, **fields): + # Add the document to the docbuffer + self.docbuffer.append((0, fields)) + # If the buffer is full, flush it to the job queue + if not self._grouping and len(self.docbuffer) >= self.batchsize: + self._enqueue() + self._added_sub = True + + def _read_and_renumber_run(self, path, offset): + # Note that SortingPool._read_run() automatically deletes the run file + # when it's finished + + gen = self.pool._read_run(path) + # If offset is 0, just return the items unchanged + if not offset: + return gen + else: + # Otherwise, add the offset to each docnum + return ((fname, text, docnum + offset, weight, value) + for fname, text, docnum, weight, value in gen) + + def commit(self, mergetype=None, optimize=None, merge=None): + if self._added_sub: + # If documents have been added to sub-writers, use the parallel + # merge commit code + self._commit(mergetype, optimize, merge) + else: + # Otherwise, just do a regular-old commit + SegmentWriter.commit(self, mergetype=mergetype, optimize=optimize, + merge=merge) + + def _commit(self, mergetype, optimize, merge): + # Index the remaining documents in the doc buffer + if self.docbuffer: + self._enqueue() + # Tell the tasks to finish + for task in self.tasks: + self.jobqueue.put(None) + + # Merge existing segments + finalsegments = self._merge_segments(mergetype, optimize, merge) + + # Wait for the subtasks to finish + for task in self.tasks: + task.join() + + # Pull a (run_file_name, fieldnames, segment) tuple off the result + # queue for each sub-task, representing the final results of the task + results = [] + for task in self.tasks: + results.append(self.resultqueue.get(timeout=5)) + + if self.multisegment: + # If we're not merging the segments, we don't care about the runname + # and fieldnames in the results... just pull out the segments and + # add them to the list of final segments + finalsegments += [s for _, _, s in results] + if self._added: + finalsegments.append(self._finalize_segment()) + else: + self._close_segment() + assert self.perdocwriter.is_closed + else: + # Merge the posting sources from the sub-writers and my + # postings into this writer + self._merge_subsegments(results, mergetype) + self._close_segment() + self._assemble_segment() + finalsegments.append(self.get_segment()) + assert self.perdocwriter.is_closed + + self._commit_toc(finalsegments) + self._finish() + + def _merge_subsegments(self, results, mergetype): + schema = self.schema + schemanames = set(schema.names()) + storage = self.storage + codec = self.codec + sources = [] + + # If information was added to this writer the conventional (e.g. + # through add_reader or merging segments), add it as an extra source + if self._added: + sources.append(self.pool.iter_postings()) + + pdrs = [] + for runname, fieldnames, segment in results: + fieldnames = set(fieldnames) | schemanames + pdr = codec.per_document_reader(storage, segment) + pdrs.append(pdr) + basedoc = self.docnum + docmap = self.write_per_doc(fieldnames, pdr) + assert docmap is None + + items = self._read_and_renumber_run(runname, basedoc) + sources.append(items) + + # Create a MultiLengths object combining the length files from the + # subtask segments + self.perdocwriter.close() + pdrs.insert(0, self.per_document_reader()) + mpdr = base.MultiPerDocumentReader(pdrs) + + try: + # Merge the iterators into the field writer + self.fieldwriter.add_postings(schema, mpdr, imerge(sources)) + finally: + mpdr.close() + self._added = True + + +class SerialMpWriter(MpWriter): + # A non-parallel version of the MpWriter for testing purposes + + def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs): + SegmentWriter.__init__(self, ix, **kwargs) + + self.procs = procs or cpu_count() + self.batchsize = batchsize + self.subargs = subargs if subargs else kwargs + self.tasks = [SegmentWriter(ix, _lk=False, **self.subargs) + for _ in xrange(self.procs)] + self.pointer = 0 + self._added_sub = False + + def add_document(self, **fields): + self.tasks[self.pointer].add_document(**fields) + self.pointer = (self.pointer + 1) % len(self.tasks) + self._added_sub = True + + def _commit(self, mergetype, optimize, merge): + # Pull a (run_file_name, segment) tuple off the result queue for each + # sub-task, representing the final results of the task + + # Merge existing segments + finalsegments = self._merge_segments(mergetype, optimize, merge) + results = [] + for writer in self.tasks: + results.append(finish_subsegment(writer)) + + self._merge_subsegments(results, mergetype) + self._close_segment() + self._assemble_segment() + finalsegments.append(self.get_segment()) + + self._commit_toc(finalsegments) + self._finish() + + +# For compatibility with old multiproc module +class MultiSegmentWriter(MpWriter): + def __init__(self, *args, **kwargs): + MpWriter.__init__(self, *args, **kwargs) + self.multisegment = True diff --git a/src/whoosh/qparser/__init__.py b/src/whoosh/qparser/__init__.py new file mode 100644 index 0000000..a61f905 --- /dev/null +++ b/src/whoosh/qparser/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.qparser.default import * +from whoosh.qparser.plugins import * +from whoosh.qparser.syntax import * diff --git a/src/whoosh/qparser/common.py b/src/whoosh/qparser/common.py new file mode 100644 index 0000000..39e7087 --- /dev/null +++ b/src/whoosh/qparser/common.py @@ -0,0 +1,65 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module contains common utility objects/functions for the other query +parser modules. +""" + +import sys + +from whoosh.compat import string_type + + +class QueryParserError(Exception): + def __init__(self, cause, msg=None): + super(QueryParserError, self).__init__(str(cause)) + self.cause = cause + + +def get_single_text(field, text, **kwargs): + """Returns the first token from an analyzer's output. + """ + + for t in field.process_text(text, mode="query", **kwargs): + return t + + +def attach(q, stxnode): + if q: + try: + q.startchar = stxnode.startchar + q.endchar = stxnode.endchar + except AttributeError: + raise AttributeError("Can't set attribute on %s" + % q.__class__.__name__) + return q + + +def print_debug(level, msg, out=sys.stderr): + if level: + out.write("%s%s\n" % (" " * (level - 1), msg)) diff --git a/src/whoosh/qparser/dateparse.py b/src/whoosh/qparser/dateparse.py new file mode 100644 index 0000000..1c4d511 --- /dev/null +++ b/src/whoosh/qparser/dateparse.py @@ -0,0 +1,922 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import re +import sys +from datetime import datetime, timedelta + +from whoosh.compat import string_type, iteritems +from whoosh.qparser import plugins, syntax +from whoosh.qparser.taggers import Tagger +from whoosh.support.relativedelta import relativedelta +from whoosh.util.text import rcompile +from whoosh.util.times import adatetime, timespan +from whoosh.util.times import fill_in, is_void, relative_days +from whoosh.util.times import TimeError + + +class DateParseError(Exception): + "Represents an error in parsing date text." + + +# Utility functions + +def print_debug(level, msg, *args): + if level > 0: + print((" " * (level - 1)) + (msg % args)) + + +# Parser element objects + +class Props(object): + """A dumb little object that just puts copies a dictionary into attibutes + so I can use dot syntax instead of square bracket string item lookup and + save a little bit of typing. Used by :class:`Regex`. + """ + + def __init__(self, **args): + self.__dict__ = args + + def __repr__(self): + return repr(self.__dict__) + + def get(self, key, default=None): + return self.__dict__.get(key, default) + + +class ParserBase(object): + """Base class for date parser elements. + """ + + def to_parser(self, e): + if isinstance(e, string_type): + return Regex(e) + else: + return e + + def parse(self, text, dt, pos=0, debug=-9999): + raise NotImplementedError + + def date_from(self, text, dt=None, pos=0, debug=-9999): + if dt is None: + dt = datetime.now() + + d, pos = self.parse(text, dt, pos, debug + 1) + return d + + +class MultiBase(ParserBase): + """Base class for date parser elements such as Sequence and Bag that + have sub-elements. + """ + + def __init__(self, elements, name=None): + """ + :param elements: the sub-elements to match. + :param name: a name for this element (for debugging purposes only). + """ + + self.elements = [self.to_parser(e) for e in elements] + self.name = name + + def __repr__(self): + return "%s<%s>%r" % (self.__class__.__name__, self.name or '', + self.elements) + + +class Sequence(MultiBase): + """Merges the dates parsed by a sequence of sub-elements. + """ + + def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", name=None, + progressive=False): + """ + :param elements: the sequence of sub-elements to parse. + :param sep: a separator regular expression to match between elements, + or None to not have separators. + :param name: a name for this element (for debugging purposes only). + :param progressive: if True, elements after the first do not need to + match. That is, for elements (a, b, c) and progressive=True, the + sequence matches like ``a[b[c]]``. + """ + + super(Sequence, self).__init__(elements, name) + self.sep_pattern = sep + if sep: + self.sep_expr = rcompile(sep, re.IGNORECASE) + else: + self.sep_expr = None + self.progressive = progressive + + def parse(self, text, dt, pos=0, debug=-9999): + d = adatetime() + first = True + foundall = False + failed = False + + print_debug(debug, "Seq %s sep=%r text=%r", self.name, + self.sep_pattern, text[pos:]) + for e in self.elements: + print_debug(debug, "Seq %s text=%r", self.name, text[pos:]) + if self.sep_expr and not first: + print_debug(debug, "Seq %s looking for sep", self.name) + m = self.sep_expr.match(text, pos) + if m: + pos = m.end() + else: + print_debug(debug, "Seq %s didn't find sep", self.name) + break + + print_debug(debug, "Seq %s trying=%r at=%s", self.name, e, pos) + + try: + at, newpos = e.parse(text, dt, pos=pos, debug=debug + 1) + except TimeError: + failed = True + break + + print_debug(debug, "Seq %s result=%r", self.name, at) + if not at: + break + pos = newpos + + print_debug(debug, "Seq %s adding=%r to=%r", self.name, at, d) + try: + d = fill_in(d, at) + except TimeError: + print_debug(debug, "Seq %s Error in fill_in", self.name) + failed = True + break + print_debug(debug, "Seq %s filled date=%r", self.name, d) + + first = False + else: + foundall = True + + if not failed and (foundall or (not first and self.progressive)): + print_debug(debug, "Seq %s final=%r", self.name, d) + return (d, pos) + else: + print_debug(debug, "Seq %s failed", self.name) + return (None, None) + + +class Combo(Sequence): + """Parses a sequence of elements in order and combines the dates parsed + by the sub-elements somehow. The default behavior is to accept two dates + from the sub-elements and turn them into a range. + """ + + def __init__(self, elements, fn=None, sep="(\\s+|\\s*,\\s*)", min=2, max=2, + name=None): + """ + :param elements: the sequence of sub-elements to parse. + :param fn: a function to run on all dates found. It should return a + datetime, adatetime, or timespan object. If this argument is None, + the default behavior accepts two dates and returns a timespan. + :param sep: a separator regular expression to match between elements, + or None to not have separators. + :param min: the minimum number of dates required from the sub-elements. + :param max: the maximum number of dates allowed from the sub-elements. + :param name: a name for this element (for debugging purposes only). + """ + + super(Combo, self).__init__(elements, sep=sep, name=name) + self.fn = fn + self.min = min + self.max = max + + def parse(self, text, dt, pos=0, debug=-9999): + dates = [] + first = True + + print_debug(debug, "Combo %s sep=%r text=%r", self.name, + self.sep_pattern, text[pos:]) + for e in self.elements: + if self.sep_expr and not first: + print_debug(debug, "Combo %s looking for sep at %r", + self.name, text[pos:]) + m = self.sep_expr.match(text, pos) + if m: + pos = m.end() + else: + print_debug(debug, "Combo %s didn't find sep", self.name) + return (None, None) + + print_debug(debug, "Combo %s trying=%r", self.name, e) + try: + at, pos = e.parse(text, dt, pos, debug + 1) + except TimeError: + at, pos = None, None + + print_debug(debug, "Combo %s result=%r", self.name, at) + if at is None: + return (None, None) + + first = False + if is_void(at): + continue + if len(dates) == self.max: + print_debug(debug, "Combo %s length > %s", self.name, self.max) + return (None, None) + dates.append(at) + + print_debug(debug, "Combo %s dates=%r", self.name, dates) + if len(dates) < self.min: + print_debug(debug, "Combo %s length < %s", self.name, self.min) + return (None, None) + + return (self.dates_to_timespan(dates), pos) + + def dates_to_timespan(self, dates): + if self.fn: + return self.fn(dates) + elif len(dates) == 2: + return timespan(dates[0], dates[1]) + else: + raise DateParseError("Don't know what to do with %r" % (dates,)) + + +class Choice(MultiBase): + """Returns the date from the first of its sub-elements that matches. + """ + + def parse(self, text, dt, pos=0, debug=-9999): + print_debug(debug, "Choice %s text=%r", self.name, text[pos:]) + for e in self.elements: + print_debug(debug, "Choice %s trying=%r", self.name, e) + + try: + d, newpos = e.parse(text, dt, pos, debug + 1) + except TimeError: + d, newpos = None, None + if d: + print_debug(debug, "Choice %s matched", self.name) + return (d, newpos) + print_debug(debug, "Choice %s no match", self.name) + return (None, None) + + +class Bag(MultiBase): + """Parses its sub-elements in any order and merges the dates. + """ + + def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", onceper=True, + requireall=False, allof=None, anyof=None, name=None): + """ + :param elements: the sub-elements to parse. + :param sep: a separator regular expression to match between elements, + or None to not have separators. + :param onceper: only allow each element to match once. + :param requireall: if True, the sub-elements can match in any order, + but they must all match. + :param allof: a list of indexes into the list of elements. When this + argument is not None, this element matches only if all the + indicated sub-elements match. + :param allof: a list of indexes into the list of elements. When this + argument is not None, this element matches only if any of the + indicated sub-elements match. + :param name: a name for this element (for debugging purposes only). + """ + + super(Bag, self).__init__(elements, name) + self.sep_expr = rcompile(sep, re.IGNORECASE) + self.onceper = onceper + self.requireall = requireall + self.allof = allof + self.anyof = anyof + + def parse(self, text, dt, pos=0, debug=-9999): + first = True + d = adatetime() + seen = [False] * len(self.elements) + + while True: + newpos = pos + print_debug(debug, "Bag %s text=%r", self.name, text[pos:]) + if not first: + print_debug(debug, "Bag %s looking for sep", self.name) + m = self.sep_expr.match(text, pos) + if m: + newpos = m.end() + else: + print_debug(debug, "Bag %s didn't find sep", self.name) + break + + for i, e in enumerate(self.elements): + print_debug(debug, "Bag %s trying=%r", self.name, e) + + try: + at, xpos = e.parse(text, dt, newpos, debug + 1) + except TimeError: + at, xpos = None, None + + print_debug(debug, "Bag %s result=%r", self.name, at) + if at: + if self.onceper and seen[i]: + return (None, None) + + d = fill_in(d, at) + newpos = xpos + seen[i] = True + break + else: + break + + pos = newpos + if self.onceper and all(seen): + break + + first = False + + if (not any(seen) + or (self.allof and not all(seen[pos] for pos in self.allof)) + or (self.anyof and not any(seen[pos] for pos in self.anyof)) + or (self.requireall and not all(seen))): + return (None, None) + + print_debug(debug, "Bag %s final=%r", self.name, d) + return (d, pos) + + +class Optional(ParserBase): + """Wraps a sub-element to indicate that the sub-element is optional. + """ + + def __init__(self, element): + self.element = self.to_parser(element) + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.element) + + def parse(self, text, dt, pos=0, debug=-9999): + try: + d, pos = self.element.parse(text, dt, pos, debug + 1) + except TimeError: + d, pos = None, None + + if d: + return (d, pos) + else: + return (adatetime(), pos) + + +class ToEnd(ParserBase): + """Wraps a sub-element and requires that the end of the sub-element's match + be the end of the text. + """ + + def __init__(self, element): + self.element = element + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.element) + + def parse(self, text, dt, pos=0, debug=-9999): + try: + d, pos = self.element.parse(text, dt, pos, debug + 1) + except TimeError: + d, pos = None, None + + if d and pos == len(text): + return (d, pos) + else: + return (None, None) + + +class Regex(ParserBase): + """Matches a regular expression and maps named groups in the pattern to + datetime attributes using a function or overridden method. + + There are two points at which you can customize the behavior of this class, + either by supplying functions to the initializer or overriding methods. + + * The ``modify`` function or ``modify_props`` method takes a ``Props`` + object containing the named groups and modifies its values (in place). + * The ``fn`` function or ``props_to_date`` method takes a ``Props`` object + and the base datetime and returns an adatetime/datetime. + """ + + fn = None + modify = None + + def __init__(self, pattern, fn=None, modify=None): + self.pattern = pattern + self.expr = rcompile(pattern, re.IGNORECASE) + self.fn = fn + self.modify = modify + + def __repr__(self): + return "<%r>" % (self.pattern,) + + def parse(self, text, dt, pos=0, debug=-9999): + m = self.expr.match(text, pos) + if not m: + return (None, None) + + props = self.extract(m) + self.modify_props(props) + + try: + d = self.props_to_date(props, dt) + except TimeError: + d = None + + if d: + return (d, m.end()) + else: + return (None, None) + + def extract(self, match): + d = match.groupdict() + for key, value in iteritems(d): + try: + value = int(value) + d[key] = value + except (ValueError, TypeError): + pass + return Props(**d) + + def modify_props(self, props): + if self.modify: + self.modify(props) + + def props_to_date(self, props, dt): + if self.fn: + return self.fn(props, dt) + else: + args = {} + for key in adatetime.units: + args[key] = props.get(key) + return adatetime(**args) + + +class Month(Regex): + def __init__(self, *patterns): + self.patterns = patterns + self.exprs = [rcompile(pat, re.IGNORECASE) for pat in self.patterns] + + self.pattern = ("(?P" + + "|".join("(%s)" % pat for pat in self.patterns) + + ")") + self.expr = rcompile(self.pattern, re.IGNORECASE) + + def modify_props(self, p): + text = p.month + for i, expr in enumerate(self.exprs): + m = expr.match(text) + if m: + p.month = i + 1 + break + + +class PlusMinus(Regex): + def __init__(self, years, months, weeks, days, hours, minutes, seconds): + rel_years = "((?P[0-9]+) *(%s))?" % years + rel_months = "((?P[0-9]+) *(%s))?" % months + rel_weeks = "((?P[0-9]+) *(%s))?" % weeks + rel_days = "((?P[0-9]+) *(%s))?" % days + rel_hours = "((?P[0-9]+) *(%s))?" % hours + rel_mins = "((?P[0-9]+) *(%s))?" % minutes + rel_secs = "((?P[0-9]+) *(%s))?" % seconds + + self.pattern = ("(?P

[+-]) *%s *%s *%s *%s *%s *%s *%s(?=(\\W|$))" + % (rel_years, rel_months, rel_weeks, rel_days, + rel_hours, rel_mins, rel_secs)) + self.expr = rcompile(self.pattern, re.IGNORECASE) + + def props_to_date(self, p, dt): + if p.dir == "-": + dir = -1 + else: + dir = 1 + + delta = relativedelta(years=(p.get("years") or 0) * dir, + months=(p.get("months") or 0) * dir, + weeks=(p.get("weeks") or 0) * dir, + days=(p.get("days") or 0) * dir, + hours=(p.get("hours") or 0) * dir, + minutes=(p.get("mins") or 0) * dir, + seconds=(p.get("secs") or 0) * dir) + return dt + delta + + +class Daynames(Regex): + def __init__(self, next, last, daynames): + self.next_pattern = next + self.last_pattern = last + self._dayname_exprs = tuple(rcompile(pat, re.IGNORECASE) + for pat in daynames) + dn_pattern = "|".join(daynames) + self.pattern = ("(?P%s|%s) +(?P%s)(?=(\\W|$))" + % (next, last, dn_pattern)) + self.expr = rcompile(self.pattern, re.IGNORECASE) + + def props_to_date(self, p, dt): + if re.match(p.dir, self.last_pattern): + dir = -1 + else: + dir = 1 + + for daynum, expr in enumerate(self._dayname_exprs): + m = expr.match(p.day) + if m: + break + current_daynum = dt.weekday() + days_delta = relative_days(current_daynum, daynum, dir) + + d = dt.date() + timedelta(days=days_delta) + return adatetime(year=d.year, month=d.month, day=d.day) + + +class Time12(Regex): + def __init__(self): + self.pattern = ("(?P[1-9]|10|11|12)(:(?P[0-5][0-9])" + "(:(?P[0-5][0-9])(\\.(?P[0-9]{1,5}))?)?)?" + "\\s*(?Pam|pm)(?=(\\W|$))") + self.expr = rcompile(self.pattern, re.IGNORECASE) + + def props_to_date(self, p, dt): + isam = p.ampm.lower().startswith("a") + + if p.hour == 12: + if isam: + hr = 0 + else: + hr = 12 + else: + hr = p.hour + if not isam: + hr += 12 + + return adatetime(hour=hr, minute=p.mins, second=p.secs, microsecond=p.usecs) + + +# Top-level parser classes + +class DateParser(object): + """Base class for locale-specific parser classes. + """ + + day = Regex("(?P([123][0-9])|[1-9])(?=(\\W|$))(?!=:)", + lambda p, dt: adatetime(day=p.day)) + year = Regex("(?P[0-9]{4})(?=(\\W|$))", + lambda p, dt: adatetime(year=p.year)) + time24 = Regex("(?P([0-1][0-9])|(2[0-3])):(?P[0-5][0-9])" + "(:(?P[0-5][0-9])(\\.(?P[0-9]{1,5}))?)?" + "(?=(\\W|$))", + lambda p, dt: adatetime(hour=p.hour, minute=p.mins, + second=p.secs, microsecond=p.usecs)) + time12 = Time12() + + def __init__(self): + simple_year = "(?P[0-9]{4})" + simple_month = "(?P[0-1][0-9])" + simple_day = "(?P[0-3][0-9])" + simple_hour = "(?P([0-1][0-9])|(2[0-3]))" + simple_minute = "(?P[0-5][0-9])" + simple_second = "(?P[0-5][0-9])" + simple_usec = "(?P[0-9]{6})" + + tup = (simple_year, simple_month, simple_day, simple_hour, + simple_minute, simple_second, simple_usec) + simple_seq = Sequence(tup, sep="[- .:/]*", name="simple", + progressive=True) + self.simple = Sequence((simple_seq, "(?=(\\s|$))"), sep='') + + self.setup() + + def setup(self): + raise NotImplementedError + + # + + def get_parser(self): + return self.all + + def parse(self, text, dt, pos=0, debug=-9999): + parser = self.get_parser() + + d, newpos = parser.parse(text, dt, pos=pos, debug=debug) + if isinstance(d, (adatetime, timespan)): + d = d.disambiguated(dt) + + return (d, newpos) + + def date_from(self, text, basedate=None, pos=0, debug=-9999, toend=True): + if basedate is None: + basedate = datetime.utcnow() + + parser = self.get_parser() + if toend: + parser = ToEnd(parser) + + d = parser.date_from(text, basedate, pos=pos, debug=debug) + if isinstance(d, (adatetime, timespan)): + d = d.disambiguated(basedate) + return d + + +class English(DateParser): + day = Regex("(?P([123][0-9])|[1-9])(st|nd|rd|th)?(?=(\\W|$))", + lambda p, dt: adatetime(day=p.day)) + + def setup(self): + self.plusdate = PlusMinus("years|year|yrs|yr|ys|y", + "months|month|mons|mon|mos|mo", + "weeks|week|wks|wk|ws|w", + "days|day|dys|dy|ds|d", + "hours|hour|hrs|hr|hs|h", + "minutes|minute|mins|min|ms|m", + "seconds|second|secs|sec|s") + + self.dayname = Daynames("next", "last", + ("monday|mon|mo", "tuesday|tues|tue|tu", + "wednesday|wed|we", "thursday|thur|thu|th", + "friday|fri|fr", "saturday|sat|sa", + "sunday|sun|su")) + + midnight_l = lambda p, dt: adatetime(hour=0, minute=0, second=0, + microsecond=0) + midnight = Regex("midnight", midnight_l) + + noon_l = lambda p, dt: adatetime(hour=12, minute=0, second=0, + microsecond=0) + noon = Regex("noon", noon_l) + + now = Regex("now", lambda p, dt: dt) + + self.time = Choice((self.time12, self.time24, midnight, noon, now), + name="time") + + def tomorrow_to_date(p, dt): + d = dt.date() + timedelta(days=+1) + return adatetime(year=d.year, month=d.month, day=d.day) + tomorrow = Regex("tomorrow", tomorrow_to_date) + + def yesterday_to_date(p, dt): + d = dt.date() + timedelta(days=-1) + return adatetime(year=d.year, month=d.month, day=d.day) + yesterday = Regex("yesterday", yesterday_to_date) + + thisyear = Regex("this year", lambda p, dt: adatetime(year=dt.year)) + thismonth = Regex("this month", + lambda p, dt: adatetime(year=dt.year, + month=dt.month)) + today = Regex("today", + lambda p, dt: adatetime(year=dt.year, month=dt.month, + day=dt.day)) + + self.month = Month("january|jan", "february|febuary|feb", "march|mar", + "april|apr", "may", "june|jun", "july|jul", + "august|aug", "september|sept|sep", "october|oct", + "november|nov", "december|dec") + + # If you specify a day number you must also specify a month... this + # Choice captures that constraint + + self.dmy = Choice((Sequence((self.day, self.month, self.year), + name="dmy"), + Sequence((self.month, self.day, self.year), + name="mdy"), + Sequence((self.year, self.month, self.day), + name="ymd"), + Sequence((self.year, self.day, self.month), + name="ydm"), + Sequence((self.day, self.month), name="dm"), + Sequence((self.month, self.day), name="md"), + Sequence((self.month, self.year), name="my"), + self.month, self.year, self.dayname, tomorrow, + yesterday, thisyear, thismonth, today, now, + ), name="date") + + self.datetime = Bag((self.time, self.dmy), name="datetime") + self.bundle = Choice((self.plusdate, self.datetime, self.simple), + name="bundle") + self.torange = Combo((self.bundle, "to", self.bundle), name="torange") + + self.all = Choice((self.torange, self.bundle), name="all") + + +# QueryParser plugin + +class DateParserPlugin(plugins.Plugin): + """Adds more powerful parsing of DATETIME fields. + + >>> parser.add_plugin(DateParserPlugin()) + >>> parser.parse(u"date:'last tuesday'") + """ + + def __init__(self, basedate=None, dateparser=None, callback=None, + free=False, free_expr="([A-Za-z][A-Za-z_0-9]*):([^^]+)"): + """ + :param basedate: a datetime object representing the current time + against which to measure relative dates. If you do not supply this + argument, the plugin uses ``datetime.utcnow()``. + :param dateparser: an instance of + :class:`whoosh.qparser.dateparse.DateParser`. If you do not supply + this argument, the plugin automatically uses + :class:`whoosh.qparser.dateparse.English`. + :param callback: a callback function for parsing errors. This allows + you to provide feedback to the user about problems parsing dates. + :param remove: if True, unparseable dates are removed from the token + stream instead of being replaced with ErrorToken. + :param free: if True, this plugin will install a filter early in the + parsing process and try to find undelimited dates such as + ``date:last tuesday``. Note that allowing this could result in + normal query words accidentally being parsed as dates sometimes. + """ + + self.basedate = basedate + if dateparser is None: + dateparser = English() + self.dateparser = dateparser + self.callback = callback + self.free = free + self.freeexpr = free_expr + + def taggers(self, parser): + if self.free: + # If we're tokenizing, we have to go before the FieldsPlugin + return [(DateTagger(self, self.freeexpr), -1)] + else: + return () + + def filters(self, parser): + # Run the filter after the FieldsPlugin assigns field names + return [(self.do_dates, 110)] + + def errorize(self, message, node): + if self.callback: + self.callback(message) + return syntax.ErrorNode(message, node) + + def text_to_dt(self, node): + text = node.text + try: + dt = self.dateparser.date_from(text, self.basedate) + if dt is None: + return self.errorize(text, node) + else: + n = DateTimeNode(node.fieldname, dt, node.boost) + except DateParseError: + e = sys.exc_info()[1] + n = self.errorize(e, node) + n.startchar = node.startchar + n.endchar = node.endchar + return n + + def range_to_dt(self, node): + start = end = None + dp = self.dateparser.get_parser() + + if node.start: + start = dp.date_from(node.start, self.basedate) + if start is None: + return self.errorize(node.start, node) + if node.end: + end = dp.date_from(node.end, self.basedate) + if end is None: + return self.errorize(node.end, node) + + if start and end: + ts = timespan(start, end).disambiguated(self.basedate) + start, end = ts.start, ts.end + elif start: + start = start.disambiguated(self.basedate) + if isinstance(start, timespan): + start = start.start + elif end: + end = end.disambiguated(self.basedate) + if isinstance(end, timespan): + end = end.end + drn = DateRangeNode(node.fieldname, start, end, boost=node.boost) + drn.startchar = node.startchar + drn.endchar = node.endchar + return drn + + def do_dates(self, parser, group): + schema = parser.schema + if not schema: + return group + + from whoosh.fields import DATETIME + datefields = frozenset(fieldname for fieldname, field + in parser.schema.items() + if isinstance(field, DATETIME)) + + for i, node in enumerate(group): + if node.has_fieldname: + fname = node.fieldname or parser.fieldname + else: + fname = None + + if isinstance(node, syntax.GroupNode): + group[i] = self.do_dates(parser, node) + elif fname in datefields: + if node.has_text: + group[i] = self.text_to_dt(node) + elif isinstance(node, syntax.RangeNode): + group[i] = self.range_to_dt(node) + return group + + +class DateTimeNode(syntax.SyntaxNode): + has_fieldname = True + has_boost = True + + def __init__(self, fieldname, dt, boost=1.0): + self.fieldname = fieldname + self.dt = dt + self.boost = 1.0 + + def r(self): + return repr(self.dt) + + def query(self, parser): + from whoosh import query + + fieldname = self.fieldname or parser.fieldname + field = parser.schema[fieldname] + dt = self.dt + if isinstance(self.dt, datetime): + btext = field.to_bytes(dt) + return query.Term(fieldname, btext, boost=self.boost) + elif isinstance(self.dt, timespan): + return query.DateRange(fieldname, dt.start, dt.end, + boost=self.boost) + else: + raise Exception("Unknown time object: %r" % dt) + + +class DateRangeNode(syntax.SyntaxNode): + has_fieldname = True + has_boost = True + + def __init__(self, fieldname, start, end, boost=1.0): + self.fieldname = fieldname + self.start = start + self.end = end + self.boost = 1.0 + + def r(self): + return "%r-%r" % (self.start, self.end) + + def query(self, parser): + from whoosh import query + + fieldname = self.fieldname or parser.fieldname + return query.DateRange(fieldname, self.start, self.end, + boost=self.boost) + + +class DateTagger(Tagger): + def __init__(self, plugin, expr): + self.plugin = plugin + self.expr = rcompile(expr, re.IGNORECASE) + + def match(self, parser, text, pos): + from whoosh.fields import DATETIME + + match = self.expr.match(text, pos) + if match: + fieldname = match.group(1) + dtext = match.group(2) + + if parser.schema and fieldname in parser.schema: + field = parser.schema[fieldname] + if isinstance(field, DATETIME): + plugin = self.plugin + dateparser = plugin.dateparser + basedate = plugin.basedate + + d, newpos = dateparser.parse(dtext, basedate) + if d: + node = DateTimeNode(fieldname, d) + node.startchar = match.start() + node.endchar = newpos + match.start(2) + return node diff --git a/src/whoosh/qparser/default.py b/src/whoosh/qparser/default.py new file mode 100644 index 0000000..7b4dfee --- /dev/null +++ b/src/whoosh/qparser/default.py @@ -0,0 +1,439 @@ +# Copyright 2011 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import sys + +from whoosh import query +from whoosh.compat import text_type +from whoosh.qparser import syntax +from whoosh.qparser.common import print_debug, QueryParserError + + +# Query parser object + +class QueryParser(object): + """A hand-written query parser built on modular plug-ins. The default + configuration implements a powerful fielded query language similar to + Lucene's. + + You can use the ``plugins`` argument when creating the object to override + the default list of plug-ins, and/or use ``add_plugin()`` and/or + ``remove_plugin_class()`` to change the plug-ins included in the parser. + + >>> from whoosh import qparser + >>> parser = qparser.QueryParser("content", schema) + >>> parser.remove_plugin_class(qparser.WildcardPlugin) + >>> parser.add_plugin(qparser.PrefixPlugin()) + >>> parser.parse(u"hello there") + And([Term("content", u"hello"), Term("content", u"there")]) + """ + + def __init__(self, fieldname, schema, plugins=None, termclass=query.Term, + phraseclass=query.Phrase, group=syntax.AndGroup): + """ + :param fieldname: the default field -- the parser uses this as the + field for any terms without an explicit field. + :param schema: a :class:`whoosh.fields.Schema` object to use when + parsing. The appropriate fields in the schema will be used to + tokenize terms/phrases before they are turned into query objects. + You can specify None for the schema to create a parser that does + not analyze the text of the query, usually for testing purposes. + :param plugins: a list of plugins to use. WhitespacePlugin is + automatically included, do not put it in this list. This overrides + the default list of plugins. Classes in the list will be + automatically instantiated. + :param termclass: the query class to use for individual search terms. + The default is :class:`whoosh.query.Term`. + :param phraseclass: the query class to use for phrases. The default + is :class:`whoosh.query.Phrase`. + :param group: the default grouping. ``AndGroup`` makes terms required + by default. ``OrGroup`` makes terms optional by default. + """ + + self.fieldname = fieldname + self.schema = schema + self.termclass = termclass + self.phraseclass = phraseclass + self.group = group + self.plugins = [] + + if plugins is None: + plugins = self.default_set() + self._add_ws_plugin() + self.add_plugins(plugins) + + def default_set(self): + """Returns the default list of plugins to use. + """ + + from whoosh.qparser import plugins + + return [plugins.WhitespacePlugin(), + plugins.SingleQuotePlugin(), + plugins.FieldsPlugin(), + plugins.WildcardPlugin(), + plugins.PhrasePlugin(), + plugins.RangePlugin(), + plugins.GroupPlugin(), + plugins.OperatorsPlugin(), + plugins.BoostPlugin(), + plugins.EveryPlugin(), + ] + + def add_plugins(self, pins): + """Adds the given list of plugins to the list of plugins in this + parser. + """ + + for pin in pins: + self.add_plugin(pin) + + def add_plugin(self, pin): + """Adds the given plugin to the list of plugins in this parser. + """ + + if isinstance(pin, type): + pin = pin() + self.plugins.append(pin) + + def _add_ws_plugin(self): + from whoosh.qparser.plugins import WhitespacePlugin + self.add_plugin(WhitespacePlugin()) + + def remove_plugin(self, pi): + """Removes the given plugin object from the list of plugins in this + parser. + """ + + self.plugins.remove(pi) + + def remove_plugin_class(self, cls): + """Removes any plugins of the given class from this parser. + """ + + self.plugins = [pi for pi in self.plugins if not isinstance(pi, cls)] + + def replace_plugin(self, plugin): + """Removes any plugins of the class of the given plugin and then adds + it. This is a convenience method to keep from having to call + ``remove_plugin_class`` followed by ``add_plugin`` each time you want + to reconfigure a default plugin. + + >>> qp = qparser.QueryParser("content", schema) + >>> qp.replace_plugin(qparser.NotPlugin("(^| )-")) + """ + + self.remove_plugin_class(plugin.__class__) + self.add_plugin(plugin) + + def _priorized(self, methodname): + # methodname is "taggers" or "filters". Returns a priorized list of + # tagger objects or filter functions. + items_and_priorities = [] + for plugin in self.plugins: + # Call either .taggers() or .filters() on the plugin + method = getattr(plugin, methodname) + for item in method(self): + items_and_priorities.append(item) + # Sort the list by priority (lower priority runs first) + items_and_priorities.sort(key=lambda x: x[1]) + # Return the sorted list without the priorities + return [item for item, _ in items_and_priorities] + + def multitoken_query(self, spec, texts, fieldname, termclass, boost): + """Returns a query for multiple texts. This method implements the + intention specified in the field's ``multitoken_query`` attribute, + which specifies what to do when strings that look like single terms + to the parser turn out to yield multiple tokens when analyzed. + + :param spec: a string describing how to join the text strings into a + query. This is usually the value of the field's + ``multitoken_query`` attribute. + :param texts: a list of token strings. + :param fieldname: the name of the field. + :param termclass: the query class to use for single terms. + :param boost: the original term's boost in the query string, should be + applied to the returned query object. + """ + + spec = spec.lower() + if spec == "first": + # Throw away all but the first token + return termclass(fieldname, texts[0], boost=boost) + elif spec == "phrase": + # Turn the token into a phrase + return self.phraseclass(fieldname, texts, boost=boost) + else: + if spec == "default": + qclass = self.group.qclass + elif spec == "and": + qclass = query.And + elif spec == "or": + qclass = query.Or + else: + raise QueryParserError("Unknown multitoken_query value %r" + % spec) + return qclass([termclass(fieldname, t, boost=boost) + for t in texts]) + + def term_query(self, fieldname, text, termclass, boost=1.0, tokenize=True, + removestops=True): + """Returns the appropriate query object for a single term in the query + string. + """ + + if self.schema and fieldname in self.schema: + field = self.schema[fieldname] + + # If this field type wants to parse queries itself, let it do so + # and return early + if field.self_parsing(): + try: + q = field.parse_query(fieldname, text, boost=boost) + return q + except: + e = sys.exc_info()[1] + return query.error_query(e) + + # Otherwise, ask the field to process the text into a list of + # tokenized strings + texts = list(field.process_text(text, mode="query", + tokenize=tokenize, + removestops=removestops)) + + # If the analyzer returned more than one token, use the field's + # multitoken_query attribute to decide what query class, if any, to + # use to put the tokens together + if len(texts) > 1: + return self.multitoken_query(field.multitoken_query, texts, + fieldname, termclass, boost) + + # It's possible field.process_text() will return an empty list (for + # example, on a stop word) + if not texts: + return None + text = texts[0] + + return termclass(fieldname, text, boost=boost) + + def taggers(self): + """Returns a priorized list of tagger objects provided by the parser's + currently configured plugins. + """ + + return self._priorized("taggers") + + def filters(self): + """Returns a priorized list of filter functions provided by the + parser's currently configured plugins. + """ + + return self._priorized("filters") + + def tag(self, text, pos=0, debug=False): + """Returns a group of syntax nodes corresponding to the given text, + created by matching the Taggers provided by the parser's plugins. + + :param text: the text to tag. + :param pos: the position in the text to start tagging at. + """ + + # The list out output tags + stack = [] + # End position of the previous match + prev = pos + # Priorized list of taggers provided by the parser's plugins + taggers = self.taggers() + if debug: + print_debug(debug, "Taggers: %r" % taggers) + + # Define a function that will make a WordNode from the "interstitial" + # text between matches + def inter(startchar, endchar): + n = syntax.WordNode(text[startchar:endchar]) + n.startchar = startchar + n.endchar = endchar + return n + + while pos < len(text): + node = None + # Try each tagger to see if it matches at the current position + for tagger in taggers: + node = tagger.match(self, text, pos) + if node is not None: + if node.endchar <= pos: + raise Exception("Token %r did not move cursor forward." + " (%r, %s)" % (tagger, text, pos)) + if prev < pos: + tween = inter(prev, pos) + if debug: + print_debug(debug, "Tween: %r" % tween) + stack.append(tween) + + if debug: + print_debug(debug, "Tagger: %r at %s: %r" + % (tagger, pos, node)) + stack.append(node) + prev = pos = node.endchar + break + + if not node: + # No taggers matched, move forward + pos += 1 + + # If there's unmatched text left over on the end, put it in a WordNode + if prev < len(text): + stack.append(inter(prev, len(text))) + + # Wrap the list of nodes in a group node + group = self.group(stack) + if debug: + print_debug(debug, "Tagged group: %r" % group) + return group + + def filterize(self, nodes, debug=False): + """Takes a group of nodes and runs the filters provided by the parser's + plugins. + """ + + # Call each filter in the priorized list of plugin filters + if debug: + print_debug(debug, "Pre-filtered group: %r" % nodes) + for f in self.filters(): + if debug: + print_debug(debug, "..Applying: %r" % f) + nodes = f(self, nodes) + if debug: + print_debug(debug, "..Result: %r" % nodes) + if nodes is None: + raise Exception("Filter %r did not return anything" % f) + return nodes + + def process(self, text, pos=0, debug=False): + """Returns a group of syntax nodes corresponding to the given text, + tagged by the plugin Taggers and filtered by the plugin filters. + + :param text: the text to tag. + :param pos: the position in the text to start tagging at. + """ + + nodes = self.tag(text, pos=pos, debug=debug) + nodes = self.filterize(nodes, debug=debug) + return nodes + + def parse(self, text, normalize=True, debug=False): + """Parses the input string and returns a :class:`whoosh.query.Query` + object/tree. + + :param text: the unicode string to parse. + :param normalize: whether to call normalize() on the query object/tree + before returning it. This should be left on unless you're trying to + debug the parser output. + :rtype: :class:`whoosh.query.Query` + """ + + if not isinstance(text, text_type): + text = text.decode("latin1") + + nodes = self.process(text, debug=debug) + if debug: + print_debug(debug, "Syntax tree: %r" % nodes) + + q = nodes.query(self) + if not q: + q = query.NullQuery + if debug: + print_debug(debug, "Pre-normalized query: %r" % q) + + if normalize: + q = q.normalize() + if debug: + print_debug(debug, "Normalized query: %r" % q) + return q + + def parse_(self, text, normalize=True): + pass + + +# Premade parser configurations + +def MultifieldParser(fieldnames, schema, fieldboosts=None, **kwargs): + """Returns a QueryParser configured to search in multiple fields. + + Instead of assigning unfielded clauses to a default field, this parser + transforms them into an OR clause that searches a list of fields. For + example, if the list of multi-fields is "f1", "f2" and the query string is + "hello there", the class will parse "(f1:hello OR f2:hello) (f1:there OR + f2:there)". This is very useful when you have two textual fields (e.g. + "title" and "content") you want to search by default. + + :param fieldnames: a list of field names to search. + :param fieldboosts: an optional dictionary mapping field names to boosts. + """ + + from whoosh.qparser.plugins import MultifieldPlugin + + p = QueryParser(None, schema, **kwargs) + mfp = MultifieldPlugin(fieldnames, fieldboosts=fieldboosts) + p.add_plugin(mfp) + return p + + +def SimpleParser(fieldname, schema, **kwargs): + """Returns a QueryParser configured to support only +, -, and phrase + syntax. + """ + + from whoosh.qparser import plugins, syntax + + pins = [plugins.WhitespacePlugin, + plugins.PlusMinusPlugin, + plugins.PhrasePlugin] + orgroup = syntax.OrGroup + return QueryParser(fieldname, schema, plugins=pins, group=orgroup, + **kwargs) + + +def DisMaxParser(fieldboosts, schema, tiebreak=0.0, **kwargs): + """Returns a QueryParser configured to support only +, -, and phrase + syntax, and which converts individual terms into DisjunctionMax queries + across a set of fields. + + :param fieldboosts: a dictionary mapping field names to boosts. + """ + + from whoosh.qparser import plugins, syntax + + mfp = plugins.MultifieldPlugin(list(fieldboosts.keys()), + fieldboosts=fieldboosts, + group=syntax.DisMaxGroup) + pins = [plugins.WhitespacePlugin, + plugins.PlusMinusPlugin, + plugins.PhrasePlugin, + mfp] + orgroup = syntax.OrGroup + return QueryParser(None, schema, plugins=pins, group=orgroup, **kwargs) diff --git a/src/whoosh/qparser/plugins.py b/src/whoosh/qparser/plugins.py new file mode 100644 index 0000000..9f1a7fa --- /dev/null +++ b/src/whoosh/qparser/plugins.py @@ -0,0 +1,1413 @@ +# Copyright 2011 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import copy + +from whoosh import query +from whoosh.compat import u +from whoosh.compat import iteritems, xrange +from whoosh.qparser import syntax +from whoosh.qparser.common import attach +from whoosh.qparser.taggers import RegexTagger, FnTagger +from whoosh.util.text import rcompile + + +class Plugin(object): + """Base class for parser plugins. + """ + + def taggers(self, parser): + """Should return a list of ``(Tagger, priority)`` tuples to add to the + syntax the parser understands. Lower priorities run first. + """ + + return () + + def filters(self, parser): + """Should return a list of ``(filter_function, priority)`` tuples to + add to parser. Lower priority numbers run first. + + Filter functions will be called with ``(parser, groupnode)`` and should + return a group node. + """ + + return () + + +class TaggingPlugin(RegexTagger): + """A plugin that also acts as a Tagger, to avoid having an extra Tagger + class for simple cases. + + A TaggingPlugin object should have a ``priority`` attribute and either a + ``nodetype`` attribute or a ``create()`` method. If the subclass doesn't + override ``create()``, the base class will call ``self.nodetype`` with the + Match object's named groups as keyword arguments. + """ + + priority = 0 + + def __init__(self, expr=None): + self.expr = rcompile(expr or self.expr) + + def taggers(self, parser): + return [(self, self.priority)] + + def filters(self, parser): + return () + + def create(self, parser, match): + # Groupdict keys can be unicode sometimes apparently? Convert them to + # str for use as keyword arguments. This should be Py3-safe. + kwargs = dict((str(k), v) for k, v in iteritems(match.groupdict())) + return self.nodetype(**kwargs) + + +class WhitespacePlugin(TaggingPlugin): + """Tags whitespace and removes it at priority 500. Depending on whether + your plugin's filter wants to see where whitespace was in the original + query, it should run with priority lower than 500 (before removal of + whitespace) or higher than 500 (after removal of whitespace). + """ + + nodetype = syntax.Whitespace + priority = 100 + + def __init__(self, expr=r"\s+"): + TaggingPlugin.__init__(self, expr) + + def filters(self, parser): + return [(self.remove_whitespace, 500)] + + def remove_whitespace(self, parser, group): + newgroup = group.empty_copy() + for node in group: + if isinstance(node, syntax.GroupNode): + newgroup.append(self.remove_whitespace(parser, node)) + elif not node.is_ws(): + newgroup.append(node) + return newgroup + + +class SingleQuotePlugin(TaggingPlugin): + """Adds the ability to specify single "terms" containing spaces by + enclosing them in single quotes. + """ + + expr = r"(^|(?<=\W))'(?P.*?)'(?=\s|\]|[)}]|$)" + nodetype = syntax.WordNode + + +class PrefixPlugin(TaggingPlugin): + """Adds the ability to specify prefix queries by ending a term with an + asterisk. + + This plugin is useful if you want the user to be able to create prefix but + not wildcard queries (for performance reasons). If you are including the + wildcard plugin, you should not include this plugin as well. + + >>> qp = qparser.QueryParser("content", myschema) + >>> qp.remove_plugin_class(qparser.WildcardPlugin) + >>> qp.add_plugin(qparser.PrefixPlugin()) + >>> q = qp.parse("pre*") + """ + + class PrefixNode(syntax.TextNode): + qclass = query.Prefix + + def r(self): + return "%r*" % self.text + + expr = "(?P[^ \t\r\n*]+)[*](?= |$|\\))" + nodetype = PrefixNode + + +class WildcardPlugin(TaggingPlugin): + # \u055E = Armenian question mark + # \u061F = Arabic question mark + # \u1367 = Ethiopic question mark + qmarks = u("?\u055E\u061F\u1367") + expr = "(?P[*%s])" % qmarks + + def filters(self, parser): + # Run early, but definitely before multifield plugin + return [(self.do_wildcards, 50)] + + def do_wildcards(self, parser, group): + i = 0 + while i < len(group): + node = group[i] + if isinstance(node, self.WildcardNode): + if i < len(group) - 1 and group[i + 1].is_text(): + nextnode = group.pop(i + 1) + node.text += nextnode.text + if i > 0 and group[i - 1].is_text(): + prevnode = group.pop(i - 1) + node.text = prevnode.text + node.text + else: + i += 1 + else: + if isinstance(node, syntax.GroupNode): + self.do_wildcards(parser, node) + i += 1 + + for i in xrange(len(group)): + node = group[i] + if isinstance(node, self.WildcardNode): + text = node.text + if len(text) > 1 and not any(qm in text for qm in self.qmarks): + if text.find("*") == len(text) - 1: + newnode = PrefixPlugin.PrefixNode(text[:-1]) + newnode.startchar = node.startchar + newnode.endchar = node.endchar + group[i] = newnode + return group + + class WildcardNode(syntax.TextNode): + # Note that this node inherits tokenize = False from TextNode, + # so the text in this node will not be analyzed... just passed + # straight to the query + + qclass = query.Wildcard + + def r(self): + return "Wild %r" % self.text + + nodetype = WildcardNode + + +class RegexPlugin(TaggingPlugin): + """Adds the ability to specify regular expression term queries. + + The default syntax for a regular expression term is ``r"termexpr"``. + + >>> qp = qparser.QueryParser("content", myschema) + >>> qp.add_plugin(qparser.RegexPlugin()) + >>> q = qp.parse('foo title:r"bar+"') + """ + + class RegexNode(syntax.TextNode): + qclass = query.Regex + + def r(self): + return "Regex %r" % self.text + + expr = 'r"(?P[^"]*)"' + nodetype = RegexNode + + +class BoostPlugin(TaggingPlugin): + """Adds the ability to boost clauses of the query using the circumflex. + + >>> qp = qparser.QueryParser("content", myschema) + >>> q = qp.parse("hello there^2") + """ + + expr = "\\^(?P[0-9]*(\\.[0-9]+)?)($|(?=[ \t\r\n)]))" + + class BoostNode(syntax.SyntaxNode): + def __init__(self, original, boost): + self.original = original + self.boost = boost + + def r(self): + return "^ %s" % self.boost + + def create(self, parser, match): + # Override create so we can grab group 0 + original = match.group(0) + try: + boost = float(match.group("boost")) + except ValueError: + # The text after the ^ wasn't a valid number, so turn it into a + # word + node = syntax.WordNode(original) + else: + node = self.BoostNode(original, boost) + + return node + + def filters(self, parser): + return [(self.clean_boost, 0), (self.do_boost, 510)] + + def clean_boost(self, parser, group): + """This filter finds any BoostNodes in positions where they can't boost + the previous node (e.g. at the very beginning, after whitespace, or + after another BoostNode) and turns them into WordNodes. + """ + + bnode = self.BoostNode + for i, node in enumerate(group): + if isinstance(node, bnode): + if (not i or not group[i - 1].has_boost): + group[i] = syntax.to_word(node) + return group + + def do_boost(self, parser, group): + """This filter finds BoostNodes and applies the boost to the previous + node. + """ + + newgroup = group.empty_copy() + for node in group: + if isinstance(node, syntax.GroupNode): + node = self.do_boost(parser, node) + elif isinstance(node, self.BoostNode): + if (newgroup and newgroup[-1].has_boost): + # Apply the BoostNode's boost to the previous node + newgroup[-1].set_boost(node.boost) + # Skip adding the BoostNode to the new group + continue + else: + node = syntax.to_word(node) + newgroup.append(node) + return newgroup + + +class GroupPlugin(Plugin): + """Adds the ability to group clauses using parentheses. + """ + + # Marker nodes for open and close bracket + + class OpenBracket(syntax.SyntaxNode): + def r(self): + return "(" + + class CloseBracket(syntax.SyntaxNode): + def r(self): + return ")" + + def __init__(self, openexpr="[(]", closeexpr="[)]"): + self.openexpr = openexpr + self.closeexpr = closeexpr + + def taggers(self, parser): + return [(FnTagger(self.openexpr, self.OpenBracket, "openB"), 0), + (FnTagger(self.closeexpr, self.CloseBracket, "closeB"), 0)] + + def filters(self, parser): + return [(self.do_groups, 0)] + + def do_groups(self, parser, group): + """This filter finds open and close bracket markers in a flat group + and uses them to organize the nodes into a hierarchy. + """ + + ob, cb = self.OpenBracket, self.CloseBracket + # Group hierarchy stack + stack = [parser.group()] + for node in group: + if isinstance(node, ob): + # Open bracket: push a new level of hierarchy on the stack + stack.append(parser.group()) + elif isinstance(node, cb): + # Close bracket: pop the current level of hierarchy and append + # it to the previous level + if len(stack) > 1: + last = stack.pop() + stack[-1].append(last) + else: + # Anything else: add it to the current level of hierarchy + stack[-1].append(node) + + top = stack[0] + # If the parens were unbalanced (more opens than closes), just take + # whatever levels of hierarchy were left on the stack and tack them on + # the end of the top-level + if len(stack) > 1: + for ls in stack[1:]: + top.extend(ls) + + if len(top) == 1 and isinstance(top[0], syntax.GroupNode): + boost = top.boost + top = top[0] + top.boost = boost + + return top + + +class EveryPlugin(TaggingPlugin): + expr = "[*]:[*]" + priority = -1 + + def create(self, parser, match): + return self.EveryNode() + + class EveryNode(syntax.SyntaxNode): + def r(self): + return "*:*" + + def query(self, parser): + return query.Every() + + +class FieldsPlugin(TaggingPlugin): + """Adds the ability to specify the field of a clause. + """ + + class FieldnameTagger(RegexTagger): + def create(self, parser, match): + return syntax.FieldnameNode(match.group("text"), match.group(0)) + + def __init__(self, expr=r"(?P\w+|[*]):", remove_unknown=True): + """ + :param expr: the regular expression to use for tagging fields. + :param remove_unknown: if True, converts field specifications for + fields that aren't in the schema into regular text. + """ + + self.expr = expr + self.removeunknown = remove_unknown + + def taggers(self, parser): + return [(self.FieldnameTagger(self.expr), 0)] + + def filters(self, parser): + return [(self.do_fieldnames, 100)] + + def do_fieldnames(self, parser, group): + """This filter finds FieldnameNodes in the tree and applies their + fieldname to the next node. + """ + + fnclass = syntax.FieldnameNode + + if self.removeunknown and parser.schema: + # Look for field nodes that aren't in the schema and convert them + # to text + schema = parser.schema + newgroup = group.empty_copy() + prev_field_node = None + + for node in group: + if isinstance(node, fnclass) and node.fieldname not in schema: + prev_field_node = node + continue + elif prev_field_node: + # If prev_field_node is not None, it contains a field node + # that appeared before this node but isn't in the schema, + # so we'll convert it to text here + if node.has_text: + node.text = prev_field_node.original + node.text + else: + newgroup.append(syntax.to_word(prev_field_node)) + prev_field_node = None + newgroup.append(node) + if prev_field_node: + newgroup.append(syntax.to_word(prev_field_node)) + group = newgroup + + newgroup = group.empty_copy() + # Iterate backwards through the stream, looking for field-able objects + # with field nodes in front of them + i = len(group) + while i > 0: + i -= 1 + node = group[i] + if isinstance(node, fnclass): + # If we see a fieldname node, it must not have been in front + # of something fieldable, since we would have already removed + # it (since we're iterating backwards), so convert it to text + node = syntax.to_word(node) + elif isinstance(node, syntax.GroupNode): + node = self.do_fieldnames(parser, node) + + if i > 0 and not node.is_ws() and isinstance(group[i - 1], + fnclass): + node.set_fieldname(group[i - 1].fieldname, override=False) + i -= 1 + + newgroup.append(node) + newgroup.reverse() + return newgroup + + +class FuzzyTermPlugin(TaggingPlugin): + """Adds syntax to the query parser to create "fuzzy" term queries, which + match any term within a certain "edit distance" (number of inserted, + deleted, or transposed characters) by appending a tilde (``~``) and an + optional maximum edit distance to a term. If you don't specify an explicit + maximum edit distance, the default is 1. + + >>> qp = qparser.QueryParser("content", myschema) + >>> qp.add_plugin(qparser.FuzzyTermPlugin()) + >>> q = qp.parse("Stephen~2 Colbert") + + For example, the following query creates a :class:`whoosh.query.FuzzyTerm` + query with a maximum edit distance of 1:: + + bob~ + + The following creates a fuzzy term query with a maximum edit distance of + 2:: + + bob~2 + + The maximum edit distance can only be a single digit. Note that edit + distances greater than 2 can take an extremely long time and are generally + not useful. + + You can specify a prefix length using ``~n/m``. For example, to allow a + maximum edit distance of 2 and require a prefix match of 3 characters:: + + johannson~2/3 + + To specify a prefix with the default edit distance:: + + johannson~/3 + """ + + expr = rcompile(""" + (?<=\\S) # Only match right after non-space + ~ # Initial tilde + (?P[0-9])? # Optional maxdist + (/ # Optional prefix slash + (?P[1-9][0-9]*) # prefix + )? # (end prefix group) + """, verbose=True) + + class FuzzinessNode(syntax.SyntaxNode): + def __init__(self, maxdist, prefixlength, original): + self.maxdist = maxdist + self.prefixlength = prefixlength + self.original = original + + def __repr__(self): + return "<~%d/%d>" % (self.maxdist, self.prefixlength) + + class FuzzyTermNode(syntax.TextNode): + qclass = query.FuzzyTerm + + def __init__(self, wordnode, maxdist, prefixlength): + self.fieldname = wordnode.fieldname + self.text = wordnode.text + self.boost = wordnode.boost + self.startchar = wordnode.startchar + self.endchar = wordnode.endchar + self.maxdist = maxdist + self.prefixlength = prefixlength + + def r(self): + return "%r ~%d/%d" % (self.text, self.maxdist, self.prefixlength) + + def query(self, parser): + # Use the superclass's query() method to create a FuzzyTerm query + # (it looks at self.qclass), just because it takes care of some + # extra checks and attributes + q = syntax.TextNode.query(self, parser) + # Set FuzzyTerm-specific attributes + q.maxdist = self.maxdist + q.prefixlength = self.prefixlength + return q + + def create(self, parser, match): + mdstr = match.group("maxdist") + maxdist = int(mdstr) if mdstr else 1 + + pstr = match.group("prefix") + prefixlength = int(pstr) if pstr else 0 + + return self.FuzzinessNode(maxdist, prefixlength, match.group(0)) + + def filters(self, parser): + return [(self.do_fuzzyterms, 0)] + + def do_fuzzyterms(self, parser, group): + newgroup = group.empty_copy() + i = 0 + while i < len(group): + node = group[i] + if i < len(group) - 1 and isinstance(node, syntax.WordNode): + nextnode = group[i + 1] + if isinstance(nextnode, self.FuzzinessNode): + node = self.FuzzyTermNode(node, nextnode.maxdist, + nextnode.prefixlength) + i += 1 + if isinstance(node, self.FuzzinessNode): + node = syntax.to_word(node) + if isinstance(node, syntax.GroupNode): + node = self.do_fuzzyterms(parser, node) + + newgroup.append(node) + i += 1 + return newgroup + + +class FunctionPlugin(TaggingPlugin): + """Adds an abitrary "function call" syntax to the query parser to allow + advanced and extensible query functionality. + + This is unfinished and experimental. + """ + + expr = rcompile(""" + [#](?P[A-Za-z_][A-Za-z0-9._]*) # function name + ( # optional args + \\[ # inside square brackets + (?P.*?) + \\] + )? + """, verbose=True) + + class FunctionNode(syntax.SyntaxNode): + has_fieldname = False + has_boost = True + merging = False + + def __init__(self, name, fn, args, kwargs): + self.name = name + self.fn = fn + self.args = args + self.kwargs = kwargs + self.nodes = [] + self.boost = None + + def __repr__(self): + return "#%s<%r>(%r)" % (self.name, self.args, self.nodes) + + def query(self, parser): + qs = [n.query(parser) for n in self.nodes] + kwargs = self.kwargs + if "boost" not in kwargs and self.boost is not None: + kwargs["boost"] = self.boost + # TODO: If this call raises an exception, return an error query + return self.fn(qs, *self.args, **self.kwargs) + + def __init__(self, fns): + """ + :param fns: a dictionary mapping names to functions that return a + query. + """ + + self.fns = fns + + def create(self, parser, match): + name = match.group("name") + if name in self.fns: + fn = self.fns[name] + argstring = match.group("args") + if argstring: + args, kwargs = self._parse_args(argstring) + else: + args = () + kwargs = {} + return self.FunctionNode(name, fn, args, kwargs) + + def _parse_args(self, argstring): + args = [] + kwargs = {} + + parts = argstring.split(",") + for part in parts: + if "=" in part: + name, value = part.split("=", 1) + # Wrap with str() because Python 2.5 can't handle unicode kws + name = str(name.strip()) + else: + name = None + value = part + + value = value.strip() + if value.startswith("'") and value.endswith("'"): + value = value[1:-1] + + if name: + kwargs[name] = value + else: + args.append(value) + + return args, kwargs + + def filters(self, parser): + return [(self.do_functions, 600)] + + def do_functions(self, parser, group): + newgroup = group.empty_copy() + i = 0 + while i < len(group): + node = group[i] + if (isinstance(node, self.FunctionNode) + and i < len(group) - 1 + and isinstance(group[i + 1], syntax.GroupNode)): + nextnode = group[i + 1] + node.nodes = list(self.do_functions(parser, nextnode)) + + if nextnode.boost != 1: + node.set_boost(nextnode.boost) + + i += 1 + elif isinstance(node, syntax.GroupNode): + node = self.do_functions(parser, node) + + newgroup.append(node) + i += 1 + return newgroup + + +class PhrasePlugin(Plugin): + """Adds the ability to specify phrase queries inside double quotes. + """ + + # Didn't use TaggingPlugin because I need to add slop parsing at some + # point + + # Expression used to find words if a schema isn't available + wordexpr = rcompile(r'\S+') + + class PhraseNode(syntax.TextNode): + def __init__(self, text, textstartchar, slop=1): + syntax.TextNode.__init__(self, text) + self.textstartchar = textstartchar + self.slop = slop + + def r(self): + return "%s %r~%s" % (self.__class__.__name__, self.text, self.slop) + + def apply(self, fn): + return self.__class__(self.type, [fn(node) for node in self.nodes], + slop=self.slop, boost=self.boost) + + def query(self, parser): + text = self.text + fieldname = self.fieldname or parser.fieldname + + # We want to process the text of the phrase into "words" (tokens), + # and also record the startchar and endchar of each word + + sc = self.textstartchar + if parser.schema and fieldname in parser.schema: + field = parser.schema[fieldname] + if field.analyzer: + # We have a field with an analyzer, so use it to parse + # the phrase into tokens + tokens = field.tokenize(text, mode="query", chars=True) + words = [] + char_ranges = [] + for t in tokens: + words.append(t.text) + char_ranges.append((sc + t.startchar, sc + t.endchar)) + else: + # We have a field but it doesn't have a format object, + # for some reason (it's self-parsing?), so use process_text + # to get the texts (we won't know the start/end chars) + words = list(field.process_text(text, mode="query")) + char_ranges = [(None, None)] * len(words) + else: + # We're parsing without a schema, so just use the default + # regular expression to break the text into words + words = [] + char_ranges = [] + for match in PhrasePlugin.wordexpr.finditer(text): + words.append(match.group(0)) + char_ranges.append((sc + match.start(), sc + match.end())) + + qclass = parser.phraseclass + q = qclass(fieldname, words, slop=self.slop, boost=self.boost, + char_ranges=char_ranges) + return attach(q, self) + + class PhraseTagger(RegexTagger): + def create(self, parser, match): + text = match.group("text") + textstartchar = match.start("text") + slopstr = match.group("slop") + slop = int(slopstr) if slopstr else 1 + return PhrasePlugin.PhraseNode(text, textstartchar, slop) + + def __init__(self, expr='"(?P.*?)"(~(?P[1-9][0-9]*))?'): + self.expr = expr + + def taggers(self, parser): + return [(self.PhraseTagger(self.expr), 0)] + + +class SequencePlugin(Plugin): + """Adds the ability to group arbitrary queries inside double quotes to + produce a query matching the individual sub-queries in sequence. + + To enable this plugin, first remove the default PhrasePlugin, then add + this plugin:: + + qp = qparser.QueryParser("field", my_schema) + qp.remove_plugin_class(qparser.PhrasePlugin) + qp.add_plugin(qparser.SequencePlugin()) + + This enables parsing "phrases" such as:: + + "(jon OR john OR jonathan~1) smith*" + """ + + def __init__(self, expr='["](~(?P[1-9][0-9]*))?'): + """ + :param expr: a regular expression for the marker at the start and end + of a phrase. The default is the double-quotes character. + """ + + self.expr = expr + + class SequenceNode(syntax.GroupNode): + qclass = query.Sequence + + class QuoteNode(syntax.MarkerNode): + def __init__(self, slop=None): + self.slop = int(slop) if slop else 1 + + def taggers(self, parser): + return [(FnTagger(self.expr, self.QuoteNode, "quote"), 0)] + + def filters(self, parser): + return [(self.do_quotes, 550)] + + def do_quotes(self, parser, group): + # New group to copy nodes into + newgroup = group.empty_copy() + # Buffer for sequence nodes; when it's None, it means we're not in + # a sequence + seq = None + + # Start copying nodes from group to newgroup. When we find a quote + # node, start copying nodes into the buffer instead. When we find + # the next (end) quote, put the buffered nodes into a SequenceNode + # and add it to newgroup. + for node in group: + if isinstance(node, syntax.GroupNode): + # Recurse + node = self.do_quotes(parser, node) + + if isinstance(node, self.QuoteNode): + if seq is None: + # Start a new sequence + seq = [] + else: + # End the current sequence + sn = self.SequenceNode(seq, slop=node.slop) + newgroup.append(sn) + seq = None + elif seq is None: + # Not in a sequence, add directly + newgroup.append(node) + else: + # In a sequence, add it to the buffer + seq.append(node) + + # We can end up with buffered nodes if there was an unbalanced quote; + # just add the buffered nodes directly to newgroup + if seq is not None: + newgroup.extend(seq) + + return newgroup + + +class RangePlugin(Plugin): + """Adds the ability to specify term ranges. + """ + + expr = rcompile(r""" + (?P\{|\[) # Open paren + (?P + ('[^']*?'\s+) # single-quoted + | # or + ([^\]}]+?(?=[Tt][Oo])) # everything until "to" + )? + [Tt][Oo] # "to" + (?P + (\s+'[^']*?') # single-quoted + | # or + ([^\]}]+?) # everything until "]" or "}" + )? + (?P}|]) # Close paren + """, verbose=True) + + class RangeTagger(RegexTagger): + def __init__(self, expr, excl_start, excl_end): + self.expr = expr + self.excl_start = excl_start + self.excl_end = excl_end + + def create(self, parser, match): + start = match.group("start") + end = match.group("end") + if start: + # Strip the space before the "to" + start = start.rstrip() + # Strip single quotes + if start.startswith("'") and start.endswith("'"): + start = start[1:-1] + if end: + # Strip the space before the "to" + end = end.lstrip() + # Strip single quotes + if end.startswith("'") and end.endswith("'"): + end = end[1:-1] + # What kind of open and close brackets were used? + startexcl = match.group("open") == self.excl_start + endexcl = match.group("close") == self.excl_end + + rn = syntax.RangeNode(start, end, startexcl, endexcl) + return rn + + def __init__(self, expr=None, excl_start="{", excl_end="}"): + self.expr = expr or self.expr + self.excl_start = excl_start + self.excl_end = excl_end + + def taggers(self, parser): + tagger = self.RangeTagger(self.expr, self.excl_start, self.excl_end) + return [(tagger, 1)] + + +class OperatorsPlugin(Plugin): + """By default, adds the AND, OR, ANDNOT, ANDMAYBE, and NOT operators to + the parser syntax. This plugin scans the token stream for subclasses of + :class:`Operator` and calls their :meth:`Operator.make_group` methods + to allow them to manipulate the stream. + + There are two levels of configuration available. + + The first level is to change the regular expressions of the default + operators, using the ``And``, ``Or``, ``AndNot``, ``AndMaybe``, and/or + ``Not`` keyword arguments. The keyword value can be a pattern string or + a compiled expression, or None to remove the operator:: + + qp = qparser.QueryParser("content", schema) + cp = qparser.OperatorsPlugin(And="&", Or="\\|", AndNot="&!", + AndMaybe="&~", Not=None) + qp.replace_plugin(cp) + + You can also specify a list of ``(OpTagger, priority)`` pairs as the first + argument to the initializer to use custom operators. See :ref:`custom-op` + for more information on this. + """ + + class OpTagger(RegexTagger): + def __init__(self, expr, grouptype, optype=syntax.InfixOperator, + leftassoc=True, memo=""): + RegexTagger.__init__(self, expr) + self.grouptype = grouptype + self.optype = optype + self.leftassoc = leftassoc + self.memo = memo + + def __repr__(self): + return "<%s %r (%s)>" % (self.__class__.__name__, + self.expr.pattern, self.memo) + + def create(self, parser, match): + return self.optype(match.group(0), self.grouptype, self.leftassoc) + + def __init__(self, ops=None, clean=False, + And=r"(?<=\s)AND(?=\s)", + Or=r"(?<=\s)OR(?=\s)", + AndNot=r"(?<=\s)ANDNOT(?=\s)", + AndMaybe=r"(?<=\s)ANDMAYBE(?=\s)", + Not=r"(^|(?<=(\s|[()])))NOT(?=\s)", + Require=r"(^|(?<=\s))REQUIRE(?=\s)"): + if ops: + ops = list(ops) + else: + ops = [] + + if not clean: + ot = self.OpTagger + if Not: + ops.append((ot(Not, syntax.NotGroup, syntax.PrefixOperator, + memo="not"), 0)) + if And: + ops.append((ot(And, syntax.AndGroup, memo="and"), 0)) + if Or: + ops.append((ot(Or, syntax.OrGroup, memo="or"), 0)) + if AndNot: + ops.append((ot(AndNot, syntax.AndNotGroup, + memo="anot"), -5)) + if AndMaybe: + ops.append((ot(AndMaybe, syntax.AndMaybeGroup, + memo="amaybe"), -5)) + if Require: + ops.append((ot(Require, syntax.RequireGroup, + memo="req"), 0)) + + self.ops = ops + + def taggers(self, parser): + return self.ops + + def filters(self, parser): + return [(self.do_operators, 600)] + + def do_operators(self, parser, group): + """This filter finds PrefixOperator, PostfixOperator, and InfixOperator + nodes in the tree and calls their logic to rearrange the nodes. + """ + + for tagger, _ in self.ops: + # Get the operators created by the configured taggers + optype = tagger.optype + gtype = tagger.grouptype + + # Left-associative infix operators are replaced left-to-right, and + # right-associative infix operators are replaced right-to-left. + # Most of the work is done in the different implementations of + # Operator.replace_self(). + if tagger.leftassoc: + i = 0 + while i < len(group): + t = group[i] + if isinstance(t, optype) and t.grouptype is gtype: + i = t.replace_self(parser, group, i) + else: + i += 1 + else: + i = len(group) - 1 + while i >= 0: + t = group[i] + if isinstance(t, optype): + i = t.replace_self(parser, group, i) + i -= 1 + + # Descend into the groups and recursively call do_operators + for i, t in enumerate(group): + if isinstance(t, syntax.GroupNode): + group[i] = self.do_operators(parser, t) + + return group + + +# + +class PlusMinusPlugin(Plugin): + """Adds the ability to use + and - in a flat OR query to specify required + and prohibited terms. + + This is the basis for the parser configuration returned by + ``SimpleParser()``. + """ + + # Marker nodes for + and - + + class Plus(syntax.MarkerNode): + pass + + class Minus(syntax.MarkerNode): + pass + + def __init__(self, plusexpr="\\+", minusexpr="-"): + self.plusexpr = plusexpr + self.minusexpr = minusexpr + + def taggers(self, parser): + return [(FnTagger(self.plusexpr, self.Plus, "plus"), 0), + (FnTagger(self.minusexpr, self.Minus, "minus"), 0)] + + def filters(self, parser): + return [(self.do_plusminus, 510)] + + def do_plusminus(self, parser, group): + """This filter sorts nodes in a flat group into "required", "optional", + and "banned" subgroups based on the presence of plus and minus nodes. + """ + + required = syntax.AndGroup() + optional = syntax.OrGroup() + banned = syntax.OrGroup() + + # If the top-level group is an AndGroup we make everything "required" by default + if isinstance(group, syntax.AndGroup): + optional = syntax.AndGroup() + + # Which group to put the next node we see into + next = optional + for node in group: + if isinstance(node, self.Plus): + # +: put the next node in the required group + next = required + elif isinstance(node, self.Minus): + # -: put the next node in the banned group + next = banned + else: + # Anything else: put it in the appropriate group + next.append(node) + # Reset to putting things in the optional group by default + next = optional + + group = optional + if required: + group = syntax.AndMaybeGroup([required, group]) + if banned: + group = syntax.AndNotGroup([group, banned]) + return group + + +class GtLtPlugin(TaggingPlugin): + """Allows the user to use greater than/less than symbols to create range + queries:: + + a:>100 b:<=z c:>=-1.4 d:``, ``<``, ``>=``, ``<=``, ``=>``, and ``=<`` + after a field specifier. The field specifier is required. You cannot do the + following:: + + >100 + + This plugin requires the FieldsPlugin and RangePlugin to work. + """ + + class GtLtNode(syntax.SyntaxNode): + def __init__(self, rel): + self.rel = rel + + def __repr__(self): + return "(%s)" % self.rel + + expr = r"(?P(<=|>=|<|>|=<|=>))" + nodetype = GtLtNode + + def filters(self, parser): + # Run before the fields filter removes FilenameNodes at priority 100. + return [(self.do_gtlt, 99)] + + def do_gtlt(self, parser, group): + """This filter translate FieldnameNode/GtLtNode pairs into RangeNodes. + """ + + fname = syntax.FieldnameNode + newgroup = group.empty_copy() + i = 0 + lasti = len(group) - 1 + while i < len(group): + node = group[i] + # If this is a GtLtNode... + if isinstance(node, self.GtLtNode): + # If it's not the last node in the group... + if i < lasti: + prevnode = newgroup[-1] + nextnode = group[i + 1] + # If previous was a fieldname and next node has text + if isinstance(prevnode, fname) and nextnode.has_text: + # Make the next node into a range based on the symbol + newgroup.append(self.make_range(nextnode, node.rel)) + # Skip the next node + i += 1 + else: + # If it's not a GtLtNode, add it to the filtered group + newgroup.append(node) + i += 1 + + return newgroup + + def make_range(self, node, rel): + text = node.text + if rel == "<": + n = syntax.RangeNode(None, text, False, True) + elif rel == ">": + n = syntax.RangeNode(text, None, True, False) + elif rel == "<=" or rel == "=<": + n = syntax.RangeNode(None, text, False, False) + elif rel == ">=" or rel == "=>": + n = syntax.RangeNode(text, None, False, False) + return n.set_range(node.startchar, node.endchar) + + +class MultifieldPlugin(Plugin): + """Converts any unfielded terms into OR clauses that search for the + term in a specified list of fields. + + >>> qp = qparser.QueryParser(None, myschema) + >>> qp.add_plugin(qparser.MultifieldPlugin(["a", "b"]) + >>> qp.parse("alfa c:bravo") + And([Or([Term("a", "alfa"), Term("b", "alfa")]), Term("c", "bravo")]) + + This plugin is the basis for the ``MultifieldParser``. + """ + + def __init__(self, fieldnames, fieldboosts=None, group=syntax.OrGroup): + """ + :param fieldnames: a list of fields to search. + :param fieldboosts: an optional dictionary mapping field names to + a boost to use for that field. + :param group: the group to use to relate the fielded terms to each + other. + """ + + self.fieldnames = fieldnames + self.boosts = fieldboosts or {} + self.group = group + + def filters(self, parser): + # Run after the fields filter applies explicit fieldnames (at priority + # 100) + return [(self.do_multifield, 110)] + + def do_multifield(self, parser, group): + for i, node in enumerate(group): + if isinstance(node, syntax.GroupNode): + # Recurse inside groups + group[i] = self.do_multifield(parser, node) + elif node.has_fieldname and node.fieldname is None: + # For an unfielded node, create a new group containing fielded + # versions of the node for each configured "multi" field. + newnodes = [] + for fname in self.fieldnames: + newnode = copy.copy(node) + newnode.set_fieldname(fname) + newnode.set_boost(self.boosts.get(fname, 1.0)) + newnodes.append(newnode) + group[i] = self.group(newnodes) + return group + + +class FieldAliasPlugin(Plugin): + """Adds the ability to use "aliases" of fields in the query string. + + This plugin is useful for allowing users of languages that can't be + represented in ASCII to use field names in their own language, and + translate them into the "real" field names, which must be valid Python + identifiers. + + >>> # Allow users to use 'body' or 'text' to refer to the 'content' field + >>> parser.add_plugin(FieldAliasPlugin({"content": ["body", "text"]})) + >>> parser.parse("text:hello") + Term("content", "hello") + """ + + def __init__(self, fieldmap): + self.fieldmap = fieldmap + self.reverse = {} + for key, values in iteritems(fieldmap): + for value in values: + self.reverse[value] = key + + def filters(self, parser): + # Run before fields plugin at 100 + return [(self.do_aliases, 90)] + + def do_aliases(self, parser, group): + for i, node in enumerate(group): + if isinstance(node, syntax.GroupNode): + group[i] = self.do_aliases(parser, node) + elif node.has_fieldname and node.fieldname is not None: + fname = node.fieldname + if fname in self.reverse: + node.set_fieldname(self.reverse[fname], override=True) + return group + + +class CopyFieldPlugin(Plugin): + """Looks for basic syntax nodes (terms, prefixes, wildcards, phrases, etc.) + occurring in a certain field and replaces it with a group (by default OR) + containing the original token and the token copied to a new field. + + For example, the query:: + + hello name:matt + + could be automatically converted by ``CopyFieldPlugin({"name", "author"})`` + to:: + + hello (name:matt OR author:matt) + + This is useful where one field was indexed with a differently-analyzed copy + of another, and you want the query to search both fields. + + You can specify a different group type with the ``group`` keyword. You can + also specify ``group=None``, in which case the copied node is inserted + "inline" next to the original, instead of in a new group:: + + hello name:matt author:matt + """ + + def __init__(self, map, group=syntax.OrGroup, mirror=False): + """ + :param map: a dictionary mapping names of fields to copy to the + names of the destination fields. + :param group: the type of group to create in place of the original + token. You can specify ``group=None`` to put the copied node + "inline" next to the original node instead of in a new group. + :param two_way: if True, the plugin copies both ways, so if the user + specifies a query in the 'toname' field, it will be copied to + the 'fromname' field. + """ + + self.map = map + self.group = group + if mirror: + # Add in reversed mappings + map.update(dict((v, k) for k, v in iteritems(map))) + + def filters(self, parser): + # Run after the fieldname filter (100) but before multifield (110) + return [(self.do_copyfield, 109)] + + def do_copyfield(self, parser, group): + map = self.map + newgroup = group.empty_copy() + for node in group: + if isinstance(node, syntax.GroupNode): + # Recurse into groups + node = self.do_copyfield(parser, node) + elif node.has_fieldname: + fname = node.fieldname or parser.fieldname + if fname in map: + newnode = copy.copy(node) + newnode.set_fieldname(map[fname], override=True) + if self.group is None: + newgroup.append(node) + newgroup.append(newnode) + else: + newgroup.append(self.group([node, newnode])) + continue + newgroup.append(node) + return newgroup + + +class PseudoFieldPlugin(Plugin): + """This is an advanced plugin that lets you define "pseudo-fields" the user + can use in their queries. When the parser encounters one of these fields, + it runs a given function on the following node in the abstract syntax tree. + + Unfortunately writing the transform function(s) requires knowledge of the + parser's abstract syntax tree classes. A transform function takes a + :class:`whoosh.qparser.SyntaxNode` and returns a + :class:`~whoosh.qparser.SyntaxNode` (or None if the node should be removed + instead of transformed). + + Some things you can do in the transform function:: + + from whoosh import qparser + + def my_xform_fn(node): + # Is this a text node? + if node.has_text: + # Change the node's text + node.text = node.text + "foo" + + # Change the node into a prefix query + node = qparser.PrefixPlugin.PrefixNode(node.text) + + # Set the field the node should search in + node.set_fieldname("title") + + return node + else: + # If the pseudo-field wasn't applied to a text node (e.g. + # it preceded a group, as in ``pfield:(a OR b)`` ), remove the + # node. Alternatively you could just ``return node`` here to + # leave the non-text node intact. + return None + + In the following example, if the user types ``regex:foo.bar``, the function + transforms the text in the pseudo-field "regex" into a regular expression + query in the "content" field:: + + from whoosh import qparser + + def regex_maker(node): + if node.has_text: + node = qparser.RegexPlugin.RegexNode(node.text) + node.set_fieldname("content") + return node + + qp = qparser.QueryParser("content", myindex.schema) + qp.add_plugin(qparser.PseudoFieldPlugin({"regex": regex_maker})) + q = qp.parse("alfa regex:br.vo") + + The name of the "pseudo" field can be the same as an actual field. Imagine + the schema has a field named ``reverse``, and you want the user to be able + to type ``reverse:foo`` and transform it to ``reverse:(foo OR oof)``:: + + def rev_text(node): + if node.has_text: + # Create a word node for the reversed text + revtext = node.text[::-1] # Reverse the text + rnode = qparser.WordNode(revtext) + + # Put the original node and the reversed node in an OrGroup + group = qparser.OrGroup([node, rnode]) + + # Need to set the fieldname here because the PseudoFieldPlugin + # removes the field name syntax + group.set_fieldname("reverse") + + return group + + qp = qparser.QueryParser("content", myindex.schema) + qp.add_plugin(qparser.PseudoFieldPlugin({"reverse": rev_text})) + q = qp.parse("alfa reverse:bravo") + + Note that transforming the query like this can potentially really confuse + the spell checker! + + This plugin works as a filter, so it can only operate on the query after it + has been parsed into an abstract syntax tree. For parsing control (i.e. to + give a pseudo-field its own special syntax), you would need to write your + own parsing plugin. + """ + + def __init__(self, xform_map): + """ + :param xform_map: a dictionary mapping psuedo-field names to transform + functions. The function should take a + :class:`whoosh.qparser.SyntaxNode` as an argument, and return a + :class:`~whoosh.qparser.SyntaxNode`. If the function returns None, + the node will be removed from the query. + """ + + self.xform_map = xform_map + + def filters(self, parser): + # Run before the fieldname filter (100) + return [(self.do_pseudofield, 99)] + + def do_pseudofield(self, parser, group): + xform_map = self.xform_map + + newgroup = group.empty_copy() + xform_next = None + for node in group: + if isinstance(node, syntax.GroupNode): + node = self.do_pseudofield(parser, node) + elif (isinstance(node, syntax.FieldnameNode) + and node.fieldname in xform_map): + xform_next = xform_map[node.fieldname] + continue + + if xform_next: + newnode = xform_next(node) + xform_next = None + if newnode is None: + continue + else: + newnode.set_range(node.startchar, node.endchar) + node = newnode + + newgroup.append(node) + + return newgroup diff --git a/src/whoosh/qparser/syntax.py b/src/whoosh/qparser/syntax.py new file mode 100644 index 0000000..b700fec --- /dev/null +++ b/src/whoosh/qparser/syntax.py @@ -0,0 +1,641 @@ +# Copyright 2011 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import sys, weakref + +from whoosh import query +from whoosh.qparser.common import get_single_text, QueryParserError, attach + + +class SyntaxNode(object): + """Base class for nodes that make up the abstract syntax tree (AST) of a + parsed user query string. The AST is an intermediate step, generated + from the query string, then converted into a :class:`whoosh.query.Query` + tree by calling the ``query()`` method on the nodes. + + Instances have the following required attributes: + + ``has_fieldname`` + True if this node has a ``fieldname`` attribute. + ``has_text`` + True if this node has a ``text`` attribute + ``has_boost`` + True if this node has a ``boost`` attribute. + ``startchar`` + The character position in the original text at which this node started. + ``endchar`` + The character position in the original text at which this node ended. + """ + + has_fieldname = False + has_text = False + has_boost = False + _parent = None + + def __repr__(self): + r = "<" + if self.has_fieldname: + r += "%r:" % self.fieldname + r += self.r() + if self.has_boost and self.boost != 1.0: + r += " ^%s" % self.boost + r += ">" + return r + + def r(self): + """Returns a basic representation of this node. The base class's + ``__repr__`` method calls this, then does the extra busy work of adding + fieldname and boost where appropriate. + """ + + return "%s %r" % (self.__class__.__name__, self.__dict__) + + def apply(self, fn): + return self + + def accept(self, fn): + def fn_wrapper(n): + return fn(n.apply(fn_wrapper)) + + return fn_wrapper(self) + + def query(self, parser): + """Returns a :class:`whoosh.query.Query` instance corresponding to this + syntax tree node. + """ + + raise NotImplementedError(self.__class__.__name__) + + def is_ws(self): + """Returns True if this node is ignorable whitespace. + """ + + return False + + def is_text(self): + return False + + def set_fieldname(self, name, override=False): + """Sets the fieldname associated with this node. If ``override`` is + False (the default), the fieldname will only be replaced if this node + does not already have a fieldname set. + + For nodes that don't have a fieldname, this is a no-op. + """ + + if not self.has_fieldname: + return + + if self.fieldname is None or override: + self.fieldname = name + return self + + def set_boost(self, boost): + """Sets the boost associated with this node. + + For nodes that don't have a boost, this is a no-op. + """ + + if not self.has_boost: + return + self.boost = boost + return self + + def set_range(self, startchar, endchar): + """Sets the character range associated with this node. + """ + + self.startchar = startchar + self.endchar = endchar + return self + + # Navigation methods + + def parent(self): + if self._parent: + return self._parent() + + def next_sibling(self): + p = self.parent() + if p: + return p.node_after(self) + + def prev_sibling(self): + p = self.parent() + if p: + return p.node_before(self) + + def bake(self, parent): + self._parent = weakref.ref(parent) + + +class MarkerNode(SyntaxNode): + """Base class for nodes that only exist to mark places in the tree. + """ + + def r(self): + return self.__class__.__name__ + + +class Whitespace(MarkerNode): + """Abstract syntax tree node for ignorable whitespace. + """ + + def r(self): + return " " + + def is_ws(self): + return True + + +class FieldnameNode(SyntaxNode): + """Abstract syntax tree node for field name assignments. + """ + + has_fieldname = True + + def __init__(self, fieldname, original): + self.fieldname = fieldname + self.original = original + + def __repr__(self): + return "<%r:>" % self.fieldname + + +class GroupNode(SyntaxNode): + """Base class for abstract syntax tree node types that group together + sub-nodes. + + Instances have the following attributes: + + ``merging`` + True if side-by-side instances of this group can be merged into a + single group. + ``qclass`` + If a subclass doesn't override ``query()``, the base class will simply + wrap this class around the queries returned by the subnodes. + + This class implements a number of list methods for operating on the + subnodes. + """ + + has_boost = True + merging = True + qclass = None + + def __init__(self, nodes=None, boost=1.0, **kwargs): + self.nodes = nodes or [] + self.boost = boost + self.kwargs = kwargs + + def r(self): + return "%s %s" % (self.__class__.__name__, + ", ".join(repr(n) for n in self.nodes)) + + @property + def startchar(self): + if not self.nodes: + return None + return self.nodes[0].startchar + + @property + def endchar(self): + if not self.nodes: + return None + return self.nodes[-1].endchar + + def apply(self, fn): + return self.__class__(self.type, [fn(node) for node in self.nodes], + boost=self.boost, **self.kwargs) + + def query(self, parser): + subs = [] + for node in self.nodes: + subq = node.query(parser) + if subq is not None: + subs.append(subq) + + q = self.qclass(subs, boost=self.boost, **self.kwargs) + return attach(q, self) + + def empty_copy(self): + """Returns an empty copy of this group. + + This is used in the common pattern where a filter creates an new + group and then adds nodes from the input group to it if they meet + certain criteria, then returns the new group:: + + def remove_whitespace(parser, group): + newgroup = group.empty_copy() + for node in group: + if not node.is_ws(): + newgroup.append(node) + return newgroup + """ + + c = self.__class__(**self.kwargs) + if self.has_boost: + c.boost = self.boost + if self.has_fieldname: + c.fieldname = self.fieldname + if self.has_text: + c.text = self.text + return c + + def set_fieldname(self, name, override=False): + SyntaxNode.set_fieldname(self, name, override=override) + for node in self.nodes: + node.set_fieldname(name, override=override) + + def set_range(self, startchar, endchar): + for node in self.nodes: + node.set_range(startchar, endchar) + return self + + # List-like methods + + def __nonzero__(self): + return bool(self.nodes) + + __bool__ = __nonzero__ + + def __iter__(self): + return iter(self.nodes) + + def __len__(self): + return len(self.nodes) + + def __getitem__(self, n): + return self.nodes.__getitem__(n) + + def __setitem__(self, n, v): + self.nodes.__setitem__(n, v) + + def __delitem__(self, n): + self.nodes.__delitem__(n) + + def insert(self, n, v): + self.nodes.insert(n, v) + + def append(self, v): + self.nodes.append(v) + + def extend(self, vs): + self.nodes.extend(vs) + + def pop(self, *args, **kwargs): + return self.nodes.pop(*args, **kwargs) + + def reverse(self): + self.nodes.reverse() + + def index(self, v): + return self.nodes.index(v) + + # Navigation methods + + def bake(self, parent): + SyntaxNode.bake(self, parent) + for node in self.nodes: + node.bake(self) + + def node_before(self, n): + try: + i = self.nodes.index(n) + except ValueError: + return + if i > 0: + return self.nodes[i - 1] + + def node_after(self, n): + try: + i = self.nodes.index(n) + except ValueError: + return + if i < len(self.nodes) - 2: + return self.nodes[i + 1] + + +class BinaryGroup(GroupNode): + """Intermediate base class for group nodes that have two subnodes and + whose ``qclass`` initializer takes two arguments instead of a list. + """ + + merging = False + has_boost = False + + def query(self, parser): + assert len(self.nodes) == 2 + + qa = self.nodes[0].query(parser) + qb = self.nodes[1].query(parser) + if qa is None and qb is None: + q = query.NullQuery + elif qa is None: + q = qb + elif qb is None: + q = qa + else: + q = self.qclass(self.nodes[0].query(parser), + self.nodes[1].query(parser)) + + return attach(q, self) + + +class Wrapper(GroupNode): + """Intermediate base class for nodes that wrap a single sub-node. + """ + + merging = False + + def query(self, parser): + q = self.nodes[0].query(parser) + if q: + return attach(self.qclass(q), self) + + +class ErrorNode(SyntaxNode): + def __init__(self, message, node=None): + self.message = message + self.node = node + + def r(self): + return "ERR %r %r" % (self.node, self.message) + + @property + def startchar(self): + return self.node.startchar + + @property + def endchar(self): + return self.node.endchar + + def query(self, parser): + if self.node: + q = self.node.query(parser) + else: + q = query.NullQuery + + return attach(query.error_query(self.message, q), self) + + +class AndGroup(GroupNode): + qclass = query.And + + +class OrGroup(GroupNode): + qclass = query.Or + + @classmethod + def factory(cls, scale=1.0): + def maker(nodes=None, **kwargs): + return cls(nodes=nodes, scale=scale, **kwargs) + return maker + + +class DisMaxGroup(GroupNode): + qclass = query.DisjunctionMax + + +class OrderedGroup(GroupNode): + qclass = query.Ordered + + +class AndNotGroup(BinaryGroup): + qclass = query.AndNot + + +class AndMaybeGroup(BinaryGroup): + qclass = query.AndMaybe + + +class RequireGroup(BinaryGroup): + qclass = query.Require + + +class NotGroup(Wrapper): + qclass = query.Not + + +class RangeNode(SyntaxNode): + """Syntax node for range queries. + """ + + has_fieldname = True + + def __init__(self, start, end, startexcl, endexcl): + self.start = start + self.end = end + self.startexcl = startexcl + self.endexcl = endexcl + self.boost = 1.0 + self.fieldname = None + self.kwargs = {} + + def r(self): + b1 = "{" if self.startexcl else "[" + b2 = "}" if self.endexcl else "]" + return "%s%r %r%s" % (b1, self.start, self.end, b2) + + def query(self, parser): + fieldname = self.fieldname or parser.fieldname + start = self.start + end = self.end + + if parser.schema and fieldname in parser.schema: + field = parser.schema[fieldname] + if field.self_parsing(): + try: + q = field.parse_range(fieldname, start, end, + self.startexcl, self.endexcl, + boost=self.boost) + if q is not None: + return attach(q, self) + except QueryParserError: + e = sys.exc_info()[1] + return attach(query.error_query(e), self) + + if start: + start = get_single_text(field, start, tokenize=False, + removestops=False) + if end: + end = get_single_text(field, end, tokenize=False, + removestops=False) + + q = query.TermRange(fieldname, start, end, self.startexcl, + self.endexcl, boost=self.boost) + return attach(q, self) + + +class TextNode(SyntaxNode): + """Intermediate base class for basic nodes that search for text, such as + term queries, wildcards, prefixes, etc. + + Instances have the following attributes: + + ``qclass`` + If a subclass does not override ``query()``, the base class will use + this class to construct the query. + ``tokenize`` + If True and the subclass does not override ``query()``, the node's text + will be tokenized before constructing the query + ``removestops`` + If True and the subclass does not override ``query()``, and the field's + analyzer has a stop word filter, stop words will be removed from the + text before constructing the query. + """ + + has_fieldname = True + has_text = True + has_boost = True + qclass = None + tokenize = False + removestops = False + + def __init__(self, text): + self.fieldname = None + self.text = text + self.boost = 1.0 + + def r(self): + return "%s %r" % (self.__class__.__name__, self.text) + + def is_text(self): + return True + + def query(self, parser): + fieldname = self.fieldname or parser.fieldname + termclass = self.qclass or parser.termclass + q = parser.term_query(fieldname, self.text, termclass, + boost=self.boost, tokenize=self.tokenize, + removestops=self.removestops) + return attach(q, self) + + +class WordNode(TextNode): + """Syntax node for term queries. + """ + + tokenize = True + removestops = True + + def r(self): + return repr(self.text) + + +# Operators + +class Operator(SyntaxNode): + """Base class for PrefixOperator, PostfixOperator, and InfixOperator. + + Operators work by moving the nodes they apply to (e.g. for prefix operator, + the previous node, for infix operator, the nodes on either side, etc.) into + a group node. The group provides the code for what to do with the nodes. + """ + + def __init__(self, text, grouptype, leftassoc=True): + """ + :param text: the text of the operator in the query string. + :param grouptype: the type of group to create in place of the operator + and the node(s) it operates on. + :param leftassoc: for infix opeators, whether the operator is left + associative. use ``leftassoc=False`` for right-associative infix + operators. + """ + + self.text = text + self.grouptype = grouptype + self.leftassoc = leftassoc + + def r(self): + return "OP %r" % self.text + + def replace_self(self, parser, group, position): + """Called with the parser, a group, and the position at which the + operator occurs in that group. Should return a group with the operator + replaced by whatever effect the operator has (e.g. for an infix op, + replace the op and the nodes on either side with a sub-group). + """ + + raise NotImplementedError + + +class PrefixOperator(Operator): + def replace_self(self, parser, group, position): + length = len(group) + del group[position] + if position < length - 1: + group[position] = self.grouptype([group[position]]) + return position + + +class PostfixOperator(Operator): + def replace_self(self, parser, group, position): + del group[position] + if position > 0: + group[position - 1] = self.grouptype([group[position - 1]]) + return position + + +class InfixOperator(Operator): + def replace_self(self, parser, group, position): + la = self.leftassoc + gtype = self.grouptype + merging = gtype.merging + + if position > 0 and position < len(group) - 1: + left = group[position - 1] + right = group[position + 1] + + # The first two clauses check whether the "strong" side is already + # a group of the type we are going to create. If it is, we just + # append the "weak" side to the "strong" side instead of creating + # a new group inside the existing one. This is necessary because + # we can quickly run into Python's recursion limit otherwise. + if merging and la and isinstance(left, gtype): + left.append(right) + del group[position:position + 2] + elif merging and not la and isinstance(right, gtype): + right.insert(0, left) + del group[position - 1:position + 1] + return position - 1 + else: + # Replace the operator and the two surrounding objects + group[position - 1:position + 2] = [gtype([left, right])] + else: + del group[position] + + return position + + +# Functions + +def to_word(n): + node = WordNode(n.original) + node.startchar = n.startchar + node.endchar = n.endchar + return node diff --git a/src/whoosh/qparser/taggers.py b/src/whoosh/qparser/taggers.py new file mode 100644 index 0000000..e3b5bb5 --- /dev/null +++ b/src/whoosh/qparser/taggers.py @@ -0,0 +1,93 @@ +# Copyright 2011 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.util.text import rcompile + + +# Tagger objects + +class Tagger(object): + """Base class for taggers, objects which match syntax in the query string + and translate it into a :class:`whoosh.qparser.syntax.SyntaxNode` object. + """ + + def match(self, parser, text, pos): + """This method should see if this tagger matches the query string at + the given position. If it matches, it should return + + :param parser: the :class:`whoosh.qparser.default.QueryParser` object. + :param text: the text being parsed. + :param pos: the position in the text at which the tagger should try to + match. + """ + + raise NotImplementedError + + +class RegexTagger(Tagger): + """Tagger class that uses regular expressions to match the query string. + Subclasses should override ``create()`` instead of ``match()``. + """ + + def __init__(self, expr): + self.expr = rcompile(expr) + + def match(self, parser, text, pos): + match = self.expr.match(text, pos) + if match: + node = self.create(parser, match) + if node is not None: + node = node.set_range(match.start(), match.end()) + return node + + def create(self, parser, match): + """When the regular expression matches, this method is called to + translate the regex match object into a syntax node. + + :param parser: the :class:`whoosh.qparser.default.QueryParser` object. + :param match: the regex match object. + """ + + raise NotImplementedError + + +class FnTagger(RegexTagger): + """Tagger that takes a regular expression and a class or function, and for + matches calls the class/function with the regex match's named groups as + keyword arguments. + """ + + def __init__(self, expr, fn, memo=""): + RegexTagger.__init__(self, expr) + self.fn = fn + self.memo = memo + + def __repr__(self): + return "<%s %r (%s)>" % (self.__class__.__name__, self.expr, self.memo) + + def create(self, parser, match): + return self.fn(**match.groupdict()) diff --git a/src/whoosh/query/__init__.py b/src/whoosh/query/__init__.py new file mode 100644 index 0000000..97e34a4 --- /dev/null +++ b/src/whoosh/query/__init__.py @@ -0,0 +1,36 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.query.qcore import * +from whoosh.query.terms import * +from whoosh.query.compound import * +from whoosh.query.positional import * +from whoosh.query.ranges import * +from whoosh.query.wrappers import * +from whoosh.query.nested import * +from whoosh.query.qcolumns import * +from whoosh.query.spans import * diff --git a/src/whoosh/query/compound.py b/src/whoosh/query/compound.py new file mode 100644 index 0000000..b458cbc --- /dev/null +++ b/src/whoosh/query/compound.py @@ -0,0 +1,660 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import division + +from whoosh import matching +from whoosh.compat import text_type, u +from whoosh.compat import xrange +from whoosh.query import qcore +from whoosh.util import make_binary_tree, make_weighted_tree + + +class CompoundQuery(qcore.Query): + """Abstract base class for queries that combine or manipulate the results + of multiple sub-queries . + """ + + def __init__(self, subqueries, boost=1.0): + for subq in subqueries: + if not isinstance(subq, qcore.Query): + raise qcore.QueryError("%r is not a query" % subq) + self.subqueries = subqueries + self.boost = boost + + def __repr__(self): + r = "%s(%r" % (self.__class__.__name__, self.subqueries) + if hasattr(self, "boost") and self.boost != 1: + r += ", boost=%s" % self.boost + r += ")" + return r + + def __unicode__(self): + r = u("(") + r += self.JOINT.join([text_type(s) for s in self.subqueries]) + r += u(")") + return r + + __str__ = __unicode__ + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.subqueries == other.subqueries + and self.boost == other.boost) + + def __getitem__(self, i): + return self.subqueries.__getitem__(i) + + def __len__(self): + return len(self.subqueries) + + def __iter__(self): + return iter(self.subqueries) + + def __hash__(self): + h = hash(self.__class__.__name__) ^ hash(self.boost) + for q in self.subqueries: + h ^= hash(q) + return h + + def is_leaf(self): + return False + + def children(self): + return iter(self.subqueries) + + def apply(self, fn): + return self.__class__([fn(q) for q in self.subqueries], + boost=self.boost) + + def field(self): + if self.subqueries: + f = self.subqueries[0].field() + if all(q.field() == f for q in self.subqueries[1:]): + return f + + def estimate_size(self, ixreader): + est = sum(q.estimate_size(ixreader) for q in self.subqueries) + return min(est, ixreader.doc_count()) + + def estimate_min_size(self, ixreader): + from whoosh.query import Not + + subs = self.subqueries + qs = [(q, q.estimate_min_size(ixreader)) for q in subs + if not isinstance(q, Not)] + pos = [minsize for q, minsize in qs if minsize > 0] + if pos: + neg = [q.estimate_size(ixreader) for q in subs + if isinstance(q, Not)] + size = min(pos) - sum(neg) + if size > 0: + return size + return 0 + + def normalize(self): + from whoosh.query import Every, TermRange, NumericRange + + # Normalize subqueries and merge nested instances of this class + subqueries = [] + for s in self.subqueries: + s = s.normalize() + if isinstance(s, self.__class__): + subqueries += [ss.with_boost(ss.boost * s.boost) for ss in s] + else: + subqueries.append(s) + + # If every subquery is Null, this query is Null + if all(q is qcore.NullQuery for q in subqueries): + return qcore.NullQuery + + # If there's an unfielded Every inside, then this query is Every + if any((isinstance(q, Every) and q.fieldname is None) + for q in subqueries): + return Every() + + # Merge ranges and Everys + everyfields = set() + i = 0 + while i < len(subqueries): + q = subqueries[i] + f = q.field() + if f in everyfields: + subqueries.pop(i) + continue + + if isinstance(q, (TermRange, NumericRange)): + j = i + 1 + while j < len(subqueries): + if q.overlaps(subqueries[j]): + qq = subqueries.pop(j) + q = q.merge(qq, intersect=self.intersect_merge) + else: + j += 1 + q = subqueries[i] = q.normalize() + + if isinstance(q, Every): + everyfields.add(q.fieldname) + i += 1 + + # Eliminate duplicate queries + subqs = [] + seenqs = set() + for s in subqueries: + if not isinstance(s, Every) and s.field() in everyfields: + continue + if s in seenqs: + continue + seenqs.add(s) + subqs.append(s) + + # Remove NullQuerys + subqs = [q for q in subqs if q is not qcore.NullQuery] + + if not subqs: + return qcore.NullQuery + + if len(subqs) == 1: + sub = subqs[0] + sub_boost = getattr(sub, "boost", 1.0) + if not (self.boost == 1.0 and sub_boost == 1.0): + sub = sub.with_boost(sub_boost * self.boost) + return sub + + return self.__class__(subqs, boost=self.boost) + + def simplify(self, ixreader): + subs = self.subqueries + if subs: + q = self.__class__([subq.simplify(ixreader) for subq in subs], + boost=self.boost).normalize() + else: + q = qcore.NullQuery + return q + + def matcher(self, searcher, context=None): + # This method does a little sanity checking and then passes the info + # down to the _matcher() method which subclasses must implement + + subs = self.subqueries + if not subs: + return matching.NullMatcher() + + if len(subs) == 1: + m = subs[0].matcher(searcher, context) + else: + m = self._matcher(subs, searcher, context) + return m + + def _matcher(self, subs, searcher, context): + # Subclasses must implement this method + + raise NotImplementedError + + def _tree_matcher(self, subs, mcls, searcher, context, q_weight_fn, + **kwargs): + # q_weight_fn is a function which is called on each query and returns a + # "weight" value which is used to build a huffman-like matcher tree. If + # q_weight_fn is None, an order-preserving binary tree is used instead. + + # Create a matcher from the list of subqueries + subms = [q.matcher(searcher, context) for q in subs] + + if len(subms) == 1: + m = subms[0] + elif q_weight_fn is None: + m = make_binary_tree(mcls, subms, **kwargs) + else: + w_subms = [(q_weight_fn(q), m) for q, m in zip(subs, subms)] + m = make_weighted_tree(mcls, w_subms, **kwargs) + + # If this query had a boost, add a wrapping matcher to apply the boost + if self.boost != 1.0: + m = matching.WrappingMatcher(m, self.boost) + + return m + + +class And(CompoundQuery): + """Matches documents that match ALL of the subqueries. + + >>> And([Term("content", u"render"), + ... Term("content", u"shade"), + ... Not(Term("content", u"texture"))]) + >>> # You can also do this + >>> Term("content", u"render") & Term("content", u"shade") + """ + + # This is used by the superclass's __unicode__ method. + JOINT = " AND " + intersect_merge = True + + def requires(self): + s = set() + for q in self.subqueries: + s |= q.requires() + return s + + def estimate_size(self, ixreader): + return min(q.estimate_size(ixreader) for q in self.subqueries) + + def _matcher(self, subs, searcher, context): + r = searcher.reader() + q_weight_fn = lambda q: 0 - q.estimate_size(r) + return self._tree_matcher(subs, matching.IntersectionMatcher, searcher, + context, q_weight_fn) + + +class Or(CompoundQuery): + """Matches documents that match ANY of the subqueries. + + >>> Or([Term("content", u"render"), + ... And([Term("content", u"shade"), Term("content", u"texture")]), + ... Not(Term("content", u"network"))]) + >>> # You can also do this + >>> Term("content", u"render") | Term("content", u"shade") + """ + + # This is used by the superclass's __unicode__ method. + JOINT = " OR " + intersect_merge = False + TOO_MANY_CLAUSES = 1024 + + # For debugging: set the array_type property to control matcher selection + AUTO_MATCHER = 0 # Use automatic heuristics to choose matcher + DEFAULT_MATCHER = 1 # Use a binary tree of UnionMatchers + SPLIT_MATCHER = 2 # Use a different strategy for short and long queries + ARRAY_MATCHER = 3 # Use a matcher that pre-loads docnums and scores + matcher_type = AUTO_MATCHER + + def __init__(self, subqueries, boost=1.0, minmatch=0, scale=None): + """ + :param subqueries: a list of :class:`Query` objects to search for. + :param boost: a boost factor to apply to the scores of all matching + documents. + :param minmatch: not yet implemented. + :param scale: a scaling factor for a "coordination bonus". If this + value is not None, it should be a floating point number greater + than 0 and less than 1. The scores of the matching documents are + boosted/penalized based on the number of query terms that matched + in the document. This number scales the effect of the bonuses. + """ + + CompoundQuery.__init__(self, subqueries, boost=boost) + self.minmatch = minmatch + self.scale = scale + + def __unicode__(self): + r = u("(") + r += (self.JOINT).join([text_type(s) for s in self.subqueries]) + r += u(")") + if self.minmatch: + r += u(">%s") % self.minmatch + return r + + __str__ = __unicode__ + + def normalize(self): + norm = CompoundQuery.normalize(self) + if norm.__class__ is self.__class__: + norm.minmatch = self.minmatch + norm.scale = self.scale + return norm + + def requires(self): + if len(self.subqueries) == 1: + return self.subqueries[0].requires() + else: + return set() + + def _matcher(self, subs, searcher, context): + needs_current = context.needs_current if context else True + weighting = context.weighting if context else None + matcher_type = self.matcher_type + + if matcher_type == self.AUTO_MATCHER: + dc = searcher.doc_count_all() + if (len(subs) < self.TOO_MANY_CLAUSES + and (needs_current + or self.scale + or len(subs) == 2 + or dc > 5000)): + # If the parent matcher needs the current match, or there's just + # two sub-matchers, use the standard binary tree of Unions + matcher_type = self.DEFAULT_MATCHER + else: + # For small indexes, or too many clauses, just preload all + # matches + matcher_type = self.ARRAY_MATCHER + + if matcher_type == self.DEFAULT_MATCHER: + # Implementation of Or that creates a binary tree of Union matchers + cls = DefaultOr + elif matcher_type == self.SPLIT_MATCHER: + # Hybrid of pre-loading small queries and a binary tree of union + # matchers for big queries + cls = SplitOr + elif matcher_type == self.ARRAY_MATCHER: + # Implementation that pre-loads docnums and scores into an array + cls = PreloadedOr + else: + raise ValueError("Unknown matcher_type %r" % self.matcher_type) + + return cls(subs, boost=self.boost, minmatch=self.minmatch, + scale=self.scale).matcher(searcher, context) + + +class DefaultOr(Or): + JOINT = " dOR " + + def _matcher(self, subs, searcher, context): + reader = searcher.reader() + q_weight_fn = lambda q: q.estimate_size(reader) + m = self._tree_matcher(subs, matching.UnionMatcher, searcher, context, + q_weight_fn) + + # If a scaling factor was given, wrap the matcher in a CoordMatcher to + # alter scores based on term coordination + if self.scale and any(m.term_matchers()): + m = matching.CoordMatcher(m, scale=self.scale) + + return m + + +class SplitOr(Or): + JOINT = " sOr " + SPLIT_DOC_LIMIT = 8000 + + def matcher(self, searcher, context=None): + from whoosh import collectors + + # Get the subqueries + subs = self.subqueries + if not subs: + return matching.NullMatcher() + elif len(subs) == 1: + return subs[0].matcher(searcher, context) + + # Sort the subqueries into "small" and "big" queries based on their + # estimated size. This works best for term queries. + reader = searcher.reader() + smallqs = [] + bigqs = [] + for q in subs: + size = q.estimate_size(reader) + if size <= self.SPLIT_DOC_LIMIT: + smallqs.append(q) + else: + bigqs.append(q) + + # Build a pre-scored matcher for the small queries + minscore = 0 + smallmatcher = None + if smallqs: + smallmatcher = DefaultOr(smallqs).matcher(searcher, context) + smallmatcher = matching.ArrayMatcher(smallmatcher, context.limit) + minscore = smallmatcher.limit_quality() + if bigqs: + # Get a matcher for the big queries + m = DefaultOr(bigqs).matcher(searcher, context) + # Add the prescored matcher for the small queries + if smallmatcher: + m = matching.UnionMatcher(m, smallmatcher) + # Set the minimum score based on the prescored matcher + m.set_min_quality(minscore) + elif smallmatcher: + # If there are no big queries, just return the prescored matcher + m = smallmatcher + else: + m = matching.NullMatcher() + + return m + + +class PreloadedOr(Or): + JOINT = " pOR " + + def _matcher(self, subs, searcher, context): + if context: + scored = context.weighting is not None + else: + scored = True + + ms = [sub.matcher(searcher, context) for sub in subs] + doccount = searcher.doc_count_all() + am = matching.ArrayUnionMatcher(ms, doccount, boost=self.boost, + scored=scored) + return am + + +class DisjunctionMax(CompoundQuery): + """Matches all documents that match any of the subqueries, but scores each + document using the maximum score from the subqueries. + """ + + def __init__(self, subqueries, boost=1.0, tiebreak=0.0): + CompoundQuery.__init__(self, subqueries, boost=boost) + self.tiebreak = tiebreak + + def __unicode__(self): + r = u("DisMax(") + r += " ".join(sorted(text_type(s) for s in self.subqueries)) + r += u(")") + if self.tiebreak: + r += u("~") + text_type(self.tiebreak) + return r + + __str__ = __unicode__ + + def normalize(self): + norm = CompoundQuery.normalize(self) + if norm.__class__ is self.__class__: + norm.tiebreak = self.tiebreak + return norm + + def requires(self): + if len(self.subqueries) == 1: + return self.subqueries[0].requires() + else: + return set() + + def _matcher(self, subs, searcher, context): + r = searcher.reader() + q_weight_fn = lambda q: q.estimate_size(r) + return self._tree_matcher(subs, matching.DisjunctionMaxMatcher, + searcher, context, q_weight_fn, + tiebreak=self.tiebreak) + + +# Boolean queries + +class BinaryQuery(CompoundQuery): + """Base class for binary queries (queries which are composed of two + sub-queries). Subclasses should set the ``matcherclass`` attribute or + override ``matcher()``, and may also need to override ``normalize()``, + ``estimate_size()``, and/or ``estimate_min_size()``. + """ + + boost = 1.0 + + def __init__(self, a, b): + self.a = a + self.b = b + self.subqueries = (a, b) + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.a == other.a and self.b == other.b) + + def __hash__(self): + return (hash(self.__class__.__name__) ^ hash(self.a) ^ hash(self.b)) + + def needs_spans(self): + return self.a.needs_spans() or self.b.needs_spans() + + def apply(self, fn): + return self.__class__(fn(self.a), fn(self.b)) + + def field(self): + f = self.a.field() + if self.b.field() == f: + return f + + def with_boost(self, boost): + return self.__class__(self.a.with_boost(boost), + self.b.with_boost(boost)) + + def normalize(self): + a = self.a.normalize() + b = self.b.normalize() + if a is qcore.NullQuery and b is qcore.NullQuery: + return qcore.NullQuery + elif a is qcore.NullQuery: + return b + elif b is qcore.NullQuery: + return a + + return self.__class__(a, b) + + def matcher(self, searcher, context=None): + return self.matcherclass(self.a.matcher(searcher, context), + self.b.matcher(searcher, context)) + + +class AndNot(BinaryQuery): + """Binary boolean query of the form 'a ANDNOT b', where documents that + match b are removed from the matches for a. + """ + + JOINT = " ANDNOT " + + def with_boost(self, boost): + return self.__class__(self.a.with_boost(boost), self.b) + + def normalize(self): + a = self.a.normalize() + b = self.b.normalize() + + if a is qcore.NullQuery: + return qcore.NullQuery + elif b is qcore.NullQuery: + return a + + return self.__class__(a, b) + + def requires(self): + return self.a.requires() + + def matcher(self, searcher, context=None): + scoredm = self.a.matcher(searcher, context) + notm = self.b.matcher(searcher, searcher.boolean_context()) + return matching.AndNotMatcher(scoredm, notm) + + +class Otherwise(BinaryQuery): + """A binary query that only matches the second clause if the first clause + doesn't match any documents. + """ + + JOINT = " OTHERWISE " + + def matcher(self, searcher, context=None): + m = self.a.matcher(searcher, context) + if not m.is_active(): + m = self.b.matcher(searcher, context) + return m + + +class Require(BinaryQuery): + """Binary query returns results from the first query that also appear in + the second query, but only uses the scores from the first query. This lets + you filter results without affecting scores. + """ + + JOINT = " REQUIRE " + matcherclass = matching.RequireMatcher + + def requires(self): + return self.a.requires() | self.b.requires() + + def estimate_size(self, ixreader): + return self.b.estimate_size(ixreader) + + def estimate_min_size(self, ixreader): + return self.b.estimate_min_size(ixreader) + + def with_boost(self, boost): + return self.__class__(self.a.with_boost(boost), self.b) + + def normalize(self): + a = self.a.normalize() + b = self.b.normalize() + if a is qcore.NullQuery or b is qcore.NullQuery: + return qcore.NullQuery + return self.__class__(a, b) + + def docs(self, searcher): + return And(self.subqueries).docs(searcher) + + def matcher(self, searcher, context=None): + scoredm = self.a.matcher(searcher, context) + requiredm = self.b.matcher(searcher, searcher.boolean_context()) + return matching.AndNotMatcher(scoredm, requiredm) + + +class AndMaybe(BinaryQuery): + """Binary query takes results from the first query. If and only if the + same document also appears in the results from the second query, the score + from the second query will be added to the score from the first query. + """ + + JOINT = " ANDMAYBE " + matcherclass = matching.AndMaybeMatcher + + def normalize(self): + a = self.a.normalize() + b = self.b.normalize() + if a is qcore.NullQuery: + return qcore.NullQuery + if b is qcore.NullQuery: + return a + return self.__class__(a, b) + + def requires(self): + return self.a.requires() + + def estimate_min_size(self, ixreader): + return self.subqueries[0].estimate_min_size(ixreader) + + def docs(self, searcher): + return self.subqueries[0].docs(searcher) + + +def BooleanQuery(required, should, prohibited): + return AndNot(AndMaybe(And(required), Or(should)), + Or(prohibited)).normalize() diff --git a/src/whoosh/query/nested.py b/src/whoosh/query/nested.py new file mode 100644 index 0000000..a627adf --- /dev/null +++ b/src/whoosh/query/nested.py @@ -0,0 +1,412 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh import matching +from whoosh.compat import text_type, u, xrange +from whoosh.query import qcore +from whoosh.query.wrappers import WrappingQuery + + +class NestedParent(WrappingQuery): + """A query that allows you to search for "nested" documents, where you can + index (possibly multiple levels of) "parent" and "child" documents using + the :meth:`~whoosh.writing.IndexWriter.group` and/or + :meth:`~whoosh.writing.IndexWriter.start_group` methods of a + :class:`whoosh.writing.IndexWriter` to indicate that hierarchically related + documents should be kept together:: + + schema = fields.Schema(type=fields.ID, text=fields.TEXT(stored=True)) + + with ix.writer() as w: + # Say we're indexing chapters (type=chap) and each chapter has a + # number of paragraphs (type=p) + with w.group(): + w.add_document(type="chap", text="Chapter 1") + w.add_document(type="p", text="Able baker") + w.add_document(type="p", text="Bright morning") + with w.group(): + w.add_document(type="chap", text="Chapter 2") + w.add_document(type="p", text="Car trip") + w.add_document(type="p", text="Dog eared") + w.add_document(type="p", text="Every day") + with w.group(): + w.add_document(type="chap", text="Chapter 3") + w.add_document(type="p", text="Fine day") + + The ``NestedParent`` query wraps two sub-queries: the "parent query" + matches a class of "parent documents". The "sub query" matches nested + documents you want to find. For each "sub document" the "sub query" finds, + this query acts as if it found the corresponding "parent document". + + >>> with ix.searcher() as s: + ... r = s.search(query.Term("text", "day")) + ... for hit in r: + ... print(hit["text"]) + ... + Chapter 2 + Chapter 3 + """ + + def __init__(self, parents, subq, per_parent_limit=None, score_fn=sum): + """ + :param parents: a query, DocIdSet object, or Results object + representing the documents you want to use as the "parent" + documents. Where the sub-query matches, the corresponding document + in these results will be returned as the match. + :param subq: a query matching the information you want to find. + :param per_parent_limit: a maximum number of "sub documents" to search + per parent. The default is None, meaning no limit. + :param score_fn: a function to use to combine the scores of matching + sub-documents to calculate the score returned for the parent + document. The default is ``sum``, that is, add up the scores of the + sub-documents. + """ + + self.parents = parents + self.child = subq + self.per_parent_limit = per_parent_limit + self.score_fn = score_fn + + def normalize(self): + p = self.parents + if isinstance(p, qcore.Query): + p = p.normalize() + q = self.child.normalize() + + if p is qcore.NullQuery or q is qcore.NullQuery: + return qcore.NullQuery + + return self.__class__(p, q) + + def requires(self): + return self.child.requires() + + def matcher(self, searcher, context=None): + bits = searcher._filter_to_comb(self.parents) + if not bits: + return matching.NullMatcher + m = self.child.matcher(searcher, context) + if not m.is_active(): + return matching.NullMatcher + + return self.NestedParentMatcher(bits, m, self.per_parent_limit, + searcher.doc_count_all()) + + def deletion_docs(self, searcher): + bits = searcher._filter_to_comb(self.parents) + if not bits: + return + + m = self.child.matcher(searcher, searcher.boolean_context()) + maxdoc = searcher.doc_count_all() + while m.is_active(): + docnum = m.id() + parentdoc = bits.before(docnum + 1) + nextparent = bits.after(docnum) or maxdoc + for i in xrange(parentdoc, nextparent): + yield i + m.skip_to(nextparent) + + class NestedParentMatcher(matching.Matcher): + def __init__(self, comb, child, per_parent_limit, maxdoc): + self.comb = comb + self.child = child + self.per_parent_limit = per_parent_limit + self.maxdoc = maxdoc + + self._nextdoc = None + if self.child.is_active(): + self._gather() + + def is_active(self): + return self._nextdoc is not None + + def supports_block_quality(self): + return False + + def _gather(self): + # This is where the magic happens ;) + child = self.child + pplimit = self.per_parent_limit + + # The next document returned by this matcher is the parent of the + # child's current document. We don't have to worry about whether + # the parent is deleted, because the query that gave us the parents + # wouldn't return deleted documents. + self._nextdoc = self.comb.before(child.id() + 1) + # The next parent after the child matcher's current document + nextparent = self.comb.after(child.id()) or self.maxdoc + + # Sum the scores of all matching documents under the parent + count = 1 + score = 0 + while child.is_active() and child.id() < nextparent: + if pplimit and count > pplimit: + child.skip_to(nextparent) + break + + score += child.score() + child.next() + count += 1 + + self._nextscore = score + + def id(self): + return self._nextdoc + + def score(self): + return self._nextscore + + def reset(self): + self.child.reset() + self._gather() + + def next(self): + if self.child.is_active(): + self._gather() + else: + if self._nextdoc is None: + raise matching.ReadTooFar + else: + self._nextdoc = None + + def skip_to(self, id): + self.child.skip_to(id) + self._gather() + + def value(self): + raise NotImplementedError(self.__class__) + + def spans(self): + return [] + + +class NestedChildren(WrappingQuery): + """This is the reverse of a :class:`NestedParent` query: instead of taking + a query that matches children but returns the parent, this query matches + parents but returns the children. + + This is useful, for example, to search for an album title and return the + songs in the album:: + + schema = fields.Schema(type=fields.ID(stored=True), + album_name=fields.TEXT(stored=True), + track_num=fields.NUMERIC(stored=True), + track_name=fields.TEXT(stored=True), + lyrics=fields.TEXT) + ix = RamStorage().create_index(schema) + + # Indexing + with ix.writer() as w: + # For each album, index a "group" of a parent "album" document and + # multiple child "track" documents. + with w.group(): + w.add_document(type="album", + artist="The Cure", album_name="Disintegration") + w.add_document(type="track", track_num=1, + track_name="Plainsong") + w.add_document(type="track", track_num=2, + track_name="Pictures of You") + # ... + # ... + + + # Find songs where the song name has "heaven" in the title and the + # album the song is on has "hell" in the title + qp = QueryParser("lyrics", ix.schema) + with ix.searcher() as s: + # A query that matches all parents + all_albums = qp.parse("type:album") + + # A query that matches the parents we want + albums_with_hell = qp.parse("album_name:hell") + + # A query that matches the desired albums but returns the tracks + songs_on_hell_albums = NestedChildren(all_albums, albums_with_hell) + + # A query that matches tracks with heaven in the title + songs_with_heaven = qp.parse("track_name:heaven") + + # A query that finds tracks with heaven in the title on albums + # with hell in the title + q = query.And([songs_on_hell_albums, songs_with_heaven]) + + """ + + def __init__(self, parents, subq, boost=1.0): + self.parents = parents + self.child = subq + self.boost = boost + + def matcher(self, searcher, context=None): + bits = searcher._filter_to_comb(self.parents) + if not bits: + return matching.NullMatcher + + m = self.child.matcher(searcher, context) + if not m.is_active(): + return matching.NullMatcher + + return self.NestedChildMatcher(bits, m, searcher.doc_count_all(), + searcher.reader().is_deleted, + boost=self.boost) + + class NestedChildMatcher(matching.WrappingMatcher): + def __init__(self, parent_comb, wanted_parent_matcher, limit, + is_deleted, boost=1.0): + self.parent_comb = parent_comb + self.child = wanted_parent_matcher + self.limit = limit + self.is_deleted = is_deleted + self.boost = boost + self._nextchild = -1 + self._nextparent = -1 + self._find_next_children() + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, + self.parent_comb, + self.child) + + def reset(self): + self.child.reset() + self._reset() + + def _reset(self): + self._nextchild = -1 + self._nextparent = -1 + self._find_next_children() + + def is_active(self): + return self._nextchild < self._nextparent + + def replace(self, minquality=0): + return self + + def _find_next_children(self): + # "comb" contains the doc IDs of all parent documents + comb = self.parent_comb + # "m" is the matcher for "wanted" parents + m = self.child + # Last doc ID + 1 + limit = self.limit + # A function that returns True if a doc ID is deleted + is_deleted = self.is_deleted + nextchild = self._nextchild + nextparent = self._nextparent + + while m.is_active(): + # Move the "child id" to the document after the current match + nextchild = m.id() + 1 + # Move the parent matcher to the next match + m.next() + + # Find the next parent document (matching or not) after this + nextparent = comb.after(nextchild) + if nextparent is None: + nextparent = limit + + # Skip any deleted child documents + while is_deleted(nextchild): + nextchild += 1 + + # If skipping deleted documents put us to or past the next + # parent doc, go again + if nextchild >= nextparent: + continue + else: + # Otherwise, we're done + break + + self._nextchild = nextchild + self._nextparent = nextparent + + def id(self): + return self._nextchild + + def all_ids(self): + while self.is_active(): + yield self.id() + self.next() + + def next(self): + is_deleted = self.is_deleted + limit = self.limit + nextparent = self._nextparent + + # Go to the next document + nextchild = self._nextchild + nextchild += 1 + + # Skip over any deleted child documents + while nextchild < nextparent and is_deleted(nextchild): + nextchild += 1 + + self._nextchild = nextchild + # If we're at or past the next parent doc, go to the next set of + # children + if nextchild >= limit: + return + elif nextchild >= nextparent: + self._find_next_children() + + def skip_to(self, docid): + comb = self.parent_comb + wanted = self.child + + # self._nextchild is the "current" matching child ID + if docid <= self._nextchild: + return + + # self._nextparent is the next parent ID (matching or not) + if docid < self._nextparent: + # Just iterate + while self.is_active() and self.id() < docid: + self.next() + elif wanted.is_active(): + # Find the parent before the target ID + pid = comb.before(docid) + # Skip the parent matcher to that ID + wanted.skip_to(pid) + # If that made the matcher inactive, then we're done + if not wanted.is_active(): + self._nextchild = self._nextparent = self.limit + else: + # Reestablish for the next child after the next matching + # parent + self._find_next_children() + else: + self._nextchild = self._nextparent = self.limit + + def value(self): + raise NotImplementedError(self.__class__) + + def score(self): + return self.boost + + def spans(self): + return [] diff --git a/src/whoosh/query/positional.py b/src/whoosh/query/positional.py new file mode 100644 index 0000000..4ad9320 --- /dev/null +++ b/src/whoosh/query/positional.py @@ -0,0 +1,249 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import division +import copy + +from whoosh import matching +from whoosh.analysis import Token +from whoosh.compat import u +from whoosh.query import qcore, terms, compound + + +class Sequence(compound.CompoundQuery): + """Matches documents containing a list of sub-queries in adjacent + positions. + + This object has no sanity check to prevent you from using queries in + different fields. + """ + + JOINT = " NEAR " + intersect_merge = True + + def __init__(self, subqueries, slop=1, ordered=True, boost=1.0): + """ + :param subqueries: a list of :class:`whoosh.query.Query` objects to + match in sequence. + :param slop: the maximum difference in position allowed between the + subqueries. + :param ordered: if True, the position differences between subqueries + must be positive (that is, each subquery in the list must appear + after the previous subquery in the document). + :param boost: a boost factor to add to the score of documents matching + this query. + """ + + compound.CompoundQuery.__init__(self, subqueries, boost=boost) + self.slop = slop + self.ordered = ordered + + def __eq__(self, other): + return (other and type(self) is type(other) + and self.subqueries == other.subqueries + and self.boost == other.boost) + + def __repr__(self): + return "%s(%r, slop=%d, boost=%f)" % (self.__class__.__name__, + self.subqueries, self.slop, + self.boost) + + def __hash__(self): + h = hash(self.slop) ^ hash(self.boost) + for q in self.subqueries: + h ^= hash(q) + return h + + def normalize(self): + # Because the subqueries are in sequence, we can't do the fancy merging + # that CompoundQuery does + return self.__class__([q.normalize() for q in self.subqueries], + self.slop, self.ordered, self.boost) + + def _and_query(self): + return compound.And(self.subqueries) + + def estimate_size(self, ixreader): + return self._and_query().estimate_size(ixreader) + + def estimate_min_size(self, ixreader): + return self._and_query().estimate_min_size(ixreader) + + def _matcher(self, subs, searcher, context): + from whoosh.query.spans import SpanNear + + # Tell the sub-queries this matcher will need the current match to get + # spans + context = context.set(needs_current=True) + m = self._tree_matcher(subs, SpanNear.SpanNearMatcher, searcher, + context, None, slop=self.slop, + ordered=self.ordered) + return m + + +class Ordered(Sequence): + """Matches documents containing a list of sub-queries in the given order. + """ + + JOINT = " BEFORE " + + def _matcher(self, subs, searcher, context): + from whoosh.query.spans import SpanBefore + + return self._tree_matcher(subs, SpanBefore._Matcher, searcher, + context, None) + + +class Phrase(qcore.Query): + """Matches documents containing a given phrase.""" + + def __init__(self, fieldname, words, slop=1, boost=1.0, char_ranges=None): + """ + :param fieldname: the field to search. + :param words: a list of words (unicode strings) in the phrase. + :param slop: the number of words allowed between each "word" in the + phrase; the default of 1 means the phrase must match exactly. + :param boost: a boost factor that to apply to the raw score of + documents matched by this query. + :param char_ranges: if a Phrase object is created by the query parser, + it will set this attribute to a list of (startchar, endchar) pairs + corresponding to the words in the phrase + """ + + self.fieldname = fieldname + self.words = words + self.slop = slop + self.boost = boost + self.char_ranges = char_ranges + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.fieldname == other.fieldname + and self.words == other.words + and self.slop == other.slop + and self.boost == other.boost) + + def __repr__(self): + return "%s(%r, %r, slop=%s, boost=%f)" % (self.__class__.__name__, + self.fieldname, self.words, + self.slop, self.boost) + + def __unicode__(self): + return u('%s:"%s"') % (self.fieldname, u(" ").join(self.words)) + + __str__ = __unicode__ + + def __hash__(self): + h = hash(self.fieldname) ^ hash(self.slop) ^ hash(self.boost) + for w in self.words: + h ^= hash(w) + return h + + def has_terms(self): + return True + + def terms(self, phrases=False): + if phrases and self.field(): + for word in self.words: + yield (self.field(), word) + + def tokens(self, boost=1.0): + char_ranges = self.char_ranges + startchar = endchar = None + for i, word in enumerate(self.words): + if char_ranges: + startchar, endchar = char_ranges[i] + + yield Token(fieldname=self.fieldname, text=word, + boost=boost * self.boost, startchar=startchar, + endchar=endchar, chars=True) + + def normalize(self): + if not self.words: + return qcore.NullQuery + if len(self.words) == 1: + t = terms.Term(self.fieldname, self.words[0]) + if self.char_ranges: + t.startchar, t.endchar = self.char_ranges[0] + return t + + words = [w for w in self.words if w is not None] + return self.__class__(self.fieldname, words, slop=self.slop, + boost=self.boost, char_ranges=self.char_ranges) + + def replace(self, fieldname, oldtext, newtext): + q = copy.copy(self) + if q.fieldname == fieldname: + for i, word in enumerate(q.words): + if word == oldtext: + q.words[i] = newtext + return q + + def _and_query(self): + return compound.And([terms.Term(self.fieldname, word) + for word in self.words]) + + def estimate_size(self, ixreader): + return self._and_query().estimate_size(ixreader) + + def estimate_min_size(self, ixreader): + return self._and_query().estimate_min_size(ixreader) + + def matcher(self, searcher, context=None): + from whoosh.query import Term, SpanNear2 + + fieldname = self.fieldname + if fieldname not in searcher.schema: + return matching.NullMatcher() + + field = searcher.schema[fieldname] + if not field.format or not field.format.supports("positions"): + raise qcore.QueryError("Phrase search: %r field has no positions" + % self.fieldname) + + terms = [] + # Build a list of Term queries from the words in the phrase + reader = searcher.reader() + for word in self.words: + try: + word = field.to_bytes(word) + except ValueError: + return matching.NullMatcher() + + if (fieldname, word) not in reader: + # Shortcut the query if one of the words doesn't exist. + return matching.NullMatcher() + terms.append(Term(fieldname, word)) + + # Create the equivalent SpanNear2 query from the terms + q = SpanNear2(terms, slop=self.slop, ordered=True, mindist=1) + # Get the matcher + m = q.matcher(searcher, context) + + if self.boost != 1.0: + m = matching.WrappingMatcher(m, boost=self.boost) + return m diff --git a/src/whoosh/query/qcolumns.py b/src/whoosh/query/qcolumns.py new file mode 100644 index 0000000..d2dd6af --- /dev/null +++ b/src/whoosh/query/qcolumns.py @@ -0,0 +1,117 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.matching import ConstantScoreMatcher, NullMatcher, ReadTooFar +from whoosh.query import Query + + +class ColumnQuery(Query): + """A query that matches per-document values stored in a column rather than + terms in the inverted index. + + This may be useful in special circumstances, but note that this is MUCH + SLOWER than searching an indexed field. + """ + + def __init__(self, fieldname, condition): + """ + :param fieldname: the name of the field to look in. If the field does + not have a column, this query will not match anything. + :param condition: if this is a callable, it is called on each value + in the column, and for documents where callable(docvalue) returns + True are returned as matching documents. If this is not a callable, + the document values are compared to it (using ``==``). + """ + + self.fieldname = fieldname + self.condition = condition + + def is_leaf(self): + return True + + def matcher(self, searcher, context=None): + fieldname = self.fieldname + condition = self.condition + if callable(condition): + comp = condition + else: + def comp(v): + # Made this a function instead of a lambda so I could put + # debug prints here if necessary ;) + return v == condition + + reader = searcher.reader() + if not reader.has_column(fieldname): + return NullMatcher() + + creader = reader.column_reader(fieldname) + return ColumnMatcher(creader, comp) + + +class ColumnMatcher(ConstantScoreMatcher): + def __init__(self, creader, condition): + self.creader = creader + self.condition = condition + self._i = 0 + self._find_next() + + def _find_next(self): + condition = self.condition + creader = self.creader + + while self._i < len(creader) and not condition(creader[self._i]): + self._i += 1 + + def is_active(self): + return self._i < len(self.creader) + + def next(self): + if not self.is_active(): + raise ReadTooFar + self._i += 1 + self._find_next() + + def reset(self): + self._i = 0 + self._find_next() + + def id(self): + return self._i + + def all_ids(self): + condition = self.condition + for docnum, v in enumerate(self.creader): + if condition(v): + yield docnum + + def supports(self, astype): + return False + + def skip_to_quality(self, minquality): + if self._score <= minquality: + self._i = len(self.creader) + return True diff --git a/src/whoosh/query/qcore.py b/src/whoosh/query/qcore.py new file mode 100644 index 0000000..883a90b --- /dev/null +++ b/src/whoosh/query/qcore.py @@ -0,0 +1,715 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import division +import copy +from array import array + +from whoosh import matching +from whoosh.compat import u +from whoosh.reading import TermNotFound +from whoosh.compat import methodcaller + + +# Exceptions + +class QueryError(Exception): + """Error encountered while running a query. + """ + pass + + +# Functions + +def error_query(msg, q=None): + """Returns the query in the second argument (or a :class:`NullQuery` if the + second argument is not given) with its ``error`` attribute set to + ``msg``. + """ + + if q is None: + q = _NullQuery() + q.error = msg + return q + + +def token_lists(q, phrases=True): + """Returns the terms in the query tree, with the query hierarchy + represented as nested lists. + """ + + if q.is_leaf(): + from whoosh.query import Phrase + if phrases or not isinstance(q, Phrase): + return list(q.tokens()) + else: + ls = [] + for qq in q.children(): + t = token_lists(qq, phrases=phrases) + if len(t) == 1: + t = t[0] + if t: + ls.append(t) + return ls + + +# Utility classes + +class Lowest(object): + """A value that is always compares lower than any other object except + itself. + """ + + def __cmp__(self, other): + if other.__class__ is Lowest: + return 0 + return -1 + + def __eq__(self, other): + return self.__class__ is type(other) + + def __lt__(self, other): + return type(other) is not self.__class__ + + def __ne__(self, other): + return not self.__eq__(other) + + def __gt__(self, other): + return not (self.__lt__(other) or self.__eq__(other)) + + def __le__(self, other): + return self.__eq__(other) or self.__lt__(other) + + def __ge__(self, other): + return self.__eq__(other) or self.__gt__(other) + + +class Highest(object): + """A value that is always compares higher than any other object except + itself. + """ + + def __cmp__(self, other): + if other.__class__ is Highest: + return 0 + return 1 + + def __eq__(self, other): + return self.__class__ is type(other) + + def __lt__(self, other): + return type(other) is self.__class__ + + def __ne__(self, other): + return not self.__eq__(other) + + def __gt__(self, other): + return not (self.__lt__(other) or self.__eq__(other)) + + def __le__(self, other): + return self.__eq__(other) or self.__lt__(other) + + def __ge__(self, other): + return self.__eq__(other) or self.__gt__(other) + + +Lowest = Lowest() +Highest = Highest() + + +# Base classes + +class Query(object): + """Abstract base class for all queries. + + Note that this base class implements __or__, __and__, and __sub__ to allow + slightly more convenient composition of query objects:: + + >>> Term("content", u"a") | Term("content", u"b") + Or([Term("content", u"a"), Term("content", u"b")]) + + >>> Term("content", u"a") & Term("content", u"b") + And([Term("content", u"a"), Term("content", u"b")]) + + >>> Term("content", u"a") - Term("content", u"b") + And([Term("content", u"a"), Not(Term("content", u"b"))]) + """ + + # For queries produced by the query parser, record where in the user + # query this object originated + startchar = endchar = None + # For queries produced by the query parser, records an error that resulted + # in this query + error = None + + def __unicode__(self): + raise NotImplementedError(self.__class__.__name__) + + def __getitem__(self, item): + raise NotImplementedError + + def __or__(self, query): + """Allows you to use | between query objects to wrap them in an Or + query. + """ + + from whoosh.query import Or + return Or([self, query]).normalize() + + def __and__(self, query): + """Allows you to use & between query objects to wrap them in an And + query. + """ + + from whoosh.query import And + return And([self, query]).normalize() + + def __sub__(self, query): + """Allows you to use - between query objects to add the right-hand + query as a "NOT" query. + """ + + from whoosh.query import And, Not + return And([self, Not(query)]).normalize() + + def __hash__(self): + raise NotImplementedError + + def __ne__(self, other): + return not self.__eq__(other) + + def is_leaf(self): + """Returns True if this is a leaf node in the query tree, or False if + this query has sub-queries. + """ + + return True + + def children(self): + """Returns an iterator of the subqueries of this object. + """ + + return iter([]) + + def is_range(self): + """Returns True if this object searches for values within a range. + """ + + return False + + def has_terms(self): + """Returns True if this specific object represents a search for a + specific term (as opposed to a pattern, as in Wildcard and Prefix) or + terms (i.e., whether the ``replace()`` method does something + meaningful on this instance). + """ + + return False + + def needs_spans(self): + for child in self.children(): + if child.needs_spans(): + return True + return False + + def apply(self, fn): + """If this query has children, calls the given function on each child + and returns a new copy of this node with the new children returned by + the function. If this is a leaf node, simply returns this object. + + This is useful for writing functions that transform a query tree. For + example, this function changes all Term objects in a query tree into + Variations objects:: + + def term2var(q): + if isinstance(q, Term): + return Variations(q.fieldname, q.text) + else: + return q.apply(term2var) + + q = And([Term("f", "alfa"), + Or([Term("f", "bravo"), + Not(Term("f", "charlie"))])]) + q = term2var(q) + + Note that this method does not automatically create copies of nodes. + To avoid modifying the original tree, your function should call the + :meth:`Query.copy` method on nodes before changing their attributes. + """ + + return self + + def accept(self, fn): + """Applies the given function to this query's subqueries (if any) and + then to this query itself:: + + def boost_phrases(q): + if isintance(q, Phrase): + q.boost *= 2.0 + return q + + myquery = myquery.accept(boost_phrases) + + This method automatically creates copies of the nodes in the original + tree before passing them to your function, so your function can change + attributes on nodes without altering the original tree. + + This method is less flexible than using :meth:`Query.apply` (in fact + it's implemented using that method) but is often more straightforward. + """ + + def fn_wrapper(q): + q = q.apply(fn_wrapper) + return fn(q) + + return fn_wrapper(self) + + def replace(self, fieldname, oldtext, newtext): + """Returns a copy of this query with oldtext replaced by newtext (if + oldtext was anywhere in this query). + + Note that this returns a *new* query with the given text replaced. It + *does not* modify the original query "in place". + """ + + # The default implementation uses the apply method to "pass down" the + # replace() method call + if self.is_leaf(): + return copy.copy(self) + else: + return self.apply(methodcaller("replace", fieldname, oldtext, + newtext)) + + def copy(self): + """Deprecated, just use ``copy.deepcopy``. + """ + + return copy.deepcopy(self) + + def all_terms(self, phrases=True): + """Returns a set of all terms in this query tree. + + This method exists for backwards-compatibility. Use iter_all_terms() + instead. + + :param phrases: Whether to add words found in Phrase queries. + :rtype: set + """ + + return set(self.iter_all_terms(phrases=phrases)) + + def terms(self, phrases=False): + """Yields zero or more (fieldname, text) pairs queried by this object. + You can check whether a query object targets specific terms before you + call this method using :meth:`Query.has_terms`. + + To get all terms in a query tree, use :meth:`Query.iter_all_terms`. + """ + + return iter(()) + + def expanded_terms(self, ixreader, phrases=True): + return self.terms(phrases=phrases) + + def existing_terms(self, ixreader, phrases=True, expand=False, fieldname=None): + """Returns a set of all byteterms in this query tree that exist in + the given ixreader. + + :param ixreader: A :class:`whoosh.reading.IndexReader` object. + :param phrases: Whether to add words found in Phrase queries. + :param expand: If True, queries that match multiple terms + will return all matching expansions. + :rtype: set + """ + + schema = ixreader.schema + termset = set() + + for q in self.leaves(): + if fieldname and fieldname != q.field(): + continue + + if expand: + terms = q.expanded_terms(ixreader, phrases=phrases) + else: + terms = q.terms(phrases=phrases) + + for fieldname, text in terms: + if (fieldname, text) in termset: + continue + + if fieldname in schema: + field = schema[fieldname] + + try: + btext = field.to_bytes(text) + except ValueError: + continue + + if (fieldname, btext) in ixreader: + termset.add((fieldname, btext)) + return termset + + def leaves(self): + """Returns an iterator of all the leaf queries in this query tree as a + flat series. + """ + + if self.is_leaf(): + yield self + else: + for q in self.children(): + for qq in q.leaves(): + yield qq + + def iter_all_terms(self, phrases=True): + """Returns an iterator of (fieldname, text) pairs for all terms in + this query tree. + + >>> qp = qparser.QueryParser("text", myindex.schema) + >>> q = myparser.parse("alfa bravo title:charlie") + >>> # List the terms in a query + >>> list(q.iter_all_terms()) + [("text", "alfa"), ("text", "bravo"), ("title", "charlie")] + >>> # Get a set of all terms in the query that don't exist in the index + >>> r = myindex.reader() + >>> missing = set(t for t in q.iter_all_terms() if t not in r) + set([("text", "alfa"), ("title", "charlie")]) + >>> # All terms in the query that occur in fewer than 5 documents in + >>> # the index + >>> [t for t in q.iter_all_terms() if r.doc_frequency(t[0], t[1]) < 5] + [("title", "charlie")] + + :param phrases: Whether to add words found in Phrase queries. + """ + + for q in self.leaves(): + if q.has_terms(): + for t in q.terms(phrases=phrases): + yield t + + def all_tokens(self, boost=1.0): + """Returns an iterator of :class:`analysis.Token` objects corresponding + to all terms in this query tree. The Token objects will have the + ``fieldname``, ``text``, and ``boost`` attributes set. If the query + was built by the query parser, they Token objects will also have + ``startchar`` and ``endchar`` attributes indexing into the original + user query. + """ + + if self.is_leaf(): + for token in self.tokens(boost): + yield token + else: + boost *= self.boost if hasattr(self, "boost") else 1.0 + for child in self.children(): + for token in child.all_tokens(boost): + yield token + + def tokens(self, boost=1.0, exreader=None): + """Yields zero or more :class:`analysis.Token` objects corresponding to + the terms searched for by this query object. You can check whether a + query object targets specific terms before you call this method using + :meth:`Query.has_terms`. + + The Token objects will have the ``fieldname``, ``text``, and ``boost`` + attributes set. If the query was built by the query parser, they Token + objects will also have ``startchar`` and ``endchar`` attributes + indexing into the original user query. + + To get all tokens for a query tree, use :meth:`Query.all_tokens`. + + :param exreader: a reader to use to expand multiterm queries such as + prefixes and wildcards. The default is None meaning do not expand. + """ + + return iter(()) + + def requires(self): + """Returns a set of queries that are *known* to be required to match + for the entire query to match. Note that other queries might also turn + out to be required but not be determinable by examining the static + query. + + >>> a = Term("f", u"a") + >>> b = Term("f", u"b") + >>> And([a, b]).requires() + set([Term("f", u"a"), Term("f", u"b")]) + >>> Or([a, b]).requires() + set([]) + >>> AndMaybe(a, b).requires() + set([Term("f", u"a")]) + >>> a.requires() + set([Term("f", u"a")]) + """ + + # Subclasses should implement the _add_required_to(qset) method + + return set([self]) + + def field(self): + """Returns the field this query matches in, or None if this query does + not match in a single field. + """ + + return self.fieldname + + def with_boost(self, boost): + """Returns a COPY of this query with the boost set to the given value. + + If a query type does not accept a boost itself, it will try to pass the + boost on to its children, if any. + """ + + q = self.copy() + q.boost = boost + return q + + def estimate_size(self, ixreader): + """Returns an estimate of how many documents this query could + potentially match (for example, the estimated size of a simple term + query is the document frequency of the term). It is permissible to + overestimate, but not to underestimate. + """ + raise NotImplementedError + + def estimate_min_size(self, ixreader): + """Returns an estimate of the minimum number of documents this query + could potentially match. + """ + + return self.estimate_size(ixreader) + + def matcher(self, searcher, context=None): + """Returns a :class:`~whoosh.matching.Matcher` object you can use to + retrieve documents and scores matching this query. + + :rtype: :class:`whoosh.matching.Matcher` + """ + + raise NotImplementedError + + def docs(self, searcher): + """Returns an iterator of docnums matching this query. + + >>> with my_index.searcher() as searcher: + ... list(my_query.docs(searcher)) + [10, 34, 78, 103] + + :param searcher: A :class:`whoosh.searching.Searcher` object. + """ + + try: + context = searcher.boolean_context() + return self.matcher(searcher, context).all_ids() + except TermNotFound: + return iter([]) + + def deletion_docs(self, searcher): + """Returns an iterator of docnums matching this query for the purpose + of deletion. The :meth:`~whoosh.writing.IndexWriter.delete_by_query` + method will use this method when deciding what documents to delete, + allowing special queries (e.g. nested queries) to override what + documents are deleted. The default implementation just forwards to + :meth:`Query.docs`. + """ + + return self.docs(searcher) + + def normalize(self): + """Returns a recursively "normalized" form of this query. The + normalized form removes redundancy and empty queries. This is called + automatically on query trees created by the query parser, but you may + want to call it yourself if you're writing your own parser or building + your own queries. + + >>> q = And([And([Term("f", u"a"), + ... Term("f", u"b")]), + ... Term("f", u"c"), Or([])]) + >>> q.normalize() + And([Term("f", u"a"), Term("f", u"b"), Term("f", u"c")]) + + Note that this returns a *new, normalized* query. It *does not* modify + the original query "in place". + """ + return self + + def simplify(self, ixreader): + """Returns a recursively simplified form of this query, where + "second-order" queries (such as Prefix and Variations) are re-written + into lower-level queries (such as Term and Or). + """ + return self + + +# Null query + +class _NullQuery(Query): + "Represents a query that won't match anything." + + boost = 1.0 + + def __init__(self): + self.error = None + + def __unicode__(self): + return u("<_NullQuery>") + + def __call__(self): + return self + + def __repr__(self): + return "<%s>" % (self.__class__.__name__) + + def __eq__(self, other): + return isinstance(other, _NullQuery) + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + return id(self) + + def __copy__(self): + return self + + def __deepcopy__(self, memo): + return self + + def field(self): + return None + + def estimate_size(self, ixreader): + return 0 + + def normalize(self): + return self + + def simplify(self, ixreader): + return self + + def docs(self, searcher): + return [] + + def matcher(self, searcher, context=None): + return matching.NullMatcher() + + +NullQuery = _NullQuery() + + +# Every + +class Every(Query): + """A query that matches every document containing any term in a given + field. If you don't specify a field, the query matches every document. + + >>> # Match any documents with something in the "path" field + >>> q = Every("path") + >>> # Matcher every document + >>> q = Every() + + The unfielded form (matching every document) is efficient. + + The fielded is more efficient than a prefix query with an empty prefix or a + '*' wildcard, but it can still be very slow on large indexes. It requires + the searcher to read the full posting list of every term in the given + field. + + Instead of using this query it is much more efficient when you create the + index to include a single term that appears in all documents that have the + field you want to match. + + For example, instead of this:: + + # Match all documents that have something in the "path" field + q = Every("path") + + Do this when indexing:: + + # Add an extra field that indicates whether a document has a path + schema = fields.Schema(path=fields.ID, has_path=fields.ID) + + # When indexing, set the "has_path" field based on whether the document + # has anything in the "path" field + writer.add_document(text=text_value1) + writer.add_document(text=text_value2, path=path_value2, has_path="t") + + Then to find all documents with a path:: + + q = Term("has_path", "t") + """ + + def __init__(self, fieldname=None, boost=1.0): + """ + :param fieldname: the name of the field to match, or ``None`` or ``*`` + to match all documents. + """ + + if not fieldname or fieldname == "*": + fieldname = None + self.fieldname = fieldname + self.boost = boost + + def __repr__(self): + return "%s(%r, boost=%s)" % (self.__class__.__name__, self.fieldname, + self.boost) + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.fieldname == other.fieldname + and self.boost == other.boost) + + def __unicode__(self): + return u("%s:*") % self.fieldname + + __str__ = __unicode__ + + def __hash__(self): + return hash(self.fieldname) + + def estimate_size(self, ixreader): + return ixreader.doc_count() + + def matcher(self, searcher, context=None): + fieldname = self.fieldname + reader = searcher.reader() + + if fieldname in (None, "", "*"): + # This takes into account deletions + doclist = array("I", reader.all_doc_ids()) + else: + # This is a hacky hack, but just create an in-memory set of all the + # document numbers of every term in the field. This is SLOOOW for + # large indexes + doclist = set() + for text in searcher.lexicon(fieldname): + pr = searcher.postings(fieldname, text) + doclist.update(pr.all_ids()) + doclist = sorted(doclist) + + return matching.ListMatcher(doclist, all_weights=self.boost) diff --git a/src/whoosh/query/ranges.py b/src/whoosh/query/ranges.py new file mode 100644 index 0000000..b2b03d0 --- /dev/null +++ b/src/whoosh/query/ranges.py @@ -0,0 +1,347 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import division + +from whoosh.compat import b, u +from whoosh.query import qcore, terms, compound, wrappers +from whoosh.util.times import datetime_to_long + + +class RangeMixin(object): + # Contains methods shared by TermRange and NumericRange + + def __repr__(self): + return ('%s(%r, %r, %r, %s, %s, boost=%s, constantscore=%s)' + % (self.__class__.__name__, self.fieldname, self.start, + self.end, self.startexcl, self.endexcl, self.boost, + self.constantscore)) + + def __unicode__(self): + startchar = "{" if self.startexcl else "[" + endchar = "}" if self.endexcl else "]" + start = '' if self.start is None else self.start + end = '' if self.end is None else self.end + return u("%s:%s%s TO %s%s") % (self.fieldname, startchar, start, end, + endchar) + + __str__ = __unicode__ + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.fieldname == other.fieldname + and self.start == other.start and self.end == other.end + and self.startexcl == other.startexcl + and self.endexcl == other.endexcl + and self.boost == other.boost + and self.constantscore == other.constantscore) + + def __hash__(self): + return (hash(self.fieldname) ^ hash(self.start) ^ hash(self.startexcl) + ^ hash(self.end) ^ hash(self.endexcl) ^ hash(self.boost)) + + def is_range(self): + return True + + def _comparable_start(self): + if self.start is None: + return (qcore.Lowest, 0) + else: + second = 1 if self.startexcl else 0 + return (self.start, second) + + def _comparable_end(self): + if self.end is None: + return (qcore.Highest, 0) + else: + second = -1 if self.endexcl else 0 + return (self.end, second) + + def overlaps(self, other): + if not isinstance(other, TermRange): + return False + if self.fieldname != other.fieldname: + return False + + start1 = self._comparable_start() + start2 = other._comparable_start() + end1 = self._comparable_end() + end2 = other._comparable_end() + + return ((start1 >= start2 and start1 <= end2) + or (end1 >= start2 and end1 <= end2) + or (start2 >= start1 and start2 <= end1) + or (end2 >= start1 and end2 <= end1)) + + def merge(self, other, intersect=True): + assert self.fieldname == other.fieldname + + start1 = self._comparable_start() + start2 = other._comparable_start() + end1 = self._comparable_end() + end2 = other._comparable_end() + + if start1 >= start2 and end1 <= end2: + start = start2 + end = end2 + elif start2 >= start1 and end2 <= end1: + start = start1 + end = end1 + elif intersect: + start = max(start1, start2) + end = min(end1, end2) + else: + start = min(start1, start2) + end = max(end1, end2) + + startval = None if start[0] is qcore.Lowest else start[0] + startexcl = start[1] == 1 + endval = None if end[0] is qcore.Highest else end[0] + endexcl = end[1] == -1 + + boost = max(self.boost, other.boost) + constantscore = self.constantscore or other.constantscore + + return self.__class__(self.fieldname, startval, endval, startexcl, + endexcl, boost=boost, + constantscore=constantscore) + + +class TermRange(RangeMixin, terms.MultiTerm): + """Matches documents containing any terms in a given range. + + >>> # Match documents where the indexed "id" field is greater than or equal + >>> # to 'apple' and less than or equal to 'pear'. + >>> TermRange("id", u"apple", u"pear") + """ + + def __init__(self, fieldname, start, end, startexcl=False, endexcl=False, + boost=1.0, constantscore=True): + """ + :param fieldname: The name of the field to search. + :param start: Match terms equal to or greater than this. + :param end: Match terms equal to or less than this. + :param startexcl: If True, the range start is exclusive. If False, the + range start is inclusive. + :param endexcl: If True, the range end is exclusive. If False, the + range end is inclusive. + :param boost: Boost factor that should be applied to the raw score of + results matched by this query. + """ + + self.fieldname = fieldname + self.start = start + self.end = end + self.startexcl = startexcl + self.endexcl = endexcl + self.boost = boost + self.constantscore = constantscore + + def normalize(self): + if self.start in ('', None) and self.end in (u('\uffff'), None): + from whoosh.query import Every + return Every(self.fieldname, boost=self.boost) + elif self.start == self.end: + if self.startexcl or self.endexcl: + return qcore.NullQuery + return terms.Term(self.fieldname, self.start, boost=self.boost) + else: + return TermRange(self.fieldname, self.start, self.end, + self.startexcl, self.endexcl, + boost=self.boost) + + #def replace(self, fieldname, oldtext, newtext): + # q = self.copy() + # if q.fieldname == fieldname: + # if q.start == oldtext: + # q.start = newtext + # if q.end == oldtext: + # q.end = newtext + # return q + + def _btexts(self, ixreader): + fieldname = self.fieldname + field = ixreader.schema[fieldname] + startexcl = self.startexcl + endexcl = self.endexcl + + if self.start is None: + start = b("") + else: + try: + start = field.to_bytes(self.start) + except ValueError: + return + + if self.end is None: + end = b("\xFF\xFF\xFF\xFF") + else: + try: + end = field.to_bytes(self.end) + except ValueError: + return + + for fname, t in ixreader.terms_from(fieldname, start): + if fname != fieldname: + break + if t == start and startexcl: + continue + if t == end and endexcl: + break + if t > end: + break + yield t + + +class NumericRange(RangeMixin, qcore.Query): + """A range query for NUMERIC fields. Takes advantage of tiered indexing + to speed up large ranges by matching at a high resolution at the edges of + the range and a low resolution in the middle. + + >>> # Match numbers from 10 to 5925 in the "number" field. + >>> nr = NumericRange("number", 10, 5925) + """ + + def __init__(self, fieldname, start, end, startexcl=False, endexcl=False, + boost=1.0, constantscore=True): + """ + :param fieldname: The name of the field to search. + :param start: Match terms equal to or greater than this number. This + should be a number type, not a string. + :param end: Match terms equal to or less than this number. This should + be a number type, not a string. + :param startexcl: If True, the range start is exclusive. If False, the + range start is inclusive. + :param endexcl: If True, the range end is exclusive. If False, the + range end is inclusive. + :param boost: Boost factor that should be applied to the raw score of + results matched by this query. + :param constantscore: If True, the compiled query returns a constant + score (the value of the ``boost`` keyword argument) instead of + actually scoring the matched terms. This gives a nice speed boost + and won't affect the results in most cases since numeric ranges + will almost always be used as a filter. + """ + + self.fieldname = fieldname + self.start = start + self.end = end + self.startexcl = startexcl + self.endexcl = endexcl + self.boost = boost + self.constantscore = constantscore + + def simplify(self, ixreader): + return self._compile_query(ixreader).simplify(ixreader) + + def estimate_size(self, ixreader): + return self._compile_query(ixreader).estimate_size(ixreader) + + def estimate_min_size(self, ixreader): + return self._compile_query(ixreader).estimate_min_size(ixreader) + + def docs(self, searcher): + q = self._compile_query(searcher.reader()) + return q.docs(searcher) + + def _compile_query(self, ixreader): + from whoosh.fields import NUMERIC + from whoosh.util.numeric import tiered_ranges + + field = ixreader.schema[self.fieldname] + if not isinstance(field, NUMERIC): + raise Exception("NumericRange: field %r is not numeric" + % self.fieldname) + + start = self.start + if start is not None: + start = field.prepare_number(start) + end = self.end + if end is not None: + end = field.prepare_number(end) + + subqueries = [] + stb = field.sortable_to_bytes + # Get the term ranges for the different resolutions + ranges = tiered_ranges(field.numtype, field.bits, field.signed, + start, end, field.shift_step, + self.startexcl, self.endexcl) + for startnum, endnum, shift in ranges: + if startnum == endnum: + subq = terms.Term(self.fieldname, stb(startnum, shift)) + else: + startbytes = stb(startnum, shift) + endbytes = stb(endnum, shift) + subq = TermRange(self.fieldname, startbytes, endbytes) + subqueries.append(subq) + + if len(subqueries) == 1: + q = subqueries[0] + elif subqueries: + q = compound.Or(subqueries, boost=self.boost) + else: + return qcore.NullQuery + + if self.constantscore: + q = wrappers.ConstantScoreQuery(q, self.boost) + return q + + def matcher(self, searcher, context=None): + q = self._compile_query(searcher.reader()) + return q.matcher(searcher, context) + + +class DateRange(NumericRange): + """This is a very thin subclass of :class:`NumericRange` that only + overrides the initializer and ``__repr__()`` methods to work with datetime + objects instead of numbers. Internally this object converts the datetime + objects it's created with to numbers and otherwise acts like a + ``NumericRange`` query. + + >>> DateRange("date", datetime(2010, 11, 3, 3, 0), + ... datetime(2010, 11, 3, 17, 59)) + """ + + def __init__(self, fieldname, start, end, startexcl=False, endexcl=False, + boost=1.0, constantscore=True): + self.startdate = start + self.enddate = end + if start: + start = datetime_to_long(start) + if end: + end = datetime_to_long(end) + super(DateRange, self).__init__(fieldname, start, end, + startexcl=startexcl, endexcl=endexcl, + boost=boost, + constantscore=constantscore) + + def __repr__(self): + return '%s(%r, %r, %r, %s, %s, boost=%s)' % (self.__class__.__name__, + self.fieldname, + self.startdate, self.enddate, + self.startexcl, self.endexcl, + self.boost) diff --git a/src/whoosh/query/spans.py b/src/whoosh/query/spans.py new file mode 100644 index 0000000..8d6a968 --- /dev/null +++ b/src/whoosh/query/spans.py @@ -0,0 +1,872 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module contains Query objects that deal with "spans". + +Span queries allow for positional constraints on matching documents. For +example, the :class:`whoosh.spans.SpanNear` query matches documents where one +term occurs near another. Because you can nest span queries, and wrap them +around almost any non-span query, you can create very complex constraints. + +For example, to find documents containing "whoosh" at most 5 positions before +"library" in the "text" field:: + + from whoosh import query, spans + t1 = query.Term("text", "whoosh") + t2 = query.Term("text", "library") + q = spans.SpanNear(t1, t2, slop=5) + +""" + +from whoosh.matching import mcore, wrappers, binary +from whoosh.query import Query, And, AndMaybe, Or, Term +from whoosh.util import make_binary_tree + + +# Span class + +class Span(object): + __slots__ = ("start", "end", "startchar", "endchar", "boost") + + def __init__(self, start, end=None, startchar=None, endchar=None, + boost=1.0): + if end is None: + end = start + assert start <= end + self.start = start + self.end = end + self.startchar = startchar + self.endchar = endchar + self.boost = boost + + def __repr__(self): + if self.startchar is not None or self.endchar is not None: + return "<%d-%d %d:%d>" % (self.start, self.end, self.startchar, + self.endchar) + else: + return "<%d-%d>" % (self.start, self.end) + + def __eq__(self, span): + return (self.start == span.start + and self.end == span.end + and self.startchar == span.startchar + and self.endchar == span.endchar) + + def __ne__(self, span): + return self.start != span.start or self.end != span.end + + def __lt__(self, span): + return self.start < span.start + + def __gt__(self, span): + return self.start > span.start + + def __hash__(self): + return hash((self.start, self.end)) + + @classmethod + def merge(cls, spans): + """Merges overlapping and touches spans in the given list of spans. + + Note that this modifies the original list. + + >>> spans = [Span(1,2), Span(3)] + >>> Span.merge(spans) + >>> spans + [<1-3>] + """ + + i = 0 + while i < len(spans) - 1: + here = spans[i] + j = i + 1 + while j < len(spans): + there = spans[j] + if there.start > here.end + 1: + break + if here.touches(there) or here.overlaps(there): + here = here.to(there) + spans[i] = here + del spans[j] + else: + j += 1 + i += 1 + return spans + + def to(self, span): + if self.startchar is None: + minchar = span.startchar + elif span.startchar is None: + minchar = self.startchar + else: + minchar = min(self.startchar, span.startchar) + if self.endchar is None: + maxchar = span.endchar + elif span.endchar is None: + maxchar = self.endchar + else: + maxchar = max(self.endchar, span.endchar) + + minpos = min(self.start, span.start) + maxpos = max(self.end, span.end) + return self.__class__(minpos, maxpos, minchar, maxchar) + + def overlaps(self, span): + return ((self.start >= span.start and self.start <= span.end) + or (self.end >= span.start and self.end <= span.end) + or (span.start >= self.start and span.start <= self.end) + or (span.end >= self.start and span.end <= self.end)) + + def surrounds(self, span): + return self.start < span.start and self.end > span.end + + def is_within(self, span): + return self.start >= span.start and self.end <= span.end + + def is_before(self, span): + return self.end < span.start + + def is_after(self, span): + return self.start > span.end + + def touches(self, span): + return self.start == span.end + 1 or self.end == span.start - 1 + + def distance_to(self, span): + if self.overlaps(span): + return 0 + elif self.is_before(span): + return span.start - self.end + else: + return self.start - span.end + + +def bisect_spans(spans, start): + lo = 0 + hi = len(spans) + while lo < hi: + mid = (lo + hi) // 2 + if spans[mid].start < start: + lo = mid + 1 + else: + hi = mid + return lo + + +# Base matchers + +class SpanWrappingMatcher(wrappers.WrappingMatcher): + """An abstract matcher class that wraps a "regular" matcher. This matcher + uses the sub-matcher's matching logic, but only matches documents that have + matching spans, i.e. where ``_get_spans()`` returns a non-empty list. + + Subclasses must implement the ``_get_spans()`` method, which returns a list + of valid spans for the current document. + """ + + def __init__(self, child): + super(SpanWrappingMatcher, self).__init__(child) + self._spans = None + if self.is_active(): + self._find_next() + + def copy(self): + m = self.__class__(self.child.copy()) + m._spans = self._spans + return m + + def _replacement(self, newchild): + return self.__class__(newchild) + + def _find_next(self): + if not self.is_active(): + return + + child = self.child + r = False + + spans = self._get_spans() + while child.is_active() and not spans: + r = child.next() or r + if not child.is_active(): + return True + spans = self._get_spans() + self._spans = spans + + return r + + def spans(self): + return self._spans + + def next(self): + self.child.next() + self._find_next() + + def skip_to(self, id): + self.child.skip_to(id) + self._find_next() + + def all_ids(self): + while self.is_active(): + if self.spans(): + yield self.id() + self.next() + + +class SpanBiMatcher(SpanWrappingMatcher): + def copy(self): + return self.__class__(self.a.copy(), self.b.copy()) + + def depth(self): + return 1 + max(self.a.depth(), self.b.depth()) + + def replace(self, minquality=0): + # TODO: fix this + if not self.is_active(): + return mcore.NullMatcher() + return self + + +# Queries + +class SpanQuery(Query): + """Abstract base class for span-based queries. Each span query type wraps + a "regular" query that implements the basic document-matching functionality + (for example, SpanNear wraps an And query, because SpanNear requires that + the two sub-queries occur in the same documents. The wrapped query is + stored in the ``q`` attribute. + + Subclasses usually only need to implement the initializer to set the + wrapped query, and ``matcher()`` to return a span-aware matcher object. + """ + + def _subm(self, s, context=None): + return self.q.matcher(s, context) + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.q) + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.q == other.q) + + def __hash__(self): + return hash(self.__class__.__name__) ^ hash(self.q) + + def field(self): + return None + + def needs_spans(self): + return True + + +class WrappingSpan(SpanQuery): + def is_leaf(self): + return False + + def apply(self, fn): + return self.__class__(fn(self.q), limit=self.limit) + + def field(self): + return self.q.field() + + +class SpanFirst(WrappingSpan): + """Matches spans that end within the first N positions. This lets you + for example only match terms near the beginning of the document. + """ + + def __init__(self, q, limit=0): + """ + :param q: the query to match. + :param limit: the query must match within this position at the start + of a document. The default is ``0``, which means the query must + match at the first position. + """ + + self.q = q + self.limit = limit + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.q == other.q and self.limit == other.limit) + + def __hash__(self): + return hash(self.q) ^ hash(self.limit) + + def matcher(self, searcher, context=None): + m = self._subm(searcher, context) + return SpanFirst.SpanFirstMatcher(m, limit=self.limit) + + class SpanFirstMatcher(SpanWrappingMatcher): + def __init__(self, child, limit=0): + self.limit = limit + super(SpanFirst.SpanFirstMatcher, self).__init__(child) + + def copy(self): + return self.__class__(self.child.copy(), limit=self.limit) + + def _replacement(self, newchild): + return self.__class__(newchild, limit=self.limit) + + def _get_spans(self): + return [span for span in self.child.spans() + if span.end <= self.limit] + + +class SpanNear(SpanQuery): + """ + Note: for new code, use :class:`SpanNear2` instead of this class. SpanNear2 + takes a list of sub-queries instead of requiring you to create a binary + tree of query objects. + + Matches queries that occur near each other. By default, only matches + queries that occur right next to each other (slop=1) and in order + (ordered=True). + + For example, to find documents where "whoosh" occurs next to "library" + in the "text" field:: + + from whoosh import query, spans + t1 = query.Term("text", "whoosh") + t2 = query.Term("text", "library") + q = spans.SpanNear(t1, t2) + + To find documents where "whoosh" occurs at most 5 positions before + "library":: + + q = spans.SpanNear(t1, t2, slop=5) + + To find documents where "whoosh" occurs at most 5 positions before or after + "library":: + + q = spans.SpanNear(t1, t2, slop=5, ordered=False) + + You can use the ``phrase()`` class method to create a tree of SpanNear + queries to match a list of terms:: + + q = spans.SpanNear.phrase("text", ["whoosh", "search", "library"], + slop=2) + """ + + def __init__(self, a, b, slop=1, ordered=True, mindist=1): + """ + :param a: the first query to match. + :param b: the second query that must occur within "slop" positions of + the first query. + :param slop: the number of positions within which the queries must + occur. Default is 1, meaning the queries must occur right next + to each other. + :param ordered: whether a must occur before b. Default is True. + :pram mindist: the minimum distance allowed between the queries. + """ + + self.q = And([a, b]) + self.a = a + self.b = b + self.slop = slop + self.ordered = ordered + self.mindist = mindist + + def __repr__(self): + return ("%s(%r, slop=%d, ordered=%s, mindist=%d)" + % (self.__class__.__name__, self.q, self.slop, self.ordered, + self.mindist)) + + def __eq__(self, other): + return (other and self.__class__ == other.__class__ + and self.q == other.q and self.slop == other.slop + and self.ordered == other.ordered + and self.mindist == other.mindist) + + def __hash__(self): + return (hash(self.a) ^ hash(self.b) ^ hash(self.slop) + ^ hash(self.ordered) ^ hash(self.mindist)) + + def is_leaf(self): + return False + + def apply(self, fn): + return self.__class__(fn(self.a), fn(self.b), slop=self.slop, + ordered=self.ordered, mindist=self.mindist) + + def matcher(self, searcher, context=None): + ma = self.a.matcher(searcher, context) + mb = self.b.matcher(searcher, context) + return SpanNear.SpanNearMatcher(ma, mb, slop=self.slop, + ordered=self.ordered, + mindist=self.mindist) + + @classmethod + def phrase(cls, fieldname, words, slop=1, ordered=True): + """Returns a tree of SpanNear queries to match a list of terms. + + This class method is a convenience for constructing a phrase query + using a binary tree of SpanNear queries:: + + SpanNear.phrase("content", ["alfa", "bravo", "charlie", "delta"]) + + :param fieldname: the name of the field to search in. + :param words: a sequence of texts to search for. + :param slop: the number of positions within which the terms must + occur. Default is 1, meaning the terms must occur right next + to each other. + :param ordered: whether the terms must occur in order. Default is True. + """ + + terms = [Term(fieldname, word) for word in words] + return make_binary_tree(cls, terms, slop=slop, ordered=ordered) + + class SpanNearMatcher(SpanWrappingMatcher): + def __init__(self, a, b, slop=1, ordered=True, mindist=1): + self.a = a + self.b = b + self.slop = slop + self.ordered = ordered + self.mindist = mindist + isect = binary.IntersectionMatcher(a, b) + super(SpanNear.SpanNearMatcher, self).__init__(isect) + + def copy(self): + return self.__class__(self.a.copy(), self.b.copy(), slop=self.slop, + ordered=self.ordered, mindist=self.mindist) + + def replace(self, minquality=0): + # TODO: fix this + if not self.is_active(): + return mcore.NullMatcher() + return self + + def _get_spans(self): + slop = self.slop + mindist = self.mindist + ordered = self.ordered + spans = set() + + bspans = self.b.spans() + for aspan in self.a.spans(): + for bspan in bspans: + if (bspan.end < aspan.start - slop + or (ordered and aspan.start > bspan.start)): + # B is too far in front of A, or B is in front of A + # *at all* when ordered is True + continue + if bspan.start > aspan.end + slop: + # B is too far from A. Since spans are listed in + # start position order, we know that all spans after + # this one will also be too far. + break + + # Check the distance between the spans + dist = aspan.distance_to(bspan) + if mindist <= dist <= slop: + spans.add(aspan.to(bspan)) + + return sorted(spans) + + +class SpanNear2(SpanQuery): + """ + Matches queries that occur near each other. By default, only matches + queries that occur right next to each other (slop=1) and in order + (ordered=True). + + New code should use this query type instead of :class:`SpanNear`. + + (Unlike :class:`SpanNear`, this query takes a list of subqueries instead of + requiring you to build a binary tree of query objects. This query should + also be slightly faster due to less overhead.) + + For example, to find documents where "whoosh" occurs next to "library" + in the "text" field:: + + from whoosh import query, spans + t1 = query.Term("text", "whoosh") + t2 = query.Term("text", "library") + q = spans.SpanNear2([t1, t2]) + + To find documents where "whoosh" occurs at most 5 positions before + "library":: + + q = spans.SpanNear2([t1, t2], slop=5) + + To find documents where "whoosh" occurs at most 5 positions before or after + "library":: + + q = spans.SpanNear2(t1, t2, slop=5, ordered=False) + """ + + def __init__(self, qs, slop=1, ordered=True, mindist=1): + """ + :param qs: a sequence of sub-queries to match. + :param slop: the number of positions within which the queries must + occur. Default is 1, meaning the queries must occur right next + to each other. + :param ordered: whether a must occur before b. Default is True. + :pram mindist: the minimum distance allowed between the queries. + """ + + self.qs = qs + self.slop = slop + self.ordered = ordered + self.mindist = mindist + + def __repr__(self): + return ("%s(%r, slop=%d, ordered=%s, mindist=%d)" + % (self.__class__.__name__, self.qs, self.slop, self.ordered, + self.mindist)) + + def __eq__(self, other): + return (other and self.__class__ == other.__class__ + and self.qs == other.qs and self.slop == other.slop + and self.ordered == other.ordered + and self.mindist == other.mindist) + + def __hash__(self): + h = hash(self.slop) ^ hash(self.ordered) ^ hash(self.mindist) + for q in self.qs: + h ^= hash(q) + return h + + def is_leaf(self): + return False + + def children(self): + return self.qs + + def apply(self, fn): + return self.__class__([fn(q) for q in self.qs], slop=self.slop, + ordered=self.ordered, mindist=self.mindist) + + def matcher(self, searcher, context=None): + ms = [q.matcher(searcher, context) for q in self.qs] + return self.SpanNear2Matcher(ms, slop=self.slop, ordered=self.ordered, + mindist=self.mindist) + + class SpanNear2Matcher(SpanWrappingMatcher): + def __init__(self, ms, slop=1, ordered=True, mindist=1): + self.ms = ms + self.slop = slop + self.ordered = ordered + self.mindist = mindist + isect = make_binary_tree(binary.IntersectionMatcher, ms) + super(SpanNear2.SpanNear2Matcher, self).__init__(isect) + + def copy(self): + return self.__class__([m.copy() for m in self.ms], slop=self.slop, + ordered=self.ordered, mindist=self.mindist) + + def replace(self, minquality=0): + # TODO: fix this + if not self.is_active(): + return mcore.NullMatcher() + return self + + def _get_spans(self): + slop = self.slop + mindist = self.mindist + ordered = self.ordered + ms = self.ms + + aspans = ms[0].spans() + i = 1 + while i < len(ms) and aspans: + bspans = ms[i].spans() + spans = set() + for aspan in aspans: + # Use a binary search to find the first position we should + # start looking for possible matches + if ordered: + start = aspan.start + else: + start = max(0, aspan.start - slop) + j = bisect_spans(bspans, start) + + while j < len(bspans): + bspan = bspans[j] + j += 1 + + if (bspan.end < aspan.start - slop + or (ordered and aspan.start > bspan.start)): + # B is too far in front of A, or B is in front of A + # *at all* when ordered is True + continue + if bspan.start > aspan.end + slop: + # B is too far from A. Since spans are listed in + # start position order, we know that all spans after + # this one will also be too far. + break + + # Check the distance between the spans + dist = aspan.distance_to(bspan) + if mindist <= dist <= slop: + spans.add(aspan.to(bspan)) + aspans = sorted(spans) + i += 1 + + if i == len(ms): + return aspans + else: + return [] + + +class SpanOr(SpanQuery): + """Matches documents that match any of a list of sub-queries. Unlike + query.Or, this class merges together matching spans from the different + sub-queries when they overlap. + """ + + def __init__(self, subqs): + """ + :param subqs: a list of queries to match. + """ + + self.q = Or(subqs) + self.subqs = subqs + + def is_leaf(self): + return False + + def apply(self, fn): + return self.__class__([fn(sq) for sq in self.subqs]) + + def matcher(self, searcher, context=None): + matchers = [q.matcher(searcher, context) for q in self.subqs] + return make_binary_tree(SpanOr.SpanOrMatcher, matchers) + + class SpanOrMatcher(SpanBiMatcher): + def __init__(self, a, b): + self.a = a + self.b = b + um = binary.UnionMatcher(a, b) + super(SpanOr.SpanOrMatcher, self).__init__(um) + + def _get_spans(self): + a_active = self.a.is_active() + b_active = self.b.is_active() + + if a_active: + a_id = self.a.id() + if b_active: + b_id = self.b.id() + if a_id == b_id: + spans = sorted(set(self.a.spans()) + | set(self.b.spans())) + elif a_id < b_id: + spans = self.a.spans() + else: + spans = self.b.spans() + else: + spans = self.a.spans() + else: + spans = self.b.spans() + + Span.merge(spans) + return spans + + +class SpanBiQuery(SpanQuery): + # Intermediate base class for methods common to "a/b" span query types + + def is_leaf(self): + return False + + def apply(self, fn): + return self.__class__(fn(self.a), fn(self.b)) + + def matcher(self, searcher, context=None): + ma = self.a.matcher(searcher, context) + mb = self.b.matcher(searcher, context) + return self._Matcher(ma, mb) + + +class SpanNot(SpanBiQuery): + """Matches spans from the first query only if they don't overlap with + spans from the second query. If there are no non-overlapping spans, the + document does not match. + + For example, to match documents that contain "bear" at most 2 places after + "apple" in the "text" field but don't have "cute" between them:: + + from whoosh import query, spans + t1 = query.Term("text", "apple") + t2 = query.Term("text", "bear") + near = spans.SpanNear(t1, t2, slop=2) + q = spans.SpanNot(near, query.Term("text", "cute")) + """ + + def __init__(self, a, b): + """ + :param a: the query to match. + :param b: do not match any spans that overlap with spans from this + query. + """ + + self.q = AndMaybe(a, b) + self.a = a + self.b = b + + class _Matcher(SpanBiMatcher): + def __init__(self, a, b): + self.a = a + self.b = b + amm = binary.AndMaybeMatcher(a, b) + super(SpanNot._Matcher, self).__init__(amm) + + def _get_spans(self): + if self.a.id() == self.b.id(): + spans = [] + bspans = self.b.spans() + for aspan in self.a.spans(): + overlapped = False + for bspan in bspans: + if aspan.overlaps(bspan): + overlapped = True + break + if not overlapped: + spans.append(aspan) + return spans + else: + return self.a.spans() + + +class SpanContains(SpanBiQuery): + """Matches documents where the spans of the first query contain any spans + of the second query. + + For example, to match documents where "apple" occurs at most 10 places + before "bear" in the "text" field and "cute" is between them:: + + from whoosh import query, spans + t1 = query.Term("text", "apple") + t2 = query.Term("text", "bear") + near = spans.SpanNear(t1, t2, slop=10) + q = spans.SpanContains(near, query.Term("text", "cute")) + """ + + def __init__(self, a, b): + """ + :param a: the query to match. + :param b: the query whose spans must occur within the matching spans + of the first query. + """ + + self.q = And([a, b]) + self.a = a + self.b = b + + class _Matcher(SpanBiMatcher): + def __init__(self, a, b): + self.a = a + self.b = b + im = binary.IntersectionMatcher(a, b) + super(SpanContains._Matcher, self).__init__(im) + + def _get_spans(self): + spans = [] + bspans = self.b.spans() + for aspan in self.a.spans(): + for bspan in bspans: + if aspan.start > bspan.end: + continue + if aspan.end < bspan.start: + break + + if bspan.is_within(aspan): + spans.append(aspan) + break + return spans + + +class SpanBefore(SpanBiQuery): + """Matches documents where the spans of the first query occur before any + spans of the second query. + + For example, to match documents where "apple" occurs anywhere before + "bear":: + + from whoosh import query, spans + t1 = query.Term("text", "apple") + t2 = query.Term("text", "bear") + q = spans.SpanBefore(t1, t2) + """ + + def __init__(self, a, b): + """ + :param a: the query that must occur before the second. + :param b: the query that must occur after the first. + """ + + self.a = a + self.b = b + self.q = And([a, b]) + + class _Matcher(SpanBiMatcher): + def __init__(self, a, b): + self.a = a + self.b = b + im = binary.IntersectionMatcher(a, b) + super(SpanBefore._Matcher, self).__init__(im) + + def _get_spans(self): + bminstart = min(bspan.start for bspan in self.b.spans()) + return [aspan for aspan in self.a.spans() if aspan.end < bminstart] + + +class SpanCondition(SpanBiQuery): + """Matches documents that satisfy both subqueries, but only uses the spans + from the first subquery. + + This is useful when you want to place conditions on matches but not have + those conditions affect the spans returned. + + For example, to get spans for the term ``alfa`` in documents that also + must contain the term ``bravo``:: + + SpanCondition(Term("text", u"alfa"), Term("text", u"bravo")) + + """ + + def __init__(self, a, b): + self.a = a + self.b = b + self.q = And([a, b]) + + class _Matcher(SpanBiMatcher): + def __init__(self, a, b): + self.a = a + im = binary.IntersectionMatcher(a, b) + super(SpanCondition._Matcher, self).__init__(im) + + def _get_spans(self): + return self.a.spans() + + + + + diff --git a/src/whoosh/query/terms.py b/src/whoosh/query/terms.py new file mode 100644 index 0000000..5b011f5 --- /dev/null +++ b/src/whoosh/query/terms.py @@ -0,0 +1,534 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import division +import copy +import fnmatch +import re +from collections import defaultdict + +from whoosh import matching +from whoosh.analysis import Token +from whoosh.compat import bytes_type, text_type, u +from whoosh.lang.morph_en import variations +from whoosh.query import qcore + + +class Term(qcore.Query): + """Matches documents containing the given term (fieldname+text pair). + + >>> Term("content", u"render") + """ + + __inittypes__ = dict(fieldname=str, text=text_type, boost=float) + + def __init__(self, fieldname, text, boost=1.0, minquality=None): + self.fieldname = fieldname + self.text = text + self.boost = boost + self.minquality = minquality + + def __eq__(self, other): + return (other + and self.__class__ is other.__class__ + and self.fieldname == other.fieldname + and self.text == other.text + and self.boost == other.boost) + + def __repr__(self): + r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text) + if self.boost != 1.0: + r += ", boost=%s" % self.boost + r += ")" + return r + + def __unicode__(self): + text = self.text + if isinstance(text, bytes_type): + try: + text = text.decode("ascii") + except UnicodeDecodeError: + text = repr(text) + + t = u("%s:%s") % (self.fieldname, text) + if self.boost != 1: + t += u("^") + text_type(self.boost) + return t + + __str__ = __unicode__ + + def __hash__(self): + return hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost) + + def has_terms(self): + return True + + def tokens(self, boost=1.0): + yield Token(fieldname=self.fieldname, text=self.text, + boost=boost * self.boost, startchar=self.startchar, + endchar=self.endchar, chars=True) + + def terms(self, phrases=False): + if self.field(): + yield (self.field(), self.text) + + def replace(self, fieldname, oldtext, newtext): + q = copy.copy(self) + if q.fieldname == fieldname and q.text == oldtext: + q.text = newtext + return q + + def estimate_size(self, ixreader): + fieldname = self.fieldname + if fieldname not in ixreader.schema: + return 0 + + field = ixreader.schema[fieldname] + try: + text = field.to_bytes(self.text) + except ValueError: + return 0 + + return ixreader.doc_frequency(fieldname, text) + + def matcher(self, searcher, context=None): + fieldname = self.fieldname + text = self.text + if fieldname not in searcher.schema: + return matching.NullMatcher() + + field = searcher.schema[fieldname] + try: + text = field.to_bytes(text) + except ValueError: + return matching.NullMatcher() + + if (self.fieldname, text) in searcher.reader(): + if context is None: + w = searcher.weighting + else: + w = context.weighting + + m = searcher.postings(self.fieldname, text, weighting=w) + if self.minquality: + m.set_min_quality(self.minquality) + if self.boost != 1.0: + m = matching.WrappingMatcher(m, boost=self.boost) + return m + else: + return matching.NullMatcher() + + +class MultiTerm(qcore.Query): + """Abstract base class for queries that operate on multiple terms in the + same field. + """ + + constantscore = False + + def _btexts(self, ixreader): + raise NotImplementedError(self.__class__.__name__) + + def expanded_terms(self, ixreader, phrases=False): + fieldname = self.field() + if fieldname: + for btext in self._btexts(ixreader): + yield (fieldname, btext) + + def tokens(self, boost=1.0, exreader=None): + fieldname = self.field() + if exreader is None: + btexts = [self.text] + else: + btexts = self._btexts(exreader) + + for btext in btexts: + yield Token(fieldname=fieldname, text=btext, + boost=boost * self.boost, startchar=self.startchar, + endchar=self.endchar, chars=True) + + def simplify(self, ixreader): + fieldname = self.field() + + if fieldname not in ixreader.schema: + return qcore.NullQuery() + field = ixreader.schema[fieldname] + + existing = [] + for btext in sorted(set(self._btexts(ixreader))): + text = field.from_bytes(btext) + existing.append(Term(fieldname, text, boost=self.boost)) + + if len(existing) == 1: + return existing[0] + elif existing: + from whoosh.query import Or + return Or(existing) + else: + return qcore.NullQuery + + def estimate_size(self, ixreader): + fieldname = self.field() + return sum(ixreader.doc_frequency(fieldname, btext) + for btext in self._btexts(ixreader)) + + def estimate_min_size(self, ixreader): + fieldname = self.field() + return min(ixreader.doc_frequency(fieldname, text) + for text in self._btexts(ixreader)) + + def matcher(self, searcher, context=None): + from whoosh.query import Or + + fieldname = self.field() + constantscore = self.constantscore + + reader = searcher.reader() + qs = [Term(fieldname, word) for word in self._btexts(reader)] + if not qs: + return matching.NullMatcher() + + if len(qs) == 1: + # If there's only one term, just use it + m = qs[0].matcher(searcher, context) + else: + if constantscore: + # To tell the sub-query that score doesn't matter, set weighting + # to None + if context: + context = context.set(weighting=None) + else: + from whoosh.searching import SearchContext + context = SearchContext(weighting=None) + # Or the terms together + m = Or(qs, boost=self.boost).matcher(searcher, context) + return m + + +class PatternQuery(MultiTerm): + """An intermediate base class for common methods of Prefix and Wildcard. + """ + + __inittypes__ = dict(fieldname=str, text=text_type, boost=float) + + def __init__(self, fieldname, text, boost=1.0, constantscore=True): + self.fieldname = fieldname + self.text = text + self.boost = boost + self.constantscore = constantscore + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.fieldname == other.fieldname + and self.text == other.text and self.boost == other.boost + and self.constantscore == other.constantscore) + + def __repr__(self): + r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text) + if self.boost != 1: + r += ", boost=%s" % self.boost + r += ")" + return r + + def __hash__(self): + return (hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost) + ^ hash(self.constantscore)) + + def _get_pattern(self): + raise NotImplementedError + + def _find_prefix(self, text): + # Subclasses/instances should set the SPECIAL_CHARS attribute to a set + # of characters that mark the end of the literal prefix + specialchars = self.SPECIAL_CHARS + i = 0 + for i, char in enumerate(text): + if char in specialchars: + break + return text[:i] + + def _btexts(self, ixreader): + field = ixreader.schema[self.fieldname] + + exp = re.compile(self._get_pattern()) + prefix = self._find_prefix(self.text) + if prefix: + candidates = ixreader.expand_prefix(self.fieldname, prefix) + else: + candidates = ixreader.lexicon(self.fieldname) + + from_bytes = field.from_bytes + for btext in candidates: + text = from_bytes(btext) + if exp.match(text): + yield btext + + +class Prefix(PatternQuery): + """Matches documents that contain any terms that start with the given text. + + >>> # Match documents containing words starting with 'comp' + >>> Prefix("content", u"comp") + """ + + def __unicode__(self): + return "%s:%s*" % (self.fieldname, self.text) + + __str__ = __unicode__ + + def _btexts(self, ixreader): + return ixreader.expand_prefix(self.fieldname, self.text) + + def matcher(self, searcher, context=None): + if self.text == "": + from whoosh.query import Every + eq = Every(self.fieldname, boost=self.boost) + return eq.matcher(searcher, context) + else: + return PatternQuery.matcher(self, searcher, context) + + +class Wildcard(PatternQuery): + """Matches documents that contain any terms that match a "glob" pattern. + See the Python ``fnmatch`` module for information about globs. + + >>> Wildcard("content", u"in*f?x") + """ + + SPECIAL_CHARS = frozenset("*?[") + + def __unicode__(self): + return "%s:%s" % (self.fieldname, self.text) + + __str__ = __unicode__ + + def _get_pattern(self): + return fnmatch.translate(self.text) + + def normalize(self): + # If there are no wildcard characters in this "wildcard", turn it into + # a simple Term + text = self.text + if text == "*": + from whoosh.query import Every + return Every(self.fieldname, boost=self.boost) + if "*" not in text and "?" not in text: + # If no wildcard chars, convert to a normal term. + return Term(self.fieldname, self.text, boost=self.boost) + elif ("?" not in text and text.endswith("*") + and text.find("*") == len(text) - 1): + # If the only wildcard char is an asterisk at the end, convert to a + # Prefix query. + return Prefix(self.fieldname, self.text[:-1], boost=self.boost) + else: + return self + + def matcher(self, searcher, context=None): + if self.text == "*": + from whoosh.query import Every + eq = Every(self.fieldname, boost=self.boost) + return eq.matcher(searcher, context) + else: + return PatternQuery.matcher(self, searcher, context) + + # _btexts() implemented in PatternQuery + + +class Regex(PatternQuery): + """Matches documents that contain any terms that match a regular + expression. See the Python ``re`` module for information about regular + expressions. + """ + + SPECIAL_CHARS = frozenset("{}()[].?*+^$\\") + + def __unicode__(self): + return '%s:r"%s"' % (self.fieldname, self.text) + + __str__ = __unicode__ + + def _get_pattern(self): + return self.text + + def _find_prefix(self, text): + if "|" in text: + return "" + if text.startswith("^"): + text = text[1:] + elif text.startswith("\\A"): + text = text[2:] + + prefix = PatternQuery._find_prefix(self, text) + + lp = len(prefix) + if lp < len(text) and text[lp] in "*?": + # we stripped something starting from * or ? - they both MAY mean + # "0 times". As we had stripped starting from FIRST special char, + # that implies there were only ordinary chars left of it. Thus, + # the very last of them is not part of the real prefix: + prefix = prefix[:-1] + return prefix + + def matcher(self, searcher, context=None): + if self.text == ".*": + from whoosh.query import Every + eq = Every(self.fieldname, boost=self.boost) + return eq.matcher(searcher, context) + else: + return PatternQuery.matcher(self, searcher, context) + + # _btexts() implemented in PatternQuery + + +class ExpandingTerm(MultiTerm): + """Intermediate base class for queries such as FuzzyTerm and Variations + that expand into multiple queries, but come from a single term. + """ + + def has_terms(self): + return True + + def terms(self, phrases=False): + if self.field(): + yield (self.field(), self.text) + + +class FuzzyTerm(ExpandingTerm): + """Matches documents containing words similar to the given term. + """ + + __inittypes__ = dict(fieldname=str, text=text_type, boost=float, + maxdist=float, prefixlength=int) + + def __init__(self, fieldname, text, boost=1.0, maxdist=1, + prefixlength=1, constantscore=True): + """ + :param fieldname: The name of the field to search. + :param text: The text to search for. + :param boost: A boost factor to apply to scores of documents matching + this query. + :param maxdist: The maximum edit distance from the given text. + :param prefixlength: The matched terms must share this many initial + characters with 'text'. For example, if text is "light" and + prefixlength is 2, then only terms starting with "li" are checked + for similarity. + """ + + self.fieldname = fieldname + self.text = text + self.boost = boost + self.maxdist = maxdist + self.prefixlength = prefixlength + self.constantscore = constantscore + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.fieldname == other.fieldname + and self.text == other.text + and self.maxdist == other.maxdist + and self.prefixlength == other.prefixlength + and self.boost == other.boost + and self.constantscore == other.constantscore) + + def __repr__(self): + r = "%s(%r, %r, boost=%f, maxdist=%d, prefixlength=%d)" + return r % (self.__class__.__name__, self.fieldname, self.text, + self.boost, self.maxdist, self.prefixlength) + + def __unicode__(self): + r = u("%s:%s") % (self.fieldname, self.text) + u("~") + if self.maxdist > 1: + r += u("%d") % self.maxdist + if self.boost != 1.0: + r += u("^%f") % self.boost + return r + + __str__ = __unicode__ + + def __hash__(self): + return (hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost) + ^ hash(self.maxdist) ^ hash(self.prefixlength) + ^ hash(self.constantscore)) + + def _btexts(self, ixreader): + return ixreader.terms_within(self.fieldname, self.text, self.maxdist, + prefix=self.prefixlength) + + def replace(self, fieldname, oldtext, newtext): + q = copy.copy(self) + if q.fieldname == fieldname and q.text == oldtext: + q.text = newtext + return q + + +class Variations(ExpandingTerm): + """Query that automatically searches for morphological variations of the + given word in the same field. + """ + + def __init__(self, fieldname, text, boost=1.0): + self.fieldname = fieldname + self.text = text + self.boost = boost + + def __repr__(self): + r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text) + if self.boost != 1: + r += ", boost=%s" % self.boost + r += ")" + return r + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.fieldname == other.fieldname + and self.text == other.text and self.boost == other.boost) + + def __hash__(self): + return hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost) + + def _btexts(self, ixreader): + fieldname = self.fieldname + to_bytes = ixreader.schema[fieldname].to_bytes + for word in variations(self.text): + try: + btext = to_bytes(word) + except ValueError: + continue + + if (fieldname, btext) in ixreader: + yield btext + + def __unicode__(self): + return u("%s:<%s>") % (self.fieldname, self.text) + + __str__ = __unicode__ + + def replace(self, fieldname, oldtext, newtext): + q = copy.copy(self) + if q.fieldname == fieldname and q.text == oldtext: + q.text = newtext + return q diff --git a/src/whoosh/query/wrappers.py b/src/whoosh/query/wrappers.py new file mode 100644 index 0000000..db59798 --- /dev/null +++ b/src/whoosh/query/wrappers.py @@ -0,0 +1,198 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import division +from array import array + +from whoosh import matching +from whoosh.compat import text_type, u, xrange +from whoosh.query import qcore + + +class WrappingQuery(qcore.Query): + def __init__(self, child): + self.child = child + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.child) + + def __hash__(self): + return hash(self.__class__.__name__) ^ hash(self.child) + + def _rewrap(self, child): + return self.__class__(child) + + def is_leaf(self): + return False + + def children(self): + yield self.child + + def apply(self, fn): + return self._rewrap(fn(self.child)) + + def requires(self): + return self.child.requires() + + def field(self): + return self.child.field() + + def with_boost(self, boost): + return self._rewrap(self.child.with_boost(boost)) + + def estimate_size(self, ixreader): + return self.child.estimate_size(ixreader) + + def estimate_min_size(self, ixreader): + return self.child.estimate_min_size(ixreader) + + def matcher(self, searcher, context=None): + return self.child.matcher(searcher, context) + + +class Not(qcore.Query): + """Excludes any documents that match the subquery. + + >>> # Match documents that contain 'render' but not 'texture' + >>> And([Term("content", u"render"), + ... Not(Term("content", u"texture"))]) + >>> # You can also do this + >>> Term("content", u"render") - Term("content", u"texture") + """ + + __inittypes__ = dict(query=qcore.Query) + + def __init__(self, query, boost=1.0): + """ + :param query: A :class:`Query` object. The results of this query + are *excluded* from the parent query. + :param boost: Boost is meaningless for excluded documents but this + keyword argument is accepted for the sake of a consistent + interface. + """ + + self.query = query + self.boost = boost + + def __eq__(self, other): + return other and self.__class__ is other.__class__ and\ + self.query == other.query + + def __repr__(self): + return "%s(%s)" % (self.__class__.__name__, repr(self.query)) + + def __unicode__(self): + return u("NOT ") + text_type(self.query) + + __str__ = __unicode__ + + def __hash__(self): + return (hash(self.__class__.__name__) + ^ hash(self.query) + ^ hash(self.boost)) + + def is_leaf(self): + return False + + def children(self): + yield self.query + + def apply(self, fn): + return self.__class__(fn(self.query)) + + def normalize(self): + q = self.query.normalize() + if q is qcore.NullQuery: + return q + else: + return self.__class__(q, boost=self.boost) + + def field(self): + return None + + def estimate_size(self, ixreader): + return ixreader.doc_count() + + def estimate_min_size(self, ixreader): + return 1 if ixreader.doc_count() else 0 + + def matcher(self, searcher, context=None): + # Usually only called if Not is the root query. Otherwise, queries such + # as And and Or do special handling of Not subqueries. + reader = searcher.reader() + child = self.query.matcher(searcher, searcher.boolean_context()) + return matching.InverseMatcher(child, reader.doc_count_all(), + missing=reader.is_deleted) + + +class ConstantScoreQuery(WrappingQuery): + """Wraps a query and uses a matcher that always gives a constant score + to all matching documents. This is a useful optimization when you don't + care about scores from a certain branch of the query tree because it is + simply acting as a filter. See also the :class:`AndMaybe` query. + """ + + def __init__(self, child, score=1.0): + WrappingQuery.__init__(self, child) + self.score = score + + def __eq__(self, other): + return (other and self.__class__ is other.__class__ + and self.child == other.child and self.score == other.score) + + def __hash__(self): + return hash(self.child) ^ hash(self.score) + + def _rewrap(self, child): + return self.__class__(child, self.score) + + def matcher(self, searcher, context=None): + from whoosh.searching import SearchContext + + context = context or SearchContext() + m = self.child.matcher(searcher, context) + if context.needs_current or isinstance(m, matching.NullMatcherClass): + return m + else: + ids = array("I", m.all_ids()) + return matching.ListMatcher(ids, all_weights=self.score, + term=m.term()) + + +class WeightingQuery(WrappingQuery): + """Wraps a query and uses a specific :class:`whoosh.sorting.WeightingModel` + to score documents that match the wrapped query. + """ + + def __init__(self, child, weighting): + WrappingQuery.__init__(self, child) + self.weighting = weighting + + def matcher(self, searcher, context=None): + # Replace the passed-in weighting with the one configured on this query + context.set(weighting=self.weighting) + return self.child.matcher(searcher, context) diff --git a/src/whoosh/reading.py b/src/whoosh/reading.py new file mode 100644 index 0000000..3ff65c4 --- /dev/null +++ b/src/whoosh/reading.py @@ -0,0 +1,1295 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +"""This module contains classes that allow reading from an index. +""" + +from math import log +from bisect import bisect_right +from heapq import heapify, heapreplace, heappop, nlargest + +from whoosh import columns +from whoosh.compat import abstractmethod +from whoosh.compat import xrange, zip_, next, iteritems +from whoosh.filedb.filestore import OverlayStorage +from whoosh.matching import MultiMatcher +from whoosh.support.levenshtein import distance +from whoosh.system import emptybytes + + +# Exceptions + +class ReaderClosed(Exception): + """Exception raised when you try to do some operation on a closed searcher + (or a Results object derived from a searcher that has since been closed). + """ + + message = "Operation on a closed reader" + + +class TermNotFound(Exception): + pass + + +# Term Info base class + +class TermInfo(object): + """Represents a set of statistics about a term. This object is returned by + :meth:`IndexReader.term_info`. These statistics may be useful for + optimizations and scoring algorithms. + """ + + def __init__(self, weight=0, df=0, minlength=None, + maxlength=0, maxweight=0, minid=None, maxid=0): + self._weight = weight + self._df = df + self._minlength = minlength + self._maxlength = maxlength + self._maxweight = maxweight + self._minid = minid + self._maxid = maxid + + def add_posting(self, docnum, weight, length=None): + if self._minid is None: + self._minid = docnum + self._maxid = docnum + self._weight += weight + self._df += 1 + self._maxweight = max(self._maxweight, weight) + + if length is not None: + if self._minlength is None: + self._minlength = length + else: + self._minlength = min(self._minlength, length) + self._maxlength = max(self._maxlength, length) + + def weight(self): + """Returns the total frequency of the term across all documents. + """ + + return self._weight + + def doc_frequency(self): + """Returns the number of documents the term appears in. + """ + + return self._df + + def min_length(self): + """Returns the length of the shortest field value the term appears + in. + """ + + return self._minlength + + def max_length(self): + """Returns the length of the longest field value the term appears + in. + """ + + return self._maxlength + + def max_weight(self): + """Returns the number of times the term appears in the document in + which it appears the most. + """ + + return self._maxweight + + def min_id(self): + """Returns the lowest document ID this term appears in. + """ + + return self._minid + + def max_id(self): + """Returns the highest document ID this term appears in. + """ + + return self._maxid + + +# Reader base class + +class IndexReader(object): + """Do not instantiate this object directly. Instead use Index.reader(). + """ + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + @abstractmethod + def __contains__(self, term): + """Returns True if the given term tuple (fieldname, text) is + in this reader. + """ + raise NotImplementedError + + def codec(self): + """Returns the :class:`whoosh.codec.base.Codec` object used to read + this reader's segment. If this reader is not atomic + (``reader.is_atomic() == True``), returns None. + """ + + return None + + def segment(self): + """Returns the :class:`whoosh.index.Segment` object used by this reader. + If this reader is not atomic (``reader.is_atomic() == True``), returns + None. + """ + + return None + + def storage(self): + """Returns the :class:`whoosh.filedb.filestore.Storage` object used by + this reader to read its files. If the reader is not atomic, + (``reader.is_atomic() == True``), returns None. + """ + + return None + + def is_atomic(self): + return True + + def _text_to_bytes(self, fieldname, text): + if fieldname not in self.schema: + raise TermNotFound((fieldname, text)) + return self.schema[fieldname].to_bytes(text) + + def close(self): + """Closes the open files associated with this reader. + """ + + pass + + def generation(self): + """Returns the generation of the index being read, or -1 if the backend + is not versioned. + """ + + return None + + @abstractmethod + def indexed_field_names(self): + """Returns an iterable of strings representing the names of the indexed + fields. This may include additional names not explicitly listed in the + Schema if you use "glob" fields. + """ + + raise NotImplementedError + + @abstractmethod + def all_terms(self): + """Yields (fieldname, text) tuples for every term in the index. + """ + + raise NotImplementedError + + def terms_from(self, fieldname, prefix): + """Yields (fieldname, text) tuples for every term in the index starting + at the given prefix. + """ + + # The default implementation just scans the whole list of terms + for fname, text in self.all_terms(): + if fname < fieldname or text < prefix: + continue + yield (fname, text) + + @abstractmethod + def term_info(self, fieldname, text): + """Returns a :class:`TermInfo` object allowing access to various + statistics about the given term. + """ + + raise NotImplementedError + + def expand_prefix(self, fieldname, prefix): + """Yields terms in the given field that start with the given prefix. + """ + + for fn, text in self.terms_from(fieldname, prefix): + if fn != fieldname or not text.startswith(prefix): + return + yield text + + def lexicon(self, fieldname): + """Yields all bytestrings in the given field. + """ + + for fn, btext in self.terms_from(fieldname, emptybytes): + if fn != fieldname: + return + yield btext + + def field_terms(self, fieldname): + """Yields all term values (converted from on-disk bytes) in the given + field. + """ + + from_bytes = self.schema[fieldname].from_bytes + for btext in self.lexicon(fieldname): + yield from_bytes(btext) + + def __iter__(self): + """Yields ((fieldname, text), terminfo) tuples for each term in the + reader, in lexical order. + """ + + term_info = self.term_info + for term in self.all_terms(): + yield (term, term_info(*term)) + + def iter_from(self, fieldname, text): + """Yields ((fieldname, text), terminfo) tuples for all terms in the + reader, starting at the given term. + """ + + term_info = self.term_info + text = self._text_to_bytes(fieldname, text) + for term in self.terms_from(fieldname, text): + yield (term, term_info(*term)) + + def iter_field(self, fieldname, prefix=''): + """Yields (text, terminfo) tuples for all terms in the given field. + """ + + prefix = self._text_to_bytes(fieldname, prefix) + for (fn, text), terminfo in self.iter_from(fieldname, prefix): + if fn != fieldname: + return + yield text, terminfo + + def iter_prefix(self, fieldname, prefix): + """Yields (text, terminfo) tuples for all terms in the given field with + a certain prefix. + """ + + prefix = self._text_to_bytes(fieldname, prefix) + for text, terminfo in self.iter_field(fieldname, prefix): + if not text.startswith(prefix): + return + yield (text, terminfo) + + @abstractmethod + def has_deletions(self): + """Returns True if the underlying index/segment has deleted + documents. + """ + + raise NotImplementedError + + def all_doc_ids(self): + """Returns an iterator of all (undeleted) document IDs in the reader. + """ + + is_deleted = self.is_deleted + return (docnum for docnum in xrange(self.doc_count_all()) + if not is_deleted(docnum)) + + def iter_docs(self): + """Yields a series of ``(docnum, stored_fields_dict)`` + tuples for the undeleted documents in the reader. + """ + + for docnum in self.all_doc_ids(): + yield docnum, self.stored_fields(docnum) + + @abstractmethod + def is_deleted(self, docnum): + """Returns True if the given document number is marked deleted. + """ + + raise NotImplementedError + + @abstractmethod + def stored_fields(self, docnum): + """Returns the stored fields for the given document number. + + :param numerickeys: use field numbers as the dictionary keys instead of + field names. + """ + + raise NotImplementedError + + def all_stored_fields(self): + """Yields the stored fields for all non-deleted documents. + """ + + is_deleted = self.is_deleted + for docnum in xrange(self.doc_count_all()): + if not is_deleted(docnum): + yield self.stored_fields(docnum) + + @abstractmethod + def doc_count_all(self): + """Returns the total number of documents, DELETED OR UNDELETED, + in this reader. + """ + + raise NotImplementedError + + @abstractmethod + def doc_count(self): + """Returns the total number of UNDELETED documents in this reader. + """ + + return self.doc_count_all() - self.deleted_count() + + @abstractmethod + def frequency(self, fieldname, text): + """Returns the total number of instances of the given term in the + collection. + """ + raise NotImplementedError + + @abstractmethod + def doc_frequency(self, fieldname, text): + """Returns how many documents the given term appears in. + """ + raise NotImplementedError + + @abstractmethod + def field_length(self, fieldname): + """Returns the total number of terms in the given field. This is used + by some scoring algorithms. + """ + raise NotImplementedError + + @abstractmethod + def min_field_length(self, fieldname): + """Returns the minimum length of the field across all documents. This + is used by some scoring algorithms. + """ + raise NotImplementedError + + @abstractmethod + def max_field_length(self, fieldname): + """Returns the minimum length of the field across all documents. This + is used by some scoring algorithms. + """ + raise NotImplementedError + + @abstractmethod + def doc_field_length(self, docnum, fieldname, default=0): + """Returns the number of terms in the given field in the given + document. This is used by some scoring algorithms. + """ + raise NotImplementedError + + def first_id(self, fieldname, text): + """Returns the first ID in the posting list for the given term. This + may be optimized in certain backends. + """ + + text = self._text_to_bytes(fieldname, text) + p = self.postings(fieldname, text) + if p.is_active(): + return p.id() + raise TermNotFound((fieldname, text)) + + def iter_postings(self): + """Low-level method, yields all postings in the reader as + ``(fieldname, text, docnum, weight, valuestring)`` tuples. + """ + + for fieldname, btext in self.all_terms(): + m = self.postings(fieldname, btext) + while m.is_active(): + yield (fieldname, btext, m.id(), m.weight(), m.value()) + m.next() + + @abstractmethod + def postings(self, fieldname, text): + """Returns a :class:`~whoosh.matching.Matcher` for the postings of the + given term. + + >>> pr = reader.postings("content", "render") + >>> pr.skip_to(10) + >>> pr.id + 12 + + :param fieldname: the field name or field number of the term. + :param text: the text of the term. + :rtype: :class:`whoosh.matching.Matcher` + """ + + raise NotImplementedError + + @abstractmethod + def has_vector(self, docnum, fieldname): + """Returns True if the given document has a term vector for the given + field. + """ + raise NotImplementedError + + @abstractmethod + def vector(self, docnum, fieldname, format_=None): + """Returns a :class:`~whoosh.matching.Matcher` object for the + given term vector. + + >>> docnum = searcher.document_number(path=u'/a/b/c') + >>> v = searcher.vector(docnum, "content") + >>> v.all_as("frequency") + [(u"apple", 3), (u"bear", 2), (u"cab", 2)] + + :param docnum: the document number of the document for which you want + the term vector. + :param fieldname: the field name or field number of the field for which + you want the term vector. + :rtype: :class:`whoosh.matching.Matcher` + """ + raise NotImplementedError + + def vector_as(self, astype, docnum, fieldname): + """Returns an iterator of (termtext, value) pairs for the terms in the + given term vector. This is a convenient shortcut to calling vector() + and using the Matcher object when all you want are the terms and/or + values. + + >>> docnum = searcher.document_number(path=u'/a/b/c') + >>> searcher.vector_as("frequency", docnum, "content") + [(u"apple", 3), (u"bear", 2), (u"cab", 2)] + + :param docnum: the document number of the document for which you want + the term vector. + :param fieldname: the field name or field number of the field for which + you want the term vector. + :param astype: a string containing the name of the format you want the + term vector's data in, for example "weights". + """ + + vec = self.vector(docnum, fieldname) + if astype == "weight": + while vec.is_active(): + yield (vec.id(), vec.weight()) + vec.next() + else: + format_ = self.schema[fieldname].format + decoder = format_.decoder(astype) + while vec.is_active(): + yield (vec.id(), decoder(vec.value())) + vec.next() + + def corrector(self, fieldname): + """Returns a :class:`whoosh.spelling.Corrector` object that suggests + corrections based on the terms in the given field. + """ + + from whoosh.spelling import ReaderCorrector + + fieldobj = self.schema[fieldname] + return ReaderCorrector(self, fieldname, fieldobj) + + def terms_within(self, fieldname, text, maxdist, prefix=0): + """ + Returns a generator of words in the given field within ``maxdist`` + Damerau-Levenshtein edit distance of the given text. + + Important: the terms are returned in **no particular order**. The only + criterion is that they are within ``maxdist`` edits of ``text``. You + may want to run this method multiple times with increasing ``maxdist`` + values to ensure you get the closest matches first. You may also have + additional information (such as term frequency or an acoustic matching + algorithm) you can use to rank terms with the same edit distance. + + :param maxdist: the maximum edit distance. + :param prefix: require suggestions to share a prefix of this length + with the given word. This is often justifiable since most + misspellings do not involve the first letter of the word. + Using a prefix dramatically decreases the time it takes to generate + the list of words. + :param seen: an optional set object. Words that appear in the set will + not be yielded. + """ + + fieldobj = self.schema[fieldname] + for btext in self.expand_prefix(fieldname, text[:prefix]): + word = fieldobj.from_bytes(btext) + k = distance(word, text, limit=maxdist) + if k <= maxdist: + yield word + + def most_frequent_terms(self, fieldname, number=5, prefix=''): + """Returns the top 'number' most frequent terms in the given field as a + list of (frequency, text) tuples. + """ + + gen = ((terminfo.weight(), text) for text, terminfo + in self.iter_prefix(fieldname, prefix)) + return nlargest(number, gen) + + def most_distinctive_terms(self, fieldname, number=5, prefix=''): + """Returns the top 'number' terms with the highest `tf*idf` scores as + a list of (score, text) tuples. + """ + + N = float(self.doc_count()) + gen = ((terminfo.weight() * log(N / terminfo.doc_frequency()), text) + for text, terminfo in self.iter_prefix(fieldname, prefix)) + return nlargest(number, gen) + + def leaf_readers(self): + """Returns a list of (IndexReader, docbase) pairs for the child readers + of this reader if it is a composite reader. If this is not a composite + reader, it returns `[(self, 0)]`. + """ + + return [(self, 0)] + + def supports_caches(self): + return False + + def has_column(self, fieldname): + return False + + def column_reader(self, fieldname, column=None, reverse=False, + translate=False): + """ + + :param fieldname: the name of the field for which to get a reader. + :param column: if passed, use this Column object instead of the one + associated with the field in the Schema. + :param reverse: if passed, reverses the order of keys returned by the + reader's ``sort_key()`` method. If the column type is not + reversible, this will raise a ``NotImplementedError``. + :param translate: if True, wrap the reader to call the field's + ``from_bytes()`` method on the returned values. + :return: a :class:`whoosh.columns.ColumnReader` object. + """ + + raise NotImplementedError + + +# Segment-based reader + +class SegmentReader(IndexReader): + def __init__(self, storage, schema, segment, generation=None, codec=None): + self.schema = schema + self.is_closed = False + + self._segment = segment + self._segid = self._segment.segment_id() + self._gen = generation + + # self.files is a storage object from which to load the segment files. + # This is different from the general storage (which will be used for + # caches) if the segment is in a compound file. + if segment.is_compound(): + # Open the compound file as a storage object + files = segment.open_compound_file(storage) + # Use an overlay here instead of just the compound storage, in rare + # circumstances a segment file may be added after the segment is + # written + self._storage = OverlayStorage(files, storage) + else: + self._storage = storage + + # Get subreaders from codec + self._codec = codec if codec else segment.codec() + self._terms = self._codec.terms_reader(self._storage, segment) + self._perdoc = self._codec.per_document_reader(self._storage, segment) + + def codec(self): + return self._codec + + def segment(self): + return self._segment + + def storage(self): + return self._storage + + def has_deletions(self): + if self.is_closed: + raise ReaderClosed + return self._perdoc.has_deletions() + + def doc_count(self): + if self.is_closed: + raise ReaderClosed + return self._perdoc.doc_count() + + def doc_count_all(self): + if self.is_closed: + raise ReaderClosed + return self._perdoc.doc_count_all() + + def is_deleted(self, docnum): + if self.is_closed: + raise ReaderClosed + return self._perdoc.is_deleted(docnum) + + def generation(self): + return self._gen + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, self._storage, + self._segment) + + def __contains__(self, term): + if self.is_closed: + raise ReaderClosed + fieldname, text = term + if fieldname not in self.schema: + return False + text = self._text_to_bytes(fieldname, text) + return (fieldname, text) in self._terms + + def close(self): + if self.is_closed: + raise ReaderClosed("Reader already closed") + self._terms.close() + self._perdoc.close() + + # It's possible some weird codec that doesn't use storage might have + # passed None instead of a storage object + if self._storage: + self._storage.close() + + self.is_closed = True + + def stored_fields(self, docnum): + if self.is_closed: + raise ReaderClosed + assert docnum >= 0 + schema = self.schema + sfs = self._perdoc.stored_fields(docnum) + # Double-check with schema to filter out removed fields + return dict(item for item in iteritems(sfs) if item[0] in schema) + + # Delegate doc methods to the per-doc reader + + def all_doc_ids(self): + if self.is_closed: + raise ReaderClosed + return self._perdoc.all_doc_ids() + + def iter_docs(self): + if self.is_closed: + raise ReaderClosed + return self._perdoc.iter_docs() + + def all_stored_fields(self): + if self.is_closed: + raise ReaderClosed + return self._perdoc.all_stored_fields() + + def field_length(self, fieldname): + if self.is_closed: + raise ReaderClosed + return self._perdoc.field_length(fieldname) + + def min_field_length(self, fieldname): + if self.is_closed: + raise ReaderClosed + return self._perdoc.min_field_length(fieldname) + + def max_field_length(self, fieldname): + if self.is_closed: + raise ReaderClosed + return self._perdoc.max_field_length(fieldname) + + def doc_field_length(self, docnum, fieldname, default=0): + if self.is_closed: + raise ReaderClosed + return self._perdoc.doc_field_length(docnum, fieldname, default) + + def has_vector(self, docnum, fieldname): + if self.is_closed: + raise ReaderClosed + return self._perdoc.has_vector(docnum, fieldname) + + # + + def _test_field(self, fieldname): + if self.is_closed: + raise ReaderClosed + if fieldname not in self.schema: + raise TermNotFound("No field %r" % fieldname) + if self.schema[fieldname].format is None: + raise TermNotFound("Field %r is not indexed" % fieldname) + + def indexed_field_names(self): + return self._terms.indexed_field_names() + + def all_terms(self): + if self.is_closed: + raise ReaderClosed + schema = self.schema + return ((fieldname, text) for fieldname, text in self._terms.terms() + if fieldname in schema) + + def terms_from(self, fieldname, prefix): + self._test_field(fieldname) + prefix = self._text_to_bytes(fieldname, prefix) + schema = self.schema + return ((fname, text) for fname, text + in self._terms.terms_from(fieldname, prefix) + if fname in schema) + + def term_info(self, fieldname, text): + self._test_field(fieldname) + text = self._text_to_bytes(fieldname, text) + try: + return self._terms.term_info(fieldname, text) + except KeyError: + raise TermNotFound("%s:%r" % (fieldname, text)) + + def expand_prefix(self, fieldname, prefix): + self._test_field(fieldname) + prefix = self._text_to_bytes(fieldname, prefix) + return IndexReader.expand_prefix(self, fieldname, prefix) + + def lexicon(self, fieldname): + self._test_field(fieldname) + return IndexReader.lexicon(self, fieldname) + + def __iter__(self): + if self.is_closed: + raise ReaderClosed + schema = self.schema + return ((term, terminfo) for term, terminfo in self._terms.items() + if term[0] in schema) + + def iter_from(self, fieldname, text): + self._test_field(fieldname) + schema = self.schema + text = self._text_to_bytes(fieldname, text) + for term, terminfo in self._terms.items_from(fieldname, text): + if term[0] not in schema: + continue + yield (term, terminfo) + + def frequency(self, fieldname, text): + self._test_field(fieldname) + text = self._text_to_bytes(fieldname, text) + try: + return self._terms.frequency(fieldname, text) + except KeyError: + return 0 + + def doc_frequency(self, fieldname, text): + self._test_field(fieldname) + text = self._text_to_bytes(fieldname, text) + try: + return self._terms.doc_frequency(fieldname, text) + except KeyError: + return 0 + + def postings(self, fieldname, text, scorer=None): + from whoosh.matching.wrappers import FilterMatcher + + if self.is_closed: + raise ReaderClosed + if fieldname not in self.schema: + raise TermNotFound("No field %r" % fieldname) + text = self._text_to_bytes(fieldname, text) + format_ = self.schema[fieldname].format + matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer) + deleted = frozenset(self._perdoc.deleted_docs()) + if deleted: + matcher = FilterMatcher(matcher, deleted, exclude=True) + return matcher + + def vector(self, docnum, fieldname, format_=None): + if self.is_closed: + raise ReaderClosed + if fieldname not in self.schema: + raise TermNotFound("No field %r" % fieldname) + vformat = format_ or self.schema[fieldname].vector + if not vformat: + raise Exception("No vectors are stored for field %r" % fieldname) + return self._perdoc.vector(docnum, fieldname, vformat) + + def cursor(self, fieldname): + if self.is_closed: + raise ReaderClosed + fieldobj = self.schema[fieldname] + return self._terms.cursor(fieldname, fieldobj) + + def terms_within(self, fieldname, text, maxdist, prefix=0): + # Replaces the horribly inefficient base implementation with one based + # on skipping through the word list efficiently using a DFA + + fieldobj = self.schema[fieldname] + spellfield = fieldobj.spelling_fieldname(fieldname) + auto = self._codec.automata(self._storage, self._segment) + fieldcur = self.cursor(spellfield) + return auto.terms_within(fieldcur, text, maxdist, prefix) + + # Column methods + + def has_column(self, fieldname): + if self.is_closed: + raise ReaderClosed + coltype = self.schema[fieldname].column_type + return coltype and self._perdoc.has_column(fieldname) + + def column_reader(self, fieldname, column=None, reverse=False, + translate=True): + if self.is_closed: + raise ReaderClosed + + fieldobj = self.schema[fieldname] + column = column or fieldobj.column_type + if not column: + raise Exception("No column for field %r in %r" + % (fieldname, self)) + + if self._perdoc.has_column(fieldname): + creader = self._perdoc.column_reader(fieldname, column) + if reverse: + creader.set_reverse() + else: + # This segment doesn't have a column file for this field, so create + # a fake column reader that always returns the default value. + default = column.default_value(reverse) + creader = columns.EmptyColumnReader(default, self.doc_count_all()) + + if translate: + # Wrap the column in a Translator to give the caller + # nice values instead of sortable representations + fcv = fieldobj.from_column_value + creader = columns.TranslatingColumnReader(creader, fcv) + + return creader + + +# Fake IndexReader class for empty indexes + +class EmptyReader(IndexReader): + def __init__(self, schema): + self.schema = schema + + def __contains__(self, term): + return False + + def __iter__(self): + return iter([]) + + def cursor(self, fieldname): + from whoosh.codec.base import EmptyCursor + + return EmptyCursor() + + def indexed_field_names(self): + return [] + + def all_terms(self): + return iter([]) + + def term_info(self, fieldname, text): + raise TermNotFound((fieldname, text)) + + def iter_from(self, fieldname, text): + return iter([]) + + def iter_field(self, fieldname, prefix=''): + return iter([]) + + def iter_prefix(self, fieldname, prefix=''): + return iter([]) + + def lexicon(self, fieldname): + return iter([]) + + def has_deletions(self): + return False + + def is_deleted(self, docnum): + return False + + def stored_fields(self, docnum): + raise KeyError("No document number %s" % docnum) + + def all_stored_fields(self): + return iter([]) + + def doc_count_all(self): + return 0 + + def doc_count(self): + return 0 + + def frequency(self, fieldname, text): + return 0 + + def doc_frequency(self, fieldname, text): + return 0 + + def field_length(self, fieldname): + return 0 + + def min_field_length(self, fieldname): + return 0 + + def max_field_length(self, fieldname): + return 0 + + def doc_field_length(self, docnum, fieldname, default=0): + return default + + def postings(self, fieldname, text, scorer=None): + raise TermNotFound("%s:%r" % (fieldname, text)) + + def has_vector(self, docnum, fieldname): + return False + + def vector(self, docnum, fieldname, format_=None): + raise KeyError("No document number %s" % docnum) + + def most_frequent_terms(self, fieldname, number=5, prefix=''): + return iter([]) + + def most_distinctive_terms(self, fieldname, number=5, prefix=None): + return iter([]) + + +# Multisegment reader class + +class MultiReader(IndexReader): + """Do not instantiate this object directly. Instead use Index.reader(). + """ + + def __init__(self, readers, generation=None): + self.readers = readers + self._gen = generation + self.schema = None + if readers: + self.schema = readers[0].schema + + self.doc_offsets = [] + self.base = 0 + for r in self.readers: + self.doc_offsets.append(self.base) + self.base += r.doc_count_all() + + self.is_closed = False + + def _document_segment(self, docnum): + return max(0, bisect_right(self.doc_offsets, docnum) - 1) + + def _segment_and_docnum(self, docnum): + segmentnum = self._document_segment(docnum) + offset = self.doc_offsets[segmentnum] + return segmentnum, docnum - offset + + def cursor(self, fieldname): + return MultiCursor([r.cursor(fieldname) for r in self.readers]) + + def is_atomic(self): + return False + + def leaf_readers(self): + return zip_(self.readers, self.doc_offsets) + + def add_reader(self, reader): + self.readers.append(reader) + self.doc_offsets.append(self.base) + self.base += reader.doc_count_all() + + def close(self): + for d in self.readers: + d.close() + self.is_closed = True + + def generation(self): + return self._gen + + def format(self, fieldname): + for r in self.readers: + fmt = r.format(fieldname) + if fmt is not None: + return fmt + + def vector_format(self, fieldname): + for r in self.readers: + vfmt = r.vector_format(fieldname) + if vfmt is not None: + return vfmt + + # Term methods + + def __contains__(self, term): + return any(r.__contains__(term) for r in self.readers) + + def _merge_terms(self, iterlist): + # Merge-sorts terms coming from a list of term iterators. + + # Create a map so we can look up each iterator by its id() value + itermap = {} + for it in iterlist: + itermap[id(it)] = it + + # Fill in the list with the head term from each iterator. + + current = [] + for it in iterlist: + try: + term = next(it) + except StopIteration: + continue + current.append((term, id(it))) + # Number of active iterators + active = len(current) + + # If only one iterator is active, just yield from it and return + if active == 1: + term, itid = current[0] + it = itermap[itid] + yield term + for term in it: + yield term + return + + # Otherwise, do a streaming heap sort of the terms from the iterators + heapify(current) + while active: + # Peek at the first term in the sorted list + term = current[0][0] + + # Re-iterate on all items in the list that have that term + while active and current[0][0] == term: + it = itermap[current[0][1]] + try: + nextterm = next(it) + heapreplace(current, (nextterm, id(it))) + except StopIteration: + heappop(current) + active -= 1 + + # Yield the term + yield term + + def indexed_field_names(self): + names = set() + for r in self.readers: + names.update(r.indexed_field_names()) + return iter(names) + + def all_terms(self): + return self._merge_terms([r.all_terms() for r in self.readers]) + + def terms_from(self, fieldname, prefix): + return self._merge_terms([r.terms_from(fieldname, prefix) + for r in self.readers]) + + def term_info(self, fieldname, text): + term = (fieldname, text) + + # Get the term infos for the sub-readers containing the term + tis = [(r.term_info(fieldname, text), offset) for r, offset + in zip_(self.readers, self.doc_offsets) if term in r] + + # If only one reader had the term, return its terminfo with the offset + # added + if not tis: + raise TermNotFound(term) + + return combine_terminfos(tis) + + def frequency(self, fieldname, text): + return sum(r.frequency(fieldname, text) for r in self.readers) + + def doc_frequency(self, fieldname, text): + return sum(r.doc_frequency(fieldname, text) for r in self.readers) + + def postings(self, fieldname, text): + # This method does not add a scorer; for that, use Searcher.postings() + + postreaders = [] + docoffsets = [] + term = (fieldname, text) + + for i, r in enumerate(self.readers): + if term in r: + offset = self.doc_offsets[i] + pr = r.postings(fieldname, text) + postreaders.append(pr) + docoffsets.append(offset) + + if not postreaders: + raise TermNotFound(fieldname, text) + + return MultiMatcher(postreaders, docoffsets) + + def first_id(self, fieldname, text): + for i, r in enumerate(self.readers): + try: + id = r.first_id(fieldname, text) + except (KeyError, TermNotFound): + pass + else: + if id is None: + raise TermNotFound((fieldname, text)) + else: + return self.doc_offsets[i] + id + + raise TermNotFound((fieldname, text)) + + # Deletion methods + + def has_deletions(self): + return any(r.has_deletions() for r in self.readers) + + def is_deleted(self, docnum): + segmentnum, segmentdoc = self._segment_and_docnum(docnum) + return self.readers[segmentnum].is_deleted(segmentdoc) + + def stored_fields(self, docnum): + segmentnum, segmentdoc = self._segment_and_docnum(docnum) + return self.readers[segmentnum].stored_fields(segmentdoc) + + # Columns + + def has_column(self, fieldname): + return any(r.has_column(fieldname) for r in self.readers) + + def column_reader(self, fieldname, column=None, reverse=False, + translate=True): + crs = [] + doc_offsets = [] + for i, r in enumerate(self.readers): + if r.has_column(fieldname): + cr = r.column_reader(fieldname, column=column, reverse=reverse, + translate=translate) + crs.append(cr) + doc_offsets.append(self.doc_offsets[i]) + return columns.MultiColumnReader(crs, doc_offsets) + + # Per doc methods + + def all_stored_fields(self): + for reader in self.readers: + for result in reader.all_stored_fields(): + yield result + + def doc_count_all(self): + return sum(dr.doc_count_all() for dr in self.readers) + + def doc_count(self): + return sum(dr.doc_count() for dr in self.readers) + + def field_length(self, fieldname): + return sum(dr.field_length(fieldname) for dr in self.readers) + + def min_field_length(self, fieldname): + return min(r.min_field_length(fieldname) for r in self.readers) + + def max_field_length(self, fieldname): + return max(r.max_field_length(fieldname) for r in self.readers) + + def doc_field_length(self, docnum, fieldname, default=0): + segmentnum, segmentdoc = self._segment_and_docnum(docnum) + reader = self.readers[segmentnum] + return reader.doc_field_length(segmentdoc, fieldname, default=default) + + def has_vector(self, docnum, fieldname): + segmentnum, segmentdoc = self._segment_and_docnum(docnum) + return self.readers[segmentnum].has_vector(segmentdoc, fieldname) + + def vector(self, docnum, fieldname, format_=None): + segmentnum, segmentdoc = self._segment_and_docnum(docnum) + return self.readers[segmentnum].vector(segmentdoc, fieldname) + + def vector_as(self, astype, docnum, fieldname): + segmentnum, segmentdoc = self._segment_and_docnum(docnum) + return self.readers[segmentnum].vector_as(astype, segmentdoc, + fieldname) + + +def combine_terminfos(tis): + if len(tis) == 1: + ti, offset = tis[0] + ti._minid += offset + ti._maxid += offset + return ti + + # Combine the various statistics + w = sum(ti.weight() for ti, _ in tis) + df = sum(ti.doc_frequency() for ti, _ in tis) + ml = min(ti.min_length() for ti, _ in tis) + xl = max(ti.max_length() for ti, _ in tis) + xw = max(ti.max_weight() for ti, _ in tis) + + # For min and max ID, we need to add the doc offsets + mid = min(ti.min_id() + offset for ti, offset in tis) + xid = max(ti.max_id() + offset for ti, offset in tis) + + return TermInfo(w, df, ml, xl, xw, mid, xid) + + +class MultiCursor(object): + def __init__(self, cursors): + self._cursors = [c for c in cursors if c.is_valid()] + self._low = [] + self._text = None + self.next() + + def _find_low(self): + low = [] + lowterm = None + + for c in self._cursors: + if c.is_valid(): + cterm = c.term() + if low and cterm == lowterm: + low.append(c) + elif low and cterm < lowterm: + low = [c] + lowterm = cterm + + self._low = low + self._text = lowterm + return lowterm + + def first(self): + for c in self._cursors: + c.first() + return self._find_low() + + def find(self, term): + for c in self._cursors: + c.find(term) + return self._find_low() + + def next(self): + for c in self._cursors: + c.next() + return self._find_low() + + def term_info(self): + tis = [c.term_info() for c in self._low] + return combine_terminfos(tis) if tis else None + + def is_valid(self): + return any(c.is_valid() for c in self._cursors) diff --git a/src/whoosh/scoring.py b/src/whoosh/scoring.py new file mode 100644 index 0000000..cd6c2d2 --- /dev/null +++ b/src/whoosh/scoring.py @@ -0,0 +1,616 @@ +# Copyright 2008 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module contains classes for scoring (and sorting) search results. +""" + +from __future__ import division +from math import log, pi + +from whoosh.compat import iteritems + + +# Base classes + +class WeightingModel(object): + """Abstract base class for scoring models. A WeightingModel object provides + a method, ``scorer``, which returns an instance of + :class:`whoosh.scoring.Scorer`. + + Basically, WeightingModel objects store the configuration information for + the model (for example, the values of B and K1 in the BM25F model), and + then creates a scorer instance based on additional run-time information + (the searcher, the fieldname, and term text) to do the actual scoring. + """ + + use_final = False + + def idf(self, searcher, fieldname, text): + """Returns the inverse document frequency of the given term. + """ + + parent = searcher.get_parent() + n = parent.doc_frequency(fieldname, text) + dc = parent.doc_count_all() + return log(dc / (n + 1)) + 1 + + def scorer(self, searcher, fieldname, text, qf=1): + """Returns an instance of :class:`whoosh.scoring.Scorer` configured + for the given searcher, fieldname, and term text. + """ + + raise NotImplementedError(self.__class__.__name__) + + def final(self, searcher, docnum, score): + """Returns a final score for each document. You can use this method + in subclasses to apply document-level adjustments to the score, for + example using the value of stored field to influence the score + (although that would be slow). + + WeightingModel sub-classes that use ``final()`` should have the + attribute ``use_final`` set to ``True``. + + :param searcher: :class:`whoosh.searching.Searcher` for the index. + :param docnum: the doc number of the document being scored. + :param score: the document's accumulated term score. + + :rtype: float + """ + + return score + + +class BaseScorer(object): + """Base class for "scorer" implementations. A scorer provides a method for + scoring a document, and sometimes methods for rating the "quality" of a + document and a matcher's current "block", to implement quality-based + optimizations. + + Scorer objects are created by WeightingModel objects. Basically, + WeightingModel objects store the configuration information for the model + (for example, the values of B and K1 in the BM25F model), and then creates + a scorer instance. + """ + + def supports_block_quality(self): + """Returns True if this class supports quality optimizations. + """ + + return False + + def score(self, matcher): + """Returns a score for the current document of the matcher. + """ + + raise NotImplementedError(self.__class__.__name__) + + def max_quality(self): + """Returns the *maximum limit* on the possible score the matcher can + give. This can be an estimate and not necessarily the actual maximum + score possible, but it must never be less than the actual maximum + score. + """ + + raise NotImplementedError(self.__class__.__name__) + + def block_quality(self, matcher): + """Returns the *maximum limit* on the possible score the matcher can + give **in its current "block"** (whatever concept of "block" the + backend might use). This can be an estimate and not necessarily the + actual maximum score possible, but it must never be less than the + actual maximum score. + + If this score is less than the minimum score + required to make the "top N" results, then we can tell the matcher to + skip ahead to another block with better "quality". + """ + + raise NotImplementedError(self.__class__.__name__) + + +# Scorer that just returns term weight + +class WeightScorer(BaseScorer): + """A scorer that simply returns the weight as the score. This is useful + for more complex weighting models to return when they are asked for a + scorer for fields that aren't scorable (don't store field lengths). + """ + + def __init__(self, maxweight): + self._maxweight = maxweight + + def supports_block_quality(self): + return True + + def score(self, matcher): + return matcher.weight() + + def max_quality(self): + return self._maxweight + + def block_quality(self, matcher): + return matcher.block_max_weight() + + @classmethod + def for_(cls, searcher, fieldname, text): + ti = searcher.term_info(fieldname, text) + return cls(ti.max_weight()) + + +# Base scorer for models that only use weight and field length + +class WeightLengthScorer(BaseScorer): + """Base class for scorers where the only per-document variables are term + weight and field length. + + Subclasses should override the ``_score(weight, length)`` method to return + the score for a document with the given weight and length, and call the + ``setup()`` method at the end of the initializer to set up common + attributes. + """ + + def setup(self, searcher, fieldname, text): + """Initializes the scorer and then does the busy work of + adding the ``dfl()`` function and maximum quality attribute. + + This method assumes the initializers of WeightLengthScorer subclasses + always take ``searcher, offset, fieldname, text`` as the first three + arguments. Any additional arguments given to this method are passed + through to the initializer. + + Note: this method calls ``self._score()``, so you should only call it + in the initializer after setting up whatever attributes ``_score()`` + depends on:: + + class MyScorer(WeightLengthScorer): + def __init__(self, searcher, fieldname, text, parm=1.0): + self.parm = parm + self.setup(searcher, fieldname, text) + + def _score(self, weight, length): + return (weight / (length + 1)) * self.parm + """ + + ti = searcher.term_info(fieldname, text) + if not searcher.schema[fieldname].scorable: + return WeightScorer(ti.max_weight()) + + self.dfl = lambda docid: searcher.doc_field_length(docid, fieldname, 1) + self._maxquality = self._score(ti.max_weight(), ti.min_length()) + + def supports_block_quality(self): + return True + + def score(self, matcher): + return self._score(matcher.weight(), self.dfl(matcher.id())) + + def max_quality(self): + return self._maxquality + + def block_quality(self, matcher): + return self._score(matcher.block_max_weight(), + matcher.block_min_length()) + + def _score(self, weight, length): + # Override this method with the actual scoring function + raise NotImplementedError(self.__class__.__name__) + + +# WeightingModel implementations + +# Debugging model + +class DebugModel(WeightingModel): + def __init__(self): + self.log = [] + + def scorer(self, searcher, fieldname, text, qf=1): + return DebugScorer(searcher, fieldname, text, self.log) + + +class DebugScorer(BaseScorer): + def __init__(self, searcher, fieldname, text, log): + ti = searcher.term_info(fieldname, text) + self._maxweight = ti.max_weight() + + self.searcher = searcher + self.fieldname = fieldname + self.text = text + self.log = log + + def supports_block_quality(self): + return True + + def score(self, matcher): + fieldname, text = self.fieldname, self.text + docid = matcher.id() + w = matcher.weight() + length = self.searcher.doc_field_length(docid, fieldname) + self.log.append((fieldname, text, docid, w, length)) + return w + + def max_quality(self): + return self._maxweight + + def block_quality(self, matcher): + return matcher.block_max_weight() + + +# BM25F Model + +def bm25(idf, tf, fl, avgfl, B, K1): + # idf - inverse document frequency + # tf - term frequency in the current document + # fl - field length in the current document + # avgfl - average field length across documents in collection + # B, K1 - free paramters + + return idf * ((tf * (K1 + 1)) / (tf + K1 * ((1 - B) + B * fl / avgfl))) + + +class BM25F(WeightingModel): + """Implements the BM25F scoring algorithm. + """ + + def __init__(self, B=0.75, K1=1.2, **kwargs): + """ + + >>> from whoosh import scoring + >>> # Set a custom B value for the "content" field + >>> w = scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) + + :param B: free parameter, see the BM25 literature. Keyword arguments of + the form ``fieldname_B`` (for example, ``body_B``) set field- + specific values for B. + :param K1: free parameter, see the BM25 literature. + """ + + self.B = B + self.K1 = K1 + + self._field_B = {} + for k, v in iteritems(kwargs): + if k.endswith("_B"): + fieldname = k[:-2] + self._field_B[fieldname] = v + + def supports_block_quality(self): + return True + + def scorer(self, searcher, fieldname, text, qf=1): + if not searcher.schema[fieldname].scorable: + return WeightScorer.for_(searcher, fieldname, text) + + if fieldname in self._field_B: + B = self._field_B[fieldname] + else: + B = self.B + + return BM25FScorer(searcher, fieldname, text, B, self.K1, qf=qf) + + +class BM25FScorer(WeightLengthScorer): + def __init__(self, searcher, fieldname, text, B, K1, qf=1): + # IDF and average field length are global statistics, so get them from + # the top-level searcher + parent = searcher.get_parent() # Returns self if no parent + self.idf = parent.idf(fieldname, text) + self.avgfl = parent.avg_field_length(fieldname) or 1 + + self.B = B + self.K1 = K1 + self.qf = qf + self.setup(searcher, fieldname, text) + + def _score(self, weight, length): + s = bm25(self.idf, weight, length, self.avgfl, self.B, self.K1) + return s + + +# DFree model + +def dfree(tf, cf, qf, dl, fl): + # tf - term frequency in current document + # cf - term frequency in collection + # qf - term frequency in query + # dl - field length in current document + # fl - total field length across all documents in collection + prior = tf / dl + post = (tf + 1.0) / (dl + 1.0) + invpriorcol = fl / cf + norm = tf * log(post / prior) + + return qf * norm * (tf * (log(prior * invpriorcol)) + + (tf + 1.0) * (log(post * invpriorcol)) + + 0.5 * log(post / prior)) + + +class DFree(WeightingModel): + """Implements the DFree scoring model from Terrier. + + See http://terrier.org/ + """ + + def supports_block_quality(self): + return True + + def scorer(self, searcher, fieldname, text, qf=1): + if not searcher.schema[fieldname].scorable: + return WeightScorer.for_(searcher, fieldname, text) + + return DFreeScorer(searcher, fieldname, text, qf=qf) + + +class DFreeScorer(WeightLengthScorer): + def __init__(self, searcher, fieldname, text, qf=1): + # Total term weight and total field length are global statistics, so + # get them from the top-level searcher + parent = searcher.get_parent() # Returns self if no parent + self.cf = parent.weight(fieldname, text) + self.fl = parent.field_length(fieldname) + + self.qf = qf + self.setup(searcher, fieldname, text) + + def _score(self, weight, length): + return dfree(weight, self.cf, self.qf, length, self.fl) + + +# PL2 model + +rec_log2_of_e = 1.0 / log(2) + + +def pl2(tf, cf, qf, dc, fl, avgfl, c): + # tf - term frequency in the current document + # cf - term frequency in the collection + # qf - term frequency in the query + # dc - doc count + # fl - field length in the current document + # avgfl - average field length across all documents + # c -free parameter + + TF = tf * log(1.0 + (c * avgfl) / fl) + norm = 1.0 / (TF + 1.0) + f = cf / dc + return norm * qf * (TF * log(1.0 / f) + + f * rec_log2_of_e + + 0.5 * log(2 * pi * TF) + + TF * (log(TF) - rec_log2_of_e)) + + +class PL2(WeightingModel): + """Implements the PL2 scoring model from Terrier. + + See http://terrier.org/ + """ + + def __init__(self, c=1.0): + self.c = c + + def scorer(self, searcher, fieldname, text, qf=1): + if not searcher.schema[fieldname].scorable: + return WeightScorer.for_(searcher, fieldname, text) + + return PL2Scorer(searcher, fieldname, text, self.c, qf=qf) + + +class PL2Scorer(WeightLengthScorer): + def __init__(self, searcher, fieldname, text, c, qf=1): + # Total term weight, document count, and average field length are + # global statistics, so get them from the top-level searcher + parent = searcher.get_parent() # Returns self if no parent + self.cf = parent.frequency(fieldname, text) + self.dc = parent.doc_count_all() + self.avgfl = parent.avg_field_length(fieldname) or 1 + + self.c = c + self.qf = qf + self.setup(searcher, fieldname, text) + + def _score(self, weight, length): + return pl2(weight, self.cf, self.qf, self.dc, length, self.avgfl, + self.c) + + +# Simple models + +class Frequency(WeightingModel): + def scorer(self, searcher, fieldname, text, qf=1): + maxweight = searcher.term_info(fieldname, text).max_weight() + return WeightScorer(maxweight) + + +class TF_IDF(WeightingModel): + def scorer(self, searcher, fieldname, text, qf=1): + # IDF is a global statistic, so get it from the top-level searcher + parent = searcher.get_parent() # Returns self if no parent + idf = parent.idf(fieldname, text) + + maxweight = searcher.term_info(fieldname, text).max_weight() + return TF_IDFScorer(maxweight, idf) + + +class TF_IDFScorer(BaseScorer): + def __init__(self, maxweight, idf): + self._maxquality = maxweight * idf + self.idf = idf + + def supports_block_quality(self): + return True + + def score(self, matcher): + return matcher.weight() * self.idf + + def max_quality(self): + return self._maxquality + + def block_quality(self, matcher): + return matcher.block_max_weight() * self.idf + + +# Utility models + +class Weighting(WeightingModel): + """This class provides backwards-compatibility with the old weighting + class architecture, so any existing custom scorers don't need to be + rewritten. + """ + + def scorer(self, searcher, fieldname, text, qf=1): + return self.CompatibilityScorer(searcher, fieldname, text, self.score) + + def score(self, searcher, fieldname, text, docnum, weight): + raise NotImplementedError + + class CompatibilityScorer(BaseScorer): + def __init__(self, searcher, fieldname, text, scoremethod): + self.searcher = searcher + self.fieldname = fieldname + self.text = text + self.scoremethod = scoremethod + + def score(self, matcher): + return self.scoremethod(self.searcher, self.fieldname, self.text, + matcher.id(), matcher.weight()) + + +class FunctionWeighting(WeightingModel): + """Uses a supplied function to do the scoring. For simple scoring functions + and experiments this may be simpler to use than writing a full weighting + model class and scorer class. + + The function should accept the arguments + ``searcher, fieldname, text, matcher``. + + For example, the following function will score documents based on the + earliest position of the query term in the document:: + + def pos_score_fn(searcher, fieldname, text, matcher): + poses = matcher.value_as("positions") + return 1.0 / (poses[0] + 1) + + pos_weighting = scoring.FunctionWeighting(pos_score_fn) + with myindex.searcher(weighting=pos_weighting) as s: + results = s.search(q) + + Note that the searcher passed to the function may be a per-segment searcher + for performance reasons. If you want to get global statistics inside the + function, you should use ``searcher.get_parent()`` to get the top-level + searcher. (However, if you are using global statistics, you should probably + write a real model/scorer combo so you can cache them on the object.) + """ + + def __init__(self, fn): + self.fn = fn + + def scorer(self, searcher, fieldname, text, qf=1): + return self.FunctionScorer(self.fn, searcher, fieldname, text, qf=qf) + + class FunctionScorer(BaseScorer): + def __init__(self, fn, searcher, fieldname, text, qf=1): + self.fn = fn + self.searcher = searcher + self.fieldname = fieldname + self.text = text + self.qf = qf + + def score(self, matcher): + return self.fn(self.searcher, self.fieldname, self.text, matcher) + + +class MultiWeighting(WeightingModel): + """Chooses from multiple scoring algorithms based on the field. + """ + + def __init__(self, default, **weightings): + """The only non-keyword argument specifies the default + :class:`Weighting` instance to use. Keyword arguments specify + Weighting instances for specific fields. + + For example, to use ``BM25`` for most fields, but ``Frequency`` for + the ``id`` field and ``TF_IDF`` for the ``keys`` field:: + + mw = MultiWeighting(BM25(), id=Frequency(), keys=TF_IDF()) + + :param default: the Weighting instance to use for fields not + specified in the keyword arguments. + """ + + self.default = default + # Store weighting functions by field name + self.weightings = weightings + + def scorer(self, searcher, fieldname, text, qf=1): + w = self.weightings.get(fieldname, self.default) + return w.scorer(searcher, fieldname, text, qf=qf) + + +class ReverseWeighting(WeightingModel): + """Wraps a weighting object and subtracts the wrapped model's scores from + 0, essentially reversing the weighting model. + """ + + def __init__(self, weighting): + self.weighting = weighting + + def scorer(self, searcher, fieldname, text, qf=1): + subscorer = self.weighting.scorer(searcher, fieldname, text, qf=qf) + return ReverseWeighting.ReverseScorer(subscorer) + + class ReverseScorer(BaseScorer): + def __init__(self, subscorer): + self.subscorer = subscorer + + def supports_block_quality(self): + return self.subscorer.supports_block_quality() + + def score(self, matcher): + return 0 - self.subscorer.score(matcher) + + def max_quality(self): + return 0 - self.subscorer.max_quality() + + def block_quality(self, matcher): + return 0 - self.subscorer.block_quality(matcher) + + +#class PositionWeighting(WeightingModel): +# def __init__(self, reversed=False): +# self.reversed = reversed +# +# def scorer(self, searcher, fieldname, text, qf=1): +# return PositionWeighting.PositionScorer() +# +# class PositionScorer(BaseScorer): +# def score(self, matcher): +# p = min(span.pos for span in matcher.spans()) +# if self.reversed: +# return p +# else: +# return 0 - p diff --git a/src/whoosh/searching.py b/src/whoosh/searching.py new file mode 100644 index 0000000..4e8cea6 --- /dev/null +++ b/src/whoosh/searching.py @@ -0,0 +1,1658 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +"""This module contains classes and functions related to searching the index. +""" + + +from __future__ import division +import copy +import weakref +from math import ceil + +from whoosh import classify, highlight, query, scoring +from whoosh.compat import iteritems, itervalues, iterkeys, xrange +from whoosh.idsets import DocIdSet, BitSet +from whoosh.reading import TermNotFound +from whoosh.util.cache import lru_cache + + +class NoTermsException(Exception): + """Exception raised you try to access matched terms on a :class:`Results` + object was created without them. To record which terms matched in which + document, you need to call the :meth:`Searcher.search` method with + ``terms=True``. + """ + + message = "Results were created without recording terms" + + +class TimeLimit(Exception): + """Raised by :class:`TimeLimitedCollector` if the time limit is reached + before the search finishes. If you have a reference to the collector, you + can get partial results by calling :meth:`TimeLimitedCollector.results`. + """ + + pass + + +# Context class + +class SearchContext(object): + """A container for information about the current search that may be used + by the collector or the query objects to change how they operate. + """ + + def __init__(self, needs_current=False, weighting=None, top_query=None, + limit=0): + """ + :param needs_current: if True, the search requires that the matcher + tree be "valid" and able to access information about the current + match. For queries during matcher instantiation, this means they + should not instantiate a matcher that doesn't allow access to the + current match's value, weight, and so on. For collectors, this + means they should advanced the matcher doc-by-doc rather than using + shortcut methods such as all_ids(). + :param weighting: the Weighting object to use for scoring documents. + :param top_query: a reference to the top-level query object. + :param limit: the number of results requested by the user. + """ + + self.needs_current = needs_current + self.weighting = weighting + self.top_query = top_query + self.limit = limit + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.__dict__) + + def set(self, **kwargs): + ctx = copy.copy(self) + ctx.__dict__.update(kwargs) + return ctx + + +# Searcher class + +class Searcher(object): + """Wraps an :class:`~whoosh.reading.IndexReader` object and provides + methods for searching the index. + """ + + def __init__(self, reader, weighting=scoring.BM25F, closereader=True, + fromindex=None, parent=None): + """ + :param reader: An :class:`~whoosh.reading.IndexReader` object for + the index to search. + :param weighting: A :class:`whoosh.scoring.Weighting` object to use to + score found documents. + :param closereader: Whether the underlying reader will be closed when + the searcher is closed. + :param fromindex: An optional reference to the index of the underlying + reader. This is required for :meth:`Searcher.up_to_date` and + :meth:`Searcher.refresh` to work. + """ + + self.ixreader = reader + self.is_closed = False + self._closereader = closereader + self._ix = fromindex + self._doccount = self.ixreader.doc_count_all() + # Cache for PostingCategorizer objects (supports fields without columns) + self._field_caches = {} + + if parent: + self.parent = weakref.ref(parent) + self.schema = parent.schema + self._idf_cache = parent._idf_cache + self._filter_cache = parent._filter_cache + else: + self.parent = None + self.schema = self.ixreader.schema + self._idf_cache = {} + self._filter_cache = {} + + if type(weighting) is type: + self.weighting = weighting() + else: + self.weighting = weighting + + self.leafreaders = None + self.subsearchers = None + if not self.ixreader.is_atomic(): + self.leafreaders = self.ixreader.leaf_readers() + self.subsearchers = [(self._subsearcher(r), offset) for r, offset + in self.leafreaders] + + # Copy attributes/methods from wrapped reader + for name in ("stored_fields", "all_stored_fields", "has_vector", + "vector", "vector_as", "lexicon", "field_terms", + "frequency", "doc_frequency", "term_info", + "doc_field_length", "corrector", "iter_docs"): + setattr(self, name, getattr(self.ixreader, name)) + + def __enter__(self): + return self + + def __exit__(self, *exc_info): + self.close() + + def _subsearcher(self, reader): + return self.__class__(reader, fromindex=self._ix, + weighting=self.weighting, parent=self) + + def _offset_for_subsearcher(self, subsearcher): + for ss, offset in self.subsearchers: + if ss is subsearcher: + return offset + + def leaf_searchers(self): + if self.is_atomic(): + return [(self, 0)] + else: + return self.subsearchers + + def is_atomic(self): + return self.reader().is_atomic() + + def has_parent(self): + return self.parent is not None + + def get_parent(self): + """Returns the parent of this searcher (if has_parent() is True), or + else self. + """ + + if self.has_parent(): + # Call the weak reference to get the parent searcher + return self.parent() + else: + return self + + def doc_count(self): + """Returns the number of UNDELETED documents in the index. + """ + + return self.ixreader.doc_count() + + def doc_count_all(self): + """Returns the total number of documents, DELETED OR UNDELETED, in + the index. + """ + + return self._doccount + + def field_length(self, fieldname): + if self.has_parent(): + return self.get_parent().field_length(fieldname) + else: + return self.reader().field_length(fieldname) + + def max_field_length(self, fieldname): + if self.has_parent(): + return self.get_parent().max_field_length(fieldname) + else: + return self.reader().max_field_length(fieldname) + + def up_to_date(self): + """Returns True if this Searcher represents the latest version of the + index, for backends that support versioning. + """ + + if not self._ix: + raise Exception("No reference to index") + return self._ix.latest_generation() == self.ixreader.generation() + + def refresh(self): + """Returns a fresh searcher for the latest version of the index:: + + my_searcher = my_searcher.refresh() + + If the index has not changed since this searcher was created, this + searcher is simply returned. + + This method may CLOSE underlying resources that are no longer needed + by the refreshed searcher, so you CANNOT continue to use the original + searcher after calling ``refresh()`` on it. + """ + + if not self._ix: + raise Exception("No reference to index") + if self._ix.latest_generation() == self.reader().generation(): + return self + + # Get a new reader, re-using resources from the current reader if + # possible + self.is_closed = True + newreader = self._ix.reader(reuse=self.ixreader) + return self.__class__(newreader, fromindex=self._ix, + weighting=self.weighting) + + def close(self): + if self._closereader: + self.ixreader.close() + self.is_closed = True + + def avg_field_length(self, fieldname, default=None): + if not self.schema[fieldname].scorable: + return default + return self.field_length(fieldname) / (self._doccount or 1) + + def reader(self): + """Returns the underlying :class:`~whoosh.reading.IndexReader`. + """ + return self.ixreader + + def context(self, **kwargs): + """Generates a :class:`SearchContext` for this searcher. + """ + + if "weighting" not in kwargs: + kwargs["weighting"] = self.weighting + + return SearchContext(**kwargs) + + def boolean_context(self): + """Shortcut returns a SearchContext set for unscored (boolean) + searching. + """ + + return self.context(needs_current=False, weighting=None) + + def postings(self, fieldname, text, weighting=None, qf=1): + """Returns a :class:`whoosh.matching.Matcher` for the postings of the + given term. Unlike the :func:`whoosh.reading.IndexReader.postings` + method, this method automatically sets the scoring functions on the + matcher from the searcher's weighting object. + """ + + weighting = weighting or self.weighting + globalscorer = weighting.scorer(self, fieldname, text, qf=qf) + + if self.is_atomic(): + return self.ixreader.postings(fieldname, text, scorer=globalscorer) + else: + from whoosh.matching import MultiMatcher + + matchers = [] + docoffsets = [] + term = (fieldname, text) + for subsearcher, offset in self.subsearchers: + r = subsearcher.reader() + if term in r: + # Make a segment-specific scorer; the scorer should call + # searcher.parent() to get global stats + scorer = weighting.scorer(subsearcher, fieldname, text, qf=qf) + m = r.postings(fieldname, text, scorer=scorer) + matchers.append(m) + docoffsets.append(offset) + + if not matchers: + raise TermNotFound(fieldname, text) + + return MultiMatcher(matchers, docoffsets, globalscorer) + + def idf(self, fieldname, text): + """Calculates the Inverse Document Frequency of the current term (calls + idf() on the searcher's Weighting object). + """ + + # This method just calls the Weighting object's idf() method, but + # caches the result. So Weighting objects should call *this* method + # which will then call *their own* idf() methods. + + cache = self._idf_cache + term = (fieldname, text) + if term in cache: + return cache[term] + + idf = self.weighting.idf(self, fieldname, text) + cache[term] = idf + return idf + + def document(self, **kw): + """Convenience method returns the stored fields of a document + matching the given keyword arguments, where the keyword keys are + field names and the values are terms that must appear in the field. + + This method is equivalent to:: + + searcher.stored_fields(searcher.document_number()) + + Where Searcher.documents() returns a generator, this function returns + either a dictionary or None. Use it when you assume the given keyword + arguments either match zero or one documents (i.e. at least one of the + fields is a unique key). + + >>> stored_fields = searcher.document(path=u"/a/b") + >>> if stored_fields: + ... print(stored_fields['title']) + ... else: + ... print("There is no document with the path /a/b") + """ + + for p in self.documents(**kw): + return p + + def documents(self, **kw): + """Convenience method returns the stored fields of a document + matching the given keyword arguments, where the keyword keys are field + names and the values are terms that must appear in the field. + + Returns a generator of dictionaries containing the stored fields of any + documents matching the keyword arguments. If you do not specify any + arguments (``Searcher.documents()``), this method will yield **all** + documents. + + >>> for stored_fields in searcher.documents(emailto=u"matt@whoosh.ca"): + ... print("Email subject:", stored_fields['subject']) + """ + + ixreader = self.ixreader + return (ixreader.stored_fields(docnum) + for docnum in self.document_numbers(**kw)) + + def _kw_to_text(self, kw): + for k, v in iteritems(kw): + field = self.schema[k] + kw[k] = field.to_bytes(v) + + def _query_for_kw(self, kw): + subqueries = [] + for key, value in iteritems(kw): + subqueries.append(query.Term(key, value)) + if subqueries: + q = query.And(subqueries).normalize() + else: + q = query.Every() + return q + + def document_number(self, **kw): + """Returns the document number of the document matching the given + keyword arguments, where the keyword keys are field names and the + values are terms that must appear in the field. + + >>> docnum = searcher.document_number(path=u"/a/b") + + Where Searcher.document_numbers() returns a generator, this function + returns either an int or None. Use it when you assume the given keyword + arguments either match zero or one documents (i.e. at least one of the + fields is a unique key). + + :rtype: int + """ + + # In the common case where only one keyword was given, just use + # first_id() instead of building a query. + + self._kw_to_text(kw) + if len(kw) == 1: + k, v = list(kw.items())[0] + try: + return self.reader().first_id(k, v) + except TermNotFound: + return None + else: + m = self._query_for_kw(kw).matcher(self, self.boolean_context()) + if m.is_active(): + return m.id() + + def document_numbers(self, **kw): + """Returns a generator of the document numbers for documents matching + the given keyword arguments, where the keyword keys are field names and + the values are terms that must appear in the field. If you do not + specify any arguments (``Searcher.document_numbers()``), this method + will yield **all** document numbers. + + >>> docnums = list(searcher.document_numbers(emailto="matt@whoosh.ca")) + """ + + self._kw_to_text(kw) + return self.docs_for_query(self._query_for_kw(kw)) + + def _find_unique(self, uniques): + # uniques is a list of ("unique_field_name", "field_value") tuples + delset = set() + for name, value in uniques: + docnum = self.document_number(**{name: value}) + if docnum is not None: + delset.add(docnum) + return delset + + @lru_cache(20) + def _query_to_comb(self, fq): + return BitSet(self.docs_for_query(fq), size=self.doc_count_all()) + + def _filter_to_comb(self, obj): + if obj is None: + return None + if isinstance(obj, (set, DocIdSet)): + c = obj + elif isinstance(obj, Results): + c = obj.docs() + elif isinstance(obj, ResultsPage): + c = obj.results.docs() + elif isinstance(obj, query.Query): + c = self._query_to_comb(obj) + else: + raise Exception("Don't know what to do with filter object %r" + % obj) + + return c + + def suggest(self, fieldname, text, limit=5, maxdist=2, prefix=0): + """Returns a sorted list of suggested corrections for the given + mis-typed word ``text`` based on the contents of the given field:: + + >>> searcher.suggest("content", "specail") + ["special"] + + This is a convenience method. If you are planning to get suggestions + for multiple words in the same field, it is more efficient to get a + :class:`~whoosh.spelling.Corrector` object and use it directly:: + + corrector = searcher.corrector("fieldname") + for word in words: + print(corrector.suggest(word)) + + :param limit: only return up to this many suggestions. If there are not + enough terms in the field within ``maxdist`` of the given word, the + returned list will be shorter than this number. + :param maxdist: the largest edit distance from the given word to look + at. Numbers higher than 2 are not very effective or efficient. + :param prefix: require suggestions to share a prefix of this length + with the given word. This is often justifiable since most + misspellings do not involve the first letter of the word. Using a + prefix dramatically decreases the time it takes to generate the + list of words. + """ + + c = self.reader().corrector(fieldname) + return c.suggest(text, limit=limit, maxdist=maxdist, prefix=prefix) + + def key_terms(self, docnums, fieldname, numterms=5, + model=classify.Bo1Model, normalize=True): + """Returns the 'numterms' most important terms from the documents + listed (by number) in 'docnums'. You can get document numbers for the + documents your interested in with the document_number() and + document_numbers() methods. + + "Most important" is generally defined as terms that occur frequently in + the top hits but relatively infrequently in the collection as a whole. + + >>> docnum = searcher.document_number(path=u"/a/b") + >>> keywords_and_scores = searcher.key_terms([docnum], "content") + + This method returns a list of ("term", score) tuples. The score may be + useful if you want to know the "strength" of the key terms, however to + just get the terms themselves you can just do this: + + >>> kws = [kw for kw, score in searcher.key_terms([docnum], "content")] + + :param fieldname: Look at the terms in this field. This field must + store vectors. + :param docnums: A sequence of document numbers specifying which + documents to extract key terms from. + :param numterms: Return this number of important terms. + :param model: The classify.ExpansionModel to use. See the classify + module. + :param normalize: normalize the scores. + :returns: a list of ("term", score) tuples. + """ + + expander = classify.Expander(self.ixreader, fieldname, model=model) + for docnum in docnums: + expander.add_document(docnum) + return expander.expanded_terms(numterms, normalize=normalize) + + def key_terms_from_text(self, fieldname, text, numterms=5, + model=classify.Bo1Model, normalize=True): + """Return the 'numterms' most important terms from the given text. + + :param numterms: Return this number of important terms. + :param model: The classify.ExpansionModel to use. See the classify + module. + """ + + expander = classify.Expander(self.ixreader, fieldname, model=model) + expander.add_text(text) + return expander.expanded_terms(numterms, normalize=normalize) + + def more_like(self, docnum, fieldname, text=None, top=10, numterms=5, + model=classify.Bo1Model, normalize=False, filter=None): + """Returns a :class:`Results` object containing documents similar to + the given document, based on "key terms" in the given field:: + + # Get the ID for the document you're interested in + docnum = search.document_number(path=u"/a/b/c") + + r = searcher.more_like(docnum) + + print("Documents like", searcher.stored_fields(docnum)["title"]) + for hit in r: + print(hit["title"]) + + :param fieldname: the name of the field to use to test similarity. + :param text: by default, the method will attempt to load the contents + of the field from the stored fields for the document, or from a + term vector. If the field isn't stored or vectored in the index, + but you have access to the text another way (for example, loading + from a file or a database), you can supply it using the ``text`` + parameter. + :param top: the number of results to return. + :param numterms: the number of "key terms" to extract from the hit and + search for. Using more terms is slower but gives potentially more + and more accurate results. + :param model: (expert) a :class:`whoosh.classify.ExpansionModel` to use + to compute "key terms". + :param normalize: whether to normalize term weights. + :param filter: a query, Results object, or set of docnums. The results + will only contain documents that are also in the filter object. + """ + + if text: + kts = self.key_terms_from_text(fieldname, text, numterms=numterms, + model=model, normalize=normalize) + else: + kts = self.key_terms([docnum], fieldname, numterms=numterms, + model=model, normalize=normalize) + # Create an Or query from the key terms + q = query.Or([query.Term(fieldname, word, boost=weight) + for word, weight in kts]) + + return self.search(q, limit=top, filter=filter, mask=set([docnum])) + + def search_page(self, query, pagenum, pagelen=10, **kwargs): + """This method is Like the :meth:`Searcher.search` method, but returns + a :class:`ResultsPage` object. This is a convenience function for + getting a certain "page" of the results for the given query, which is + often useful in web search interfaces. + + For example:: + + querystring = request.get("q") + query = queryparser.parse("content", querystring) + + pagenum = int(request.get("page", 1)) + pagelen = int(request.get("perpage", 10)) + + results = searcher.search_page(query, pagenum, pagelen=pagelen) + print("Page %d of %d" % (results.pagenum, results.pagecount)) + print("Showing results %d-%d of %d" + % (results.offset + 1, results.offset + results.pagelen + 1, + len(results))) + for hit in results: + print("%d: %s" % (hit.rank + 1, hit["title"])) + + (Note that results.pagelen might be less than the pagelen argument if + there aren't enough results to fill a page.) + + Any additional keyword arguments you supply are passed through to + :meth:`Searcher.search`. For example, you can get paged results of a + sorted search:: + + results = searcher.search_page(q, 2, sortedby="date", reverse=True) + + Currently, searching for page 100 with pagelen of 10 takes the same + amount of time as using :meth:`Searcher.search` to find the first 1000 + results. That is, this method does not have any special optimizations + or efficiencies for getting a page from the middle of the full results + list. (A future enhancement may allow using previous page results to + improve the efficiency of finding the next page.) + + This method will raise a ``ValueError`` if you ask for a page number + higher than the number of pages in the resulting query. + + :param query: the :class:`whoosh.query.Query` object to match. + :param pagenum: the page number to retrieve, starting at ``1`` for the + first page. + :param pagelen: the number of results per page. + :returns: :class:`ResultsPage` + """ + + if pagenum < 1: + raise ValueError("pagenum must be >= 1") + + results = self.search(query, limit=pagenum * pagelen, **kwargs) + return ResultsPage(results, pagenum, pagelen) + + def find(self, defaultfield, querystring, **kwargs): + from whoosh.qparser import QueryParser + qp = QueryParser(defaultfield, schema=self.ixreader.schema) + q = qp.parse(querystring) + return self.search(q, **kwargs) + + def docs_for_query(self, q, for_deletion=False): + """Returns an iterator of document numbers for documents matching the + given :class:`whoosh.query.Query` object. + """ + + # If we're getting the document numbers so we can delete them, use the + # deletion_docs method instead of docs; this lets special queries + # (e.g. nested queries) override what gets deleted + if for_deletion: + method = q.deletion_docs + else: + method = q.docs + + if self.subsearchers: + for s, offset in self.subsearchers: + for docnum in method(s): + yield docnum + offset + else: + for docnum in method(self): + yield docnum + + def collector(self, limit=10, sortedby=None, reverse=False, groupedby=None, + collapse=None, collapse_limit=1, collapse_order=None, + optimize=True, filter=None, mask=None, terms=False, + maptype=None, scored=True): + """Low-level method: returns a configured + :class:`whoosh.collectors.Collector` object based on the given + arguments. You can use this object with + :meth:`Searcher.search_with_collector` to search. + + See the documentation for the :meth:`Searcher.search` method for a + description of the parameters. + + This method may be useful to get a basic collector object and then wrap + it with another collector from ``whoosh.collectors`` or with a custom + collector of your own:: + + # Equivalent of + # results = mysearcher.search(myquery, limit=10) + # but with a time limt... + + # Create a TopCollector + c = mysearcher.collector(limit=10) + + # Wrap it with a TimeLimitedCollector with a time limit of + # 10.5 seconds + from whoosh.collectors import TimeLimitedCollector + c = TimeLimitCollector(c, 10.5) + + # Search using the custom collector + results = mysearcher.search_with_collector(myquery, c) + """ + + from whoosh import collectors + + if limit is not None and limit < 1: + raise ValueError("limit must be >= 1") + + if not scored and not sortedby: + c = collectors.UnsortedCollector() + elif sortedby: + c = collectors.SortingCollector(sortedby, limit=limit, + reverse=reverse) + elif groupedby or reverse or not limit or limit >= self.doc_count(): + # A collector that gathers every matching document + c = collectors.UnlimitedCollector(reverse=reverse) + else: + # A collector that uses block quality optimizations and a heap + # queue to only collect the top N documents + c = collectors.TopCollector(limit, usequality=optimize) + + if groupedby: + c = collectors.FacetCollector(c, groupedby, maptype=maptype) + if terms: + c = collectors.TermsCollector(c) + if collapse: + c = collectors.CollapseCollector(c, collapse, limit=collapse_limit, + order=collapse_order) + + # Filtering wraps last so it sees the docs first + if filter or mask: + c = collectors.FilterCollector(c, filter, mask) + return c + + def search(self, q, **kwargs): + """Runs a :class:`whoosh.query.Query` object on this searcher and + returns a :class:`Results` object. See :doc:`/searching` for more + information. + + This method takes many keyword arguments (documented below). + + See :doc:`/facets` for information on using ``sortedby`` and/or + ``groupedby``. See :ref:`collapsing` for more information on using + ``collapse``, ``collapse_limit``, and ``collapse_order``. + + :param query: a :class:`whoosh.query.Query` object to use to match + documents. + :param limit: the maximum number of documents to score. If you're only + interested in the top N documents, you can set limit=N to limit the + scoring for a faster search. Default is 10. + :param scored: whether to score the results. Overriden by ``sortedby``. + If both ``scored=False`` and ``sortedby=None``, the results will be + in arbitrary order, but will usually be computed faster than + scored or sorted results. + :param sortedby: see :doc:`/facets`. + :param reverse: Reverses the direction of the sort. Default is False. + :param groupedby: see :doc:`/facets`. + :param optimize: use optimizations to get faster results when possible. + Default is True. + :param filter: a query, Results object, or set of docnums. The results + will only contain documents that are also in the filter object. + :param mask: a query, Results object, or set of docnums. The results + will not contain any documents that are in the mask object. + :param terms: if True, record which terms were found in each matching + document. See :doc:`/searching` for more information. Default is + False. + :param maptype: by default, the results of faceting with ``groupedby`` + is a dictionary mapping group names to ordered lists of document + numbers in the group. You can pass a + :class:`whoosh.sorting.FacetMap` subclass to this keyword argument + to specify a different (usually faster) method for grouping. For + example, ``maptype=sorting.Count`` would store only the count of + documents in each group, instead of the full list of document IDs. + :param collapse: a :doc:`facet ` to use to collapse the + results. See :ref:`collapsing` for more information. + :param collapse_limit: the maximum number of documents to allow with + the same collapse key. See :ref:`collapsing` for more information. + :param collapse_order: an optional ordering :doc:`facet ` + to control which documents are kept when collapsing. The default + (``collapse_order=None``) uses the results order (e.g. the highest + scoring documents in a scored search). + :rtype: :class:`Results` + """ + + # Call the collector() method to build a collector based on the + # parameters passed to this method + c = self.collector(**kwargs) + # Call the lower-level method to run the collector + self.search_with_collector(q, c) + # Return the results object from the collector + return c.results() + + def search_with_collector(self, q, collector, context=None): + """Low-level method: runs a :class:`whoosh.query.Query` object on this + searcher using the given :class:`whoosh.collectors.Collector` object + to collect the results:: + + myquery = query.Term("content", "cabbage") + + uc = collectors.UnlimitedCollector() + tc = TermsCollector(uc) + + mysearcher.search_with_collector(myquery, tc) + print(tc.docterms) + print(tc.results()) + + Note that this method does not return a :class:`Results` object. You + need to access the collector to get a results object or other + information the collector might hold after the search. + + :param q: a :class:`whoosh.query.Query` object to use to match + documents. + :param collector: a :class:`whoosh.collectors.Collector` object to feed + the results into. + """ + + # Get the search context object from the searcher + context = context or self.context() + # Allow collector to set up based on the top-level information + collector.prepare(self, q, context) + + collector.run() + + def correct_query(self, q, qstring, correctors=None, terms=None, maxdist=2, + prefix=0, aliases=None): + """ + Returns a corrected version of the given user query using a default + :class:`whoosh.spelling.ReaderCorrector`. + + The default: + + * Corrects any words that don't appear in the index. + + * Takes suggestions from the words in the index. To make certain fields + use custom correctors, use the ``correctors`` argument to pass a + dictionary mapping field names to :class:`whoosh.spelling.Corrector` + objects. + + * ONLY CORRECTS FIELDS THAT HAVE THE ``spelling`` ATTRIBUTE in the + schema (or for which you pass a custom corrector). To automatically + check all fields, use ``allfields=True``. Spell checking fields + without ``spelling`` is slower. + + Expert users who want more sophisticated correction behavior can create + a custom :class:`whoosh.spelling.QueryCorrector` and use that instead + of this method. + + Returns a :class:`whoosh.spelling.Correction` object with a ``query`` + attribute containing the corrected :class:`whoosh.query.Query` object + and a ``string`` attributes containing the corrected query string. + + >>> from whoosh import qparser, highlight + >>> qtext = 'mary "litle lamb"' + >>> q = qparser.QueryParser("text", myindex.schema) + >>> mysearcher = myindex.searcher() + >>> correction = mysearcher().correct_query(q, qtext) + >>> correction.query + + >>> correction.string + 'mary "little lamb"' + >>> mysearcher.close() + + You can use the ``Correction`` object's ``format_string`` method to + format the corrected query string using a + :class:`whoosh.highlight.Formatter` object. For example, you can format + the corrected string as HTML, emphasizing the changed words. + + >>> hf = highlight.HtmlFormatter(classname="change") + >>> correction.format_string(hf) + 'mary "little lamb"' + + :param q: the :class:`whoosh.query.Query` object to correct. + :param qstring: the original user query from which the query object was + created. You can pass None instead of a string, in which the + second item in the returned tuple will also be None. + :param correctors: an optional dictionary mapping fieldnames to + :class:`whoosh.spelling.Corrector` objects. By default, this method + uses the contents of the index to spell check the terms in the + query. You can use this argument to "override" some fields with a + different correct, for example a + :class:`whoosh.spelling.GraphCorrector`. + :param terms: a sequence of ``("fieldname", "text")`` tuples to correct + in the query. By default, this method corrects terms that don't + appear in the index. You can use this argument to override that + behavior and explicitly specify the terms that should be corrected. + :param maxdist: the maximum number of "edits" (insertions, deletions, + subsitutions, or transpositions of letters) allowed between the + original word and any suggestion. Values higher than ``2`` may be + slow. + :param prefix: suggested replacement words must share this number of + initial characters with the original word. Increasing this even to + just ``1`` can dramatically speed up suggestions, and may be + justifiable since spellling mistakes rarely involve the first + letter of a word. + :param aliases: an optional dictionary mapping field names in the query + to different field names to use as the source of spelling + suggestions. The mappings in ``correctors`` are applied after this. + :rtype: :class:`whoosh.spelling.Correction` + """ + + reader = self.reader() + + # Dictionary of field name alias mappings + if aliases is None: + aliases = {} + # Dictionary of custom per-field correctors + if correctors is None: + correctors = {} + + # Remap correctors dict according to aliases + d = {} + for fieldname, corr in iteritems(correctors): + fieldname = aliases.get(fieldname, fieldname) + d[fieldname] = corr + correctors = d + + # Fill in default corrector objects for fields that don't have a custom + # one in the "correctors" dictionary + fieldnames = self.schema.names() + for fieldname in fieldnames: + fieldname = aliases.get(fieldname, fieldname) + if fieldname not in correctors: + correctors[fieldname] = self.reader().corrector(fieldname) + + # Get any missing terms in the query in the fields we're correcting + if terms is None: + terms = [] + for token in q.all_tokens(): + aname = aliases.get(token.fieldname, token.fieldname) + text = token.text + if aname in correctors and (aname, text) not in reader: + # Note that we use the original, not aliases fieldname here + # so if we correct the query we know what it was + terms.append((token.fieldname, token.text)) + + # Make q query corrector + from whoosh import spelling + sqc = spelling.SimpleQueryCorrector(correctors, terms, aliases) + return sqc.correct_query(q, qstring) + + +class Results(object): + """This object is returned by a Searcher. This object represents the + results of a search query. You can mostly use it as if it was a list of + dictionaries, where each dictionary is the stored fields of the document at + that position in the results. + + Note that a Results object keeps a reference to the Searcher that created + it, so keeping a reference to a Results object keeps the Searcher alive and + so keeps all files used by it open. + """ + + def __init__(self, searcher, q, top_n, docset=None, facetmaps=None, + runtime=0, highlighter=None): + """ + :param searcher: the :class:`Searcher` object that produced these + results. + :param query: the original query that created these results. + :param top_n: a list of (score, docnum) tuples representing the top + N search results. + """ + + self.searcher = searcher + self.q = q + self.top_n = top_n + self.docset = docset + self._facetmaps = facetmaps or {} + self.runtime = runtime + self.highlighter = highlighter or highlight.Highlighter() + self.collector = None + self._total = None + self._char_cache = {} + + def __repr__(self): + return "" % (len(self.top_n), + self.q, + self.runtime) + + def __len__(self): + """Returns the total number of documents that matched the query. Note + this may be more than the number of scored documents, given the value + of the ``limit`` keyword argument to :meth:`Searcher.search`. + + If this Results object was created by searching with a ``limit`` + keyword, then computing the exact length of the result set may be + expensive for large indexes or large result sets. You may consider + using :meth:`Results.has_exact_length`, + :meth:`Results.estimated_length`, and + :meth:`Results.estimated_min_length` to display an estimated size of + the result set instead of an exact number. + """ + + if self._total is None: + self._total = self.collector.count() + return self._total + + def __getitem__(self, n): + if isinstance(n, slice): + start, stop, step = n.indices(len(self.top_n)) + return [Hit(self, self.top_n[i][1], i, self.top_n[i][0]) + for i in xrange(start, stop, step)] + else: + if n >= len(self.top_n): + raise IndexError("results[%r]: Results only has %s hits" + % (n, len(self.top_n))) + return Hit(self, self.top_n[n][1], n, self.top_n[n][0]) + + def __iter__(self): + """Yields a :class:`Hit` object for each result in ranked order. + """ + + for i in xrange(len(self.top_n)): + yield Hit(self, self.top_n[i][1], i, self.top_n[i][0]) + + def __contains__(self, docnum): + """Returns True if the given document number matched the query. + """ + + return docnum in self.docs() + + def __nonzero__(self): + return not self.is_empty() + + __bool__ = __nonzero__ + + def is_empty(self): + """Returns True if not documents matched the query. + """ + + return self.scored_length() == 0 + + def items(self): + """Returns an iterator of (docnum, score) pairs for the scored + documents in the results. + """ + + return ((docnum, score) for score, docnum in self.top_n) + + def fields(self, n): + """Returns the stored fields for the document at the ``n`` th position + in the results. Use :meth:`Results.docnum` if you want the raw + document number instead of the stored fields. + """ + + return self.searcher.stored_fields(self.top_n[n][1]) + + def facet_names(self): + """Returns the available facet names, for use with the ``groups()`` + method. + """ + + return self._facetmaps.keys() + + def groups(self, name=None): + """If you generated facet groupings for the results using the + `groupedby` keyword argument to the ``search()`` method, you can use + this method to retrieve the groups. You can use the ``facet_names()`` + method to get the list of available facet names. + + >>> results = searcher.search(my_query, groupedby=["tag", "price"]) + >>> results.facet_names() + ["tag", "price"] + >>> results.groups("tag") + {"new": [12, 1, 4], "apple": [3, 10, 5], "search": [11]} + + If you only used one facet, you can call the method without a facet + name to get the groups for the facet. + + >>> results = searcher.search(my_query, groupedby="tag") + >>> results.groups() + {"new": [12, 1, 4], "apple": [3, 10, 5, 0], "search": [11]} + + By default, this returns a dictionary mapping category names to a list + of document numbers, in the same relative order as they appear in the + results. + + >>> results = mysearcher.search(myquery, groupedby="tag") + >>> docnums = results.groups() + >>> docnums['new'] + [12, 1, 4] + + You can then use :meth:`Searcher.stored_fields` to get the stored + fields associated with a document ID. + + If you specified a different ``maptype`` for the facet when you + searched, the values in the dictionary depend on the + :class:`whoosh.sorting.FacetMap`. + + >>> myfacet = sorting.FieldFacet("tag", maptype=sorting.Count) + >>> results = mysearcher.search(myquery, groupedby=myfacet) + >>> counts = results.groups() + {"new": 3, "apple": 4, "search": 1} + """ + + if (name is None or name == "facet") and len(self._facetmaps) == 1: + # If there's only one facet, just use it; convert keys() to list + # for Python 3 + name = list(self._facetmaps.keys())[0] + elif name not in self._facetmaps: + raise KeyError("%r not in facet names %r" + % (name, self.facet_names())) + return self._facetmaps[name].as_dict() + + def has_exact_length(self): + """Returns True if this results object already knows the exact number + of matching documents. + """ + + if self.collector: + return self.collector.computes_count() + else: + return self._total is not None + + def estimated_length(self): + """The estimated maximum number of matching documents, or the + exact number of matching documents if it's known. + """ + + if self.has_exact_length(): + return len(self) + else: + return self.q.estimate_size(self.searcher.reader()) + + def estimated_min_length(self): + """The estimated minimum number of matching documents, or the + exact number of matching documents if it's known. + """ + + if self.has_exact_length(): + return len(self) + else: + return self.q.estimate_min_size(self.searcher.reader()) + + def scored_length(self): + """Returns the number of scored documents in the results, equal to or + less than the ``limit`` keyword argument to the search. + + >>> r = mysearcher.search(myquery, limit=20) + >>> len(r) + 1246 + >>> r.scored_length() + 20 + + This may be fewer than the total number of documents that match the + query, which is what ``len(Results)`` returns. + """ + + return len(self.top_n) + + def docs(self): + """Returns a set-like object containing the document numbers that + matched the query. + """ + + if self.docset is None: + self.docset = set(self.collector.all_ids()) + return self.docset + + def copy(self): + """Returns a deep copy of this results object. + """ + + # Shallow copy self to get attributes + r = copy.copy(self) + # Deep copies of docset and top_n in case they're modified + r.docset = copy.deepcopy(self.docset) + r.top_n = copy.deepcopy(self.top_n) + return r + + def score(self, n): + """Returns the score for the document at the Nth position in the list + of ranked documents. If the search was not scored, this may return + None. + """ + + return self.top_n[n][0] + + def docnum(self, n): + """Returns the document number of the result at position n in the list + of ranked documents. + """ + return self.top_n[n][1] + + def query_terms(self, expand=False, fieldname=None): + return self.q.existing_terms(self.searcher.reader(), + fieldname=fieldname, expand=expand) + + def has_matched_terms(self): + """Returns True if the search recorded which terms matched in which + documents. + + >>> r = searcher.search(myquery) + >>> r.has_matched_terms() + False + >>> + """ + + return hasattr(self, "docterms") and hasattr(self, "termdocs") + + def matched_terms(self): + """Returns the set of ``("fieldname", "text")`` tuples representing + terms from the query that matched one or more of the TOP N documents + (this does not report terms for documents that match the query but did + not score high enough to make the top N results). You can compare this + set to the terms from the original query to find terms which didn't + occur in any matching documents. + + This is only valid if you used ``terms=True`` in the search call to + record matching terms. Otherwise it will raise an exception. + + >>> q = myparser.parse("alfa OR bravo OR charlie") + >>> results = searcher.search(q, terms=True) + >>> results.terms() + set([("content", "alfa"), ("content", "charlie")]) + >>> q.all_terms() - results.terms() + set([("content", "bravo")]) + """ + + if not self.has_matched_terms(): + raise NoTermsException + return set(self.termdocs.keys()) + + def _get_fragmenter(self): + return self.highlighter.fragmenter + + def _set_fragmenter(self, f): + self.highlighter.fragmenter = f + + fragmenter = property(_get_fragmenter, _set_fragmenter) + + def _get_formatter(self): + return self.highlighter.formatter + + def _set_formatter(self, f): + self.highlighter.formatter = f + + formatter = property(_get_formatter, _set_formatter) + + def _get_scorer(self): + return self.highlighter.scorer + + def _set_scorer(self, s): + self.highlighter.scorer = s + + scorer = property(_get_scorer, _set_scorer) + + def _get_order(self): + return self.highlighter.order + + def _set_order(self, o): + self.highlighter.order = o + + order = property(_get_order, _set_order) + + def key_terms(self, fieldname, docs=10, numterms=5, + model=classify.Bo1Model, normalize=True): + """Returns the 'numterms' most important terms from the top 'docs' + documents in these results. "Most important" is generally defined as + terms that occur frequently in the top hits but relatively infrequently + in the collection as a whole. + + :param fieldname: Look at the terms in this field. This field must + store vectors. + :param docs: Look at this many of the top documents of the results. + :param numterms: Return this number of important terms. + :param model: The classify.ExpansionModel to use. See the classify + module. + :returns: list of unicode strings. + """ + + if not len(self): + return [] + docs = min(docs, len(self)) + + reader = self.searcher.reader() + + expander = classify.Expander(reader, fieldname, model=model) + for _, docnum in self.top_n[:docs]: + expander.add_document(docnum) + + return expander.expanded_terms(numterms, normalize=normalize) + + def extend(self, results): + """Appends hits from 'results' (that are not already in this + results object) to the end of these results. + + :param results: another results object. + """ + + docs = self.docs() + for item in results.top_n: + if item[1] not in docs: + self.top_n.append(item) + self.docset = docs | results.docs() + + def filter(self, results): + """Removes any hits that are not also in the other results object. + """ + + if not len(results): + return + + otherdocs = results.docs() + items = [item for item in self.top_n if item[1] in otherdocs] + self.docset = self.docs() & otherdocs + self.top_n = items + + def upgrade(self, results, reverse=False): + """Re-sorts the results so any hits that are also in 'results' appear + before hits not in 'results', otherwise keeping their current relative + positions. This does not add the documents in the other results object + to this one. + + :param results: another results object. + :param reverse: if True, lower the position of hits in the other + results object instead of raising them. + """ + + if not len(results): + return + + otherdocs = results.docs() + arein = [item for item in self.top_n if item[1] in otherdocs] + notin = [item for item in self.top_n if item[1] not in otherdocs] + + if reverse: + items = notin + arein + else: + items = arein + notin + + self.top_n = items + + def upgrade_and_extend(self, results): + """Combines the effects of extend() and upgrade(): hits that are also + in 'results' are raised. Then any hits from the other results object + that are not in this results object are appended to the end. + + :param results: another results object. + """ + + if not len(results): + return + + docs = self.docs() + otherdocs = results.docs() + + arein = [item for item in self.top_n if item[1] in otherdocs] + notin = [item for item in self.top_n if item[1] not in otherdocs] + other = [item for item in results.top_n if item[1] not in docs] + + self.docset = docs | otherdocs + self.top_n = arein + notin + other + + +class Hit(object): + """Represents a single search result ("hit") in a Results object. + + This object acts like a dictionary of the matching document's stored + fields. If for some reason you need an actual ``dict`` object, use + ``Hit.fields()`` to get one. + + >>> r = searcher.search(query.Term("content", "render")) + >>> r[0] + < Hit {title = u"Rendering the scene"} > + >>> r[0].rank + 0 + >>> r[0].docnum == 4592 + True + >>> r[0].score + 2.52045682 + >>> r[0]["title"] + "Rendering the scene" + >>> r[0].keys() + ["title"] + """ + + def __init__(self, results, docnum, pos=None, score=None): + """ + :param results: the Results object this hit belongs to. + :param pos: the position in the results list of this hit, for example + pos = 0 means this is the first (highest scoring) hit. + :param docnum: the document number of this hit. + :param score: the score of this hit. + """ + + self.results = results + self.searcher = results.searcher + self.reader = self.searcher.reader() + self.pos = self.rank = pos + self.docnum = docnum + self.score = score + self._fields = None + + def fields(self): + """Returns a dictionary of the stored fields of the document this + object represents. + """ + + if self._fields is None: + self._fields = self.searcher.stored_fields(self.docnum) + return self._fields + + def matched_terms(self): + """Returns the set of ``("fieldname", "text")`` tuples representing + terms from the query that matched in this document. You can + compare this set to the terms from the original query to find terms + which didn't occur in this document. + + This is only valid if you used ``terms=True`` in the search call to + record matching terms. Otherwise it will raise an exception. + + >>> q = myparser.parse("alfa OR bravo OR charlie") + >>> results = searcher.search(q, terms=True) + >>> for hit in results: + ... print(hit["title"]) + ... print("Contains:", hit.matched_terms()) + ... print("Doesn't contain:", q.all_terms() - hit.matched_terms()) + """ + + if not self.results.has_matched_terms(): + raise NoTermsException + return self.results.docterms.get(self.docnum, []) + + def highlights(self, fieldname, text=None, top=3, minscore=1): + """Returns highlighted snippets from the given field:: + + r = searcher.search(myquery) + for hit in r: + print(hit["title"]) + print(hit.highlights("content")) + + See :doc:`/highlight`. + + To change the fragmeter, formatter, order, or scorer used in + highlighting, you can set attributes on the results object:: + + from whoosh import highlight + + results = searcher.search(myquery, terms=True) + results.fragmenter = highlight.SentenceFragmenter() + + ...or use a custom :class:`whoosh.highlight.Highlighter` object:: + + hl = highlight.Highlighter(fragmenter=sf) + results.highlighter = hl + + :param fieldname: the name of the field you want to highlight. + :param text: by default, the method will attempt to load the contents + of the field from the stored fields for the document. If the field + you want to highlight isn't stored in the index, but you have + access to the text another way (for example, loading from a file or + a database), you can supply it using the ``text`` parameter. + :param top: the maximum number of fragments to return. + :param minscore: the minimum score for fragments to appear in the + highlights. + """ + + hliter = self.results.highlighter + return hliter.highlight_hit(self, fieldname, text=text, top=top, + minscore=minscore) + + def more_like_this(self, fieldname, text=None, top=10, numterms=5, + model=classify.Bo1Model, normalize=True, filter=None): + """Returns a new Results object containing documents similar to this + hit, based on "key terms" in the given field:: + + r = searcher.search(myquery) + for hit in r: + print(hit["title"]) + print("Top 3 similar documents:") + for subhit in hit.more_like_this("content", top=3): + print(" ", subhit["title"]) + + :param fieldname: the name of the field to use to test similarity. + :param text: by default, the method will attempt to load the contents + of the field from the stored fields for the document, or from a + term vector. If the field isn't stored or vectored in the index, + but you have access to the text another way (for example, loading + from a file or a database), you can supply it using the ``text`` + parameter. + :param top: the number of results to return. + :param numterms: the number of "key terms" to extract from the hit and + search for. Using more terms is slower but gives potentially more + and more accurate results. + :param model: (expert) a :class:`whoosh.classify.ExpansionModel` to use + to compute "key terms". + :param normalize: whether to normalize term weights. + """ + + return self.searcher.more_like(self.docnum, fieldname, text=text, + top=top, numterms=numterms, model=model, + normalize=normalize, filter=filter) + + def __repr__(self): + return "<%s %r>" % (self.__class__.__name__, self.fields()) + + def __eq__(self, other): + if isinstance(other, Hit): + return self.fields() == other.fields() + elif isinstance(other, dict): + return self.fields() == other + else: + return False + + def __len__(self): + return len(self.fields()) + + def __iter__(self): + return iterkeys(self.fields()) + + def __getitem__(self, fieldname): + if fieldname in self.fields(): + return self._fields[fieldname] + + reader = self.reader + if reader.has_column(fieldname): + cr = reader.column_reader(fieldname) + return cr[self.docnum] + + raise KeyError(fieldname) + + def __contains__(self, key): + return (key in self.fields() + or self.reader.has_column(key)) + + def items(self): + return list(self.fields().items()) + + def keys(self): + return list(self.fields().keys()) + + def values(self): + return list(self.fields().values()) + + def iteritems(self): + return iteritems(self.fields()) + + def iterkeys(self): + return iterkeys(self.fields()) + + def itervalues(self): + return itervalues(self.fields()) + + def get(self, key, default=None): + return self.fields().get(key, default) + + def __setitem__(self, key, value): + raise NotImplementedError("You cannot modify a search result") + + def __delitem__(self, key, value): + raise NotImplementedError("You cannot modify a search result") + + def clear(self): + raise NotImplementedError("You cannot modify a search result") + + def update(self, dict=None, **kwargs): + raise NotImplementedError("You cannot modify a search result") + + +class ResultsPage(object): + """Represents a single page out of a longer list of results, as returned + by :func:`whoosh.searching.Searcher.search_page`. Supports a subset of the + interface of the :class:`~whoosh.searching.Results` object, namely getting + stored fields with __getitem__ (square brackets), iterating, and the + ``score()`` and ``docnum()`` methods. + + The ``offset`` attribute contains the results number this page starts at + (numbered from 0). For example, if the page length is 10, the ``offset`` + attribute on the second page will be ``10``. + + The ``pagecount`` attribute contains the number of pages available. + + The ``pagenum`` attribute contains the page number. This may be less than + the page you requested if the results had too few pages. For example, if + you do:: + + ResultsPage(results, 5) + + but the results object only contains 3 pages worth of hits, ``pagenum`` + will be 3. + + The ``pagelen`` attribute contains the number of results on this page + (which may be less than the page length you requested if this is the last + page of the results). + + The ``total`` attribute contains the total number of hits in the results. + + >>> mysearcher = myindex.searcher() + >>> pagenum = 2 + >>> page = mysearcher.find_page(pagenum, myquery) + >>> print("Page %s of %s, results %s to %s of %s" % + ... (pagenum, page.pagecount, page.offset+1, + ... page.offset+page.pagelen, page.total)) + >>> for i, fields in enumerate(page): + ... print("%s. %r" % (page.offset + i + 1, fields)) + >>> mysearcher.close() + + To set highlighter attributes (for example ``formatter``), access the + underlying :class:`Results` object:: + + page.results.formatter = highlight.UppercaseFormatter() + + """ + + def __init__(self, results, pagenum, pagelen=10): + """ + :param results: a :class:`~whoosh.searching.Results` object. + :param pagenum: which page of the results to use, numbered from ``1``. + :param pagelen: the number of hits per page. + """ + + self.results = results + self.total = len(results) + + if pagenum < 1: + raise ValueError("pagenum must be >= 1") + + self.pagecount = int(ceil(self.total / pagelen)) + self.pagenum = min(self.pagecount, pagenum) + + offset = (self.pagenum - 1) * pagelen + if (offset + pagelen) > self.total: + pagelen = self.total - offset + self.offset = offset + self.pagelen = pagelen + + def __getitem__(self, n): + offset = self.offset + if isinstance(n, slice): + start, stop, step = n.indices(self.pagelen) + return self.results.__getitem__(slice(start + offset, + stop + offset, step)) + else: + return self.results.__getitem__(n + offset) + + def __iter__(self): + return iter(self.results[self.offset:self.offset + self.pagelen]) + + def __len__(self): + return self.total + + def scored_length(self): + return self.results.scored_length() + + def score(self, n): + """Returns the score of the hit at the nth position on this page. + """ + return self.results.score(n + self.offset) + + def docnum(self, n): + """Returns the document number of the hit at the nth position on this + page. + """ + return self.results.docnum(n + self.offset) + + def is_last_page(self): + """Returns True if this object represents the last page of results. + """ + + return self.pagecount == 0 or self.pagenum == self.pagecount diff --git a/src/whoosh/sorting.py b/src/whoosh/sorting.py new file mode 100644 index 0000000..3cd24a0 --- /dev/null +++ b/src/whoosh/sorting.py @@ -0,0 +1,1156 @@ +# Copyright 2011 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from array import array +from collections import defaultdict + +from whoosh.compat import string_type +from whoosh.compat import iteritems, izip, xrange + + +# Faceting objects + +class FacetType(object): + """Base class for "facets", aspects that can be sorted/faceted. + """ + + maptype = None + + def categorizer(self, global_searcher): + """Returns a :class:`Categorizer` corresponding to this facet. + + :param global_searcher: A parent searcher. You can use this searcher if + you need global document ID references. + """ + + raise NotImplementedError + + def map(self, default=None): + t = self.maptype + if t is None: + t = default + + if t is None: + return OrderedList() + elif type(t) is type: + return t() + else: + return t + + def default_name(self): + return "facet" + + +class Categorizer(object): + """Base class for categorizer objects which compute a key value for a + document based on certain criteria, for use in sorting/faceting. + + Categorizers are created by FacetType objects through the + :meth:`FacetType.categorizer` method. The + :class:`whoosh.searching.Searcher` object passed to the ``categorizer`` + method may be a composite searcher (that is, wrapping a multi-reader), but + categorizers are always run **per-segment**, with segment-relative document + numbers. + + The collector will call a categorizer's ``set_searcher`` method as it + searches each segment to let the cateogorizer set up whatever segment- + specific data it needs. + + ``Collector.allow_overlap`` should be ``True`` if the caller can use the + ``keys_for`` method instead of ``key_for`` to group documents into + potentially overlapping groups. The default is ``False``. + + If a categorizer subclass can categorize the document using only the + document number, it should set ``Collector.needs_current`` to ``False`` + (this is the default) and NOT USE the given matcher in the ``key_for`` or + ``keys_for`` methods, since in that case ``segment_docnum`` is not + guaranteed to be consistent with the given matcher. If a categorizer + subclass needs to access information on the matcher, it should set + ``needs_current`` to ``True``. This will prevent the caller from using + optimizations that might leave the matcher in an inconsistent state. + """ + + allow_overlap = False + needs_current = False + + def set_searcher(self, segment_searcher, docoffset): + """Called by the collector when the collector moves to a new segment. + The ``segment_searcher`` will be atomic. The ``docoffset`` is the + offset of the segment's document numbers relative to the entire index. + You can use the offset to get absolute index docnums by adding the + offset to segment-relative docnums. + """ + + pass + + def key_for(self, matcher, segment_docnum): + """Returns a key for the current match. + + :param matcher: a :class:`whoosh.matching.Matcher` object. If + ``self.needs_current`` is ``False``, DO NOT use this object, + since it may be inconsistent. Use the given ``segment_docnum`` + instead. + :param segment_docnum: the segment-relative document number of the + current match. + """ + + # Backwards compatibility + if hasattr(self, "key_for_id"): + return self.key_for_id(segment_docnum) + elif hasattr(self, "key_for_matcher"): + return self.key_for_matcher(matcher) + + raise NotImplementedError(self.__class__) + + def keys_for(self, matcher, segment_docnum): + """Yields a series of keys for the current match. + + This method will be called instead of ``key_for`` if + ``self.allow_overlap`` is ``True``. + + :param matcher: a :class:`whoosh.matching.Matcher` object. If + ``self.needs_current`` is ``False``, DO NOT use this object, + since it may be inconsistent. Use the given ``segment_docnum`` + instead. + :param segment_docnum: the segment-relative document number of the + current match. + """ + + # Backwards compatibility + if hasattr(self, "keys_for_id"): + return self.keys_for_id(segment_docnum) + + raise NotImplementedError(self.__class__) + + def key_to_name(self, key): + """Returns a representation of the key to be used as a dictionary key + in faceting. For example, the sorting key for date fields is a large + integer; this method translates it into a ``datetime`` object to make + the groupings clearer. + """ + + return key + + +# General field facet + +class FieldFacet(FacetType): + """Sorts/facets by the contents of a field. + + For example, to sort by the contents of the "path" field in reverse order, + and facet by the contents of the "tag" field:: + + paths = FieldFacet("path", reverse=True) + tags = FieldFacet("tag") + results = searcher.search(myquery, sortedby=paths, groupedby=tags) + + This facet returns different categorizers based on the field type. + """ + + def __init__(self, fieldname, reverse=False, allow_overlap=False, + maptype=None): + """ + :param fieldname: the name of the field to sort/facet on. + :param reverse: if True, when sorting, reverse the sort order of this + facet. + :param allow_overlap: if True, when grouping, allow documents to appear + in multiple groups when they have multiple terms in the field. + """ + + self.fieldname = fieldname + self.reverse = reverse + self.allow_overlap = allow_overlap + self.maptype = maptype + + def default_name(self): + return self.fieldname + + def categorizer(self, global_searcher): + # The searcher we're passed here may wrap a multireader, but the + # actual key functions will always be called per-segment following a + # Categorizer.set_searcher method call + fieldname = self.fieldname + fieldobj = global_searcher.schema[fieldname] + + # If we're grouping with allow_overlap=True, all we can use is + # OverlappingCategorizer + if self.allow_overlap: + return OverlappingCategorizer(global_searcher, fieldname) + + if global_searcher.reader().has_column(fieldname): + coltype = fieldobj.column_type + if coltype.reversible or not self.reverse: + c = ColumnCategorizer(global_searcher, fieldname, self.reverse) + else: + c = ReversedColumnCategorizer(global_searcher, fieldname) + else: + c = PostingCategorizer(global_searcher, fieldname, + self.reverse) + return c + + +class ColumnCategorizer(Categorizer): + def __init__(self, global_searcher, fieldname, reverse=False): + self._fieldname = fieldname + self._fieldobj = global_searcher.schema[self._fieldname] + self._column_type = self._fieldobj.column_type + self._reverse = reverse + + # The column reader is set in set_searcher() as we iterate over the + # sub-searchers + self._creader = None + + def __repr__(self): + return "%s(%r, %r, reverse=%r)" % (self.__class__.__name__, + self._fieldobj, self._fieldname, + self._reverse) + + def set_searcher(self, segment_searcher, docoffset): + r = segment_searcher.reader() + self._creader = r.column_reader(self._fieldname, + reverse=self._reverse, + translate=False) + + def key_for(self, matcher, segment_docnum): + return self._creader.sort_key(segment_docnum) + + def key_to_name(self, key): + return self._fieldobj.from_column_value(key) + + +class ReversedColumnCategorizer(ColumnCategorizer): + """Categorizer that reverses column values for columns that aren't + naturally reversible. + """ + + def __init__(self, global_searcher, fieldname): + ColumnCategorizer.__init__(self, global_searcher, fieldname) + + reader = global_searcher.reader() + self._doccount = reader.doc_count_all() + + global_creader = reader.column_reader(fieldname, translate=False) + self._values = sorted(set(global_creader)) + + def key_for(self, matcher, segment_docnum): + value = self._creader[segment_docnum] + order = self._values.index(value) + # Subtract from 0 to reverse the order + return 0 - order + + def key_to_name(self, key): + # Re-reverse the key to get the index into _values + key = self._values[0 - key] + return ColumnCategorizer.key_to_name(self, key) + + +class OverlappingCategorizer(Categorizer): + allow_overlap = True + + def __init__(self, global_searcher, fieldname): + self._fieldname = fieldname + self._fieldobj = global_searcher.schema[fieldname] + + field = global_searcher.schema[fieldname] + reader = global_searcher.reader() + self._use_vectors = bool(field.vector) + self._use_column = (reader.has_column(fieldname) + and field.column_type.stores_lists()) + + # These are set in set_searcher() as we iterate over the sub-searchers + self._segment_searcher = None + self._creader = None + self._lists = None + + def set_searcher(self, segment_searcher, docoffset): + fieldname = self._fieldname + self._segment_searcher = segment_searcher + reader = segment_searcher.reader() + + if self._use_vectors: + pass + elif self._use_column: + self._creader = reader.column_reader(fieldname, translate=False) + else: + # Otherwise, cache the values in each document in a huge list + # of lists + dc = segment_searcher.doc_count_all() + field = segment_searcher.schema[fieldname] + from_bytes = field.from_bytes + + self._lists = [[] for _ in xrange(dc)] + for btext in field.sortable_terms(reader, fieldname): + text = from_bytes(btext) + postings = reader.postings(fieldname, btext) + for docid in postings.all_ids(): + self._lists[docid].append(text) + + def keys_for(self, matcher, docid): + if self._use_vectors: + try: + v = self._segment_searcher.vector(docid, self._fieldname) + return list(v.all_ids()) + except KeyError: + return [] + elif self._use_column: + return self._creader[docid] + else: + return self._lists[docid] or [None] + + def key_for(self, matcher, docid): + if self._use_vectors: + try: + v = self._segment_searcher.vector(docid, self._fieldname) + return v.id() + except KeyError: + return None + elif self._use_column: + return self._creader.sort_key(docid) + else: + ls = self._lists[docid] + if ls: + return ls[0] + else: + return None + + +class PostingCategorizer(Categorizer): + """ + Categorizer for fields that don't store column values. This is very + inefficient. Instead of relying on this categorizer you should plan for + which fields you'll want to sort on and set ``sortable=True`` in their + field type. + + This object builds an array caching the order of all documents according to + the field, then uses the cached order as a numeric key. This is useful when + a field cache is not available, and also for reversed fields (since field + cache keys for non- numeric fields are arbitrary data, it's not possible to + "negate" them to reverse the sort order). + """ + + def __init__(self, global_searcher, fieldname, reverse): + self.reverse = reverse + + if fieldname in global_searcher._field_caches: + self.values, self.array = global_searcher._field_caches[fieldname] + else: + # Cache the relative positions of all docs with the given field + # across the entire index + reader = global_searcher.reader() + dc = reader.doc_count_all() + self._fieldobj = global_searcher.schema[fieldname] + from_bytes = self._fieldobj.from_bytes + + self.values = [] + self.array = array("i", [dc + 1] * dc) + + btexts = self._fieldobj.sortable_terms(reader, fieldname) + for i, btext in enumerate(btexts): + self.values.append(from_bytes(btext)) + # Get global docids from global reader + postings = reader.postings(fieldname, btext) + for docid in postings.all_ids(): + self.array[docid] = i + + global_searcher._field_caches[fieldname] = (self.values, self.array) + + def set_searcher(self, segment_searcher, docoffset): + self._searcher = segment_searcher + self.docoffset = docoffset + + def key_for(self, matcher, segment_docnum): + global_docnum = self.docoffset + segment_docnum + i = self.array[global_docnum] + if self.reverse: + i = len(self.values) - i + return i + + def key_to_name(self, i): + if i >= len(self.values): + return None + if self.reverse: + i = len(self.values) - i + return self.values[i] + + +# Special facet types + +class QueryFacet(FacetType): + """Sorts/facets based on the results of a series of queries. + """ + + def __init__(self, querydict, other=None, allow_overlap=False, + maptype=None): + """ + :param querydict: a dictionary mapping keys to + :class:`whoosh.query.Query` objects. + :param other: the key to use for documents that don't match any of the + queries. + """ + + self.querydict = querydict + self.other = other + self.maptype = maptype + self.allow_overlap = allow_overlap + + def categorizer(self, global_searcher): + return self.QueryCategorizer(self.querydict, self.other, self.allow_overlap) + + class QueryCategorizer(Categorizer): + def __init__(self, querydict, other, allow_overlap=False): + self.querydict = querydict + self.other = other + self.allow_overlap = allow_overlap + + def set_searcher(self, segment_searcher, offset): + self.docsets = {} + for qname, q in self.querydict.items(): + docset = set(q.docs(segment_searcher)) + if docset: + self.docsets[qname] = docset + self.offset = offset + + def key_for(self, matcher, docid): + for qname in self.docsets: + if docid in self.docsets[qname]: + return qname + return self.other + + def keys_for(self, matcher, docid): + found = False + for qname in self.docsets: + if docid in self.docsets[qname]: + yield qname + found = True + if not found: + yield None + + +class RangeFacet(QueryFacet): + """Sorts/facets based on numeric ranges. For textual ranges, use + :class:`QueryFacet`. + + For example, to facet the "price" field into $100 buckets, up to $1000:: + + prices = RangeFacet("price", 0, 1000, 100) + results = searcher.search(myquery, groupedby=prices) + + The ranges/buckets are always **inclusive** at the start and **exclusive** + at the end. + """ + + def __init__(self, fieldname, start, end, gap, hardend=False, + maptype=None): + """ + :param fieldname: the numeric field to sort/facet on. + :param start: the start of the entire range. + :param end: the end of the entire range. + :param gap: the size of each "bucket" in the range. This can be a + sequence of sizes. For example, ``gap=[1,5,10]`` will use 1 as the + size of the first bucket, 5 as the size of the second bucket, and + 10 as the size of all subsequent buckets. + :param hardend: if True, the end of the last bucket is clamped to the + value of ``end``. If False (the default), the last bucket is always + ``gap`` sized, even if that means the end of the last bucket is + after ``end``. + """ + + self.fieldname = fieldname + self.start = start + self.end = end + self.gap = gap + self.hardend = hardend + self.maptype = maptype + self._queries() + + def default_name(self): + return self.fieldname + + def _rangetype(self): + from whoosh import query + + return query.NumericRange + + def _range_name(self, startval, endval): + return (startval, endval) + + def _queries(self): + if not self.gap: + raise Exception("No gap secified (%r)" % self.gap) + if isinstance(self.gap, (list, tuple)): + gaps = self.gap + gapindex = 0 + else: + gaps = [self.gap] + gapindex = -1 + + rangetype = self._rangetype() + self.querydict = {} + cstart = self.start + while cstart < self.end: + thisgap = gaps[gapindex] + if gapindex >= 0: + gapindex += 1 + if gapindex == len(gaps): + gapindex = -1 + + cend = cstart + thisgap + if self.hardend: + cend = min(self.end, cend) + + rangename = self._range_name(cstart, cend) + q = rangetype(self.fieldname, cstart, cend, endexcl=True) + self.querydict[rangename] = q + + cstart = cend + + def categorizer(self, global_searcher): + return QueryFacet(self.querydict).categorizer(global_searcher) + + +class DateRangeFacet(RangeFacet): + """Sorts/facets based on date ranges. This is the same as RangeFacet + except you are expected to use ``daterange`` objects as the start and end + of the range, and ``timedelta`` or ``relativedelta`` objects as the gap(s), + and it generates :class:`~whoosh.query.DateRange` queries instead of + :class:`~whoosh.query.TermRange` queries. + + For example, to facet a "birthday" range into 5 year buckets:: + + from datetime import datetime + from whoosh.support.relativedelta import relativedelta + + startdate = datetime(1920, 0, 0) + enddate = datetime.now() + gap = relativedelta(years=5) + bdays = DateRangeFacet("birthday", startdate, enddate, gap) + results = searcher.search(myquery, groupedby=bdays) + + The ranges/buckets are always **inclusive** at the start and **exclusive** + at the end. + """ + + def _rangetype(self): + from whoosh import query + + return query.DateRange + + +class ScoreFacet(FacetType): + """Uses a document's score as a sorting criterion. + + For example, to sort by the ``tag`` field, and then within that by relative + score:: + + tag_score = MultiFacet(["tag", ScoreFacet()]) + results = searcher.search(myquery, sortedby=tag_score) + """ + + def categorizer(self, global_searcher): + return self.ScoreCategorizer(global_searcher) + + class ScoreCategorizer(Categorizer): + needs_current = True + + def __init__(self, global_searcher): + w = global_searcher.weighting + self.use_final = w.use_final + if w.use_final: + self.final = w.final + + def set_searcher(self, segment_searcher, offset): + self.segment_searcher = segment_searcher + + def key_for(self, matcher, docid): + score = matcher.score() + if self.use_final: + score = self.final(self.segment_searcher, docid, score) + # Negate the score so higher values sort first + return 0 - score + + +class FunctionFacet(FacetType): + """This facet type is low-level. In most cases you should use + :class:`TranslateFacet` instead. + + This facet type ets you pass an arbitrary function that will compute the + key. This may be easier than subclassing FacetType and Categorizer to set up + the desired behavior. + + The function is called with the arguments ``(searcher, docid)``, where the + ``searcher`` may be a composite searcher, and the ``docid`` is an absolute + index document number (not segment-relative). + + For example, to use the number of words in the document's "content" field + as the sorting/faceting key:: + + fn = lambda s, docid: s.doc_field_length(docid, "content") + lengths = FunctionFacet(fn) + """ + + def __init__(self, fn, maptype=None): + self.fn = fn + self.maptype = maptype + + def categorizer(self, global_searcher): + return self.FunctionCategorizer(global_searcher, self.fn) + + class FunctionCategorizer(Categorizer): + def __init__(self, global_searcher, fn): + self.global_searcher = global_searcher + self.fn = fn + + def set_searcher(self, segment_searcher, docoffset): + self.offset = docoffset + + def key_for(self, matcher, docid): + return self.fn(self.global_searcher, docid + self.offset) + + +class TranslateFacet(FacetType): + """Lets you specify a function to compute the key based on a key generated + by a wrapped facet. + + This is useful if you want to use a custom ordering of a sortable field. For + example, if you want to use an implementation of the Unicode Collation + Algorithm (UCA) to sort a field using the rules from a particular language:: + + from pyuca import Collator + + # The Collator object has a sort_key() method which takes a unicode + # string and returns a sort key + c = Collator("allkeys.txt") + + # Make a facet object for the field you want to sort on + facet = sorting.FieldFacet("name") + # Wrap the facet in a TranslateFacet with the translation function + # (the Collator object's sort_key method) + facet = sorting.TranslateFacet(c.sort_key, facet) + + # Use the facet to sort the search results + results = searcher.search(myquery, sortedby=facet) + + You can pass multiple facets to the + """ + + def __init__(self, fn, *facets): + """ + :param fn: The function to apply. For each matching document, this + function will be called with the values of the given facets as + arguments. + :param facets: One or more :class:`FacetType` objects. These facets are + used to compute facet value(s) for a matching document, and then the + value(s) is/are passed to the function. + """ + self.fn = fn + self.facets = facets + self.maptype = None + + def categorizer(self, global_searcher): + catters = [facet.categorizer(global_searcher) for facet in self.facets] + return self.TranslateCategorizer(self.fn, catters) + + class TranslateCategorizer(Categorizer): + def __init__(self, fn, catters): + self.fn = fn + self.catters = catters + + def set_searcher(self, segment_searcher, docoffset): + for catter in self.catters: + catter.set_searcher(segment_searcher, docoffset) + + def key_for(self, matcher, segment_docnum): + keys = [catter.key_for(matcher, segment_docnum) + for catter in self.catters] + return self.fn(*keys) + + +class StoredFieldFacet(FacetType): + """Lets you sort/group using the value in an unindexed, stored field (e.g. + :class:`whoosh.fields.STORED`). This is usually slower than using an indexed + field. + + For fields where the stored value is a space-separated list of keywords, + (e.g. ``"tag1 tag2 tag3"``), you can use the ``allow_overlap`` keyword + argument to allow overlapped faceting on the result of calling the + ``split()`` method on the field value (or calling a custom split function + if one is supplied). + """ + + def __init__(self, fieldname, allow_overlap=False, split_fn=None, + maptype=None): + """ + :param fieldname: the name of the stored field. + :param allow_overlap: if True, when grouping, allow documents to appear + in multiple groups when they have multiple terms in the field. The + categorizer uses ``string.split()`` or the custom ``split_fn`` to + convert the stored value into a list of facet values. + :param split_fn: a custom function to split a stored field value into + multiple facet values when ``allow_overlap`` is True. If not + supplied, the categorizer simply calls the value's ``split()`` + method. + """ + + self.fieldname = fieldname + self.allow_overlap = allow_overlap + self.split_fn = None + self.maptype = maptype + + def default_name(self): + return self.fieldname + + def categorizer(self, global_searcher): + return self.StoredFieldCategorizer(self.fieldname, self.allow_overlap, + self.split_fn) + + class StoredFieldCategorizer(Categorizer): + def __init__(self, fieldname, allow_overlap, split_fn): + self.fieldname = fieldname + self.allow_overlap = allow_overlap + self.split_fn = split_fn + + def set_searcher(self, segment_searcher, docoffset): + self.segment_searcher = segment_searcher + + def keys_for(self, matcher, docid): + d = self.segment_searcher.stored_fields(docid) + value = d.get(self.fieldname) + if self.split_fn: + return self.split_fn(value) + else: + return value.split() + + def key_for(self, matcher, docid): + d = self.segment_searcher.stored_fields(docid) + return d.get(self.fieldname) + + +class MultiFacet(FacetType): + """Sorts/facets by the combination of multiple "sub-facets". + + For example, to sort by the value of the "tag" field, and then (for + documents where the tag is the same) by the value of the "path" field:: + + facet = MultiFacet(FieldFacet("tag"), FieldFacet("path") + results = searcher.search(myquery, sortedby=facet) + + As a shortcut, you can use strings to refer to field names, and they will + be assumed to be field names and turned into FieldFacet objects:: + + facet = MultiFacet("tag", "path") + + You can also use the ``add_*`` methods to add criteria to the multifacet:: + + facet = MultiFacet() + facet.add_field("tag") + facet.add_field("path", reverse=True) + facet.add_query({"a-m": TermRange("name", "a", "m"), + "n-z": TermRange("name", "n", "z")}) + """ + + def __init__(self, items=None, maptype=None): + self.facets = [] + if items: + for item in items: + self._add(item) + self.maptype = maptype + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, + self.facets, + self.maptype) + + @classmethod + def from_sortedby(cls, sortedby): + multi = cls() + if isinstance(sortedby, string_type): + multi._add(sortedby) + elif (isinstance(sortedby, (list, tuple)) + or hasattr(sortedby, "__iter__")): + for item in sortedby: + multi._add(item) + else: + multi._add(sortedby) + return multi + + def _add(self, item): + if isinstance(item, FacetType): + self.add_facet(item) + elif isinstance(item, string_type): + self.add_field(item) + else: + raise Exception("Don't know what to do with facet %r" % (item,)) + + def add_field(self, fieldname, reverse=False): + self.facets.append(FieldFacet(fieldname, reverse=reverse)) + return self + + def add_query(self, querydict, other=None, allow_overlap=False): + self.facets.append(QueryFacet(querydict, other=other, + allow_overlap=allow_overlap)) + return self + + def add_score(self): + self.facets.append(ScoreFacet()) + return self + + def add_facet(self, facet): + if not isinstance(facet, FacetType): + raise TypeError("%r is not a facet object, perhaps you meant " + "add_field()" % (facet,)) + self.facets.append(facet) + return self + + def categorizer(self, global_searcher): + if not self.facets: + raise Exception("No facets") + elif len(self.facets) == 1: + catter = self.facets[0].categorizer(global_searcher) + else: + catter = self.MultiCategorizer([facet.categorizer(global_searcher) + for facet in self.facets]) + return catter + + class MultiCategorizer(Categorizer): + def __init__(self, catters): + self.catters = catters + + @property + def needs_current(self): + return any(c.needs_current for c in self.catters) + + def set_searcher(self, segment_searcher, docoffset): + for catter in self.catters: + catter.set_searcher(segment_searcher, docoffset) + + def key_for(self, matcher, docid): + return tuple(catter.key_for(matcher, docid) + for catter in self.catters) + + def key_to_name(self, key): + return tuple(catter.key_to_name(keypart) + for catter, keypart + in izip(self.catters, key)) + + +class Facets(object): + """Maps facet names to :class:`FacetType` objects, for creating multiple + groupings of documents. + + For example, to group by tag, and **also** group by price range:: + + facets = Facets() + facets.add_field("tag") + facets.add_facet("price", RangeFacet("price", 0, 1000, 100)) + results = searcher.search(myquery, groupedby=facets) + + tag_groups = results.groups("tag") + price_groups = results.groups("price") + + (To group by the combination of multiple facets, use :class:`MultiFacet`.) + """ + + def __init__(self, x=None): + self.facets = {} + if x: + self.add_facets(x) + + @classmethod + def from_groupedby(cls, groupedby): + facets = cls() + if isinstance(groupedby, (cls, dict)): + facets.add_facets(groupedby) + elif isinstance(groupedby, string_type): + facets.add_field(groupedby) + elif isinstance(groupedby, FacetType): + facets.add_facet(groupedby.default_name(), groupedby) + elif isinstance(groupedby, (list, tuple)): + for item in groupedby: + facets.add_facets(cls.from_groupedby(item)) + else: + raise Exception("Don't know what to do with groupedby=%r" + % groupedby) + + return facets + + def names(self): + """Returns an iterator of the facet names in this object. + """ + + return iter(self.facets) + + def items(self): + """Returns a list of (facetname, facetobject) tuples for the facets in + this object. + """ + + return self.facets.items() + + def add_field(self, fieldname, **kwargs): + """Adds a :class:`FieldFacet` for the given field name (the field name + is automatically used as the facet name). + """ + + self.facets[fieldname] = FieldFacet(fieldname, **kwargs) + return self + + def add_query(self, name, querydict, **kwargs): + """Adds a :class:`QueryFacet` under the given ``name``. + + :param name: a name for the facet. + :param querydict: a dictionary mapping keys to + :class:`whoosh.query.Query` objects. + """ + + self.facets[name] = QueryFacet(querydict, **kwargs) + return self + + def add_facet(self, name, facet): + """Adds a :class:`FacetType` object under the given ``name``. + """ + + if not isinstance(facet, FacetType): + raise Exception("%r:%r is not a facet" % (name, facet)) + self.facets[name] = facet + return self + + def add_facets(self, facets, replace=True): + """Adds the contents of the given ``Facets`` or ``dict`` object to this + object. + """ + + if not isinstance(facets, (dict, Facets)): + raise Exception("%r is not a Facets object or dict" % facets) + for name, facet in facets.items(): + if replace or name not in self.facets: + self.facets[name] = facet + return self + + +# Objects for holding facet groups + +class FacetMap(object): + """Base class for objects holding the results of grouping search results by + a Facet. Use an object's ``as_dict()`` method to access the results. + + You can pass a subclass of this to the ``maptype`` keyword argument when + creating a ``FacetType`` object to specify what information the facet + should record about the group. For example:: + + # Record each document in each group in its sorted order + myfacet = FieldFacet("size", maptype=OrderedList) + + # Record only the count of documents in each group + myfacet = FieldFacet("size", maptype=Count) + """ + + def add(self, groupname, docid, sortkey): + """Adds a document to the facet results. + + :param groupname: the name of the group to add this document to. + :param docid: the document number of the document to add. + :param sortkey: a value representing the sort position of the document + in the full results. + """ + + raise NotImplementedError + + def as_dict(self): + """Returns a dictionary object mapping group names to + implementation-specific values. For example, the value might be a list + of document numbers, or a integer representing the number of documents + in the group. + """ + + raise NotImplementedError + + +class OrderedList(FacetMap): + """Stores a list of document numbers for each group, in the same order as + they appear in the search results. + + The ``as_dict`` method returns a dictionary mapping group names to lists + of document numbers. + """ + + def __init__(self): + self.dict = defaultdict(list) + + def __repr__(self): + return "<%s %r>" % (self.__class__.__name__, self.dict) + + def add(self, groupname, docid, sortkey): + self.dict[groupname].append((sortkey, docid)) + + def as_dict(self): + d = {} + for key, items in iteritems(self.dict): + d[key] = [docnum for _, docnum in sorted(items)] + return d + + +class UnorderedList(FacetMap): + """Stores a list of document numbers for each group, in arbitrary order. + This is slightly faster and uses less memory than + :class:`OrderedListResult` if you don't care about the ordering of the + documents within groups. + + The ``as_dict`` method returns a dictionary mapping group names to lists + of document numbers. + """ + + def __init__(self): + self.dict = defaultdict(list) + + def __repr__(self): + return "<%s %r>" % (self.__class__.__name__, self.dict) + + def add(self, groupname, docid, sortkey): + self.dict[groupname].append(docid) + + def as_dict(self): + return dict(self.dict) + + +class Count(FacetMap): + """Stores the number of documents in each group. + + The ``as_dict`` method returns a dictionary mapping group names to + integers. + """ + + def __init__(self): + self.dict = defaultdict(int) + + def __repr__(self): + return "<%s %r>" % (self.__class__.__name__, self.dict) + + def add(self, groupname, docid, sortkey): + self.dict[groupname] += 1 + + def as_dict(self): + return dict(self.dict) + + +class Best(FacetMap): + """Stores the "best" document in each group (that is, the one with the + highest sort key). + + The ``as_dict`` method returns a dictionary mapping group names to + docnument numbers. + """ + + def __init__(self): + self.bestids = {} + self.bestkeys = {} + + def __repr__(self): + return "<%s %r>" % (self.__class__.__name__, self.bestids) + + def add(self, groupname, docid, sortkey): + if groupname not in self.bestids or sortkey < self.bestkeys[groupname]: + self.bestids[groupname] = docid + self.bestkeys[groupname] = sortkey + + def as_dict(self): + return self.bestids + + +# Helper functions + +def add_sortable(writer, fieldname, facet, column=None): + """Adds a per-document value column to an existing field which was created + without the ``sortable`` keyword argument. + + >>> from whoosh import index, sorting + >>> ix = index.open_dir("indexdir") + >>> with ix.writer() as w: + ... facet = sorting.FieldFacet("price") + ... sorting.add_sortable(w, "price", facet) + ... + + :param writer: a :class:`whoosh.writing.IndexWriter` object. + :param fieldname: the name of the field to add the per-document sortable + values to. If this field doesn't exist in the writer's schema, the + function will add a :class:`whoosh.fields.COLUMN` field to the schema, + and you must specify the column object to using the ``column`` keyword + argument. + :param facet: a :class:`FacetType` object to use to generate the + per-document values. + :param column: a :class:`whosh.columns.ColumnType` object to use to store + the per-document values. If you don't specify a column object, the + function will use the default column type for the given field. + """ + + storage = writer.storage + schema = writer.schema + + field = None + if fieldname in schema: + field = schema[fieldname] + if field.column_type: + raise Exception("%r field is already sortable" % fieldname) + + if column: + if fieldname not in schema: + from whoosh.fields import COLUMN + field = COLUMN(column) + schema.add(fieldname, field) + else: + if fieldname in schema: + column = field.default_column() + else: + raise Exception("Field %r does not exist" % fieldname) + + searcher = writer.searcher() + catter = facet.categorizer(searcher) + for subsearcher, docoffset in searcher.leaf_searchers(): + catter.set_searcher(subsearcher, docoffset) + reader = subsearcher.reader() + + if reader.has_column(fieldname): + raise Exception("%r field already has a column" % fieldname) + + codec = reader.codec() + segment = reader.segment() + + colname = codec.column_filename(segment, fieldname) + colfile = storage.create_file(colname) + try: + colwriter = column.writer(colfile) + for docnum in reader.all_doc_ids(): + v = catter.key_to_name(catter.key_for(None, docnum)) + cv = field.to_column_value(v) + colwriter.add(docnum, cv) + colwriter.finish(reader.doc_count_all()) + finally: + colfile.close() + + field.column_type = column + + + diff --git a/src/whoosh/spelling.py b/src/whoosh/spelling.py new file mode 100644 index 0000000..d1dde0c --- /dev/null +++ b/src/whoosh/spelling.py @@ -0,0 +1,343 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module contains helper functions for correcting typos in user queries. +""" + +from bisect import bisect_left +from heapq import heappush, heapreplace + +from whoosh import highlight +from whoosh.compat import iteritems, xrange + + +# Corrector objects + +class Corrector(object): + """ + Base class for spelling correction objects. Concrete sub-classes should + implement the ``_suggestions`` method. + """ + + def suggest(self, text, limit=5, maxdist=2, prefix=0): + """ + :param text: the text to check. This word will **not** be added to the + suggestions, even if it appears in the word graph. + :param limit: only return up to this many suggestions. If there are not + enough terms in the field within ``maxdist`` of the given word, the + returned list will be shorter than this number. + :param maxdist: the largest edit distance from the given word to look + at. Values higher than 2 are not very effective or efficient. + :param prefix: require suggestions to share a prefix of this length + with the given word. This is often justifiable since most + misspellings do not involve the first letter of the word. Using a + prefix dramatically decreases the time it takes to generate the + list of words. + """ + + _suggestions = self._suggestions + + heap = [] + for item in _suggestions(text, maxdist, prefix): + # Note that the *higher* scores (item[0]) are better! + if len(heap) < limit: + heappush(heap, item) + elif item > heap[0]: + heapreplace(heap, item) + + sugs = sorted(heap, key=lambda x: (0 - x[0], x[1])) + return [sug for _, sug in sugs] + + def _suggestions(self, text, maxdist, prefix): + """ + Low-level method that yields a series of (score, "suggestion") + tuples. + + :param text: the text to check. + :param maxdist: the maximum edit distance. + :param prefix: require suggestions to share a prefix of this length + with the given word. + """ + + raise NotImplementedError + + +class ReaderCorrector(Corrector): + """ + Suggests corrections based on the content of a field in a reader. + + Ranks suggestions by the edit distance, then by highest to lowest + frequency. + """ + + def __init__(self, reader, fieldname, fieldobj): + self.reader = reader + self.fieldname = fieldname + self.fieldobj = fieldobj + + def _suggestions(self, text, maxdist, prefix): + reader = self.reader + freq = reader.frequency + + fieldname = self.fieldname + fieldobj = reader.schema[fieldname] + sugfield = fieldobj.spelling_fieldname(fieldname) + + for sug in reader.terms_within(sugfield, text, maxdist, prefix=prefix): + # Higher scores are better, so negate the distance and frequency + f = freq(fieldname, sug) or 1 + score = 0 - (maxdist + (1.0 / f * 0.5)) + yield (score, sug) + + +class ListCorrector(Corrector): + """ + Suggests corrections based on the content of a sorted list of strings. + """ + + def __init__(self, wordlist): + self.wordlist = wordlist + + def _suggestions(self, text, maxdist, prefix): + from whoosh.automata.lev import levenshtein_automaton + from whoosh.automata.fsa import find_all_matches + + seen = set() + for i in xrange(1, maxdist + 1): + dfa = levenshtein_automaton(text, maxdist, prefix).to_dfa() + sk = self.Skipper(self.wordlist) + for sug in find_all_matches(dfa, sk): + if sug not in seen: + seen.add(sug) + yield (0 - maxdist), sug + + class Skipper(object): + def __init__(self, data): + self.data = data + self.i = 0 + + def __call__(self, w): + if self.data[self.i] == w: + return w + self.i += 1 + pos = bisect_left(self.data, w, self.i) + if pos < len(self.data): + return self.data[pos] + else: + return None + + +class MultiCorrector(Corrector): + """ + Merges suggestions from a list of sub-correctors. + """ + + def __init__(self, correctors, op): + self.correctors = correctors + self.op = op + + def _suggestions(self, text, maxdist, prefix): + op = self.op + seen = {} + for corr in self.correctors: + for score, sug in corr._suggestions(text, maxdist, prefix): + if sug in seen: + seen[sug] = op(seen[sug], score) + else: + seen[sug] = score + return iteritems(seen) + + +# Query correction + +class Correction(object): + """ + Represents the corrected version of a user query string. Has the + following attributes: + + ``query`` + The corrected :class:`whoosh.query.Query` object. + ``string`` + The corrected user query string. + ``original_query`` + The original :class:`whoosh.query.Query` object that was corrected. + ``original_string`` + The original user query string. + ``tokens`` + A list of token objects representing the corrected words. + + You can also use the :meth:`Correction.format_string` method to reformat the + corrected query string using a :class:`whoosh.highlight.Formatter` class. + For example, to display the corrected query string as HTML with the + changed words emphasized:: + + from whoosh import highlight + + correction = mysearcher.correct_query(q, qstring) + + hf = highlight.HtmlFormatter(classname="change") + html = correction.format_string(hf) + """ + + def __init__(self, q, qstring, corr_q, tokens): + self.original_query = q + self.query = corr_q + self.original_string = qstring + self.tokens = tokens + + if self.original_string: + self.string = self.format_string(highlight.NullFormatter()) + else: + self.string = '' + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, self.query, + self.string) + + def format_string(self, formatter): + """ + Highlights the corrected words in the original query string using the + given :class:`~whoosh.highlight.Formatter`. + + :param formatter: A :class:`whoosh.highlight.Formatter` instance. + :return: the output of the formatter (usually a string). + """ + + if not self.original_string: + return '' + if isinstance(formatter, type): + formatter = formatter() + + fragment = highlight.Fragment(self.original_string, self.tokens) + return formatter.format_fragment(fragment, replace=True) + + +# QueryCorrector objects + +class QueryCorrector(object): + """ + Base class for objects that correct words in a user query. + """ + + def __init__(self, fieldname): + self.fieldname = fieldname + + def correct_query(self, q, qstring): + """ + Returns a :class:`Correction` object representing the corrected + form of the given query. + + :param q: the original :class:`whoosh.query.Query` tree to be + corrected. + :param qstring: the original user query. This may be None if the + original query string is not available, in which case the + ``Correction.string`` attribute will also be None. + :rtype: :class:`Correction` + """ + + raise NotImplementedError + + def field(self): + return self.fieldname + + +class SimpleQueryCorrector(QueryCorrector): + """ + A simple query corrector based on a mapping of field names to + :class:`Corrector` objects, and a list of ``("fieldname", "text")`` tuples + to correct. And terms in the query that appear in list of term tuples are + corrected using the appropriate corrector. + """ + + def __init__(self, correctors, terms, aliases=None, prefix=0, maxdist=2): + """ + :param correctors: a dictionary mapping field names to + :class:`Corrector` objects. + :param terms: a sequence of ``("fieldname", "text")`` tuples + representing terms to be corrected. + :param aliases: a dictionary mapping field names in the query to + field names for spelling suggestions. + :param prefix: suggested replacement words must share this number of + initial characters with the original word. Increasing this even to + just ``1`` can dramatically speed up suggestions, and may be + justifiable since spellling mistakes rarely involve the first + letter of a word. + :param maxdist: the maximum number of "edits" (insertions, deletions, + subsitutions, or transpositions of letters) allowed between the + original word and any suggestion. Values higher than ``2`` may be + slow. + """ + + self.correctors = correctors + self.aliases = aliases or {} + self.termset = frozenset(terms) + self.prefix = prefix + self.maxdist = maxdist + + def correct_query(self, q, qstring): + correctors = self.correctors + aliases = self.aliases + termset = self.termset + prefix = self.prefix + maxdist = self.maxdist + + # A list of tokens that were changed by a corrector + corrected_tokens = [] + + # The corrected query tree. We don't need to deepcopy the original + # because we use Query.replace() to find-and-replace the corrected + # words and it returns a copy of the query tree. + corrected_q = q + + # For every word in the original query... + # Note we can't put these in a set, because we must preserve WHERE + # in the query each token occured so we can format them later + for token in q.all_tokens(): + fname = token.fieldname + aname = aliases.get(fname, fname) + + # If this is one of the words we're supposed to correct... + if (fname, token.text) in termset: + c = correctors[aname] + sugs = c.suggest(token.text, prefix=prefix, maxdist=maxdist) + if sugs: + # This is a "simple" corrector, so we just pick the first + # suggestion :/ + sug = sugs[0] + + # Return a new copy of the original query with this word + # replaced by the correction + corrected_q = corrected_q.replace(token.fieldname, + token.text, sug) + # Add the token to the list of corrected tokens (for the + # formatter to use later) + token.original = token.text + token.text = sug + corrected_tokens.append(token) + + return Correction(q, qstring, corrected_q, corrected_tokens) diff --git a/src/whoosh/support/__init__.py b/src/whoosh/support/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/whoosh/support/base85.py b/src/whoosh/support/base85.py new file mode 100644 index 0000000..87a4df8 --- /dev/null +++ b/src/whoosh/support/base85.py @@ -0,0 +1,103 @@ +""" +This module contains generic base85 encoding and decoding functions. The +whoosh.util.numeric module contains faster variants for encoding and +decoding integers. + +Modified from: +http://paste.lisp.org/display/72815 +""" + +import struct + +from whoosh.compat import xrange + + +# Instead of using the character set from the ascii85 algorithm, I put the +# characters in order so that the encoded text sorts properly (my life would be +# a lot easier if they had just done that from the start) +b85chars = ("!$%&*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "^_abcdefghijklmnopqrstuvwxyz{|}~") +b85dec = {} +for i in range(len(b85chars)): + b85dec[b85chars[i]] = i + + +# Integer encoding and decoding functions + +def to_base85(x, islong=False): + "Encodes the given integer using base 85." + + size = 10 if islong else 5 + rems = "" + for i in xrange(size): + rems = b85chars[x % 85] + rems + x //= 85 + return rems + + +def from_base85(text): + "Decodes the given base 85 text into an integer." + + acc = 0 + for c in text: + acc = acc * 85 + b85dec[c] + return acc + + +# Bytes encoding and decoding functions + +def b85encode(text, pad=False): + l = len(text) + r = l % 4 + if r: + text += '\0' * (4 - r) + longs = len(text) >> 2 + out = [] + words = struct.unpack('>' + 'L' * longs, text[0:longs * 4]) + for word in words: + rems = [0, 0, 0, 0, 0] + for i in range(4, -1, -1): + rems[i] = b85chars[word % 85] + word /= 85 + out.extend(rems) + + out = ''.join(out) + if pad: + return out + + # Trim padding + olen = l % 4 + if olen: + olen += 1 + olen += l / 4 * 5 + return out[0:olen] + + +def b85decode(text): + l = len(text) + out = [] + for i in range(0, len(text), 5): + chunk = text[i:i + 5] + acc = 0 + for j in range(len(chunk)): + try: + acc = acc * 85 + b85dec[chunk[j]] + except KeyError: + raise TypeError('Bad base85 character at byte %d' % (i + j)) + if acc > 4294967295: + raise OverflowError('Base85 overflow in hunk starting at byte %d' % i) + out.append(acc) + + # Pad final chunk if necessary + cl = l % 5 + if cl: + acc *= 85 ** (5 - cl) + if cl > 1: + acc += 0xffffff >> (cl - 2) * 8 + out[-1] = acc + + out = struct.pack('>' + 'L' * ((l + 4) / 5), *out) + if cl: + out = out[:-(5 - cl)] + + return out diff --git a/src/whoosh/support/bench.py b/src/whoosh/support/bench.py new file mode 100644 index 0000000..22d526e --- /dev/null +++ b/src/whoosh/support/bench.py @@ -0,0 +1,610 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import division +import os.path +from optparse import OptionParser +from shutil import rmtree + +from whoosh import index, qparser, query, scoring +from whoosh.util import now, find_object + +try: + import xappy +except ImportError: + pass +try: + import xapian +except ImportError: + pass +try: + import pysolr +except ImportError: + pass + +try: + from persistent import Persistent + + class ZDoc(Persistent): + def __init__(self, d): + self.__dict__.update(d) +except ImportError: + pass + + +class Module(object): + def __init__(self, bench, options, args): + self.bench = bench + self.options = options + self.args = args + + def __repr__(self): + return self.__class__.__name__ + + def indexer(self, **kwargs): + pass + + def index_document(self, d): + raise NotImplementedError + + def finish(self, **kwargs): + pass + + def _process_result(self, d): + attrname = "process_result_%s" % self.options.lib + if hasattr(self.bench.spec, attrname): + method = getattr(self.bench.spec, attrname) + self._process_result = method + return method(d) + else: + self._process_result = lambda x: x + return d + + def searcher(self): + pass + + def query(self): + raise NotImplementedError + + def find(self, q): + raise NotImplementedError + + def findterms(self, terms): + raise NotImplementedError + + def results(self, r): + for hit in r: + yield self._process_result(hit) + + +class Spec(object): + headline_field = "title" + main_field = "body" + + def __init__(self, options, args): + self.options = options + self.args = args + + def documents(self): + raise NotImplementedError + + def setup(self): + pass + + def print_results(self, ls): + showbody = self.options.showbody + snippets = self.options.snippets + limit = self.options.limit + for i, hit in enumerate(ls): + if i >= limit: + break + + print("%d. %s" % (i + 1, hit.get(self.headline_field))) + if snippets: + print(self.show_snippet(hit)) + if showbody: + print(hit.get(self.main_field)) + + +class WhooshModule(Module): + def indexer(self, create=True): + schema = self.bench.spec.whoosh_schema() + path = os.path.join(self.options.dir, "%s_whoosh" + % self.options.indexname) + + if not os.path.exists(path): + os.mkdir(path) + if create: + ix = index.create_in(path, schema) + else: + ix = index.open_dir(path) + + poolclass = None + if self.options.pool: + poolclass = find_object(self.options.pool) + + self.writer = ix.writer(limitmb=int(self.options.limitmb), + poolclass=poolclass, + dir=self.options.tempdir, + procs=int(self.options.procs), + batchsize=int(self.options.batch), + multisegment=self.options.xms) + self._procdoc = None + if hasattr(self.bench.spec, "process_document_whoosh"): + self._procdoc = self.bench.spec.process_document_whoosh + + def index_document(self, d): + _procdoc = self._procdoc + if _procdoc: + _procdoc(d) + self.writer.add_document(**d) + + def finish(self, merge=True, optimize=False): + self.writer.commit(merge=merge, optimize=optimize) + + def searcher(self): + path = os.path.join(self.options.dir, "%s_whoosh" + % self.options.indexname) + ix = index.open_dir(path) + self.srch = ix.searcher(weighting=scoring.PL2()) + self.parser = qparser.QueryParser(self.bench.spec.main_field, + schema=ix.schema) + + def query(self): + qstring = " ".join(self.args).decode("utf-8") + return self.parser.parse(qstring) + + def find(self, q): + return self.srch.search(q, limit=int(self.options.limit), + optimize=self.options.optimize) + + def findterms(self, terms): + limit = int(self.options.limit) + s = self.srch + q = query.Term(self.bench.spec.main_field, None) + for term in terms: + q.text = term + yield s.search(q, limit=limit) + + +class XappyModule(Module): + def indexer(self, **kwargs): + path = os.path.join(self.options.dir, "%s_xappy" + % self.options.indexname) + conn = self.bench.spec.xappy_connection(path) + return conn + + def index_document(self, conn, d): + if hasattr(self.bench, "process_document_xappy"): + self.bench.process_document_xappy(d) + doc = xappy.UnprocessedDocument() + for key, values in d: + if not isinstance(values, list): + values = [values] + for value in values: + doc.fields.append(xappy.Field(key, value)) + conn.add(doc) + + def finish(self, conn): + conn.flush() + + def searcher(self): + path = os.path.join(self.options.dir, "%s_xappy" + % self.options.indexname) + return xappy.SearchConnection(path) + + def query(self, conn): + return conn.query_parse(" ".join(self.args)) + + def find(self, conn, q): + return conn.search(q, 0, int(self.options.limit)) + + def findterms(self, conn, terms): + limit = int(self.options.limit) + for term in terms: + q = conn.query_field(self.bench.spec.main_field, term) + yield conn.search(q, 0, limit) + + def results(self, r): + hf = self.bench.spec.headline_field + mf = self.bench.spec.main_field + for hit in r: + yield self._process_result({hf: hit.data[hf], mf: hit.data[mf]}) + + +class XapianModule(Module): + def indexer(self, **kwargs): + path = os.path.join(self.options.dir, "%s_xapian" + % self.options.indexname) + self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN) + self.ixer = xapian.TermGenerator() + + def index_document(self, d): + if hasattr(self.bench, "process_document_xapian"): + self.bench.process_document_xapian(d) + doc = xapian.Document() + doc.add_value(0, d.get(self.bench.spec.headline_field, "-")) + doc.set_data(d[self.bench.spec.main_field]) + self.ixer.set_document(doc) + self.ixer.index_text(d[self.bench.spec.main_field]) + self.database.add_document(doc) + + def finish(self, **kwargs): + self.database.flush() + + def searcher(self): + path = os.path.join(self.options.dir, "%s_xappy" + % self.options.indexname) + self.db = xapian.Database(path) + self.enq = xapian.Enquire(self.db) + self.qp = xapian.QueryParser() + self.qp.set_database(self.db) + + def query(self): + return self.qp.parse_query(" ".join(self.args)) + + def find(self, q): + self.enq.set_query(q) + return self.enq.get_mset(0, int(self.options.limit)) + + def findterms(self, terms): + limit = int(self.options.limit) + for term in terms: + q = self.qp.parse_query(term) + self.enq.set_query(q) + yield self.enq.get_mset(0, limit) + + def results(self, matches): + hf = self.bench.spec.headline_field + mf = self.bench.spec.main_field + for m in matches: + yield self._process_result({hf: m.document.get_value(0), + mf: m.document.get_data()}) + + +class SolrModule(Module): + def indexer(self, **kwargs): + self.solr_doclist = [] + self.conn = pysolr.Solr(self.options.url) + self.conn.delete("*:*") + self.conn.commit() + + def index_document(self, d): + self.solr_doclist.append(d) + if len(self.solr_doclist) >= int(self.options.batch): + self.conn.add(self.solr_doclist, commit=False) + self.solr_doclist = [] + + def finish(self, **kwargs): + if self.solr_doclist: + self.conn.add(self.solr_doclist) + del self.solr_doclist + self.conn.optimize(block=True) + + def searcher(self): + self.solr = pysolr.Solr(self.options.url) + + def query(self): + return " ".join(self.args) + + def find(self, q): + return self.solr.search(q, limit=int(self.options.limit)) + + def findterms(self, terms): + limit = int(self.options.limit) + for term in terms: + yield self.solr.search("body:" + term, limit=limit) + + +class ZcatalogModule(Module): + def indexer(self, **kwargs): + from ZODB.FileStorage import FileStorage # @UnresolvedImport + from ZODB.DB import DB # @UnresolvedImport + from zcatalog import catalog # @UnresolvedImport + from zcatalog import indexes # @UnresolvedImport + import transaction # @UnresolvedImport + + dir = os.path.join(self.options.dir, "%s_zcatalog" + % self.options.indexname) + if os.path.exists(dir): + rmtree(dir) + os.mkdir(dir) + + storage = FileStorage(os.path.join(dir, "index")) + db = DB(storage) + conn = db.open() + + self.cat = catalog.Catalog() + self.bench.spec.zcatalog_setup(self.cat) + conn.root()["cat"] = self.cat + transaction.commit() + + self.zcatalog_count = 0 + + def index_document(self, d): + if hasattr(self.bench, "process_document_zcatalog"): + self.bench.process_document_zcatalog(d) + doc = ZDoc(d) + self.cat.index_doc(doc) + self.zcatalog_count += 1 + if self.zcatalog_count >= 100: + import transaction # @UnresolvedImport + transaction.commit() + self.zcatalog_count = 0 + + def finish(self, **kwargs): + import transaction # @UnresolvedImport + transaction.commit() + del self.zcatalog_count + + def searcher(self): + from ZODB.FileStorage import FileStorage # @UnresolvedImport + from ZODB.DB import DB # @UnresolvedImport + from zcatalog import catalog # @UnresolvedImport + from zcatalog import indexes # @UnresolvedImport + import transaction # @UnresolvedImport + + path = os.path.join(self.options.dir, "%s_zcatalog" + % self.options.indexname, "index") + storage = FileStorage(path) + db = DB(storage) + conn = db.open() + + self.cat = conn.root()["cat"] + + def query(self): + return " ".join(self.args) + + def find(self, q): + return self.cat.searchResults(body=q) + + def findterms(self, terms): + for term in terms: + yield self.cat.searchResults(body=term) + + def results(self, r): + hf = self.bench.spec.headline_field + mf = self.bench.spec.main_field + for hit in r: + # Have to access the attributes for them to be retrieved + yield self._process_result({hf: getattr(hit, hf), + mf: getattr(hit, mf)}) + + +class NucularModule(Module): + def indexer(self, create=True): + import shutil + from nucular import Nucular + + dir = os.path.join(self.options.dir, "%s_nucular" + % self.options.indexname) + if create: + if os.path.exists(dir): + shutil.rmtree(dir) + os.mkdir(dir) + self.archive = Nucular.Nucular(dir) + if create: + self.archive.create() + self.count = 0 + + def index_document(self, d): + try: + self.archive.indexDictionary(str(self.count), d) + except ValueError: + print("d=", d) + raise + self.count += 1 + if not self.count % int(self.options.batch): + t = now() + self.archive.store(lazy=True) + self.indexer(create=False) + + def finish(self, **kwargs): + self.archive.store(lazy=False) + self.archive.aggregateRecent(fast=False, verbose=True) + self.archive.moveTransientToBase(verbose=True) + self.archive.cleanUp() + + def searcher(self): + from nucular import Nucular + + dir = os.path.join(self.options.dir, "%s_nucular" + % self.options.indexname) + self.archive = Nucular.Nucular(dir) + + def query(self): + return " ".join(self.args) + + def find(self, q): + return self.archive.dictionaries(q) + + def findterms(self, terms): + for term in terms: + q = self.archive.Query() + q.anyWord(term) + yield q.resultDictionaries() + + +class Bench(object): + libs = {"whoosh": WhooshModule, "xappy": XappyModule, + "xapian": XapianModule, "solr": SolrModule, + "zcatalog": ZcatalogModule, "nucular": NucularModule} + + def index(self, lib): + print("Indexing with %s..." % lib) + + options = self.options + every = None if options.every is None else int(options.every) + merge = options.merge + chunk = int(options.chunk) + skip = int(options.skip) + upto = int(options.upto) + count = 0 + skipc = skip + + starttime = chunkstarttime = now() + + lib.indexer() + + for d in self.spec.documents(): + skipc -= 1 + if not skipc: + lib.index_document(d) + count += 1 + skipc = skip + if chunk and not count % chunk: + t = now() + sofar = t - starttime + print("Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (count, t - chunkstarttime, chunk, sofar, count / sofar)) + chunkstarttime = t + if count > upto: + break + if every and not count % every: + print("----Commit") + lib.finish(merge=merge) + lib.indexer(create=False) + + spooltime = now() + print("Spool time:", spooltime - starttime) + lib.finish(merge=merge) + committime = now() + print("Commit time:", committime - spooltime) + totaltime = committime - starttime + print("Total time to index %d documents: %0.3f secs (%0.3f minutes)" % (count, totaltime, totaltime / 60.0)) + print("Indexed %0.3f docs/s" % (count / totaltime)) + + def search(self, lib): + lib.searcher() + + t = now() + q = lib.query() + print("Query:", q) + r = lib.find(q) + print("Search time:", now() - t) + + t = now() + self.spec.print_results(lib.results(r)) + print("Print time:", now() - t) + + def search_file(self, lib): + f = open(self.options.termfile, "rb") + terms = [line.strip() for line in f] + f.close() + + print("Searching %d terms with %s" % (len(terms), lib)) + lib.searcher() + starttime = now() + for r in lib.findterms(terms): + pass + searchtime = now() - starttime + print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime) + + def _parser(self, name): + p = OptionParser() + p.add_option("-x", "--lib", dest="lib", + help="Name of the library to use to index/search.", + default="whoosh") + p.add_option("-d", "--dir", dest="dir", metavar="DIRNAME", + help="Directory in which to store index.", default=".") + p.add_option("-s", "--setup", dest="setup", action="store_true", + help="Set up any support files or caches.", default=False) + p.add_option("-i", "--index", dest="index", action="store_true", + help="Index the documents.", default=False) + p.add_option("-n", "--name", dest="indexname", metavar="PREFIX", + help="Index name prefix.", default="%s_index" % name) + p.add_option("-U", "--url", dest="url", metavar="URL", + help="Solr URL", default="http://localhost:8983/solr") + p.add_option("-m", "--mb", dest="limitmb", + help="Max. memory usage, in MB", default="128") + p.add_option("-c", "--chunk", dest="chunk", + help="Number of documents to index between progress messages.", + default=1000) + p.add_option("-B", "--batch", dest="batch", + help="Batch size for batch adding documents.", + default=1000) + p.add_option("-k", "--skip", dest="skip", metavar="N", + help="Index every Nth document.", default=1) + p.add_option("-e", "--commit-every", dest="every", metavar="NUM", + help="Commit every NUM documents", default=None) + p.add_option("-M", "--no-merge", dest="merge", action="store_false", + help="Don't merge segments when doing multiple commits", + default=True) + p.add_option("-u", "--upto", dest="upto", metavar="N", + help="Index up to this document number.", default=600000) + p.add_option("-p", "--procs", dest="procs", metavar="NUMBER", + help="Number of processors to use.", default=0) + p.add_option("-l", "--limit", dest="limit", metavar="N", + help="Maximum number of search results to retrieve.", + default=10) + p.add_option("-b", "--body", dest="showbody", action="store_true", + help="Show the body text in search results.", + default=False) + p.add_option("-g", "--gen", dest="generate", metavar="N", + help="Generate a list at most N terms present in all libraries.", + default=None) + p.add_option("-f", "--file", dest="termfile", metavar="FILENAME", + help="Search using the list of terms in this file.", + default=None) + p.add_option("-t", "--tempdir", dest="tempdir", metavar="DIRNAME", + help="Whoosh temp dir", default=None) + p.add_option("-P", "--pool", dest="pool", metavar="CLASSNAME", + help="Whoosh pool class", default=None) + p.add_option("-X", "--xms", dest="xms", action="store_true", + help="Experimental Whoosh feature", default=False) + p.add_option("-Z", "--storebody", dest="storebody", action="store_true", + help="Store the body text in index", default=False) + p.add_option("-q", "--snippets", dest="snippets", action="store_true", + help="Show highlighted snippets", default=False) + p.add_option("-O", "--no-optimize", dest="optimize", action="store_false", + help="Turn off searcher optimization", default=True) + + return p + + def run(self, specclass): + parser = self._parser(specclass.name) + options, args = parser.parse_args() + self.options = options + self.args = args + + if options.lib not in self.libs: + raise Exception("Unknown library: %r" % options.lib) + lib = self.libs[options.lib](self, options, args) + + self.spec = specclass(options, args) + + if options.setup: + self.spec.setup() + + action = self.search + if options.index: + action = self.index + if options.termfile: + action = self.search_file + if options.generate: + action = self.generate_search_file + + action(lib) diff --git a/src/whoosh/support/charset.py b/src/whoosh/support/charset.py new file mode 100644 index 0000000..29d936a --- /dev/null +++ b/src/whoosh/support/charset.py @@ -0,0 +1,1379 @@ +# coding=utf-8 + +"""This module contains tools for working with Sphinx charset table files. These files +are useful for doing case and accent folding. +See :class:`whoosh.analysis.CharsetTokenizer` and :class:`whoosh.analysis.CharsetFilter`. +""" + +from collections import defaultdict +import re + +from whoosh.compat import izip, u, iteritems, unichr, xrange + +# This is a straightforward accent-folding charset taken from Carlos Bueno's +# article "Accent Folding for Auto-Complete", for use with CharsetFilter. +# +# http://www.alistapart.com/articles/accent-folding-for-auto-complete/ +# +# See the article for information and caveats. The code is lifted directly +# from here: +# +# http://github.com/aristus/accent-folding/blob/master/accent_fold.py + +accent_map = { + u('H'): u('h'), # H -> h + u('I'): u('i'), # I -> i + u('J'): u('j'), # J -> j + u('N'): u('n'), # N -> n + u('P'): u('p'), # P -> p + u('S'): u('s'), # S -> s + u('T'): u('t'), # T -> t + u('W'): u('w'), # W -> w + u('Y'): u('y'), # Y -> y + u('i'): u('i'), # i -> i + u('n'): u('n'), # n -> n + u('p'): u('p'), # p -> p + u('s'): u('s'), # s -> s + u('\xc0'): u('a'), # À -> a + u('\xc1'): u('a'), # Á -> a + u('\xc2'): u('a'), # Â -> a + u('\xc3'): u('a'), # Ã -> a + u('\xc4'): u('a'), # Ä -> a + u('\xc5'): u('a'), # Å -> a + u('\xc7'): u('c'), # Ç -> c + u('\xc8'): u('e'), # È -> e + u('\xc9'): u('e'), # É -> e + u('\xca'): u('e'), # Ê -> e + u('\xcb'): u('e'), # Ë -> e + u('\xcc'): u('i'), # Ì -> i + u('\xcd'): u('i'), # Í -> i + u('\xce'): u('i'), # Î -> i + u('\xcf'): u('i'), # Ï -> i + u('\xd1'): u('n'), # Ñ -> n + u('\xd2'): u('o'), # Ò -> o + u('\xd3'): u('o'), # Ó -> o + u('\xd4'): u('o'), # Ô -> o + u('\xd5'): u('o'), # Õ -> o + u('\xd6'): u('o'), # Ö -> o + u('\xd8'): u('o'), # Ø -> o + u('\xd9'): u('u'), # Ù -> u + u('\xda'): u('u'), # Ú -> u + u('\xdb'): u('u'), # Û -> u + u('\xdc'): u('u'), # Ü -> u + u('\xdd'): u('y'), # Ý -> y + u('\xde'): u('t'), # Þ -> t + u('\xdf'): u('s'), # ß -> s + u('\xe0'): u('a'), # à -> a + u('\xe1'): u('a'), # á -> a + u('\xe2'): u('a'), # â -> a + u('\xe3'): u('a'), # ã -> a + u('\xe4'): u('a'), # ä -> a + u('\xe5'): u('a'), # å -> a + u('\xe7'): u('c'), # ç -> c + u('\xe8'): u('e'), # è -> e + u('\xe9'): u('e'), # é -> e + u('\xea'): u('e'), # ê -> e + u('\xeb'): u('e'), # ë -> e + u('\xec'): u('i'), # ì -> i + u('\xed'): u('i'), # í -> i + u('\xee'): u('i'), # î -> i + u('\xef'): u('i'), # ï -> i + u('\xf0'): u('d'), # ð -> d + u('\xf1'): u('n'), # ñ -> n + u('\xf2'): u('o'), # ò -> o + u('\xf3'): u('o'), # ó -> o + u('\xf4'): u('o'), # ô -> o + u('\xf5'): u('o'), # õ -> o + u('\xf6'): u('o'), # ö -> o + u('\xf8'): u('o'), # ø -> o + u('\xf9'): u('u'), # ù -> u + u('\xfa'): u('u'), # ú -> u + u('\xfb'): u('u'), # û -> u + u('\xfc'): u('u'), # ü -> u + u('\xfd'): u('y'), # ý -> y + u('\xfe'): u('t'), # þ -> t + u('\xff'): u('y'), # ÿ -> y + u('\u0100'): u('a'), # Ā -> a + u('\u0101'): u('a'), # ā -> a + u('\u0102'): u('a'), # Ă -> a + u('\u0103'): u('a'), # ă -> a + u('\u0104'): u('a'), # Ą -> a + u('\u0105'): u('a'), # ą -> a + u('\u0106'): u('c'), # Ć -> c + u('\u0107'): u('c'), # ć -> c + u('\u0108'): u('c'), # Ĉ -> c + u('\u0109'): u('c'), # ĉ -> c + u('\u010a'): u('c'), # Ċ -> c + u('\u010b'): u('c'), # ċ -> c + u('\u010c'): u('c'), # Č -> c + u('\u010d'): u('c'), # č -> c + u('\u010e'): u('d'), # Ď -> d + u('\u010f'): u('d'), # ď -> d + u('\u0110'): u('d'), # Đ -> d + u('\u0111'): u('d'), # đ -> d + u('\u0112'): u('e'), # Ē -> e + u('\u0113'): u('e'), # ē -> e + u('\u0114'): u('e'), # Ĕ -> e + u('\u0115'): u('e'), # ĕ -> e + u('\u0116'): u('e'), # Ė -> e + u('\u0117'): u('e'), # ė -> e + u('\u0118'): u('e'), # Ę -> e + u('\u0119'): u('e'), # ę -> e + u('\u011a'): u('e'), # Ě -> e + u('\u011b'): u('e'), # ě -> e + u('\u011c'): u('g'), # Ĝ -> g + u('\u011d'): u('g'), # ĝ -> g + u('\u011e'): u('g'), # Ğ -> g + u('\u011f'): u('g'), # ğ -> g + u('\u0120'): u('g'), # Ġ -> g + u('\u0121'): u('g'), # ġ -> g + u('\u0122'): u('g'), # Ģ -> g + u('\u0123'): u('g'), # ģ -> g + u('\u0124'): u('h'), # Ĥ -> h + u('\u0125'): u('h'), # ĥ -> h + u('\u0126'): u('h'), # Ħ -> h + u('\u0127'): u('h'), # ħ -> h + u('\u0128'): u('i'), # Ĩ -> i + u('\u0129'): u('i'), # ĩ -> i + u('\u012a'): u('i'), # Ī -> i + u('\u012b'): u('i'), # ī -> i + u('\u012c'): u('i'), # Ĭ -> i + u('\u012d'): u('i'), # ĭ -> i + u('\u012e'): u('i'), # Į -> i + u('\u012f'): u('i'), # į -> i + u('\u0130'): u('i'), # İ -> i + u('\u0131'): u('i'), # ı -> i + u('\u0134'): u('j'), # Ĵ -> j + u('\u0135'): u('j'), # ĵ -> j + u('\u0136'): u('k'), # Ķ -> k + u('\u0137'): u('k'), # ķ -> k + u('\u0139'): u('a'), # Ĺ -> a + u('\u013a'): u('l'), # ĺ -> l + u('\u013b'): u('l'), # Ļ -> l + u('\u013c'): u('l'), # ļ -> l + u('\u013d'): u('l'), # Ľ -> l + u('\u013e'): u('l'), # ľ -> l + u('\u013f'): u('l'), # Ŀ -> l + u('\u0140'): u('l'), # ŀ -> l + u('\u0141'): u('l'), # Ł -> l + u('\u0142'): u('l'), # ł -> l + u('\u0143'): u('n'), # Ń -> n + u('\u0144'): u('n'), # ń -> n + u('\u0145'): u('n'), # Ņ -> n + u('\u0146'): u('n'), # ņ -> n + u('\u0147'): u('n'), # Ň -> n + u('\u0148'): u('n'), # ň -> n + u('\u014c'): u('o'), # Ō -> o + u('\u014d'): u('o'), # ō -> o + u('\u014e'): u('o'), # Ŏ -> o + u('\u014f'): u('o'), # ŏ -> o + u('\u0150'): u('o'), # Ő -> o + u('\u0151'): u('o'), # ő -> o + u('\u0154'): u('r'), # Ŕ -> r + u('\u0155'): u('r'), # ŕ -> r + u('\u0156'): u('r'), # Ŗ -> r + u('\u0157'): u('r'), # ŗ -> r + u('\u0158'): u('r'), # Ř -> r + u('\u0159'): u('r'), # ř -> r + u('\u015a'): u('s'), # Ś -> s + u('\u015b'): u('s'), # ś -> s + u('\u015c'): u('s'), # Ŝ -> s + u('\u015d'): u('s'), # ŝ -> s + u('\u015e'): u('s'), # Ş -> s + u('\u015f'): u('s'), # ş -> s + u('\u0160'): u('s'), # Š -> s + u('\u0161'): u('s'), # š -> s + u('\u0162'): u('t'), # Ţ -> t + u('\u0163'): u('t'), # ţ -> t + u('\u0164'): u('t'), # Ť -> t + u('\u0165'): u('t'), # ť -> t + u('\u0166'): u('t'), # Ŧ -> t + u('\u0167'): u('t'), # ŧ -> t + u('\u0168'): u('u'), # Ũ -> u + u('\u0169'): u('u'), # ũ -> u + u('\u016a'): u('u'), # Ū -> u + u('\u016b'): u('u'), # ū -> u + u('\u016c'): u('u'), # Ŭ -> u + u('\u016d'): u('u'), # ŭ -> u + u('\u016e'): u('u'), # Ů -> u + u('\u016f'): u('u'), # ů -> u + u('\u0170'): u('u'), # Ű -> u + u('\u0171'): u('u'), # ű -> u + u('\u0172'): u('u'), # Ų -> u + u('\u0173'): u('u'), # ų -> u + u('\u0174'): u('w'), # Ŵ -> w + u('\u0175'): u('w'), # ŵ -> w + u('\u0176'): u('y'), # Ŷ -> y + u('\u0177'): u('y'), # ŷ -> y + u('\u0178'): u('y'), # Ÿ -> y + u('\u0179'): u('z'), # Ź -> z + u('\u017a'): u('z'), # ź -> z + u('\u017b'): u('z'), # Ż -> z + u('\u017c'): u('z'), # ż -> z + u('\u017d'): u('z'), # Ž -> z + u('\u017e'): u('z'), # ž -> z + u('\u0180'): u('b'), # ƀ -> b + u('\u0181'): u('b'), # Ɓ -> b + u('\u0182'): u('b'), # Ƃ -> b + u('\u0183'): u('b'), # ƃ -> b + u('\u0187'): u('c'), # Ƈ -> c + u('\u0188'): u('c'), # ƈ -> c + u('\u0189'): u('d'), # Ɖ -> d + u('\u018a'): u('d'), # Ɗ -> d + u('\u018b'): u('d'), # Ƌ -> d + u('\u018c'): u('d'), # ƌ -> d + u('\u018e'): u('e'), # Ǝ -> e + u('\u018f'): u('e'), # Ə -> e + u('\u0191'): u('f'), # Ƒ -> f + u('\u0192'): u('f'), # ƒ -> f + u('\u0193'): u('g'), # Ɠ -> g + u('\u0197'): u('i'), # Ɨ -> i + u('\u0198'): u('k'), # Ƙ -> k + u('\u0199'): u('k'), # ƙ -> k + u('\u019a'): u('l'), # ƚ -> l + u('\u019d'): u('n'), # Ɲ -> n + u('\u019e'): u('n'), # ƞ -> n + u('\u019f'): u('o'), # Ɵ -> o + u('\u01a0'): u('o'), # Ơ -> o + u('\u01a1'): u('o'), # ơ -> o + u('\u01a4'): u('p'), # Ƥ -> p + u('\u01a5'): u('p'), # ƥ -> p + u('\u01ab'): u('t'), # ƫ -> t + u('\u01ac'): u('t'), # Ƭ -> t + u('\u01ad'): u('t'), # ƭ -> t + u('\u01ae'): u('t'), # Ʈ -> t + u('\u01af'): u('u'), # Ư -> u + u('\u01b0'): u('u'), # ư -> u + u('\u01b2'): u('v'), # Ʋ -> v + u('\u01b3'): u('y'), # Ƴ -> y + u('\u01b4'): u('y'), # ƴ -> y + u('\u01b5'): u('z'), # Ƶ -> z + u('\u01b6'): u('z'), # ƶ -> z + u('\u01ba'): u('z'), # ƺ -> z + u('\u01cd'): u('a'), # Ǎ -> a + u('\u01ce'): u('a'), # ǎ -> a + u('\u01cf'): u('i'), # Ǐ -> i + u('\u01d0'): u('i'), # ǐ -> i + u('\u01d1'): u('o'), # Ǒ -> o + u('\u01d2'): u('o'), # ǒ -> o + u('\u01d3'): u('u'), # Ǔ -> u + u('\u01d4'): u('u'), # ǔ -> u + u('\u01d5'): u('u'), # Ǖ -> u + u('\u01d6'): u('u'), # ǖ -> u + u('\u01d7'): u('u'), # Ǘ -> u + u('\u01d8'): u('u'), # ǘ -> u + u('\u01d9'): u('u'), # Ǚ -> u + u('\u01da'): u('u'), # ǚ -> u + u('\u01db'): u('u'), # Ǜ -> u + u('\u01dc'): u('u'), # ǜ -> u + u('\u01dd'): u('e'), # ǝ -> e + u('\u01de'): u('a'), # Ǟ -> a + u('\u01df'): u('a'), # ǟ -> a + u('\u01e0'): u('a'), # Ǡ -> a + u('\u01e1'): u('a'), # ǡ -> a + u('\u01e2'): u('a'), # Ǣ -> a + u('\u01e3'): u('a'), # ǣ -> a + u('\u01e4'): u('g'), # Ǥ -> g + u('\u01e5'): u('g'), # ǥ -> g + u('\u01e6'): u('g'), # Ǧ -> g + u('\u01e7'): u('g'), # ǧ -> g + u('\u01e8'): u('k'), # Ǩ -> k + u('\u01e9'): u('k'), # ǩ -> k + u('\u01ea'): u('o'), # Ǫ -> o + u('\u01eb'): u('o'), # ǫ -> o + u('\u01ec'): u('o'), # Ǭ -> o + u('\u01ed'): u('o'), # ǭ -> o + u('\u01ee'): u('z'), # Ǯ -> z + u('\u01ef'): u('z'), # ǯ -> z + u('\u01f0'): u('j'), # ǰ -> j + u('\u01f4'): u('g'), # Ǵ -> g + u('\u01f5'): u('g'), # ǵ -> g + u('\u01f8'): u('n'), # Ǹ -> n + u('\u01f9'): u('n'), # ǹ -> n + u('\u01fa'): u('a'), # Ǻ -> a + u('\u01fb'): u('a'), # ǻ -> a + u('\u01fc'): u('a'), # Ǽ -> a + u('\u01fd'): u('a'), # ǽ -> a + u('\u01fe'): u('o'), # Ǿ -> o + u('\u01ff'): u('o'), # ǿ -> o + u('\u0200'): u('a'), # Ȁ -> a + u('\u0201'): u('a'), # ȁ -> a + u('\u0202'): u('a'), # Ȃ -> a + u('\u0203'): u('a'), # ȃ -> a + u('\u0204'): u('e'), # Ȅ -> e + u('\u0205'): u('e'), # ȅ -> e + u('\u0206'): u('e'), # Ȇ -> e + u('\u0207'): u('e'), # ȇ -> e + u('\u0208'): u('i'), # Ȉ -> i + u('\u0209'): u('i'), # ȉ -> i + u('\u020a'): u('i'), # Ȋ -> i + u('\u020b'): u('i'), # ȋ -> i + u('\u020c'): u('o'), # Ȍ -> o + u('\u020d'): u('o'), # ȍ -> o + u('\u020e'): u('o'), # Ȏ -> o + u('\u020f'): u('o'), # ȏ -> o + u('\u0210'): u('r'), # Ȑ -> r + u('\u0211'): u('r'), # ȑ -> r + u('\u0212'): u('r'), # Ȓ -> r + u('\u0213'): u('r'), # ȓ -> r + u('\u0214'): u('u'), # Ȕ -> u + u('\u0215'): u('u'), # ȕ -> u + u('\u0216'): u('u'), # Ȗ -> u + u('\u0217'): u('u'), # ȗ -> u + u('\u0218'): u('s'), # Ș -> s + u('\u0219'): u('s'), # ș -> s + u('\u021a'): u('t'), # Ț -> t + u('\u021b'): u('t'), # ț -> t + u('\u021e'): u('h'), # Ȟ -> h + u('\u021f'): u('h'), # ȟ -> h + u('\u0220'): u('n'), # Ƞ -> n + u('\u0221'): u('d'), # ȡ -> d + u('\u0224'): u('z'), # Ȥ -> z + u('\u0225'): u('z'), # ȥ -> z + u('\u0226'): u('a'), # Ȧ -> a + u('\u0227'): u('a'), # ȧ -> a + u('\u0228'): u('e'), # Ȩ -> e + u('\u0229'): u('e'), # ȩ -> e + u('\u022a'): u('o'), # Ȫ -> o + u('\u022b'): u('o'), # ȫ -> o + u('\u022c'): u('o'), # Ȭ -> o + u('\u022d'): u('o'), # ȭ -> o + u('\u022e'): u('o'), # Ȯ -> o + u('\u022f'): u('o'), # ȯ -> o + u('\u0230'): u('o'), # Ȱ -> o + u('\u0231'): u('o'), # ȱ -> o + u('\u0232'): u('y'), # Ȳ -> y + u('\u0233'): u('y'), # ȳ -> y + u('\u0234'): u('l'), # ȴ -> l + u('\u0235'): u('n'), # ȵ -> n + u('\u0236'): u('t'), # ȶ -> t + u('\u0237'): u('j'), # ȷ -> j + u('\u023a'): u('a'), # Ⱥ -> a + u('\u023b'): u('c'), # Ȼ -> c + u('\u023c'): u('c'), # ȼ -> c + u('\u023d'): u('l'), # Ƚ -> l + u('\u023e'): u('t'), # Ⱦ -> t + u('\u0243'): u('b'), # Ƀ -> b + u('\u0244'): u('u'), # Ʉ -> u + u('\u0246'): u('e'), # Ɇ -> e + u('\u0247'): u('e'), # ɇ -> e + u('\u0248'): u('j'), # Ɉ -> j + u('\u0249'): u('j'), # ɉ -> j + u('\u024a'): u('q'), # Ɋ -> q + u('\u024b'): u('q'), # ɋ -> q + u('\u024c'): u('r'), # Ɍ -> r + u('\u024d'): u('r'), # ɍ -> r + u('\u024e'): u('y'), # Ɏ -> y + u('\u024f'): u('y'), # ɏ -> y + u('\u0253'): u('b'), # ɓ -> b + u('\u0255'): u('c'), # ɕ -> c + u('\u0256'): u('d'), # ɖ -> d + u('\u0257'): u('d'), # ɗ -> d + u('\u025a'): u('e'), # ɚ -> e + u('\u025d'): u('e'), # ɝ -> e + u('\u025f'): u('j'), # ɟ -> j + u('\u0260'): u('g'), # ɠ -> g + u('\u0268'): u('i'), # ɨ -> i + u('\u026b'): u('l'), # ɫ -> l + u('\u026c'): u('l'), # ɬ -> l + u('\u026d'): u('l'), # ɭ -> l + u('\u0271'): u('m'), # ɱ -> m + u('\u0272'): u('n'), # ɲ -> n + u('\u0273'): u('n'), # ɳ -> n + u('\u0275'): u('o'), # ɵ -> o + u('\u027c'): u('r'), # ɼ -> r + u('\u027d'): u('r'), # ɽ -> r + u('\u027e'): u('r'), # ɾ -> r + u('\u0282'): u('s'), # ʂ -> s + u('\u0284'): u('j'), # ʄ -> j + u('\u0288'): u('t'), # ʈ -> t + u('\u0289'): u('u'), # ʉ -> u + u('\u028b'): u('v'), # ʋ -> v + u('\u028f'): u('y'), # ʏ -> y + u('\u0290'): u('z'), # ʐ -> z + u('\u0291'): u('z'), # ʑ -> z + u('\u029d'): u('j'), # ʝ -> j + u('\u02a0'): u('q'), # ʠ -> q + u('\u0303'): u('p'), # ̃ -> p + u('\u0308'): u('t'), # ̈ -> t + u('\u030a'): u('y'), # ̊ -> y + u('\u030c'): u('j'), # ̌ -> j + u('\u0323'): u('l'), # ̣ -> l + u('\u0329'): u('s'), # ̩ -> s + u('\u0331'): u('h'), # ̱ -> h + u('\u1d6c'): u('b'), # ᵬ -> b + u('\u1d6d'): u('d'), # ᵭ -> d + u('\u1d6e'): u('f'), # ᵮ -> f + u('\u1d72'): u('r'), # ᵲ -> r + u('\u1d73'): u('r'), # ᵳ -> r + u('\u1d75'): u('t'), # ᵵ -> t + u('\u1e00'): u('a'), # Ḁ -> a + u('\u1e01'): u('a'), # ḁ -> a + u('\u1e02'): u('b'), # Ḃ -> b + u('\u1e03'): u('b'), # ḃ -> b + u('\u1e04'): u('b'), # Ḅ -> b + u('\u1e05'): u('b'), # ḅ -> b + u('\u1e06'): u('b'), # Ḇ -> b + u('\u1e07'): u('b'), # ḇ -> b + u('\u1e08'): u('c'), # Ḉ -> c + u('\u1e09'): u('c'), # ḉ -> c + u('\u1e0a'): u('d'), # Ḋ -> d + u('\u1e0b'): u('d'), # ḋ -> d + u('\u1e0c'): u('d'), # Ḍ -> d + u('\u1e0d'): u('d'), # ḍ -> d + u('\u1e0e'): u('d'), # Ḏ -> d + u('\u1e0f'): u('d'), # ḏ -> d + u('\u1e10'): u('d'), # Ḑ -> d + u('\u1e11'): u('d'), # ḑ -> d + u('\u1e12'): u('d'), # Ḓ -> d + u('\u1e13'): u('d'), # ḓ -> d + u('\u1e14'): u('e'), # Ḕ -> e + u('\u1e15'): u('e'), # ḕ -> e + u('\u1e16'): u('e'), # Ḗ -> e + u('\u1e17'): u('e'), # ḗ -> e + u('\u1e18'): u('e'), # Ḙ -> e + u('\u1e19'): u('e'), # ḙ -> e + u('\u1e1a'): u('e'), # Ḛ -> e + u('\u1e1b'): u('e'), # ḛ -> e + u('\u1e1c'): u('e'), # Ḝ -> e + u('\u1e1d'): u('e'), # ḝ -> e + u('\u1e1e'): u('f'), # Ḟ -> f + u('\u1e1f'): u('f'), # ḟ -> f + u('\u1e20'): u('g'), # Ḡ -> g + u('\u1e21'): u('g'), # ḡ -> g + u('\u1e22'): u('h'), # Ḣ -> h + u('\u1e23'): u('h'), # ḣ -> h + u('\u1e24'): u('h'), # Ḥ -> h + u('\u1e25'): u('h'), # ḥ -> h + u('\u1e26'): u('h'), # Ḧ -> h + u('\u1e27'): u('h'), # ḧ -> h + u('\u1e28'): u('h'), # Ḩ -> h + u('\u1e29'): u('h'), # ḩ -> h + u('\u1e2a'): u('h'), # Ḫ -> h + u('\u1e2b'): u('h'), # ḫ -> h + u('\u1e2c'): u('i'), # Ḭ -> i + u('\u1e2d'): u('i'), # ḭ -> i + u('\u1e2e'): u('i'), # Ḯ -> i + u('\u1e2f'): u('i'), # ḯ -> i + u('\u1e30'): u('k'), # Ḱ -> k + u('\u1e31'): u('k'), # ḱ -> k + u('\u1e32'): u('k'), # Ḳ -> k + u('\u1e33'): u('k'), # ḳ -> k + u('\u1e34'): u('k'), # Ḵ -> k + u('\u1e35'): u('k'), # ḵ -> k + u('\u1e36'): u('l'), # Ḷ -> l + u('\u1e37'): u('l'), # ḷ -> l + u('\u1e38'): u('l'), # Ḹ -> l + u('\u1e39'): u('l'), # ḹ -> l + u('\u1e3a'): u('l'), # Ḻ -> l + u('\u1e3b'): u('l'), # ḻ -> l + u('\u1e3c'): u('l'), # Ḽ -> l + u('\u1e3d'): u('l'), # ḽ -> l + u('\u1e3e'): u('m'), # Ḿ -> m + u('\u1e3f'): u('m'), # ḿ -> m + u('\u1e40'): u('m'), # Ṁ -> m + u('\u1e41'): u('m'), # ṁ -> m + u('\u1e42'): u('m'), # Ṃ -> m + u('\u1e43'): u('m'), # ṃ -> m + u('\u1e44'): u('n'), # Ṅ -> n + u('\u1e45'): u('n'), # ṅ -> n + u('\u1e46'): u('n'), # Ṇ -> n + u('\u1e47'): u('n'), # ṇ -> n + u('\u1e48'): u('n'), # Ṉ -> n + u('\u1e49'): u('n'), # ṉ -> n + u('\u1e4a'): u('n'), # Ṋ -> n + u('\u1e4b'): u('n'), # ṋ -> n + u('\u1e4c'): u('o'), # Ṍ -> o + u('\u1e4d'): u('o'), # ṍ -> o + u('\u1e4e'): u('o'), # Ṏ -> o + u('\u1e4f'): u('o'), # ṏ -> o + u('\u1e50'): u('o'), # Ṑ -> o + u('\u1e51'): u('o'), # ṑ -> o + u('\u1e52'): u('o'), # Ṓ -> o + u('\u1e53'): u('o'), # ṓ -> o + u('\u1e54'): u('p'), # Ṕ -> p + u('\u1e55'): u('p'), # ṕ -> p + u('\u1e56'): u('p'), # Ṗ -> p + u('\u1e57'): u('p'), # ṗ -> p + u('\u1e58'): u('r'), # Ṙ -> r + u('\u1e59'): u('r'), # ṙ -> r + u('\u1e5a'): u('r'), # Ṛ -> r + u('\u1e5b'): u('r'), # ṛ -> r + u('\u1e5c'): u('r'), # Ṝ -> r + u('\u1e5d'): u('r'), # ṝ -> r + u('\u1e5e'): u('r'), # Ṟ -> r + u('\u1e5f'): u('r'), # ṟ -> r + u('\u1e60'): u('s'), # Ṡ -> s + u('\u1e61'): u('s'), # ṡ -> s + u('\u1e62'): u('s'), # Ṣ -> s + u('\u1e63'): u('s'), # ṣ -> s + u('\u1e64'): u('s'), # Ṥ -> s + u('\u1e65'): u('s'), # ṥ -> s + u('\u1e66'): u('s'), # Ṧ -> s + u('\u1e67'): u('s'), # ṧ -> s + u('\u1e68'): u('s'), # Ṩ -> s + u('\u1e69'): u('s'), # ṩ -> s + u('\u1e6a'): u('t'), # Ṫ -> t + u('\u1e6b'): u('t'), # ṫ -> t + u('\u1e6c'): u('t'), # Ṭ -> t + u('\u1e6d'): u('t'), # ṭ -> t + u('\u1e6e'): u('t'), # Ṯ -> t + u('\u1e6f'): u('t'), # ṯ -> t + u('\u1e70'): u('t'), # Ṱ -> t + u('\u1e71'): u('t'), # ṱ -> t + u('\u1e72'): u('u'), # Ṳ -> u + u('\u1e73'): u('u'), # ṳ -> u + u('\u1e74'): u('u'), # Ṵ -> u + u('\u1e75'): u('u'), # ṵ -> u + u('\u1e76'): u('u'), # Ṷ -> u + u('\u1e77'): u('u'), # ṷ -> u + u('\u1e78'): u('u'), # Ṹ -> u + u('\u1e79'): u('u'), # ṹ -> u + u('\u1e7a'): u('u'), # Ṻ -> u + u('\u1e7b'): u('u'), # ṻ -> u + u('\u1e7c'): u('v'), # Ṽ -> v + u('\u1e7d'): u('v'), # ṽ -> v + u('\u1e7e'): u('v'), # Ṿ -> v + u('\u1e7f'): u('v'), # ṿ -> v + u('\u1e80'): u('w'), # Ẁ -> w + u('\u1e81'): u('w'), # ẁ -> w + u('\u1e82'): u('w'), # Ẃ -> w + u('\u1e83'): u('w'), # ẃ -> w + u('\u1e84'): u('w'), # Ẅ -> w + u('\u1e85'): u('w'), # ẅ -> w + u('\u1e86'): u('w'), # Ẇ -> w + u('\u1e87'): u('w'), # ẇ -> w + u('\u1e88'): u('w'), # Ẉ -> w + u('\u1e89'): u('w'), # ẉ -> w + u('\u1e8a'): u('x'), # Ẋ -> x + u('\u1e8b'): u('x'), # ẋ -> x + u('\u1e8c'): u('x'), # Ẍ -> x + u('\u1e8d'): u('x'), # ẍ -> x + u('\u1e8e'): u('y'), # Ẏ -> y + u('\u1e8f'): u('y'), # ẏ -> y + u('\u1e90'): u('z'), # Ẑ -> z + u('\u1e91'): u('z'), # ẑ -> z + u('\u1e92'): u('z'), # Ẓ -> z + u('\u1e93'): u('z'), # ẓ -> z + u('\u1e94'): u('z'), # Ẕ -> z + u('\u1e95'): u('z'), # ẕ -> z + u('\u1e96'): u('h'), # ẖ -> h + u('\u1e97'): u('t'), # ẗ -> t + u('\u1e98'): u('w'), # ẘ -> w + u('\u1e99'): u('y'), # ẙ -> y + u('\u1e9a'): u('a'), # ẚ -> a + u('\u1e9b'): u('s'), # ẛ -> s + u('\u1ea0'): u('a'), # Ạ -> a + u('\u1ea1'): u('a'), # ạ -> a + u('\u1ea2'): u('a'), # Ả -> a + u('\u1ea3'): u('a'), # ả -> a + u('\u1ea4'): u('a'), # Ấ -> a + u('\u1ea5'): u('a'), # ấ -> a + u('\u1ea6'): u('a'), # Ầ -> a + u('\u1ea7'): u('a'), # ầ -> a + u('\u1ea8'): u('a'), # Ẩ -> a + u('\u1ea9'): u('a'), # ẩ -> a + u('\u1eaa'): u('a'), # Ẫ -> a + u('\u1eab'): u('a'), # ẫ -> a + u('\u1eac'): u('a'), # Ậ -> a + u('\u1ead'): u('a'), # ậ -> a + u('\u1eae'): u('a'), # Ắ -> a + u('\u1eaf'): u('a'), # ắ -> a + u('\u1eb0'): u('a'), # Ằ -> a + u('\u1eb1'): u('a'), # ằ -> a + u('\u1eb2'): u('a'), # Ẳ -> a + u('\u1eb3'): u('a'), # ẳ -> a + u('\u1eb4'): u('a'), # Ẵ -> a + u('\u1eb5'): u('a'), # ẵ -> a + u('\u1eb6'): u('a'), # Ặ -> a + u('\u1eb7'): u('a'), # ặ -> a + u('\u1eb8'): u('e'), # Ẹ -> e + u('\u1eb9'): u('e'), # ẹ -> e + u('\u1eba'): u('e'), # Ẻ -> e + u('\u1ebb'): u('e'), # ẻ -> e + u('\u1ebc'): u('e'), # Ẽ -> e + u('\u1ebd'): u('e'), # ẽ -> e + u('\u1ebe'): u('e'), # Ế -> e + u('\u1ebf'): u('e'), # ế -> e + u('\u1ec0'): u('e'), # Ề -> e + u('\u1ec1'): u('e'), # ề -> e + u('\u1ec2'): u('e'), # Ể -> e + u('\u1ec3'): u('e'), # ể -> e + u('\u1ec4'): u('e'), # Ễ -> e + u('\u1ec5'): u('e'), # ễ -> e + u('\u1ec6'): u('e'), # Ệ -> e + u('\u1ec7'): u('e'), # ệ -> e + u('\u1ec8'): u('i'), # Ỉ -> i + u('\u1ec9'): u('i'), # ỉ -> i + u('\u1eca'): u('i'), # Ị -> i + u('\u1ecb'): u('i'), # ị -> i + u('\u1ecc'): u('o'), # Ọ -> o + u('\u1ecd'): u('o'), # ọ -> o + u('\u1ece'): u('o'), # Ỏ -> o + u('\u1ecf'): u('o'), # ỏ -> o + u('\u1ed0'): u('o'), # Ố -> o + u('\u1ed1'): u('o'), # ố -> o + u('\u1ed2'): u('o'), # Ồ -> o + u('\u1ed3'): u('o'), # ồ -> o + u('\u1ed4'): u('o'), # Ổ -> o + u('\u1ed5'): u('o'), # ổ -> o + u('\u1ed6'): u('o'), # Ỗ -> o + u('\u1ed7'): u('o'), # ỗ -> o + u('\u1ed8'): u('o'), # Ộ -> o + u('\u1ed9'): u('o'), # ộ -> o + u('\u1eda'): u('o'), # Ớ -> o + u('\u1edb'): u('o'), # ớ -> o + u('\u1edc'): u('o'), # Ờ -> o + u('\u1edd'): u('o'), # ờ -> o + u('\u1ede'): u('o'), # Ở -> o + u('\u1edf'): u('o'), # ở -> o + u('\u1ee0'): u('o'), # Ỡ -> o + u('\u1ee1'): u('o'), # ỡ -> o + u('\u1ee2'): u('o'), # Ợ -> o + u('\u1ee3'): u('o'), # ợ -> o + u('\u1ee4'): u('u'), # Ụ -> u + u('\u1ee5'): u('u'), # ụ -> u + u('\u1ee6'): u('u'), # Ủ -> u + u('\u1ee7'): u('u'), # ủ -> u + u('\u1ee8'): u('u'), # Ứ -> u + u('\u1ee9'): u('u'), # ứ -> u + u('\u1eea'): u('u'), # Ừ -> u + u('\u1eeb'): u('u'), # ừ -> u + u('\u1eec'): u('u'), # Ử -> u + u('\u1eed'): u('u'), # ử -> u + u('\u1eee'): u('u'), # Ữ -> u + u('\u1eef'): u('u'), # ữ -> u + u('\u1ef0'): u('u'), # Ự -> u + u('\u1ef1'): u('u'), # ự -> u + u('\u1ef2'): u('y'), # Ỳ -> y + u('\u1ef3'): u('y'), # ỳ -> y + u('\u1ef4'): u('y'), # Ỵ -> y + u('\u1ef5'): u('y'), # ỵ -> y + u('\u1ef6'): u('y'), # Ỷ -> y + u('\u1ef7'): u('y'), # ỷ -> y + u('\u1ef8'): u('y'), # Ỹ -> y + u('\u1ef9'): u('y'), # ỹ -> y + u('\u2c60'): u('l'), # Ⱡ -> l + u('\u2c61'): u('l'), # ⱡ -> l + u('\u2c62'): u('l'), # Ɫ -> l + u('\u2c63'): u('p'), # Ᵽ -> p + u('\u2c64'): u('r'), # Ɽ -> r + u('\u2c65'): u('a'), # ⱥ -> a + u('\u2c66'): u('t'), # ⱦ -> t + u('\u2c67'): u('h'), # Ⱨ -> h + u('\u2c68'): u('h'), # ⱨ -> h + u('\u2c69'): u('k'), # Ⱪ -> k + u('\u2c6a'): u('k'), # ⱪ -> k + u('\u2c6b'): u('z'), # Ⱬ -> z + u('\u2c6c'): u('z'), # ⱬ -> z + u('\uff10'): u('0'), # 0 -> 0 + u('\uff11'): u('1'), # 1 -> 1 + u('\uff12'): u('2'), # 2 -> 2 + u('\uff13'): u('3'), # 3 -> 3 + u('\uff14'): u('4'), # 4 -> 4 + u('\uff15'): u('5'), # 5 -> 5 + u('\uff16'): u('6'), # 6 -> 6 + u('\uff17'): u('7'), # 7 -> 7 + u('\uff18'): u('8'), # 8 -> 8 + u('\uff19'): u('9'), # 9 -> 9 + u('\uff21'): u('A'), # A -> A + u('\uff22'): u('B'), # B -> B + u('\uff23'): u('C'), # C -> C + u('\uff24'): u('D'), # D -> D + u('\uff25'): u('E'), # E -> E + u('\uff26'): u('F'), # F -> F + u('\uff27'): u('G'), # G -> G + u('\uff28'): u('H'), # H -> H + u('\uff29'): u('I'), # I -> I + u('\uff2a'): u('J'), # J -> J + u('\uff2b'): u('K'), # K -> K + u('\uff2c'): u('L'), # L -> L + u('\uff2d'): u('M'), # M -> M + u('\uff2e'): u('N'), # N -> N + u('\uff2f'): u('O'), # O -> O + u('\uff30'): u('P'), # P -> P + u('\uff31'): u('Q'), # Q -> Q + u('\uff32'): u('R'), # R -> R + u('\uff33'): u('S'), # S -> S + u('\uff34'): u('T'), # T -> T + u('\uff35'): u('U'), # U -> U + u('\uff36'): u('V'), # V -> V + u('\uff37'): u('W'), # W -> W + u('\uff38'): u('X'), # X -> X + u('\uff39'): u('Y'), # Y -> Y + u('\uff3a'): u('Z'), # Z -> Z + u('\uff41'): u('a'), # a -> a + u('\uff42'): u('b'), # b -> b + u('\uff43'): u('c'), # c -> c + u('\uff44'): u('d'), # d -> d + u('\uff45'): u('e'), # e -> e + u('\uff46'): u('f'), # f -> f + u('\uff47'): u('g'), # g -> g + u('\uff48'): u('h'), # h -> h + u('\uff49'): u('i'), # i -> i + u('\uff4a'): u('j'), # j -> j + u('\uff4b'): u('k'), # k -> k + u('\uff4c'): u('l'), # l -> l + u('\uff4d'): u('m'), # m -> m + u('\uff4e'): u('n'), # n -> n + u('\uff4f'): u('o'), # o -> o + u('\uff50'): u('p'), # p -> p + u('\uff51'): u('q'), # q -> q + u('\uff52'): u('r'), # r -> r + u('\uff53'): u('s'), # s -> s + u('\uff54'): u('t'), # t -> t + u('\uff55'): u('u'), # u -> u + u('\uff56'): u('v'), # v -> v + u('\uff57'): u('w'), # w -> w + u('\uff58'): u('x'), # x -> x + u('\uff59'): u('y'), # y -> y + u('\uff5a'): u('z'), # z -> z +} + +# The unicode.translate() method actually requires a dictionary mapping +# character *numbers* to characters, for some reason. +accent_map = dict((ord(k), v) for k, v in iteritems(accent_map)) + + +# This Sphinx charset table taken from http://speeple.com/unicode-maps.txt + +default_charset = """ +################################################## +# Latin +# A +U+00C0->a, U+00C1->a, U+00C2->a, U+00C3->a, U+00C4->a, U+00C5->a, U+00E0->a, U+00E1->a, U+00E2->a, U+00E3->a, U+00E4->a, U+00E5->a, +U+0100->a, U+0101->a, U+0102->a, U+0103->a, U+010300->a, U+0104->a, U+0105->a, U+01CD->a, U+01CE->a, U+01DE->a, U+01DF->a, U+01E0->a, +U+01E1->a, U+01FA->a, U+01FB->a, U+0200->a, U+0201->a, U+0202->a, U+0203->a, U+0226->a, U+0227->a, U+023A->a, U+0250->a, U+04D0->a, +U+04D1->a, U+1D2C->a, U+1D43->a, U+1D44->a, U+1D8F->a, U+1E00->a, U+1E01->a, U+1E9A->a, U+1EA0->a, U+1EA1->a, U+1EA2->a, U+1EA3->a, +U+1EA4->a, U+1EA5->a, U+1EA6->a, U+1EA7->a, U+1EA8->a, U+1EA9->a, U+1EAA->a, U+1EAB->a, U+1EAC->a, U+1EAD->a, U+1EAE->a, U+1EAF->a, +U+1EB0->a, U+1EB1->a, U+1EB2->a, U+1EB3->a, U+1EB4->a, U+1EB5->a, U+1EB6->a, U+1EB7->a, U+2090->a, U+2C65->a + +# B +U+0180->b, U+0181->b, U+0182->b, U+0183->b, U+0243->b, U+0253->b, U+0299->b, U+16D2->b, U+1D03->b, U+1D2E->b, U+1D2F->b, U+1D47->b, +U+1D6C->b, U+1D80->b, U+1E02->b, U+1E03->b, U+1E04->b, U+1E05->b, U+1E06->b, U+1E07->b + +# C +U+00C7->c, U+00E7->c, U+0106->c, U+0107->c, U+0108->c, U+0109->c, U+010A->c, U+010B->c, U+010C->c, U+010D->c, U+0187->c, U+0188->c, +U+023B->c, U+023C->c, U+0255->c, U+0297->c, U+1D9C->c, U+1D9D->c, U+1E08->c, U+1E09->c, U+212D->c, U+2184->c + +# D +U+010E->d, U+010F->d, U+0110->d, U+0111->d, U+0189->d, U+018A->d, U+018B->d, U+018C->d, U+01C5->d, U+01F2->d, U+0221->d, U+0256->d, +U+0257->d, U+1D05->d, U+1D30->d, U+1D48->d, U+1D6D->d, U+1D81->d, U+1D91->d, U+1E0A->d, U+1E0B->d, U+1E0C->d, U+1E0D->d, U+1E0E->d, +U+1E0F->d, U+1E10->d, U+1E11->d, U+1E12->d, U+1E13->d + +# E +U+00C8->e, U+00C9->e, U+00CA->e, U+00CB->e, U+00E8->e, U+00E9->e, U+00EA->e, U+00EB->e, U+0112->e, U+0113->e, U+0114->e, U+0115->e, +U+0116->e, U+0117->e, U+0118->e, U+0119->e, U+011A->e, U+011B->e, U+018E->e, U+0190->e, U+01DD->e, U+0204->e, U+0205->e, U+0206->e, +U+0207->e, U+0228->e, U+0229->e, U+0246->e, U+0247->e, U+0258->e, U+025B->e, U+025C->e, U+025D->e, U+025E->e, U+029A->e, U+1D07->e, +U+1D08->e, U+1D31->e, U+1D32->e, U+1D49->e, U+1D4B->e, U+1D4C->e, U+1D92->e, U+1D93->e, U+1D94->e, U+1D9F->e, U+1E14->e, U+1E15->e, +U+1E16->e, U+1E17->e, U+1E18->e, U+1E19->e, U+1E1A->e, U+1E1B->e, U+1E1C->e, U+1E1D->e, U+1EB8->e, U+1EB9->e, U+1EBA->e, U+1EBB->e, +U+1EBC->e, U+1EBD->e, U+1EBE->e, U+1EBF->e, U+1EC0->e, U+1EC1->e, U+1EC2->e, U+1EC3->e, U+1EC4->e, U+1EC5->e, U+1EC6->e, U+1EC7->e, +U+2091->e + +# F +U+0191->f, U+0192->f, U+1D6E->f, U+1D82->f, U+1DA0->f, U+1E1E->f, U+1E1F->f + +# G +U+011C->g, U+011D->g, U+011E->g, U+011F->g, U+0120->g, U+0121->g, U+0122->g, U+0123->g, U+0193->g, U+01E4->g, U+01E5->g, U+01E6->g, +U+01E7->g, U+01F4->g, U+01F5->g, U+0260->g, U+0261->g, U+0262->g, U+029B->g, U+1D33->g, U+1D4D->g, U+1D77->g, U+1D79->g, U+1D83->g, +U+1DA2->g, U+1E20->g, U+1E21->g + +# H +U+0124->h, U+0125->h, U+0126->h, U+0127->h, U+021E->h, U+021F->h, U+0265->h, U+0266->h, U+029C->h, U+02AE->h, U+02AF->h, U+02B0->h, +U+02B1->h, U+1D34->h, U+1DA3->h, U+1E22->h, U+1E23->h, U+1E24->h, U+1E25->h, U+1E26->h, U+1E27->h, U+1E28->h, U+1E29->h, U+1E2A->h, +U+1E2B->h, U+1E96->h, U+210C->h, U+2C67->h, U+2C68->h, U+2C75->h, U+2C76->h + +# I +U+00CC->i, U+00CD->i, U+00CE->i, U+00CF->i, U+00EC->i, U+00ED->i, U+00EE->i, U+00EF->i, U+010309->i, U+0128->i, U+0129->i, U+012A->i, +U+012B->i, U+012C->i, U+012D->i, U+012E->i, U+012F->i, U+0130->i, U+0131->i, U+0197->i, U+01CF->i, U+01D0->i, U+0208->i, U+0209->i, +U+020A->i, U+020B->i, U+0268->i, U+026A->i, U+040D->i, U+0418->i, U+0419->i, U+0438->i, U+0439->i, U+0456->i, U+1D09->i, U+1D35->i, +U+1D4E->i, U+1D62->i, U+1D7B->i, U+1D96->i, U+1DA4->i, U+1DA6->i, U+1DA7->i, U+1E2C->i, U+1E2D->i, U+1E2E->i, U+1E2F->i, U+1EC8->i, +U+1EC9->i, U+1ECA->i, U+1ECB->i, U+2071->i, U+2111->i + +# J +U+0134->j, U+0135->j, U+01C8->j, U+01CB->j, U+01F0->j, U+0237->j, U+0248->j, U+0249->j, U+025F->j, U+0284->j, U+029D->j, U+02B2->j, +U+1D0A->j, U+1D36->j, U+1DA1->j, U+1DA8->j + +# K +U+0136->k, U+0137->k, U+0198->k, U+0199->k, U+01E8->k, U+01E9->k, U+029E->k, U+1D0B->k, U+1D37->k, U+1D4F->k, U+1D84->k, U+1E30->k, +U+1E31->k, U+1E32->k, U+1E33->k, U+1E34->k, U+1E35->k, U+2C69->k, U+2C6A->k + +# L +U+0139->l, U+013A->l, U+013B->l, U+013C->l, U+013D->l, U+013E->l, U+013F->l, U+0140->l, U+0141->l, U+0142->l, U+019A->l, U+01C8->l, +U+0234->l, U+023D->l, U+026B->l, U+026C->l, U+026D->l, U+029F->l, U+02E1->l, U+1D0C->l, U+1D38->l, U+1D85->l, U+1DA9->l, U+1DAA->l, +U+1DAB->l, U+1E36->l, U+1E37->l, U+1E38->l, U+1E39->l, U+1E3A->l, U+1E3B->l, U+1E3C->l, U+1E3D->l, U+2C60->l, U+2C61->l, U+2C62->l + +# M +U+019C->m, U+026F->m, U+0270->m, U+0271->m, U+1D0D->m, U+1D1F->m, U+1D39->m, U+1D50->m, U+1D5A->m, U+1D6F->m, U+1D86->m, U+1DAC->m, +U+1DAD->m, U+1E3E->m, U+1E3F->m, U+1E40->m, U+1E41->m, U+1E42->m, U+1E43->m + +# N +U+00D1->n, U+00F1->n, U+0143->n, U+0144->n, U+0145->n, U+0146->n, U+0147->n, U+0148->n, U+0149->n, U+019D->n, U+019E->n, U+01CB->n, +U+01F8->n, U+01F9->n, U+0220->n, U+0235->n, U+0272->n, U+0273->n, U+0274->n, U+1D0E->n, U+1D3A->n, U+1D3B->n, U+1D70->n, U+1D87->n, +U+1DAE->n, U+1DAF->n, U+1DB0->n, U+1E44->n, U+1E45->n, U+1E46->n, U+1E47->n, U+1E48->n, U+1E49->n, U+1E4A->n, U+1E4B->n, U+207F->n + +# O +U+00D2->o, U+00D3->o, U+00D4->o, U+00D5->o, U+00D6->o, U+00D8->o, U+00F2->o, U+00F3->o, U+00F4->o, U+00F5->o, U+00F6->o, U+00F8->o, +U+01030F->o, U+014C->o, U+014D->o, U+014E->o, U+014F->o, U+0150->o, U+0151->o, U+0186->o, U+019F->o, U+01A0->o, U+01A1->o, U+01D1->o, +U+01D2->o, U+01EA->o, U+01EB->o, U+01EC->o, U+01ED->o, U+01FE->o, U+01FF->o, U+020C->o, U+020D->o, U+020E->o, U+020F->o, U+022A->o, +U+022B->o, U+022C->o, U+022D->o, U+022E->o, U+022F->o, U+0230->o, U+0231->o, U+0254->o, U+0275->o, U+043E->o, U+04E6->o, U+04E7->o, +U+04E8->o, U+04E9->o, U+04EA->o, U+04EB->o, U+1D0F->o, U+1D10->o, U+1D11->o, U+1D12->o, U+1D13->o, U+1D16->o, U+1D17->o, U+1D3C->o, +U+1D52->o, U+1D53->o, U+1D54->o, U+1D55->o, U+1D97->o, U+1DB1->o, U+1E4C->o, U+1E4D->o, U+1E4E->o, U+1E4F->o, U+1E50->o, U+1E51->o, +U+1E52->o, U+1E53->o, U+1ECC->o, U+1ECD->o, U+1ECE->o, U+1ECF->o, U+1ED0->o, U+1ED1->o, U+1ED2->o, U+1ED3->o, U+1ED4->o, U+1ED5->o, +U+1ED6->o, U+1ED7->o, U+1ED8->o, U+1ED9->o, U+1EDA->o, U+1EDB->o, U+1EDC->o, U+1EDD->o, U+1EDE->o, U+1EDF->o, U+1EE0->o, U+1EE1->o, +U+1EE2->o, U+1EE3->o, U+2092->o, U+2C9E->o, U+2C9F->o + +# P +U+01A4->p, U+01A5->p, U+1D18->p, U+1D3E->p, U+1D56->p, U+1D71->p, U+1D7D->p, U+1D88->p, U+1E54->p, U+1E55->p, U+1E56->p, U+1E57->p, +U+2C63->p + +# Q +U+024A->q, U+024B->q, U+02A0->q + +# R +U+0154->r, U+0155->r, U+0156->r, U+0157->r, U+0158->r, U+0159->r, U+0210->r, U+0211->r, U+0212->r, U+0213->r, U+024C->r, U+024D->r, +U+0279->r, U+027A->r, U+027B->r, U+027C->r, U+027D->r, U+027E->r, U+027F->r, U+0280->r, U+0281->r, U+02B3->r, U+02B4->r, U+02B5->r, +U+02B6->r, U+1D19->r, U+1D1A->r, U+1D3F->r, U+1D63->r, U+1D72->r, U+1D73->r, U+1D89->r, U+1DCA->r, U+1E58->r, U+1E59->r, U+1E5A->r, +U+1E5B->r, U+1E5C->r, U+1E5D->r, U+1E5E->r, U+1E5F->r, U+211C->r, U+2C64->r + +# S +U+00DF->s, U+015A->s, U+015B->s, U+015C->s, U+015D->s, U+015E->s, U+015F->s, U+0160->s, U+0161->s, U+017F->s, U+0218->s, U+0219->s, +U+023F->s, U+0282->s, U+02E2->s, U+1D74->s, U+1D8A->s, U+1DB3->s, U+1E60->s, U+1E61->s, U+1E62->s, U+1E63->s, U+1E64->s, U+1E65->s, +U+1E66->s, U+1E67->s, U+1E68->s, U+1E69->s, U+1E9B->s + +# T +U+0162->t, U+0163->t, U+0164->t, U+0165->t, U+0166->t, U+0167->t, U+01AB->t, U+01AC->t, U+01AD->t, U+01AE->t, U+021A->t, U+021B->t, +U+0236->t, U+023E->t, U+0287->t, U+0288->t, U+1D1B->t, U+1D40->t, U+1D57->t, U+1D75->t, U+1DB5->t, U+1E6A->t, U+1E6B->t, U+1E6C->t, +U+1E6D->t, U+1E6E->t, U+1E6F->t, U+1E70->t, U+1E71->t, U+1E97->t, U+2C66->t + +# U +U+00D9->u, U+00DA->u, U+00DB->u, U+00DC->u, U+00F9->u, U+00FA->u, U+00FB->u, U+00FC->u, U+010316->u, U+0168->u, U+0169->u, U+016A->u, +U+016B->u, U+016C->u, U+016D->u, U+016E->u, U+016F->u, U+0170->u, U+0171->u, U+0172->u, U+0173->u, U+01AF->u, U+01B0->u, U+01D3->u, +U+01D4->u, U+01D5->u, U+01D6->u, U+01D7->u, U+01D8->u, U+01D9->u, U+01DA->u, U+01DB->u, U+01DC->u, U+0214->u, U+0215->u, U+0216->u, +U+0217->u, U+0244->u, U+0289->u, U+1D1C->u, U+1D1D->u, U+1D1E->u, U+1D41->u, U+1D58->u, U+1D59->u, U+1D64->u, U+1D7E->u, U+1D99->u, +U+1DB6->u, U+1DB8->u, U+1E72->u, U+1E73->u, U+1E74->u, U+1E75->u, U+1E76->u, U+1E77->u, U+1E78->u, U+1E79->u, U+1E7A->u, U+1E7B->u, +U+1EE4->u, U+1EE5->u, U+1EE6->u, U+1EE7->u, U+1EE8->u, U+1EE9->u, U+1EEA->u, U+1EEB->u, U+1EEC->u, U+1EED->u, U+1EEE->u, U+1EEF->u, +U+1EF0->u, U+1EF1->u + +# V +U+01B2->v, U+0245->v, U+028B->v, U+028C->v, U+1D20->v, U+1D5B->v, U+1D65->v, U+1D8C->v, U+1DB9->v, U+1DBA->v, U+1E7C->v, U+1E7D->v, +U+1E7E->v, U+1E7F->v, U+2C74->v + +# W +U+0174->w, U+0175->w, U+028D->w, U+02B7->w, U+1D21->w, U+1D42->w, U+1E80->w, U+1E81->w, U+1E82->w, U+1E83->w, U+1E84->w, U+1E85->w, +U+1E86->w, U+1E87->w, U+1E88->w, U+1E89->w, U+1E98->w + +# X +U+02E3->x, U+1D8D->x, U+1E8A->x, U+1E8B->x, U+1E8C->x, U+1E8D->x, U+2093->x + +# Y +U+00DD->y, U+00FD->y, U+00FF->y, U+0176->y, U+0177->y, U+0178->y, U+01B3->y, U+01B4->y, U+0232->y, U+0233->y, U+024E->y, U+024F->y, +U+028E->y, U+028F->y, U+02B8->y, U+1E8E->y, U+1E8F->y, U+1E99->y, U+1EF2->y, U+1EF3->y, U+1EF4->y, U+1EF5->y, U+1EF6->y, U+1EF7->y, +U+1EF8->y, U+1EF9->y + +# Z +U+0179->z, U+017A->z, U+017B->z, U+017C->z, U+017D->z, U+017E->z, U+01B5->z, U+01B6->z, U+0224->z, U+0225->z, U+0240->z, U+0290->z, +U+0291->z, U+1D22->z, U+1D76->z, U+1D8E->z, U+1DBB->z, U+1DBC->z, U+1DBD->z, U+1E90->z, U+1E91->z, U+1E92->z, U+1E93->z, U+1E94->z, +U+1E95->z, U+2128->z, U+2C6B->z, U+2C6C->z + +# Latin Extras: +U+00C6->U+00E6, U+01E2->U+00E6, U+01E3->U+00E6, U+01FC->U+00E6, U+01FD->U+00E6, U+1D01->U+00E6, U+1D02->U+00E6, U+1D2D->U+00E6, +U+1D46->U+00E6, U+00E6 + +################################################## +# Arabic +U+0622->U+0627, U+0623->U+0627, U+0624->U+0648, U+0625->U+0627, U+0626->U+064A, U+06C0->U+06D5, U+06C2->U+06C1, U+06D3->U+06D2, +U+FB50->U+0671, U+FB51->U+0671, U+FB52->U+067B, U+FB53->U+067B, U+FB54->U+067B, U+FB56->U+067E, U+FB57->U+067E, U+FB58->U+067E, +U+FB5A->U+0680, U+FB5B->U+0680, U+FB5C->U+0680, U+FB5E->U+067A, U+FB5F->U+067A, U+FB60->U+067A, U+FB62->U+067F, U+FB63->U+067F, +U+FB64->U+067F, U+FB66->U+0679, U+FB67->U+0679, U+FB68->U+0679, U+FB6A->U+06A4, U+FB6B->U+06A4, U+FB6C->U+06A4, U+FB6E->U+06A6, +U+FB6F->U+06A6, U+FB70->U+06A6, U+FB72->U+0684, U+FB73->U+0684, U+FB74->U+0684, U+FB76->U+0683, U+FB77->U+0683, U+FB78->U+0683, +U+FB7A->U+0686, U+FB7B->U+0686, U+FB7C->U+0686, U+FB7E->U+0687, U+FB7F->U+0687, U+FB80->U+0687, U+FB82->U+068D, U+FB83->U+068D, +U+FB84->U+068C, U+FB85->U+068C, U+FB86->U+068E, U+FB87->U+068E, U+FB88->U+0688, U+FB89->U+0688, U+FB8A->U+0698, U+FB8B->U+0698, +U+FB8C->U+0691, U+FB8D->U+0691, U+FB8E->U+06A9, U+FB8F->U+06A9, U+FB90->U+06A9, U+FB92->U+06AF, U+FB93->U+06AF, U+FB94->U+06AF, +U+FB96->U+06B3, U+FB97->U+06B3, U+FB98->U+06B3, U+FB9A->U+06B1, U+FB9B->U+06B1, U+FB9C->U+06B1, U+FB9E->U+06BA, U+FB9F->U+06BA, +U+FBA0->U+06BB, U+FBA1->U+06BB, U+FBA2->U+06BB, U+FBA4->U+06C0, U+FBA5->U+06C0, U+FBA6->U+06C1, U+FBA7->U+06C1, U+FBA8->U+06C1, +U+FBAA->U+06BE, U+FBAB->U+06BE, U+FBAC->U+06BE, U+FBAE->U+06D2, U+FBAF->U+06D2, U+FBB0->U+06D3, U+FBB1->U+06D3, U+FBD3->U+06AD, +U+FBD4->U+06AD, U+FBD5->U+06AD, U+FBD7->U+06C7, U+FBD8->U+06C7, U+FBD9->U+06C6, U+FBDA->U+06C6, U+FBDB->U+06C8, U+FBDC->U+06C8, +U+FBDD->U+0677, U+FBDE->U+06CB, U+FBDF->U+06CB, U+FBE0->U+06C5, U+FBE1->U+06C5, U+FBE2->U+06C9, U+FBE3->U+06C9, U+FBE4->U+06D0, +U+FBE5->U+06D0, U+FBE6->U+06D0, U+FBE8->U+0649, U+FBFC->U+06CC, U+FBFD->U+06CC, U+FBFE->U+06CC, U+0621, U+0627..U+063A, U+0641..U+064A, +U+0660..U+0669, U+066E, U+066F, U+0671..U+06BF, U+06C1, U+06C3..U+06D2, U+06D5, U+06EE..U+06FC, U+06FF, U+0750..U+076D, U+FB55, U+FB59, +U+FB5D, U+FB61, U+FB65, U+FB69, U+FB6D, U+FB71, U+FB75, U+FB79, U+FB7D, U+FB81, U+FB91, U+FB95, U+FB99, U+FB9D, U+FBA3, U+FBA9, U+FBAD, +U+FBD6, U+FBE7, U+FBE9, U+FBFF + +################################################## +# Armenian +U+0531..U+0556->U+0561..U+0586, U+0561..U+0586, U+0587 + +################################################# +# Bengali +U+09DC->U+09A1, U+09DD->U+09A2, U+09DF->U+09AF, U+09F0->U+09AC, U+09F1->U+09AC, U+0985..U+0990, U+0993..U+09B0, U+09B2, U+09B6..U+09B9, +U+09CE, U+09E0, U+09E1, U+09E6..U+09EF + +################################################# +# CJK* +U+F900->U+8C48, U+F901->U+66F4, U+F902->U+8ECA, U+F903->U+8CC8, U+F904->U+6ED1, U+F905->U+4E32, U+F906->U+53E5, U+F907->U+9F9C, +U+F908->U+9F9C, U+F909->U+5951, U+F90A->U+91D1, U+F90B->U+5587, U+F90C->U+5948, U+F90D->U+61F6, U+F90E->U+7669, U+F90F->U+7F85, +U+F910->U+863F, U+F911->U+87BA, U+F912->U+88F8, U+F913->U+908F, U+F914->U+6A02, U+F915->U+6D1B, U+F916->U+70D9, U+F917->U+73DE, +U+F918->U+843D, U+F919->U+916A, U+F91A->U+99F1, U+F91B->U+4E82, U+F91C->U+5375, U+F91D->U+6B04, U+F91E->U+721B, U+F91F->U+862D, +U+F920->U+9E1E, U+F921->U+5D50, U+F922->U+6FEB, U+F923->U+85CD, U+F924->U+8964, U+F925->U+62C9, U+F926->U+81D8, U+F927->U+881F, +U+F928->U+5ECA, U+F929->U+6717, U+F92A->U+6D6A, U+F92B->U+72FC, U+F92C->U+90CE, U+F92D->U+4F86, U+F92E->U+51B7, U+F92F->U+52DE, +U+F930->U+64C4, U+F931->U+6AD3, U+F932->U+7210, U+F933->U+76E7, U+F934->U+8001, U+F935->U+8606, U+F936->U+865C, U+F937->U+8DEF, +U+F938->U+9732, U+F939->U+9B6F, U+F93A->U+9DFA, U+F93B->U+788C, U+F93C->U+797F, U+F93D->U+7DA0, U+F93E->U+83C9, U+F93F->U+9304, +U+F940->U+9E7F, U+F941->U+8AD6, U+F942->U+58DF, U+F943->U+5F04, U+F944->U+7C60, U+F945->U+807E, U+F946->U+7262, U+F947->U+78CA, +U+F948->U+8CC2, U+F949->U+96F7, U+F94A->U+58D8, U+F94B->U+5C62, U+F94C->U+6A13, U+F94D->U+6DDA, U+F94E->U+6F0F, U+F94F->U+7D2F, +U+F950->U+7E37, U+F951->U+964B, U+F952->U+52D2, U+F953->U+808B, U+F954->U+51DC, U+F955->U+51CC, U+F956->U+7A1C, U+F957->U+7DBE, +U+F958->U+83F1, U+F959->U+9675, U+F95A->U+8B80, U+F95B->U+62CF, U+F95C->U+6A02, U+F95D->U+8AFE, U+F95E->U+4E39, U+F95F->U+5BE7, +U+F960->U+6012, U+F961->U+7387, U+F962->U+7570, U+F963->U+5317, U+F964->U+78FB, U+F965->U+4FBF, U+F966->U+5FA9, U+F967->U+4E0D, +U+F968->U+6CCC, U+F969->U+6578, U+F96A->U+7D22, U+F96B->U+53C3, U+F96C->U+585E, U+F96D->U+7701, U+F96E->U+8449, U+F96F->U+8AAA, +U+F970->U+6BBA, U+F971->U+8FB0, U+F972->U+6C88, U+F973->U+62FE, U+F974->U+82E5, U+F975->U+63A0, U+F976->U+7565, U+F977->U+4EAE, +U+F978->U+5169, U+F979->U+51C9, U+F97A->U+6881, U+F97B->U+7CE7, U+F97C->U+826F, U+F97D->U+8AD2, U+F97E->U+91CF, U+F97F->U+52F5, +U+F980->U+5442, U+F981->U+5973, U+F982->U+5EEC, U+F983->U+65C5, U+F984->U+6FFE, U+F985->U+792A, U+F986->U+95AD, U+F987->U+9A6A, +U+F988->U+9E97, U+F989->U+9ECE, U+F98A->U+529B, U+F98B->U+66C6, U+F98C->U+6B77, U+F98D->U+8F62, U+F98E->U+5E74, U+F98F->U+6190, +U+F990->U+6200, U+F991->U+649A, U+F992->U+6F23, U+F993->U+7149, U+F994->U+7489, U+F995->U+79CA, U+F996->U+7DF4, U+F997->U+806F, +U+F998->U+8F26, U+F999->U+84EE, U+F99A->U+9023, U+F99B->U+934A, U+F99C->U+5217, U+F99D->U+52A3, U+F99E->U+54BD, U+F99F->U+70C8, +U+F9A0->U+88C2, U+F9A1->U+8AAA, U+F9A2->U+5EC9, U+F9A3->U+5FF5, U+F9A4->U+637B, U+F9A5->U+6BAE, U+F9A6->U+7C3E, U+F9A7->U+7375, +U+F9A8->U+4EE4, U+F9A9->U+56F9, U+F9AA->U+5BE7, U+F9AB->U+5DBA, U+F9AC->U+601C, U+F9AD->U+73B2, U+F9AE->U+7469, U+F9AF->U+7F9A, +U+F9B0->U+8046, U+F9B1->U+9234, U+F9B2->U+96F6, U+F9B3->U+9748, U+F9B4->U+9818, U+F9B5->U+4F8B, U+F9B6->U+79AE, U+F9B7->U+91B4, +U+F9B8->U+96B8, U+F9B9->U+60E1, U+F9BA->U+4E86, U+F9BB->U+50DA, U+F9BC->U+5BEE, U+F9BD->U+5C3F, U+F9BE->U+6599, U+F9BF->U+6A02, +U+F9C0->U+71CE, U+F9C1->U+7642, U+F9C2->U+84FC, U+F9C3->U+907C, U+F9C4->U+9F8D, U+F9C5->U+6688, U+F9C6->U+962E, U+F9C7->U+5289, +U+F9C8->U+677B, U+F9C9->U+67F3, U+F9CA->U+6D41, U+F9CB->U+6E9C, U+F9CC->U+7409, U+F9CD->U+7559, U+F9CE->U+786B, U+F9CF->U+7D10, +U+F9D0->U+985E, U+F9D1->U+516D, U+F9D2->U+622E, U+F9D3->U+9678, U+F9D4->U+502B, U+F9D5->U+5D19, U+F9D6->U+6DEA, U+F9D7->U+8F2A, +U+F9D8->U+5F8B, U+F9D9->U+6144, U+F9DA->U+6817, U+F9DB->U+7387, U+F9DC->U+9686, U+F9DD->U+5229, U+F9DE->U+540F, U+F9DF->U+5C65, +U+F9E0->U+6613, U+F9E1->U+674E, U+F9E2->U+68A8, U+F9E3->U+6CE5, U+F9E4->U+7406, U+F9E5->U+75E2, U+F9E6->U+7F79, U+F9E7->U+88CF, +U+F9E8->U+88E1, U+F9E9->U+91CC, U+F9EA->U+96E2, U+F9EB->U+533F, U+F9EC->U+6EBA, U+F9ED->U+541D, U+F9EE->U+71D0, U+F9EF->U+7498, +U+F9F0->U+85FA, U+F9F1->U+96A3, U+F9F2->U+9C57, U+F9F3->U+9E9F, U+F9F4->U+6797, U+F9F5->U+6DCB, U+F9F6->U+81E8, U+F9F7->U+7ACB, +U+F9F8->U+7B20, U+F9F9->U+7C92, U+F9FA->U+72C0, U+F9FB->U+7099, U+F9FC->U+8B58, U+F9FD->U+4EC0, U+F9FE->U+8336, U+F9FF->U+523A, +U+FA00->U+5207, U+FA01->U+5EA6, U+FA02->U+62D3, U+FA03->U+7CD6, U+FA04->U+5B85, U+FA05->U+6D1E, U+FA06->U+66B4, U+FA07->U+8F3B, +U+FA08->U+884C, U+FA09->U+964D, U+FA0A->U+898B, U+FA0B->U+5ED3, U+FA0C->U+5140, U+FA0D->U+55C0, U+FA10->U+585A, U+FA12->U+6674, +U+FA15->U+51DE, U+FA16->U+732A, U+FA17->U+76CA, U+FA18->U+793C, U+FA19->U+795E, U+FA1A->U+7965, U+FA1B->U+798F, U+FA1C->U+9756, +U+FA1D->U+7CBE, U+FA1E->U+7FBD, U+FA20->U+8612, U+FA22->U+8AF8, U+FA25->U+9038, U+FA26->U+90FD, U+FA2A->U+98EF, U+FA2B->U+98FC, +U+FA2C->U+9928, U+FA2D->U+9DB4, U+FA30->U+4FAE, U+FA31->U+50E7, U+FA32->U+514D, U+FA33->U+52C9, U+FA34->U+52E4, U+FA35->U+5351, +U+FA36->U+559D, U+FA37->U+5606, U+FA38->U+5668, U+FA39->U+5840, U+FA3A->U+58A8, U+FA3B->U+5C64, U+FA3C->U+5C6E, U+FA3D->U+6094, +U+FA3E->U+6168, U+FA3F->U+618E, U+FA40->U+61F2, U+FA41->U+654F, U+FA42->U+65E2, U+FA43->U+6691, U+FA44->U+6885, U+FA45->U+6D77, +U+FA46->U+6E1A, U+FA47->U+6F22, U+FA48->U+716E, U+FA49->U+722B, U+FA4A->U+7422, U+FA4B->U+7891, U+FA4C->U+793E, U+FA4D->U+7949, +U+FA4E->U+7948, U+FA4F->U+7950, U+FA50->U+7956, U+FA51->U+795D, U+FA52->U+798D, U+FA53->U+798E, U+FA54->U+7A40, U+FA55->U+7A81, +U+FA56->U+7BC0, U+FA57->U+7DF4, U+FA58->U+7E09, U+FA59->U+7E41, U+FA5A->U+7F72, U+FA5B->U+8005, U+FA5C->U+81ED, U+FA5D->U+8279, +U+FA5E->U+8279, U+FA5F->U+8457, U+FA60->U+8910, U+FA61->U+8996, U+FA62->U+8B01, U+FA63->U+8B39, U+FA64->U+8CD3, U+FA65->U+8D08, +U+FA66->U+8FB6, U+FA67->U+9038, U+FA68->U+96E3, U+FA69->U+97FF, U+FA6A->U+983B, U+FA70->U+4E26, U+FA71->U+51B5, U+FA72->U+5168, +U+FA73->U+4F80, U+FA74->U+5145, U+FA75->U+5180, U+FA76->U+52C7, U+FA77->U+52FA, U+FA78->U+559D, U+FA79->U+5555, U+FA7A->U+5599, +U+FA7B->U+55E2, U+FA7C->U+585A, U+FA7D->U+58B3, U+FA7E->U+5944, U+FA7F->U+5954, U+FA80->U+5A62, U+FA81->U+5B28, U+FA82->U+5ED2, +U+FA83->U+5ED9, U+FA84->U+5F69, U+FA85->U+5FAD, U+FA86->U+60D8, U+FA87->U+614E, U+FA88->U+6108, U+FA89->U+618E, U+FA8A->U+6160, +U+FA8B->U+61F2, U+FA8C->U+6234, U+FA8D->U+63C4, U+FA8E->U+641C, U+FA8F->U+6452, U+FA90->U+6556, U+FA91->U+6674, U+FA92->U+6717, +U+FA93->U+671B, U+FA94->U+6756, U+FA95->U+6B79, U+FA96->U+6BBA, U+FA97->U+6D41, U+FA98->U+6EDB, U+FA99->U+6ECB, U+FA9A->U+6F22, +U+FA9B->U+701E, U+FA9C->U+716E, U+FA9D->U+77A7, U+FA9E->U+7235, U+FA9F->U+72AF, U+FAA0->U+732A, U+FAA1->U+7471, U+FAA2->U+7506, +U+FAA3->U+753B, U+FAA4->U+761D, U+FAA5->U+761F, U+FAA6->U+76CA, U+FAA7->U+76DB, U+FAA8->U+76F4, U+FAA9->U+774A, U+FAAA->U+7740, +U+FAAB->U+78CC, U+FAAC->U+7AB1, U+FAAD->U+7BC0, U+FAAE->U+7C7B, U+FAAF->U+7D5B, U+FAB0->U+7DF4, U+FAB1->U+7F3E, U+FAB2->U+8005, +U+FAB3->U+8352, U+FAB4->U+83EF, U+FAB5->U+8779, U+FAB6->U+8941, U+FAB7->U+8986, U+FAB8->U+8996, U+FAB9->U+8ABF, U+FABA->U+8AF8, +U+FABB->U+8ACB, U+FABC->U+8B01, U+FABD->U+8AFE, U+FABE->U+8AED, U+FABF->U+8B39, U+FAC0->U+8B8A, U+FAC1->U+8D08, U+FAC2->U+8F38, +U+FAC3->U+9072, U+FAC4->U+9199, U+FAC5->U+9276, U+FAC6->U+967C, U+FAC7->U+96E3, U+FAC8->U+9756, U+FAC9->U+97DB, U+FACA->U+97FF, +U+FACB->U+980B, U+FACC->U+983B, U+FACD->U+9B12, U+FACE->U+9F9C, U+FACF->U+2284A, U+FAD0->U+22844, U+FAD1->U+233D5, U+FAD2->U+3B9D, +U+FAD3->U+4018, U+FAD4->U+4039, U+FAD5->U+25249, U+FAD6->U+25CD0, U+FAD7->U+27ED3, U+FAD8->U+9F43, U+FAD9->U+9F8E, U+2F800->U+4E3D, +U+2F801->U+4E38, U+2F802->U+4E41, U+2F803->U+20122, U+2F804->U+4F60, U+2F805->U+4FAE, U+2F806->U+4FBB, U+2F807->U+5002, U+2F808->U+507A, +U+2F809->U+5099, U+2F80A->U+50E7, U+2F80B->U+50CF, U+2F80C->U+349E, U+2F80D->U+2063A, U+2F80E->U+514D, U+2F80F->U+5154, U+2F810->U+5164, +U+2F811->U+5177, U+2F812->U+2051C, U+2F813->U+34B9, U+2F814->U+5167, U+2F815->U+518D, U+2F816->U+2054B, U+2F817->U+5197, +U+2F818->U+51A4, U+2F819->U+4ECC, U+2F81A->U+51AC, U+2F81B->U+51B5, U+2F81C->U+291DF, U+2F81D->U+51F5, U+2F81E->U+5203, +U+2F81F->U+34DF, U+2F820->U+523B, U+2F821->U+5246, U+2F822->U+5272, U+2F823->U+5277, U+2F824->U+3515, U+2F825->U+52C7, +U+2F826->U+52C9, U+2F827->U+52E4, U+2F828->U+52FA, U+2F829->U+5305, U+2F82A->U+5306, U+2F82B->U+5317, U+2F82C->U+5349, +U+2F82D->U+5351, U+2F82E->U+535A, U+2F82F->U+5373, U+2F830->U+537D, U+2F831->U+537F, U+2F832->U+537F, U+2F833->U+537F, +U+2F834->U+20A2C, U+2F835->U+7070, U+2F836->U+53CA, U+2F837->U+53DF, U+2F838->U+20B63, U+2F839->U+53EB, U+2F83A->U+53F1, +U+2F83B->U+5406, U+2F83C->U+549E, U+2F83D->U+5438, U+2F83E->U+5448, U+2F83F->U+5468, U+2F840->U+54A2, U+2F841->U+54F6, +U+2F842->U+5510, U+2F843->U+5553, U+2F844->U+5563, U+2F845->U+5584, U+2F846->U+5584, U+2F847->U+5599, U+2F848->U+55AB, +U+2F849->U+55B3, U+2F84A->U+55C2, U+2F84B->U+5716, U+2F84C->U+5606, U+2F84D->U+5717, U+2F84E->U+5651, U+2F84F->U+5674, +U+2F850->U+5207, U+2F851->U+58EE, U+2F852->U+57CE, U+2F853->U+57F4, U+2F854->U+580D, U+2F855->U+578B, U+2F856->U+5832, +U+2F857->U+5831, U+2F858->U+58AC, U+2F859->U+214E4, U+2F85A->U+58F2, U+2F85B->U+58F7, U+2F85C->U+5906, U+2F85D->U+591A, +U+2F85E->U+5922, U+2F85F->U+5962, U+2F860->U+216A8, U+2F861->U+216EA, U+2F862->U+59EC, U+2F863->U+5A1B, U+2F864->U+5A27, +U+2F865->U+59D8, U+2F866->U+5A66, U+2F867->U+36EE, U+2F868->U+36FC, U+2F869->U+5B08, U+2F86A->U+5B3E, U+2F86B->U+5B3E, +U+2F86C->U+219C8, U+2F86D->U+5BC3, U+2F86E->U+5BD8, U+2F86F->U+5BE7, U+2F870->U+5BF3, U+2F871->U+21B18, U+2F872->U+5BFF, +U+2F873->U+5C06, U+2F874->U+5F53, U+2F875->U+5C22, U+2F876->U+3781, U+2F877->U+5C60, U+2F878->U+5C6E, U+2F879->U+5CC0, +U+2F87A->U+5C8D, U+2F87B->U+21DE4, U+2F87C->U+5D43, U+2F87D->U+21DE6, U+2F87E->U+5D6E, U+2F87F->U+5D6B, U+2F880->U+5D7C, +U+2F881->U+5DE1, U+2F882->U+5DE2, U+2F883->U+382F, U+2F884->U+5DFD, U+2F885->U+5E28, U+2F886->U+5E3D, U+2F887->U+5E69, +U+2F888->U+3862, U+2F889->U+22183, U+2F88A->U+387C, U+2F88B->U+5EB0, U+2F88C->U+5EB3, U+2F88D->U+5EB6, U+2F88E->U+5ECA, +U+2F88F->U+2A392, U+2F890->U+5EFE, U+2F891->U+22331, U+2F892->U+22331, U+2F893->U+8201, U+2F894->U+5F22, U+2F895->U+5F22, +U+2F896->U+38C7, U+2F897->U+232B8, U+2F898->U+261DA, U+2F899->U+5F62, U+2F89A->U+5F6B, U+2F89B->U+38E3, U+2F89C->U+5F9A, +U+2F89D->U+5FCD, U+2F89E->U+5FD7, U+2F89F->U+5FF9, U+2F8A0->U+6081, U+2F8A1->U+393A, U+2F8A2->U+391C, U+2F8A3->U+6094, +U+2F8A4->U+226D4, U+2F8A5->U+60C7, U+2F8A6->U+6148, U+2F8A7->U+614C, U+2F8A8->U+614E, U+2F8A9->U+614C, U+2F8AA->U+617A, +U+2F8AB->U+618E, U+2F8AC->U+61B2, U+2F8AD->U+61A4, U+2F8AE->U+61AF, U+2F8AF->U+61DE, U+2F8B0->U+61F2, U+2F8B1->U+61F6, +U+2F8B2->U+6210, U+2F8B3->U+621B, U+2F8B4->U+625D, U+2F8B5->U+62B1, U+2F8B6->U+62D4, U+2F8B7->U+6350, U+2F8B8->U+22B0C, +U+2F8B9->U+633D, U+2F8BA->U+62FC, U+2F8BB->U+6368, U+2F8BC->U+6383, U+2F8BD->U+63E4, U+2F8BE->U+22BF1, U+2F8BF->U+6422, +U+2F8C0->U+63C5, U+2F8C1->U+63A9, U+2F8C2->U+3A2E, U+2F8C3->U+6469, U+2F8C4->U+647E, U+2F8C5->U+649D, U+2F8C6->U+6477, +U+2F8C7->U+3A6C, U+2F8C8->U+654F, U+2F8C9->U+656C, U+2F8CA->U+2300A, U+2F8CB->U+65E3, U+2F8CC->U+66F8, U+2F8CD->U+6649, +U+2F8CE->U+3B19, U+2F8CF->U+6691, U+2F8D0->U+3B08, U+2F8D1->U+3AE4, U+2F8D2->U+5192, U+2F8D3->U+5195, U+2F8D4->U+6700, +U+2F8D5->U+669C, U+2F8D6->U+80AD, U+2F8D7->U+43D9, U+2F8D8->U+6717, U+2F8D9->U+671B, U+2F8DA->U+6721, U+2F8DB->U+675E, +U+2F8DC->U+6753, U+2F8DD->U+233C3, U+2F8DE->U+3B49, U+2F8DF->U+67FA, U+2F8E0->U+6785, U+2F8E1->U+6852, U+2F8E2->U+6885, +U+2F8E3->U+2346D, U+2F8E4->U+688E, U+2F8E5->U+681F, U+2F8E6->U+6914, U+2F8E7->U+3B9D, U+2F8E8->U+6942, U+2F8E9->U+69A3, +U+2F8EA->U+69EA, U+2F8EB->U+6AA8, U+2F8EC->U+236A3, U+2F8ED->U+6ADB, U+2F8EE->U+3C18, U+2F8EF->U+6B21, U+2F8F0->U+238A7, +U+2F8F1->U+6B54, U+2F8F2->U+3C4E, U+2F8F3->U+6B72, U+2F8F4->U+6B9F, U+2F8F5->U+6BBA, U+2F8F6->U+6BBB, U+2F8F7->U+23A8D, +U+2F8F8->U+21D0B, U+2F8F9->U+23AFA, U+2F8FA->U+6C4E, U+2F8FB->U+23CBC, U+2F8FC->U+6CBF, U+2F8FD->U+6CCD, U+2F8FE->U+6C67, +U+2F8FF->U+6D16, U+2F900->U+6D3E, U+2F901->U+6D77, U+2F902->U+6D41, U+2F903->U+6D69, U+2F904->U+6D78, U+2F905->U+6D85, +U+2F906->U+23D1E, U+2F907->U+6D34, U+2F908->U+6E2F, U+2F909->U+6E6E, U+2F90A->U+3D33, U+2F90B->U+6ECB, U+2F90C->U+6EC7, +U+2F90D->U+23ED1, U+2F90E->U+6DF9, U+2F90F->U+6F6E, U+2F910->U+23F5E, U+2F911->U+23F8E, U+2F912->U+6FC6, U+2F913->U+7039, +U+2F914->U+701E, U+2F915->U+701B, U+2F916->U+3D96, U+2F917->U+704A, U+2F918->U+707D, U+2F919->U+7077, U+2F91A->U+70AD, +U+2F91B->U+20525, U+2F91C->U+7145, U+2F91D->U+24263, U+2F91E->U+719C, U+2F91F->U+243AB, U+2F920->U+7228, U+2F921->U+7235, +U+2F922->U+7250, U+2F923->U+24608, U+2F924->U+7280, U+2F925->U+7295, U+2F926->U+24735, U+2F927->U+24814, U+2F928->U+737A, +U+2F929->U+738B, U+2F92A->U+3EAC, U+2F92B->U+73A5, U+2F92C->U+3EB8, U+2F92D->U+3EB8, U+2F92E->U+7447, U+2F92F->U+745C, +U+2F930->U+7471, U+2F931->U+7485, U+2F932->U+74CA, U+2F933->U+3F1B, U+2F934->U+7524, U+2F935->U+24C36, U+2F936->U+753E, +U+2F937->U+24C92, U+2F938->U+7570, U+2F939->U+2219F, U+2F93A->U+7610, U+2F93B->U+24FA1, U+2F93C->U+24FB8, U+2F93D->U+25044, +U+2F93E->U+3FFC, U+2F93F->U+4008, U+2F940->U+76F4, U+2F941->U+250F3, U+2F942->U+250F2, U+2F943->U+25119, U+2F944->U+25133, +U+2F945->U+771E, U+2F946->U+771F, U+2F947->U+771F, U+2F948->U+774A, U+2F949->U+4039, U+2F94A->U+778B, U+2F94B->U+4046, +U+2F94C->U+4096, U+2F94D->U+2541D, U+2F94E->U+784E, U+2F94F->U+788C, U+2F950->U+78CC, U+2F951->U+40E3, U+2F952->U+25626, +U+2F953->U+7956, U+2F954->U+2569A, U+2F955->U+256C5, U+2F956->U+798F, U+2F957->U+79EB, U+2F958->U+412F, U+2F959->U+7A40, +U+2F95A->U+7A4A, U+2F95B->U+7A4F, U+2F95C->U+2597C, U+2F95D->U+25AA7, U+2F95E->U+25AA7, U+2F95F->U+7AEE, U+2F960->U+4202, +U+2F961->U+25BAB, U+2F962->U+7BC6, U+2F963->U+7BC9, U+2F964->U+4227, U+2F965->U+25C80, U+2F966->U+7CD2, U+2F967->U+42A0, +U+2F968->U+7CE8, U+2F969->U+7CE3, U+2F96A->U+7D00, U+2F96B->U+25F86, U+2F96C->U+7D63, U+2F96D->U+4301, U+2F96E->U+7DC7, +U+2F96F->U+7E02, U+2F970->U+7E45, U+2F971->U+4334, U+2F972->U+26228, U+2F973->U+26247, U+2F974->U+4359, U+2F975->U+262D9, +U+2F976->U+7F7A, U+2F977->U+2633E, U+2F978->U+7F95, U+2F979->U+7FFA, U+2F97A->U+8005, U+2F97B->U+264DA, U+2F97C->U+26523, +U+2F97D->U+8060, U+2F97E->U+265A8, U+2F97F->U+8070, U+2F980->U+2335F, U+2F981->U+43D5, U+2F982->U+80B2, U+2F983->U+8103, +U+2F984->U+440B, U+2F985->U+813E, U+2F986->U+5AB5, U+2F987->U+267A7, U+2F988->U+267B5, U+2F989->U+23393, U+2F98A->U+2339C, +U+2F98B->U+8201, U+2F98C->U+8204, U+2F98D->U+8F9E, U+2F98E->U+446B, U+2F98F->U+8291, U+2F990->U+828B, U+2F991->U+829D, +U+2F992->U+52B3, U+2F993->U+82B1, U+2F994->U+82B3, U+2F995->U+82BD, U+2F996->U+82E6, U+2F997->U+26B3C, U+2F998->U+82E5, +U+2F999->U+831D, U+2F99A->U+8363, U+2F99B->U+83AD, U+2F99C->U+8323, U+2F99D->U+83BD, U+2F99E->U+83E7, U+2F99F->U+8457, +U+2F9A0->U+8353, U+2F9A1->U+83CA, U+2F9A2->U+83CC, U+2F9A3->U+83DC, U+2F9A4->U+26C36, U+2F9A5->U+26D6B, U+2F9A6->U+26CD5, +U+2F9A7->U+452B, U+2F9A8->U+84F1, U+2F9A9->U+84F3, U+2F9AA->U+8516, U+2F9AB->U+273CA, U+2F9AC->U+8564, U+2F9AD->U+26F2C, +U+2F9AE->U+455D, U+2F9AF->U+4561, U+2F9B0->U+26FB1, U+2F9B1->U+270D2, U+2F9B2->U+456B, U+2F9B3->U+8650, U+2F9B4->U+865C, +U+2F9B5->U+8667, U+2F9B6->U+8669, U+2F9B7->U+86A9, U+2F9B8->U+8688, U+2F9B9->U+870E, U+2F9BA->U+86E2, U+2F9BB->U+8779, +U+2F9BC->U+8728, U+2F9BD->U+876B, U+2F9BE->U+8786, U+2F9BF->U+45D7, U+2F9C0->U+87E1, U+2F9C1->U+8801, U+2F9C2->U+45F9, +U+2F9C3->U+8860, U+2F9C4->U+8863, U+2F9C5->U+27667, U+2F9C6->U+88D7, U+2F9C7->U+88DE, U+2F9C8->U+4635, U+2F9C9->U+88FA, +U+2F9CA->U+34BB, U+2F9CB->U+278AE, U+2F9CC->U+27966, U+2F9CD->U+46BE, U+2F9CE->U+46C7, U+2F9CF->U+8AA0, U+2F9D0->U+8AED, +U+2F9D1->U+8B8A, U+2F9D2->U+8C55, U+2F9D3->U+27CA8, U+2F9D4->U+8CAB, U+2F9D5->U+8CC1, U+2F9D6->U+8D1B, U+2F9D7->U+8D77, +U+2F9D8->U+27F2F, U+2F9D9->U+20804, U+2F9DA->U+8DCB, U+2F9DB->U+8DBC, U+2F9DC->U+8DF0, U+2F9DD->U+208DE, U+2F9DE->U+8ED4, +U+2F9DF->U+8F38, U+2F9E0->U+285D2, U+2F9E1->U+285ED, U+2F9E2->U+9094, U+2F9E3->U+90F1, U+2F9E4->U+9111, U+2F9E5->U+2872E, +U+2F9E6->U+911B, U+2F9E7->U+9238, U+2F9E8->U+92D7, U+2F9E9->U+92D8, U+2F9EA->U+927C, U+2F9EB->U+93F9, U+2F9EC->U+9415, +U+2F9ED->U+28BFA, U+2F9EE->U+958B, U+2F9EF->U+4995, U+2F9F0->U+95B7, U+2F9F1->U+28D77, U+2F9F2->U+49E6, U+2F9F3->U+96C3, +U+2F9F4->U+5DB2, U+2F9F5->U+9723, U+2F9F6->U+29145, U+2F9F7->U+2921A, U+2F9F8->U+4A6E, U+2F9F9->U+4A76, U+2F9FA->U+97E0, +U+2F9FB->U+2940A, U+2F9FC->U+4AB2, U+2F9FD->U+29496, U+2F9FE->U+980B, U+2F9FF->U+980B, U+2FA00->U+9829, U+2FA01->U+295B6, +U+2FA02->U+98E2, U+2FA03->U+4B33, U+2FA04->U+9929, U+2FA05->U+99A7, U+2FA06->U+99C2, U+2FA07->U+99FE, U+2FA08->U+4BCE, +U+2FA09->U+29B30, U+2FA0A->U+9B12, U+2FA0B->U+9C40, U+2FA0C->U+9CFD, U+2FA0D->U+4CCE, U+2FA0E->U+4CED, U+2FA0F->U+9D67, +U+2FA10->U+2A0CE, U+2FA11->U+4CF8, U+2FA12->U+2A105, U+2FA13->U+2A20E, U+2FA14->U+2A291, U+2FA15->U+9EBB, U+2FA16->U+4D56, +U+2FA17->U+9EF9, U+2FA18->U+9EFE, U+2FA19->U+9F05, U+2FA1A->U+9F0F, U+2FA1B->U+9F16, U+2FA1C->U+9F3B, U+2FA1D->U+2A600, +U+2F00->U+4E00, U+2F01->U+4E28, U+2F02->U+4E36, U+2F03->U+4E3F, U+2F04->U+4E59, U+2F05->U+4E85, U+2F06->U+4E8C, U+2F07->U+4EA0, +U+2F08->U+4EBA, U+2F09->U+513F, U+2F0A->U+5165, U+2F0B->U+516B, U+2F0C->U+5182, U+2F0D->U+5196, U+2F0E->U+51AB, U+2F0F->U+51E0, +U+2F10->U+51F5, U+2F11->U+5200, U+2F12->U+529B, U+2F13->U+52F9, U+2F14->U+5315, U+2F15->U+531A, U+2F16->U+5338, U+2F17->U+5341, +U+2F18->U+535C, U+2F19->U+5369, U+2F1A->U+5382, U+2F1B->U+53B6, U+2F1C->U+53C8, U+2F1D->U+53E3, U+2F1E->U+56D7, U+2F1F->U+571F, +U+2F20->U+58EB, U+2F21->U+5902, U+2F22->U+590A, U+2F23->U+5915, U+2F24->U+5927, U+2F25->U+5973, U+2F26->U+5B50, U+2F27->U+5B80, +U+2F28->U+5BF8, U+2F29->U+5C0F, U+2F2A->U+5C22, U+2F2B->U+5C38, U+2F2C->U+5C6E, U+2F2D->U+5C71, U+2F2E->U+5DDB, U+2F2F->U+5DE5, +U+2F30->U+5DF1, U+2F31->U+5DFE, U+2F32->U+5E72, U+2F33->U+5E7A, U+2F34->U+5E7F, U+2F35->U+5EF4, U+2F36->U+5EFE, U+2F37->U+5F0B, +U+2F38->U+5F13, U+2F39->U+5F50, U+2F3A->U+5F61, U+2F3B->U+5F73, U+2F3C->U+5FC3, U+2F3D->U+6208, U+2F3E->U+6236, U+2F3F->U+624B, +U+2F40->U+652F, U+2F41->U+6534, U+2F42->U+6587, U+2F43->U+6597, U+2F44->U+65A4, U+2F45->U+65B9, U+2F46->U+65E0, U+2F47->U+65E5, +U+2F48->U+66F0, U+2F49->U+6708, U+2F4A->U+6728, U+2F4B->U+6B20, U+2F4C->U+6B62, U+2F4D->U+6B79, U+2F4E->U+6BB3, U+2F4F->U+6BCB, +U+2F50->U+6BD4, U+2F51->U+6BDB, U+2F52->U+6C0F, U+2F53->U+6C14, U+2F54->U+6C34, U+2F55->U+706B, U+2F56->U+722A, U+2F57->U+7236, +U+2F58->U+723B, U+2F59->U+723F, U+2F5A->U+7247, U+2F5B->U+7259, U+2F5C->U+725B, U+2F5D->U+72AC, U+2F5E->U+7384, U+2F5F->U+7389, +U+2F60->U+74DC, U+2F61->U+74E6, U+2F62->U+7518, U+2F63->U+751F, U+2F64->U+7528, U+2F65->U+7530, U+2F66->U+758B, U+2F67->U+7592, +U+2F68->U+7676, U+2F69->U+767D, U+2F6A->U+76AE, U+2F6B->U+76BF, U+2F6C->U+76EE, U+2F6D->U+77DB, U+2F6E->U+77E2, U+2F6F->U+77F3, +U+2F70->U+793A, U+2F71->U+79B8, U+2F72->U+79BE, U+2F73->U+7A74, U+2F74->U+7ACB, U+2F75->U+7AF9, U+2F76->U+7C73, U+2F77->U+7CF8, +U+2F78->U+7F36, U+2F79->U+7F51, U+2F7A->U+7F8A, U+2F7B->U+7FBD, U+2F7C->U+8001, U+2F7D->U+800C, U+2F7E->U+8012, U+2F7F->U+8033, +U+2F80->U+807F, U+2F81->U+8089, U+2F82->U+81E3, U+2F83->U+81EA, U+2F84->U+81F3, U+2F85->U+81FC, U+2F86->U+820C, U+2F87->U+821B, +U+2F88->U+821F, U+2F89->U+826E, U+2F8A->U+8272, U+2F8B->U+8278, U+2F8C->U+864D, U+2F8D->U+866B, U+2F8E->U+8840, U+2F8F->U+884C, +U+2F90->U+8863, U+2F91->U+897E, U+2F92->U+898B, U+2F93->U+89D2, U+2F94->U+8A00, U+2F95->U+8C37, U+2F96->U+8C46, U+2F97->U+8C55, +U+2F98->U+8C78, U+2F99->U+8C9D, U+2F9A->U+8D64, U+2F9B->U+8D70, U+2F9C->U+8DB3, U+2F9D->U+8EAB, U+2F9E->U+8ECA, U+2F9F->U+8F9B, +U+2FA0->U+8FB0, U+2FA1->U+8FB5, U+2FA2->U+9091, U+2FA3->U+9149, U+2FA4->U+91C6, U+2FA5->U+91CC, U+2FA6->U+91D1, U+2FA7->U+9577, +U+2FA8->U+9580, U+2FA9->U+961C, U+2FAA->U+96B6, U+2FAB->U+96B9, U+2FAC->U+96E8, U+2FAD->U+9751, U+2FAE->U+975E, U+2FAF->U+9762, +U+2FB0->U+9769, U+2FB1->U+97CB, U+2FB2->U+97ED, U+2FB3->U+97F3, U+2FB4->U+9801, U+2FB5->U+98A8, U+2FB6->U+98DB, U+2FB7->U+98DF, +U+2FB8->U+9996, U+2FB9->U+9999, U+2FBA->U+99AC, U+2FBB->U+9AA8, U+2FBC->U+9AD8, U+2FBD->U+9ADF, U+2FBE->U+9B25, U+2FBF->U+9B2F, +U+2FC0->U+9B32, U+2FC1->U+9B3C, U+2FC2->U+9B5A, U+2FC3->U+9CE5, U+2FC4->U+9E75, U+2FC5->U+9E7F, U+2FC6->U+9EA5, U+2FC7->U+9EBB, +U+2FC8->U+9EC3, U+2FC9->U+9ECD, U+2FCA->U+9ED1, U+2FCB->U+9EF9, U+2FCC->U+9EFD, U+2FCD->U+9F0E, U+2FCE->U+9F13, U+2FCF->U+9F20, +U+2FD0->U+9F3B, U+2FD1->U+9F4A, U+2FD2->U+9F52, U+2FD3->U+9F8D, U+2FD4->U+9F9C, U+2FD5->U+9FA0, U+3042->U+3041, U+3044->U+3043, +U+3046->U+3045, U+3048->U+3047, U+304A->U+3049, U+304C->U+304B, U+304E->U+304D, U+3050->U+304F, U+3052->U+3051, U+3054->U+3053, +U+3056->U+3055, U+3058->U+3057, U+305A->U+3059, U+305C->U+305B, U+305E->U+305D, U+3060->U+305F, U+3062->U+3061, U+3064->U+3063, +U+3065->U+3063, U+3067->U+3066, U+3069->U+3068, U+3070->U+306F, U+3071->U+306F, U+3073->U+3072, U+3074->U+3072, U+3076->U+3075, +U+3077->U+3075, U+3079->U+3078, U+307A->U+3078, U+307C->U+307B, U+307D->U+307B, U+3084->U+3083, U+3086->U+3085, U+3088->U+3087, +U+308F->U+308E, U+3094->U+3046, U+3095->U+304B, U+3096->U+3051, U+30A2->U+30A1, U+30A4->U+30A3, U+30A6->U+30A5, U+30A8->U+30A7, +U+30AA->U+30A9, U+30AC->U+30AB, U+30AE->U+30AD, U+30B0->U+30AF, U+30B2->U+30B1, U+30B4->U+30B3, U+30B6->U+30B5, U+30B8->U+30B7, +U+30BA->U+30B9, U+30BC->U+30BB, U+30BE->U+30BD, U+30C0->U+30BF, U+30C2->U+30C1, U+30C5->U+30C4, U+30C7->U+30C6, U+30C9->U+30C8, +U+30D0->U+30CF, U+30D1->U+30CF, U+30D3->U+30D2, U+30D4->U+30D2, U+30D6->U+30D5, U+30D7->U+30D5, U+30D9->U+30D8, U+30DA->U+30D8, +U+30DC->U+30DB, U+30DD->U+30DB, U+30E4->U+30E3, U+30E6->U+30E5, U+30E8->U+30E7, U+30EF->U+30EE, U+30F4->U+30A6, U+30AB->U+30F5, +U+30B1->U+30F6, U+30F7->U+30EF, U+30F8->U+30F0, U+30F9->U+30F1, U+30FA->U+30F2, U+30AF->U+31F0, U+30B7->U+31F1, U+30B9->U+31F2, +U+30C8->U+31F3, U+30CC->U+31F4, U+30CF->U+31F5, U+30D2->U+31F6, U+30D5->U+31F7, U+30D8->U+31F8, U+30DB->U+31F9, U+30E0->U+31FA, +U+30E9->U+31FB, U+30EA->U+31FC, U+30EB->U+31FD, U+30EC->U+31FE, U+30ED->U+31FF, U+FF66->U+30F2, U+FF67->U+30A1, U+FF68->U+30A3, +U+FF69->U+30A5, U+FF6A->U+30A7, U+FF6B->U+30A9, U+FF6C->U+30E3, U+FF6D->U+30E5, U+FF6E->U+30E7, U+FF6F->U+30C3, U+FF71->U+30A1, +U+FF72->U+30A3, U+FF73->U+30A5, U+FF74->U+30A7, U+FF75->U+30A9, U+FF76->U+30AB, U+FF77->U+30AD, U+FF78->U+30AF, U+FF79->U+30B1, +U+FF7A->U+30B3, U+FF7B->U+30B5, U+FF7C->U+30B7, U+FF7D->U+30B9, U+FF7E->U+30BB, U+FF7F->U+30BD, U+FF80->U+30BF, U+FF81->U+30C1, +U+FF82->U+30C3, U+FF83->U+30C6, U+FF84->U+30C8, U+FF85->U+30CA, U+FF86->U+30CB, U+FF87->U+30CC, U+FF88->U+30CD, U+FF89->U+30CE, +U+FF8A->U+30CF, U+FF8B->U+30D2, U+FF8C->U+30D5, U+FF8D->U+30D8, U+FF8E->U+30DB, U+FF8F->U+30DE, U+FF90->U+30DF, U+FF91->U+30E0, +U+FF92->U+30E1, U+FF93->U+30E2, U+FF94->U+30E3, U+FF95->U+30E5, U+FF96->U+30E7, U+FF97->U+30E9, U+FF98->U+30EA, U+FF99->U+30EB, +U+FF9A->U+30EC, U+FF9B->U+30ED, U+FF9C->U+30EF, U+FF9D->U+30F3, U+FFA0->U+3164, U+FFA1->U+3131, U+FFA2->U+3132, U+FFA3->U+3133, +U+FFA4->U+3134, U+FFA5->U+3135, U+FFA6->U+3136, U+FFA7->U+3137, U+FFA8->U+3138, U+FFA9->U+3139, U+FFAA->U+313A, U+FFAB->U+313B, +U+FFAC->U+313C, U+FFAD->U+313D, U+FFAE->U+313E, U+FFAF->U+313F, U+FFB0->U+3140, U+FFB1->U+3141, U+FFB2->U+3142, U+FFB3->U+3143, +U+FFB4->U+3144, U+FFB5->U+3145, U+FFB6->U+3146, U+FFB7->U+3147, U+FFB8->U+3148, U+FFB9->U+3149, U+FFBA->U+314A, U+FFBB->U+314B, +U+FFBC->U+314C, U+FFBD->U+314D, U+FFBE->U+314E, U+FFC2->U+314F, U+FFC3->U+3150, U+FFC4->U+3151, U+FFC5->U+3152, U+FFC6->U+3153, +U+FFC7->U+3154, U+FFCA->U+3155, U+FFCB->U+3156, U+FFCC->U+3157, U+FFCD->U+3158, U+FFCE->U+3159, U+FFCF->U+315A, U+FFD2->U+315B, +U+FFD3->U+315C, U+FFD4->U+315D, U+FFD5->U+315E, U+FFD6->U+315F, U+FFD7->U+3160, U+FFDA->U+3161, U+FFDB->U+3162, U+FFDC->U+3163, +U+3131->U+1100, U+3132->U+1101, U+3133->U+11AA, U+3134->U+1102, U+3135->U+11AC, U+3136->U+11AD, U+3137->U+1103, U+3138->U+1104, +U+3139->U+1105, U+313A->U+11B0, U+313B->U+11B1, U+313C->U+11B2, U+313D->U+11B3, U+313E->U+11B4, U+313F->U+11B5, U+3140->U+111A, +U+3141->U+1106, U+3142->U+1107, U+3143->U+1108, U+3144->U+1121, U+3145->U+1109, U+3146->U+110A, U+3147->U+110B, U+3148->U+110C, +U+3149->U+110D, U+314A->U+110E, U+314B->U+110F, U+314C->U+1110, U+314D->U+1111, U+314E->U+1112, U+314F->U+1161, U+3150->U+1162, +U+3151->U+1163, U+3152->U+1164, U+3153->U+1165, U+3154->U+1166, U+3155->U+1167, U+3156->U+1168, U+3157->U+1169, U+3158->U+116A, +U+3159->U+116B, U+315A->U+116C, U+315B->U+116D, U+315C->U+116E, U+315D->U+116F, U+315E->U+1170, U+315F->U+1171, U+3160->U+1172, +U+3161->U+1173, U+3162->U+1174, U+3163->U+1175, U+3165->U+1114, U+3166->U+1115, U+3167->U+11C7, U+3168->U+11C8, U+3169->U+11CC, +U+316A->U+11CE, U+316B->U+11D3, U+316C->U+11D7, U+316D->U+11D9, U+316E->U+111C, U+316F->U+11DD, U+3170->U+11DF, U+3171->U+111D, +U+3172->U+111E, U+3173->U+1120, U+3174->U+1122, U+3175->U+1123, U+3176->U+1127, U+3177->U+1129, U+3178->U+112B, U+3179->U+112C, +U+317A->U+112D, U+317B->U+112E, U+317C->U+112F, U+317D->U+1132, U+317E->U+1136, U+317F->U+1140, U+3180->U+1147, U+3181->U+114C, +U+3182->U+11F1, U+3183->U+11F2, U+3184->U+1157, U+3185->U+1158, U+3186->U+1159, U+3187->U+1184, U+3188->U+1185, U+3189->U+1188, +U+318A->U+1191, U+318B->U+1192, U+318C->U+1194, U+318D->U+119E, U+318E->U+11A1, U+A490->U+A408, U+A491->U+A1B9, U+4E00..U+9FBB, +U+3400..U+4DB5, U+20000..U+2A6D6, U+FA0E, U+FA0F, U+FA11, U+FA13, U+FA14, U+FA1F, U+FA21, U+FA23, U+FA24, U+FA27, U+FA28, U+FA29, +U+3105..U+312C, U+31A0..U+31B7, U+3041, U+3043, U+3045, U+3047, U+3049, U+304B, U+304D, U+304F, U+3051, U+3053, U+3055, U+3057, +U+3059, U+305B, U+305D, U+305F, U+3061, U+3063, U+3066, U+3068, U+306A..U+306F, U+3072, U+3075, U+3078, U+307B, U+307E..U+3083, +U+3085, U+3087, U+3089..U+308E, U+3090..U+3093, U+30A1, U+30A3, U+30A5, U+30A7, U+30A9, U+30AD, U+30AF, U+30B3, U+30B5, U+30BB, +U+30BD, U+30BF, U+30C1, U+30C3, U+30C4, U+30C6, U+30CA, U+30CB, U+30CD, U+30CE, U+30DE, U+30DF, U+30E1, U+30E2, U+30E3, U+30E5, +U+30E7, U+30EE, U+30F0..U+30F3, U+30F5, U+30F6, U+31F0, U+31F1, U+31F2, U+31F3, U+31F4, U+31F5, U+31F6, U+31F7, U+31F8, U+31F9, +U+31FA, U+31FB, U+31FC, U+31FD, U+31FE, U+31FF, U+AC00..U+D7A3, U+1100..U+1159, U+1161..U+11A2, U+11A8..U+11F9, U+A000..U+A48C, +U+A492..U+A4C6 + +################################################## +# Coptic +# Notes: Some shared Greek characters, may require amendments. +U+2C80->U+2C81, U+2C81, U+2C82->U+2C83, U+2C83, U+2C84->U+2C85, U+2C85, U+2C86->U+2C87, U+2C87, U+2C88->U+2C89, U+2C89, U+2C8A->U+2C8B, +U+2C8B, U+2C8C->U+2C8D, U+2C8D, U+2C8E->U+2C8F, U+2C8F, U+2C90->U+2C91, U+2C91, U+2C92->U+2C93, U+2C93, U+2C94->U+2C95, U+2C95, +U+2C96->U+2C97, U+2C97, U+2C98->U+2C99, U+2C99, U+2C9A->U+2C9B, U+2C9B, U+2C9C->U+2C9D, U+2C9D, U+2C9E->U+2C9F, U+2C9F, U+2CA0->U+2CA1, +U+2CA1, U+2CA2->U+2CA3, U+2CA3, U+2CA4->U+2CA5, U+2CA5, U+2CA6->U+2CA7, U+2CA7, U+2CA8->U+2CA9, U+2CA9, U+2CAA->U+2CAB, U+2CAB, +U+2CAC->U+2CAD, U+2CAD, U+2CAE->U+2CAF, U+2CAF, U+2CB0->U+2CB1, U+2CB1, U+2CB2->U+2CB3, U+2CB3, U+2CB4->U+2CB5, U+2CB5, +U+2CB6->U+2CB7, U+2CB7, U+2CB8->U+2CB9, U+2CB9, U+2CBA->U+2CBB, U+2CBB, U+2CBC->U+2CBD, U+2CBD, U+2CBE->U+2CBF, U+2CBF, +U+2CC0->U+2CC1, U+2CC1, U+2CC2->U+2CC3, U+2CC3, U+2CC4->U+2CC5, U+2CC5, U+2CC6->U+2CC7, U+2CC7, U+2CC8->U+2CC9, U+2CC9, +U+2CCA->U+2CCB, U+2CCB, U+2CCC->U+2CCD, U+2CCD, U+2CCE->U+2CCF, U+2CCF, U+2CD0->U+2CD1, U+2CD1, U+2CD2->U+2CD3, U+2CD3, +U+2CD4->U+2CD5, U+2CD5, U+2CD6->U+2CD7, U+2CD7, U+2CD8->U+2CD9, U+2CD9, U+2CDA->U+2CDB, U+2CDB, U+2CDC->U+2CDD, U+2CDD, +U+2CDE->U+2CDF, U+2CDF, U+2CE0->U+2CE1, U+2CE1, U+2CE2->U+2CE3, U+2CE3 + +################################################## +# Cryllic* +U+0400->U+0435, U+0401->U+0435, U+0402->U+0452, U+0452, U+0403->U+0433, U+0404->U+0454, U+0454, U+0405->U+0455, U+0455, +U+0406->U+0456, U+0407->U+0456, U+0457->U+0456, U+0456, U+0408..U+040B->U+0458..U+045B, U+0458..U+045B, U+040C->U+043A, +U+040D->U+0438, U+040E->U+0443, U+040F->U+045F, U+045F, U+0450->U+0435, U+0451->U+0435, U+0453->U+0433, U+045C->U+043A, +U+045D->U+0438, U+045E->U+0443, U+0460->U+0461, U+0461, U+0462->U+0463, U+0463, U+0464->U+0465, U+0465, U+0466->U+0467, +U+0467, U+0468->U+0469, U+0469, U+046A->U+046B, U+046B, U+046C->U+046D, U+046D, U+046E->U+046F, U+046F, U+0470->U+0471, +U+0471, U+0472->U+0473, U+0473, U+0474->U+0475, U+0476->U+0475, U+0477->U+0475, U+0475, U+0478->U+0479, U+0479, U+047A->U+047B, +U+047B, U+047C->U+047D, U+047D, U+047E->U+047F, U+047F, U+0480->U+0481, U+0481, U+048A->U+0438, U+048B->U+0438, U+048C->U+044C, +U+048D->U+044C, U+048E->U+0440, U+048F->U+0440, U+0490->U+0433, U+0491->U+0433, U+0490->U+0433, U+0491->U+0433, U+0492->U+0433, +U+0493->U+0433, U+0494->U+0433, U+0495->U+0433, U+0496->U+0436, U+0497->U+0436, U+0498->U+0437, U+0499->U+0437, U+049A->U+043A, +U+049B->U+043A, U+049C->U+043A, U+049D->U+043A, U+049E->U+043A, U+049F->U+043A, U+04A0->U+043A, U+04A1->U+043A, U+04A2->U+043D, +U+04A3->U+043D, U+04A4->U+043D, U+04A5->U+043D, U+04A6->U+043F, U+04A7->U+043F, U+04A8->U+04A9, U+04A9, U+04AA->U+0441, +U+04AB->U+0441, U+04AC->U+0442, U+04AD->U+0442, U+04AE->U+0443, U+04AF->U+0443, U+04B0->U+0443, U+04B1->U+0443, U+04B2->U+0445, +U+04B3->U+0445, U+04B4->U+04B5, U+04B5, U+04B6->U+0447, U+04B7->U+0447, U+04B8->U+0447, U+04B9->U+0447, U+04BA->U+04BB, U+04BB, +U+04BC->U+04BD, U+04BE->U+04BD, U+04BF->U+04BD, U+04BD, U+04C0->U+04CF, U+04CF, U+04C1->U+0436, U+04C2->U+0436, U+04C3->U+043A, +U+04C4->U+043A, U+04C5->U+043B, U+04C6->U+043B, U+04C7->U+043D, U+04C8->U+043D, U+04C9->U+043D, U+04CA->U+043D, U+04CB->U+0447, +U+04CC->U+0447, U+04CD->U+043C, U+04CE->U+043C, U+04D0->U+0430, U+04D1->U+0430, U+04D2->U+0430, U+04D3->U+0430, U+04D4->U+00E6, +U+04D5->U+00E6, U+04D6->U+0435, U+04D7->U+0435, U+04D8->U+04D9, U+04DA->U+04D9, U+04DB->U+04D9, U+04D9, U+04DC->U+0436, +U+04DD->U+0436, U+04DE->U+0437, U+04DF->U+0437, U+04E0->U+04E1, U+04E1, U+04E2->U+0438, U+04E3->U+0438, U+04E4->U+0438, +U+04E5->U+0438, U+04E6->U+043E, U+04E7->U+043E, U+04E8->U+043E, U+04E9->U+043E, U+04EA->U+043E, U+04EB->U+043E, U+04EC->U+044D, +U+04ED->U+044D, U+04EE->U+0443, U+04EF->U+0443, U+04F0->U+0443, U+04F1->U+0443, U+04F2->U+0443, U+04F3->U+0443, U+04F4->U+0447, +U+04F5->U+0447, U+04F6->U+0433, U+04F7->U+0433, U+04F8->U+044B, U+04F9->U+044B, U+04FA->U+0433, U+04FB->U+0433, U+04FC->U+0445, +U+04FD->U+0445, U+04FE->U+0445, U+04FF->U+0445, U+0410..U+0418->U+0430..U+0438, U+0419->U+0438, U+0430..U+0438, +U+041A..U+042F->U+043A..U+044F, U+043A..U+044F + +################################################## +# Devanagari +U+0929->U+0928, U+0931->U+0930, U+0934->U+0933, U+0958->U+0915, U+0959->U+0916, U+095A->U+0917, U+095B->U+091C, U+095C->U+0921, +U+095D->U+0922, U+095E->U+092B, U+095F->U+092F, U+0904..U+0928, U+092A..U+0930, U+0932, U+0933, U+0935..U+0939, U+0960, U+0961, +U+0966..U+096F, U+097B..U+097F + +################################################## +# Georgian +U+10FC->U+10DC, U+10D0..U+10FA, U+10A0..U+10C5->U+2D00..U+2D25, U+2D00..U+2D25 + +################################################## +# Greek +U+0386->U+03B1, U+0388->U+03B5, U+0389->U+03B7, U+038A->U+03B9, U+038C->U+03BF, U+038E->U+03C5, U+038F->U+03C9, U+0390->U+03B9, +U+03AA->U+03B9, U+03AB->U+03C5, U+03AC->U+03B1, U+03AD->U+03B5, U+03AE->U+03B7, U+03AF->U+03B9, U+03B0->U+03C5, U+03CA->U+03B9, +U+03CB->U+03C5, U+03CC->U+03BF, U+03CD->U+03C5, U+03CE->U+03C9, U+03D0->U+03B2, U+03D1->U+03B8, U+03D2->U+03C5, U+03D3->U+03C5, +U+03D4->U+03C5, U+03D5->U+03C6, U+03D6->U+03C0, U+03D8->U+03D9, U+03DA->U+03DB, U+03DC->U+03DD, U+03DE->U+03DF, U+03E0->U+03E1, +U+03E2->U+03E3, U+03E4->U+03E5, U+03E6->U+03E7, U+03E8->U+03E9, U+03EA->U+03EB, U+03EC->U+03ED, U+03EE->U+03EF, U+03F0->U+03BA, +U+03F1->U+03C1, U+03F2->U+03C3, U+03F4->U+03B8, U+03F5->U+03B5, U+03F6->U+03B5, U+03F7->U+03F8, U+03F9->U+03C3, U+03FA->U+03FB, +U+1F00->U+03B1, U+1F01->U+03B1, U+1F02->U+03B1, U+1F03->U+03B1, U+1F04->U+03B1, U+1F05->U+03B1, U+1F06->U+03B1, U+1F07->U+03B1, +U+1F08->U+03B1, U+1F09->U+03B1, U+1F0A->U+03B1, U+1F0B->U+03B1, U+1F0C->U+03B1, U+1F0D->U+03B1, U+1F0E->U+03B1, U+1F0F->U+03B1, +U+1F10->U+03B5, U+1F11->U+03B5, U+1F12->U+03B5, U+1F13->U+03B5, U+1F14->U+03B5, U+1F15->U+03B5, U+1F18->U+03B5, U+1F19->U+03B5, +U+1F1A->U+03B5, U+1F1B->U+03B5, U+1F1C->U+03B5, U+1F1D->U+03B5, U+1F20->U+03B7, U+1F21->U+03B7, U+1F22->U+03B7, U+1F23->U+03B7, +U+1F24->U+03B7, U+1F25->U+03B7, U+1F26->U+03B7, U+1F27->U+03B7, U+1F28->U+03B7, U+1F29->U+03B7, U+1F2A->U+03B7, U+1F2B->U+03B7, +U+1F2C->U+03B7, U+1F2D->U+03B7, U+1F2E->U+03B7, U+1F2F->U+03B7, U+1F30->U+03B9, U+1F31->U+03B9, U+1F32->U+03B9, U+1F33->U+03B9, +U+1F34->U+03B9, U+1F35->U+03B9, U+1F36->U+03B9, U+1F37->U+03B9, U+1F38->U+03B9, U+1F39->U+03B9, U+1F3A->U+03B9, U+1F3B->U+03B9, +U+1F3C->U+03B9, U+1F3D->U+03B9, U+1F3E->U+03B9, U+1F3F->U+03B9, U+1F40->U+03BF, U+1F41->U+03BF, U+1F42->U+03BF, U+1F43->U+03BF, +U+1F44->U+03BF, U+1F45->U+03BF, U+1F48->U+03BF, U+1F49->U+03BF, U+1F4A->U+03BF, U+1F4B->U+03BF, U+1F4C->U+03BF, U+1F4D->U+03BF, +U+1F50->U+03C5, U+1F51->U+03C5, U+1F52->U+03C5, U+1F53->U+03C5, U+1F54->U+03C5, U+1F55->U+03C5, U+1F56->U+03C5, U+1F57->U+03C5, +U+1F59->U+03C5, U+1F5B->U+03C5, U+1F5D->U+03C5, U+1F5F->U+03C5, U+1F60->U+03C9, U+1F61->U+03C9, U+1F62->U+03C9, U+1F63->U+03C9, +U+1F64->U+03C9, U+1F65->U+03C9, U+1F66->U+03C9, U+1F67->U+03C9, U+1F68->U+03C9, U+1F69->U+03C9, U+1F6A->U+03C9, U+1F6B->U+03C9, +U+1F6C->U+03C9, U+1F6D->U+03C9, U+1F6E->U+03C9, U+1F6F->U+03C9, U+1F70->U+03B1, U+1F71->U+03B1, U+1F72->U+03B5, U+1F73->U+03B5, +U+1F74->U+03B7, U+1F75->U+03B7, U+1F76->U+03B9, U+1F77->U+03B9, U+1F78->U+03BF, U+1F79->U+03BF, U+1F7A->U+03C5, U+1F7B->U+03C5, +U+1F7C->U+03C9, U+1F7D->U+03C9, U+1F80->U+03B1, U+1F81->U+03B1, U+1F82->U+03B1, U+1F83->U+03B1, U+1F84->U+03B1, U+1F85->U+03B1, +U+1F86->U+03B1, U+1F87->U+03B1, U+1F88->U+03B1, U+1F89->U+03B1, U+1F8A->U+03B1, U+1F8B->U+03B1, U+1F8C->U+03B1, U+1F8D->U+03B1, +U+1F8E->U+03B1, U+1F8F->U+03B1, U+1F90->U+03B7, U+1F91->U+03B7, U+1F92->U+03B7, U+1F93->U+03B7, U+1F94->U+03B7, U+1F95->U+03B7, +U+1F96->U+03B7, U+1F97->U+03B7, U+1F98->U+03B7, U+1F99->U+03B7, U+1F9A->U+03B7, U+1F9B->U+03B7, U+1F9C->U+03B7, U+1F9D->U+03B7, +U+1F9E->U+03B7, U+1F9F->U+03B7, U+1FA0->U+03C9, U+1FA1->U+03C9, U+1FA2->U+03C9, U+1FA3->U+03C9, U+1FA4->U+03C9, U+1FA5->U+03C9, +U+1FA6->U+03C9, U+1FA7->U+03C9, U+1FA8->U+03C9, U+1FA9->U+03C9, U+1FAA->U+03C9, U+1FAB->U+03C9, U+1FAC->U+03C9, U+1FAD->U+03C9, +U+1FAE->U+03C9, U+1FAF->U+03C9, U+1FB0->U+03B1, U+1FB1->U+03B1, U+1FB2->U+03B1, U+1FB3->U+03B1, U+1FB4->U+03B1, U+1FB6->U+03B1, +U+1FB7->U+03B1, U+1FB8->U+03B1, U+1FB9->U+03B1, U+1FBA->U+03B1, U+1FBB->U+03B1, U+1FBC->U+03B1, U+1FC2->U+03B7, U+1FC3->U+03B7, +U+1FC4->U+03B7, U+1FC6->U+03B7, U+1FC7->U+03B7, U+1FC8->U+03B5, U+1FC9->U+03B5, U+1FCA->U+03B7, U+1FCB->U+03B7, U+1FCC->U+03B7, +U+1FD0->U+03B9, U+1FD1->U+03B9, U+1FD2->U+03B9, U+1FD3->U+03B9, U+1FD6->U+03B9, U+1FD7->U+03B9, U+1FD8->U+03B9, U+1FD9->U+03B9, +U+1FDA->U+03B9, U+1FDB->U+03B9, U+1FE0->U+03C5, U+1FE1->U+03C5, U+1FE2->U+03C5, U+1FE3->U+03C5, U+1FE4->U+03C1, U+1FE5->U+03C1, +U+1FE6->U+03C5, U+1FE7->U+03C5, U+1FE8->U+03C5, U+1FE9->U+03C5, U+1FEA->U+03C5, U+1FEB->U+03C5, U+1FEC->U+03C1, U+1FF2->U+03C9, +U+1FF3->U+03C9, U+1FF4->U+03C9, U+1FF6->U+03C9, U+1FF7->U+03C9, U+1FF8->U+03BF, U+1FF9->U+03BF, U+1FFA->U+03C9, U+1FFB->U+03C9, +U+1FFC->U+03C9, U+0391..U+03A1->U+03B1..U+03C1, U+03B1..U+03C1, U+03A3..U+03A9->U+03C3..U+03C9, U+03C3..U+03C9, U+03C2, U+03D9, +U+03DB, U+03DD, U+03DF, U+03E1, U+03E3, U+03E5, U+03E7, U+03E9, U+03EB, U+03ED, U+03EF, U+03F3, U+03F8, U+03FB + +################################################## +# Gujarati +U+0A85..U+0A8C, U+0A8F, U+0A90, U+0A93..U+0AB0, U+0AB2, U+0AB3, U+0AB5..U+0AB9, U+0AE0, U+0AE1, U+0AE6..U+0AEF + +################################################## +# Gurmukhi +U+0A33->U+0A32, U+0A36->U+0A38, U+0A59->U+0A16, U+0A5A->U+0A17, U+0A5B->U+0A1C, U+0A5E->U+0A2B, U+0A05..U+0A0A, U+0A0F, U+0A10, +U+0A13..U+0A28, U+0A2A..U+0A30, U+0A32, U+0A35, U+0A38, U+0A39, U+0A5C, U+0A66..U+0A6F + +################################################# +# Hebrew* +U+FB1D->U+05D9, U+FB1F->U+05F2, U+FB20->U+05E2, U+FB21->U+05D0, U+FB22->U+05D3, U+FB23->U+05D4, U+FB24->U+05DB, U+FB25->U+05DC, +U+FB26->U+05DD, U+FB27->U+05E8, U+FB28->U+05EA, U+FB2A->U+05E9, U+FB2B->U+05E9, U+FB2C->U+05E9, U+FB2D->U+05E9, U+FB2E->U+05D0, +U+FB2F->U+05D0, U+FB30->U+05D0, U+FB31->U+05D1, U+FB32->U+05D2, U+FB33->U+05D3, U+FB34->U+05D4, U+FB35->U+05D5, U+FB36->U+05D6, +U+FB38->U+05D8, U+FB39->U+05D9, U+FB3A->U+05DA, U+FB3B->U+05DB, U+FB3C->U+05DC, U+FB3E->U+05DE, U+FB40->U+05E0, U+FB41->U+05E1, +U+FB43->U+05E3, U+FB44->U+05E4, U+FB46->U+05E6, U+FB47->U+05E7, U+FB48->U+05E8, U+FB49->U+05E9, U+FB4A->U+05EA, U+FB4B->U+05D5, +U+FB4C->U+05D1, U+FB4D->U+05DB, U+FB4E->U+05E4, U+FB4F->U+05D0, U+05D0..U+05F2 + +################################################# +# Kannada +U+0C85..U+0C8C, U+0C8E..U+0C90, U+0C92..U+0CA8, U+0CAA..U+0CB3, U+0CB5..U+0CB9, U+0CE0, U+0CE1, U+0CE6..U+0CEF + +################################################# +# Limbu +U+1900..U+191C, U+1930..U+1938, U+1946..U+194F + +################################################# +# Malayalam +U+0D05..U+0D0C, U+0D0E..U+0D10, U+0D12..U+0D28, U+0D2A..U+0D39, U+0D60, U+0D61, U+0D66..U+0D6F + +################################################# +# Tamil +U+0B94->U+0B92, U+0B85..U+0B8A, U+0B8E..U+0B90, U+0B92, U+0B93, U+0B95, U+0B99, U+0B9A, U+0B9C, U+0B9E, U+0B9F, U+0BA3, U+0BA4, +U+0BA8..U+0BAA, U+0BAE..U+0BB9, U+0BE6..U+0BEF + +################################################# +# Thai +U+0E01..U+0E30, U+0E32, U+0E33, U+0E40..U+0E46, U+0E50..U+0E5B + +################################################## +# Common +U+FF10..U+FF19->0..9, U+FF21..U+FF3A->a..z, U+FF41..U+FF5A->a..z, 0..9, A..Z->a..z, a..z +""" + +# The expected value format is a commas-separated list of mappings. +# Two simplest mappings simply declare a character as valid, and map a single character +# to another single character, respectively. But specifying the whole table in such +# form would result in bloated and barely manageable specifications. So there are +# several syntax shortcuts that let you map ranges of characters at once. The complete +# list is as follows: +# +# A->a +# Single char mapping, declares source char 'A' as allowed to occur within keywords +# and maps it to destination char 'a' (but does not declare 'a' as allowed). +# A..Z->a..z +# Range mapping, declares all chars in source range as allowed and maps them to +# the destination range. Does not declare destination range as allowed. Also checks +# ranges' lengths (the lengths must be equal). +# a +# Stray char mapping, declares a character as allowed and maps it to itself. +# Equivalent to a->a single char mapping. +# a..z +# Stray range mapping, declares all characters in range as allowed and maps them to +# themselves. Equivalent to a..z->a..z range mapping. +# A..Z/2 +# Checkerboard range map. Maps every pair of chars to the second char. +# More formally, declares odd characters in range as allowed and maps them to the +# even ones; also declares even characters as allowed and maps them to themselves. +# For instance, A..Z/2 is equivalent to A->B, B->B, C->D, D->D, ..., Y->Z, Z->Z. +# This mapping shortcut is helpful for a number of Unicode blocks where uppercase +# and lowercase letters go in such interleaved order instead of contiguous chunks. + +_dewhite = re.compile(r"\s") +_char = r"((?:U\+[0-9A-Fa-f]{4,6})|.)" +_char_map = re.compile("^" + _char + "->" + _char + "$") +_range_map = re.compile("^" + _char + r"\.\." + _char + "->" + _char + ".." + _char + "$") +_stray_char = re.compile("^" + _char + "$") +_stray_range = re.compile("^" + _char + r"\.\." + _char + "$") +_checker_range = re.compile("^" + _char + r"\.\." + _char + "/2$") + + +def charspec_to_int(string): + # Converts a character specification of the form 'A' or 'U+23BC' + # to an integer + if string.startswith("U+"): + return int(string[2:], 16) + elif len(string) == 1: + return ord(string) + else: + raise Exception("Can't convert charspec: %r" % string) + + +def charset_table_to_dict(tablestring): + """Takes a string with the contents of a Sphinx charset table file and + returns a mapping object (a defaultdict, actually) of the kind expected by + the unicode.translate() method: that is, it maps a character number to a unicode + character or None if the character is not a valid word character. + + The Sphinx charset table format is described at + http://www.sphinxsearch.com/docs/current.html#conf-charset-table. + """ + + #map = {} + map = defaultdict(lambda: None) + for line in tablestring.split("\n"): + if not line or line.startswith("#"): + continue + line = _dewhite.sub("", line) + for item in line.split(","): + if not item: + continue + match = _range_map.match(item) + if match: + start1 = charspec_to_int(match.group(1)) + end1 = charspec_to_int(match.group(2)) + start2 = charspec_to_int(match.group(3)) + end2 = charspec_to_int(match.group(4)) + assert (end1 - start1) == (end2 - start2) + try: + for fromord, tooord in izip(xrange(start1, end1 + 1), + xrange(start2, end2 + 1)): + map[fromord] = unichr(tooord) + except ValueError: + pass + continue + + match = _char_map.match(item) + if match: + fromord = charspec_to_int(match.group(1)) + toord = charspec_to_int(match.group(2)) + try: + map[fromord] = unichr(toord) + except ValueError: + pass + continue + + match = _stray_char.match(item) + if match: + ord = charspec_to_int(match.group(0)) + try: + map[ord] = unichr(ord) + except ValueError: + pass + continue + + match = _stray_range.match(item) + if match: + start = charspec_to_int(match.group(1)) + end = charspec_to_int(match.group(2)) + try: + for ord in xrange(start, end + 1): + map[ord] = unichr(ord) + except ValueError: + pass + continue + + match = _checker_range.match(item) + if match: + fromord = charspec_to_int(match.group(1)) + toord = charspec_to_int(match.group(2)) + assert toord - fromord % 2 == 0 + for ord in xrange(fromord, toord + 1, 2): + try: + map[ord] = unichr(ord + 1) + map[ord + 1] = unichr(ord + 1) + except ValueError: + pass + continue + + raise Exception("Don't know what to do with %r" % item) + return dict(map) diff --git a/src/whoosh/support/levenshtein.py b/src/whoosh/support/levenshtein.py new file mode 100644 index 0000000..dbe476c --- /dev/null +++ b/src/whoosh/support/levenshtein.py @@ -0,0 +1,70 @@ +""" +Contains functions implementing edit distance algorithms. +""" + +from whoosh.compat import xrange + + +def levenshtein(seq1, seq2, limit=None): + """Returns the Levenshtein edit distance between two strings. + """ + + oneago = None + thisrow = list(range(1, len(seq2) + 1)) + [0] + for x in xrange(len(seq1)): + # Python lists wrap around for negative indices, so put the + # leftmost column at the *end* of the list. This matches with + # the zero-indexed strings and saves extra calculation. + oneago, thisrow = thisrow, [0] * len(seq2) + [x + 1] + for y in xrange(len(seq2)): + delcost = oneago[y] + 1 + addcost = thisrow[y - 1] + 1 + subcost = oneago[y - 1] + (seq1[x] != seq2[y]) + thisrow[y] = min(delcost, addcost, subcost) + + if limit and x > limit and min(thisrow) > limit: + return limit + 1 + + return thisrow[len(seq2) - 1] + + +def damerau_levenshtein(seq1, seq2, limit=None): + """Returns the Damerau-Levenshtein edit distance between two strings. + """ + + oneago = None + thisrow = list(range(1, len(seq2) + 1)) + [0] + for x in xrange(len(seq1)): + # Python lists wrap around for negative indices, so put the + # leftmost column at the *end* of the list. This matches with + # the zero-indexed strings and saves extra calculation. + twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] + for y in xrange(len(seq2)): + delcost = oneago[y] + 1 + addcost = thisrow[y - 1] + 1 + subcost = oneago[y - 1] + (seq1[x] != seq2[y]) + thisrow[y] = min(delcost, addcost, subcost) + # This block deals with transpositions + if (x > 0 and y > 0 and seq1[x] == seq2[y - 1] + and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]): + thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) + + if limit and x > limit and min(thisrow) > limit: + return limit + 1 + + return thisrow[len(seq2) - 1] + + +def relative(a, b): + """Returns the relative distance between two strings, in the range + [0-1] where 1 means total equality. + """ + + d = distance(a, b) + longer = float(max((len(a), len(b)))) + shorter = float(min((len(a), len(b)))) + r = ((longer - d) / longer) * (shorter / longer) + return r + + +distance = damerau_levenshtein diff --git a/src/whoosh/support/relativedelta.py b/src/whoosh/support/relativedelta.py new file mode 100644 index 0000000..23ca7ee --- /dev/null +++ b/src/whoosh/support/relativedelta.py @@ -0,0 +1,437 @@ +""" +Copyright (c) 2003-2010 Gustavo Niemeyer + +This module offers extensions to the standard python 2.3+ +datetime module. +""" +__author__ = "Gustavo Niemeyer " +__license__ = "PSF License" + +import datetime +import calendar + +__all__ = ["relativedelta", "MO", "TU", "WE", "TH", "FR", "SA", "SU"] + + +class weekday(object): + __slots__ = ["weekday", "n"] + + def __init__(self, weekday, n=None): + self.weekday = weekday + self.n = n + + def __call__(self, n): + if n == self.n: + return self + else: + return self.__class__(self.weekday, n) + + def __eq__(self, other): + try: + if self.weekday != other.weekday or self.n != other.n: + return False + except AttributeError: + return False + return True + + def __repr__(self): + s = ("MO", "TU", "WE", "TH", "FR", "SA", "SU")[self.weekday] + if not self.n: + return s + else: + return "%s(%+d)" % (s, self.n) + +MO, TU, WE, TH, FR, SA, SU = weekdays = tuple([weekday(x) for x in range(7)]) + + +class relativedelta: + """ +The relativedelta type is based on the specification of the excellent +work done by M.-A. Lemburg in his mx.DateTime extension. However, +notice that this type does *NOT* implement the same algorithm as +his work. Do *NOT* expect it to behave like mx.DateTime's counterpart. + +There's two different ways to build a relativedelta instance. The +first one is passing it two date/datetime classes: + + relativedelta(datetime1, datetime2) + +And the other way is to use the following keyword arguments: + + year, month, day, hour, minute, second, microsecond: + Absolute information. + + years, months, weeks, days, hours, minutes, seconds, microseconds: + Relative information, may be negative. + + weekday: + One of the weekday instances (MO, TU, etc). These instances may + receive a parameter N, specifying the Nth weekday, which could + be positive or negative (like MO(+1) or MO(-2). Not specifying + it is the same as specifying +1. You can also use an integer, + where 0=MO. + + leapdays: + Will add given days to the date found, if year is a leap + year, and the date found is post 28 of february. + + yearday, nlyearday: + Set the yearday or the non-leap year day (jump leap days). + These are converted to day/month/leapdays information. + +Here is the behavior of operations with relativedelta: + +1) Calculate the absolute year, using the 'year' argument, or the + original datetime year, if the argument is not present. + +2) Add the relative 'years' argument to the absolute year. + +3) Do steps 1 and 2 for month/months. + +4) Calculate the absolute day, using the 'day' argument, or the + original datetime day, if the argument is not present. Then, + subtract from the day until it fits in the year and month + found after their operations. + +5) Add the relative 'days' argument to the absolute day. Notice + that the 'weeks' argument is multiplied by 7 and added to + 'days'. + +6) Do steps 1 and 2 for hour/hours, minute/minutes, second/seconds, + microsecond/microseconds. + +7) If the 'weekday' argument is present, calculate the weekday, + with the given (wday, nth) tuple. wday is the index of the + weekday (0-6, 0=Mon), and nth is the number of weeks to add + forward or backward, depending on its signal. Notice that if + the calculated date is already Monday, for example, using + (0, 1) or (0, -1) won't change the day. + """ + + def __init__(self, dt1=None, dt2=None, + years=0, months=0, days=0, leapdays=0, weeks=0, + hours=0, minutes=0, seconds=0, microseconds=0, + year=None, month=None, day=None, weekday=None, + yearday=None, nlyearday=None, + hour=None, minute=None, second=None, microsecond=None): + if dt1 and dt2: + if not isinstance(dt1, datetime.date) or \ + not isinstance(dt2, datetime.date): + raise TypeError("relativedelta only diffs datetime/date") + if type(dt1) is not type(dt2): + if not isinstance(dt1, datetime.datetime): + dt1 = datetime.datetime.fromordinal(dt1.toordinal()) + elif not isinstance(dt2, datetime.datetime): + dt2 = datetime.datetime.fromordinal(dt2.toordinal()) + self.years = 0 + self.months = 0 + self.days = 0 + self.leapdays = 0 + self.hours = 0 + self.minutes = 0 + self.seconds = 0 + self.microseconds = 0 + self.year = None + self.month = None + self.day = None + self.weekday = None + self.hour = None + self.minute = None + self.second = None + self.microsecond = None + self._has_time = 0 + + months = (dt1.year * 12 + dt1.month) - (dt2.year * 12 + dt2.month) + self._set_months(months) + dtm = self.__radd__(dt2) + if dt1 < dt2: + while dt1 > dtm: + months += 1 + self._set_months(months) + dtm = self.__radd__(dt2) + else: + while dt1 < dtm: + months -= 1 + self._set_months(months) + dtm = self.__radd__(dt2) + delta = dt1 - dtm + self.seconds = delta.seconds + delta.days * 86400 + self.microseconds = delta.microseconds + else: + self.years = years + self.months = months + self.days = days + weeks * 7 + self.leapdays = leapdays + self.hours = hours + self.minutes = minutes + self.seconds = seconds + self.microseconds = microseconds + self.year = year + self.month = month + self.day = day + self.hour = hour + self.minute = minute + self.second = second + self.microsecond = microsecond + + if type(weekday) is int: + self.weekday = weekdays[weekday] + else: + self.weekday = weekday + + yday = 0 + if nlyearday: + yday = nlyearday + elif yearday: + yday = yearday + if yearday > 59: + self.leapdays = -1 + if yday: + ydayidx = [31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, + 366] + for idx, ydays in enumerate(ydayidx): + if yday <= ydays: + self.month = idx + 1 + if idx == 0: + self.day = yday + else: + self.day = yday - ydayidx[idx - 1] + break + else: + raise ValueError("invalid year day (%d)" % yday) + + self._fix() + + def _fix(self): + if abs(self.microseconds) > 999999: + s = self.microseconds // abs(self.microseconds) + div, mod = divmod(self.microseconds * s, 1000000) + self.microseconds = mod * s + self.seconds += div * s + if abs(self.seconds) > 59: + s = self.seconds // abs(self.seconds) + div, mod = divmod(self.seconds * s, 60) + self.seconds = mod * s + self.minutes += div * s + if abs(self.minutes) > 59: + s = self.minutes // abs(self.minutes) + div, mod = divmod(self.minutes * s, 60) + self.minutes = mod * s + self.hours += div * s + if abs(self.hours) > 23: + s = self.hours // abs(self.hours) + div, mod = divmod(self.hours * s, 24) + self.hours = mod * s + self.days += div * s + if abs(self.months) > 11: + s = self.months // abs(self.months) + div, mod = divmod(self.months * s, 12) + self.months = mod * s + self.years += div * s + if (self.hours or self.minutes or self.seconds or self.microseconds or + self.hour is not None or self.minute is not None or + self.second is not None or self.microsecond is not None): + self._has_time = 1 + else: + self._has_time = 0 + + def _set_months(self, months): + self.months = months + if abs(self.months) > 11: + s = self.months // abs(self.months) + div, mod = divmod(self.months * s, 12) + self.months = mod * s + self.years = div * s + else: + self.years = 0 + + def __radd__(self, other): + if not isinstance(other, datetime.date): + raise TypeError("unsupported type for add operation") + elif self._has_time and not isinstance(other, datetime.datetime): + other = datetime.datetime.fromordinal(other.toordinal()) + year = (self.year or other.year) + self.years + month = self.month or other.month + if self.months: + assert 1 <= abs(self.months) <= 12 + month += self.months + if month > 12: + year += 1 + month -= 12 + elif month < 1: + year -= 1 + month += 12 + day = min(calendar.monthrange(year, month)[1], + self.day or other.day) + repl = {"year": year, "month": month, "day": day} + for attr in ["hour", "minute", "second", "microsecond"]: + value = getattr(self, attr) + if value is not None: + repl[attr] = value + days = self.days + if self.leapdays and month > 2 and calendar.isleap(year): + days += self.leapdays + ret = (other.replace(**repl) + + datetime.timedelta(days=days, + hours=self.hours, + minutes=self.minutes, + seconds=self.seconds, + microseconds=self.microseconds)) + if self.weekday: + weekday, nth = self.weekday.weekday, self.weekday.n or 1 + jumpdays = (abs(nth) - 1) * 7 + if nth > 0: + jumpdays += (7 - ret.weekday() + weekday) % 7 + else: + jumpdays += (ret.weekday() - weekday) % 7 + jumpdays *= -1 + ret += datetime.timedelta(days=jumpdays) + return ret + + def __rsub__(self, other): + return self.__neg__().__radd__(other) + + def __add__(self, other): + if not isinstance(other, relativedelta): + raise TypeError("unsupported type for add operation") + return relativedelta(years=other.years + self.years, + months=other.months + self.months, + days=other.days + self.days, + hours=other.hours + self.hours, + minutes=other.minutes + self.minutes, + seconds=other.seconds + self.seconds, + microseconds=other.microseconds + self.microseconds, + leapdays=other.leapdays or self.leapdays, + year=other.year or self.year, + month=other.month or self.month, + day=other.day or self.day, + weekday=other.weekday or self.weekday, + hour=other.hour or self.hour, + minute=other.minute or self.minute, + second=other.second or self.second, + microsecond=other.second or self.microsecond) + + def __sub__(self, other): + if not isinstance(other, relativedelta): + raise TypeError("unsupported type for sub operation") + return relativedelta(years=other.years - self.years, + months=other.months - self.months, + days=other.days - self.days, + hours=other.hours - self.hours, + minutes=other.minutes - self.minutes, + seconds=other.seconds - self.seconds, + microseconds=other.microseconds - self.microseconds, + leapdays=other.leapdays or self.leapdays, + year=other.year or self.year, + month=other.month or self.month, + day=other.day or self.day, + weekday=other.weekday or self.weekday, + hour=other.hour or self.hour, + minute=other.minute or self.minute, + second=other.second or self.second, + microsecond=other.second or self.microsecond) + + def __neg__(self): + return relativedelta(years= -self.years, + months= -self.months, + days= -self.days, + hours= -self.hours, + minutes= -self.minutes, + seconds= -self.seconds, + microseconds= -self.microseconds, + leapdays=self.leapdays, + year=self.year, + month=self.month, + day=self.day, + weekday=self.weekday, + hour=self.hour, + minute=self.minute, + second=self.second, + microsecond=self.microsecond) + + def __nonzero__(self): + return not (not self.years and + not self.months and + not self.days and + not self.hours and + not self.minutes and + not self.seconds and + not self.microseconds and + not self.leapdays and + self.year is None and + self.month is None and + self.day is None and + self.weekday is None and + self.hour is None and + self.minute is None and + self.second is None and + self.microsecond is None) + + __bool__ = __nonzero__ + + def __mul__(self, other): + f = float(other) + return relativedelta(years=self.years * f, + months=self.months * f, + days=self.days * f, + hours=self.hours * f, + minutes=self.minutes * f, + seconds=self.seconds * f, + microseconds=self.microseconds * f, + leapdays=self.leapdays, + year=self.year, + month=self.month, + day=self.day, + weekday=self.weekday, + hour=self.hour, + minute=self.minute, + second=self.second, + microsecond=self.microsecond) + + def __eq__(self, other): + if not isinstance(other, relativedelta): + return False + if self.weekday or other.weekday: + if not self.weekday or not other.weekday: + return False + if self.weekday.weekday != other.weekday.weekday: + return False + n1, n2 = self.weekday.n, other.weekday.n + if n1 != n2 and not ((not n1 or n1 == 1) and (not n2 or n2 == 1)): + return False + return (self.years == other.years and + self.months == other.months and + self.days == other.days and + self.hours == other.hours and + self.minutes == other.minutes and + self.seconds == other.seconds and + self.leapdays == other.leapdays and + self.year == other.year and + self.month == other.month and + self.day == other.day and + self.hour == other.hour and + self.minute == other.minute and + self.second == other.second and + self.microsecond == other.microsecond) + + def __ne__(self, other): + return not self.__eq__(other) + + def __div__(self, other): + return self.__mul__(1 / float(other)) + + def __repr__(self): + l = [] + for attr in ["years", "months", "days", "leapdays", + "hours", "minutes", "seconds", "microseconds"]: + value = getattr(self, attr) + if value: + l.append("%s=%+d" % (attr, value)) + for attr in ["year", "month", "day", "weekday", + "hour", "minute", "second", "microsecond"]: + value = getattr(self, attr) + if value is not None: + l.append("%s=%s" % (attr, repr(value))) + return "%s(%s)" % (self.__class__.__name__, ", ".join(l)) + +# vim:ts=4:sw=4:et diff --git a/src/whoosh/support/unicode.py b/src/whoosh/support/unicode.py new file mode 100644 index 0000000..7d42f56 --- /dev/null +++ b/src/whoosh/support/unicode.py @@ -0,0 +1,527 @@ +import re +from bisect import bisect_right + +from whoosh.compat import text_type, u + + +# http://unicode.org/Public/UNIDATA/Blocks.txt +_blockdata = ''' +# Blocks-5.1.0.txt +# Date: 2008-03-20, 17:41:00 PDT [KW] +# +# Unicode Character Database +# Copyright (c) 1991-2008 Unicode, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# For documentation, see UCD.html +# +# Note: The casing of block names is not normative. +# For example, "Basic Latin" and "BASIC LATIN" are equivalent. +# +# Format: +# Start Code..End Code; Block Name + +# ================================================ + +# Note: When comparing block names, casing, whitespace, hyphens, +# and underbars are ignored. +# For example, "Latin Extended-A" and "latin extended a" are equivalent +# For more information on the comparison of property values, +# see UCD.html. +# +# All code points not explicitly listed for Block +# have the value No_Block. + +# Property: Block +# +# @missing: 0000..10FFFF; No_Block + +0000..007F; Basic Latin +0080..00FF; Latin-1 Supplement +0100..017F; Latin Extended-A +0180..024F; Latin Extended-B +0250..02AF; IPA Extensions +02B0..02FF; Spacing Modifier Letters +0300..036F; Combining Diacritical Marks +0370..03FF; Greek and Coptic +0400..04FF; Cyrillic +0500..052F; Cyrillic Supplement +0530..058F; Armenian +0590..05FF; Hebrew +0600..06FF; Arabic +0700..074F; Syriac +0750..077F; Arabic Supplement +0780..07BF; Thaana +07C0..07FF; NKo +0900..097F; Devanagari +0980..09FF; Bengali +0A00..0A7F; Gurmukhi +0A80..0AFF; Gujarati +0B00..0B7F; Oriya +0B80..0BFF; Tamil +0C00..0C7F; Telugu +0C80..0CFF; Kannada +0D00..0D7F; Malayalam +0D80..0DFF; Sinhala +0E00..0E7F; Thai +0E80..0EFF; Lao +0F00..0FFF; Tibetan +1000..109F; Myanmar +10A0..10FF; Georgian +1100..11FF; Hangul Jamo +1200..137F; Ethiopic +1380..139F; Ethiopic Supplement +13A0..13FF; Cherokee +1400..167F; Unified Canadian Aboriginal Syllabics +1680..169F; Ogham +16A0..16FF; Runic +1700..171F; Tagalog +1720..173F; Hanunoo +1740..175F; Buhid +1760..177F; Tagbanwa +1780..17FF; Khmer +1800..18AF; Mongolian +1900..194F; Limbu +1950..197F; Tai Le +1980..19DF; New Tai Lue +19E0..19FF; Khmer Symbols +1A00..1A1F; Buginese +1B00..1B7F; Balinese +1B80..1BBF; Sundanese +1C00..1C4F; Lepcha +1C50..1C7F; Ol Chiki +1D00..1D7F; Phonetic Extensions +1D80..1DBF; Phonetic Extensions Supplement +1DC0..1DFF; Combining Diacritical Marks Supplement +1E00..1EFF; Latin Extended Additional +1F00..1FFF; Greek Extended +2000..206F; General Punctuation +2070..209F; Superscripts and Subscripts +20A0..20CF; Currency Symbols +20D0..20FF; Combining Diacritical Marks for Symbols +2100..214F; Letterlike Symbols +2150..218F; Number Forms +2190..21FF; Arrows +2200..22FF; Mathematical Operators +2300..23FF; Miscellaneous Technical +2400..243F; Control Pictures +2440..245F; Optical Character Recognition +2460..24FF; Enclosed Alphanumerics +2500..257F; Box Drawing +2580..259F; Block Elements +25A0..25FF; Geometric Shapes +2600..26FF; Miscellaneous Symbols +2700..27BF; Dingbats +27C0..27EF; Miscellaneous Mathematical Symbols-A +27F0..27FF; Supplemental Arrows-A +2800..28FF; Braille Patterns +2900..297F; Supplemental Arrows-B +2980..29FF; Miscellaneous Mathematical Symbols-B +2A00..2AFF; Supplemental Mathematical Operators +2B00..2BFF; Miscellaneous Symbols and Arrows +2C00..2C5F; Glagolitic +2C60..2C7F; Latin Extended-C +2C80..2CFF; Coptic +2D00..2D2F; Georgian Supplement +2D30..2D7F; Tifinagh +2D80..2DDF; Ethiopic Extended +2DE0..2DFF; Cyrillic Extended-A +2E00..2E7F; Supplemental Punctuation +2E80..2EFF; CJK Radicals Supplement +2F00..2FDF; Kangxi Radicals +2FF0..2FFF; Ideographic Description Characters +3000..303F; CJK Symbols and Punctuation +3040..309F; Hiragana +30A0..30FF; Katakana +3100..312F; Bopomofo +3130..318F; Hangul Compatibility Jamo +3190..319F; Kanbun +31A0..31BF; Bopomofo Extended +31C0..31EF; CJK Strokes +31F0..31FF; Katakana Phonetic Extensions +3200..32FF; Enclosed CJK Letters and Months +3300..33FF; CJK Compatibility +3400..4DBF; CJK Unified Ideographs Extension A +4DC0..4DFF; Yijing Hexagram Symbols +4E00..9FFF; CJK Unified Ideographs +A000..A48F; Yi Syllables +A490..A4CF; Yi Radicals +A500..A63F; Vai +A640..A69F; Cyrillic Extended-B +A700..A71F; Modifier Tone Letters +A720..A7FF; Latin Extended-D +A800..A82F; Syloti Nagri +A840..A87F; Phags-pa +A880..A8DF; Saurashtra +A900..A92F; Kayah Li +A930..A95F; Rejang +AA00..AA5F; Cham +AC00..D7AF; Hangul Syllables +D800..DB7F; High Surrogates +DB80..DBFF; High Private Use Surrogates +DC00..DFFF; Low Surrogates +E000..F8FF; Private Use Area +F900..FAFF; CJK Compatibility Ideographs +FB00..FB4F; Alphabetic Presentation Forms +FB50..FDFF; Arabic Presentation Forms-A +FE00..FE0F; Variation Selectors +FE10..FE1F; Vertical Forms +FE20..FE2F; Combining Half Marks +FE30..FE4F; CJK Compatibility Forms +FE50..FE6F; Small Form Variants +FE70..FEFF; Arabic Presentation Forms-B +FF00..FFEF; Halfwidth and Fullwidth Forms +FFF0..FFFF; Specials +10000..1007F; Linear B Syllabary +10080..100FF; Linear B Ideograms +10100..1013F; Aegean Numbers +10140..1018F; Ancient Greek Numbers +10190..101CF; Ancient Symbols +101D0..101FF; Phaistos Disc +10280..1029F; Lycian +102A0..102DF; Carian +10300..1032F; Old Italic +10330..1034F; Gothic +10380..1039F; Ugaritic +103A0..103DF; Old Persian +10400..1044F; Deseret +10450..1047F; Shavian +10480..104AF; Osmanya +10800..1083F; Cypriot Syllabary +10900..1091F; Phoenician +10920..1093F; Lydian +10A00..10A5F; Kharoshthi +12000..123FF; Cuneiform +12400..1247F; Cuneiform Numbers and Punctuation +1D000..1D0FF; Byzantine Musical Symbols +1D100..1D1FF; Musical Symbols +1D200..1D24F; Ancient Greek Musical Notation +1D300..1D35F; Tai Xuan Jing Symbols +1D360..1D37F; Counting Rod Numerals +1D400..1D7FF; Mathematical Alphanumeric Symbols +1F000..1F02F; Mahjong Tiles +1F030..1F09F; Domino Tiles +20000..2A6DF; CJK Unified Ideographs Extension B +2F800..2FA1F; CJK Compatibility Ideographs Supplement +E0000..E007F; Tags +E0100..E01EF; Variation Selectors Supplement +F0000..FFFFF; Supplementary Private Use Area-A +100000..10FFFF; Supplementary Private Use Area-B + +# EOF +''' + + +pattern = re.compile(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)') +_starts = [] +_ends = [] +_names = [] + + +class blocks(object): + pass + + +def _init(): + count = 0 + for line in _blockdata.splitlines(): + m = pattern.match(line) + if m: + start, end, name = m.groups() + _starts.append(int(start, 16)) + _ends.append(int(end, 16)) + _names.append(name) + setattr(blocks, name.replace(" ", "_"), count) + count += 1 +_init() + + +def blockname(ch): + """Return the Unicode block name for ch, or None if ch has no block. + + >>> blockname(u'a') + 'Basic Latin' + >>> blockname(unichr(0x0b80)) + 'Tamil' + >>> block(unichr(2048)) + None + """ + + assert isinstance(ch, text_type) and len(ch) == 1, repr(ch) + cp = ord(ch) + i = bisect_right(_starts, cp) - 1 + end = _ends[i] + if cp > end: + return None + return _names[i] + + +def blocknum(ch): + """Returns the unicode block number for ch, or None if ch has no block. + + >>> blocknum(u'a') + 0 + >>> blocknum(unichr(0x0b80)) + 22 + >>> blocknum(unichr(2048)) + None + """ + + cp = ord(ch) + i = bisect_right(_starts, cp) - 1 + end = _ends[i] + if cp > end: + return None + return i + + +digits = u('0123456789\xb2\xb3\xb9\u0660\u0661\u0662\u0663\u0664\u0665\u0666' + '\u0667\u0668\u0669\u06f0\u06f1\u06f2\u06f3\u06f4\u06f5\u06f6\u06f7' + '\u06f8\u06f9\u07c0\u07c1\u07c2\u07c3\u07c4\u07c5\u07c6\u07c7\u07c8' + '\u07c9\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f' + '\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef\u0a66' + '\u0a67\u0a68\u0a69\u0a6a\u0a6b\u0a6c\u0a6d\u0a6e\u0a6f\u0ae6\u0ae7' + '\u0ae8\u0ae9\u0aea\u0aeb\u0aec\u0aed\u0aee\u0aef\u0b66\u0b67\u0b68' + '\u0b69\u0b6a\u0b6b\u0b6c\u0b6d\u0b6e\u0b6f\u0be6\u0be7\u0be8\u0be9' + '\u0bea\u0beb\u0bec\u0bed\u0bee\u0bef\u0c66\u0c67\u0c68\u0c69\u0c6a' + '\u0c6b\u0c6c\u0c6d\u0c6e\u0c6f\u0ce6\u0ce7\u0ce8\u0ce9\u0cea\u0ceb' + '\u0cec\u0ced\u0cee\u0cef\u0d66\u0d67\u0d68\u0d69\u0d6a\u0d6b\u0d6c' + '\u0d6d\u0d6e\u0d6f\u0e50\u0e51\u0e52\u0e53\u0e54\u0e55\u0e56\u0e57' + '\u0e58\u0e59\u0ed0\u0ed1\u0ed2\u0ed3\u0ed4\u0ed5\u0ed6\u0ed7\u0ed8' + '\u0ed9\u0f20\u0f21\u0f22\u0f23\u0f24\u0f25\u0f26\u0f27\u0f28\u0f29' + '\u1040\u1041\u1042\u1043\u1044\u1045\u1046\u1047\u1048\u1049\u1090' + '\u1091\u1092\u1093\u1094\u1095\u1096\u1097\u1098\u1099\u1369\u136a' + '\u136b\u136c\u136d\u136e\u136f\u1370\u1371\u17e0\u17e1\u17e2\u17e3' + '\u17e4\u17e5\u17e6\u17e7\u17e8\u17e9\u1810\u1811\u1812\u1813\u1814' + '\u1815\u1816\u1817\u1818\u1819\u1946\u1947\u1948\u1949\u194a\u194b' + '\u194c\u194d\u194e\u194f\u19d0\u19d1\u19d2\u19d3\u19d4\u19d5\u19d6' + '\u19d7\u19d8\u19d9\u19da\u1a80\u1a81\u1a82\u1a83\u1a84\u1a85\u1a86' + '\u1a87\u1a88\u1a89\u1a90\u1a91\u1a92\u1a93\u1a94\u1a95\u1a96\u1a97' + '\u1a98\u1a99\u1b50\u1b51\u1b52\u1b53\u1b54\u1b55\u1b56\u1b57\u1b58' + '\u1b59\u1bb0\u1bb1\u1bb2\u1bb3\u1bb4\u1bb5\u1bb6\u1bb7\u1bb8\u1bb9' + '\u1c40\u1c41\u1c42\u1c43\u1c44\u1c45\u1c46\u1c47\u1c48\u1c49\u1c50' + '\u1c51\u1c52\u1c53\u1c54\u1c55\u1c56\u1c57\u1c58\u1c59\u2070\u2074' + '\u2075\u2076\u2077\u2078\u2079\u2080\u2081\u2082\u2083\u2084\u2085' + '\u2086\u2087\u2088\u2089\u2460\u2461\u2462\u2463\u2464\u2465\u2466' + '\u2467\u2468\u2474\u2475\u2476\u2477\u2478\u2479\u247a\u247b\u247c' + '\u2488\u2489\u248a\u248b\u248c\u248d\u248e\u248f\u2490\u24ea\u24f5' + '\u24f6\u24f7\u24f8\u24f9\u24fa\u24fb\u24fc\u24fd\u24ff\u2776\u2777' + '\u2778\u2779\u277a\u277b\u277c\u277d\u277e\u2780\u2781\u2782\u2783' + '\u2784\u2785\u2786\u2787\u2788\u278a\u278b\u278c\u278d\u278e\u278f' + '\u2790\u2791\u2792\ua620\ua621\ua622\ua623\ua624\ua625\ua626\ua627' + '\ua628\ua629\ua8d0\ua8d1\ua8d2\ua8d3\ua8d4\ua8d5\ua8d6\ua8d7\ua8d8' + '\ua8d9\ua900\ua901\ua902\ua903\ua904\ua905\ua906\ua907\ua908\ua909' + '\ua9d0\ua9d1\ua9d2\ua9d3\ua9d4\ua9d5\ua9d6\ua9d7\ua9d8\ua9d9\uaa50' + '\uaa51\uaa52\uaa53\uaa54\uaa55\uaa56\uaa57\uaa58\uaa59\uabf0\uabf1' + '\uabf2\uabf3\uabf4\uabf5\uabf6\uabf7\uabf8\uabf9\uff10\uff11\uff12' + '\uff13\uff14\uff15\uff16\uff17\uff18\uff19') +lowercase = u('abcdefghijklmnopqrstuvwxyz\xaa\xb5\xba\xdf\xe0\xe1\xe2\xe3\xe4' + '\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3' + '\xf4\xf5\xf6\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\u0101\u0103\u0105' + '\u0107\u0109\u010b\u010d\u010f\u0111\u0113\u0115\u0117\u0119' + '\u011b\u011d\u011f\u0121\u0123\u0125\u0127\u0129\u012b\u012d' + '\u012f\u0131\u0133\u0135\u0137\u0138\u013a\u013c\u013e\u0140' + '\u0142\u0144\u0146\u0148\u0149\u014b\u014d\u014f\u0151\u0153' + '\u0155\u0157\u0159\u015b\u015d\u015f\u0161\u0163\u0165\u0167' + '\u0169\u016b\u016d\u016f\u0171\u0173\u0175\u0177\u017a\u017c' + '\u017e\u017f\u0180\u0183\u0185\u0188\u018c\u018d\u0192\u0195' + '\u0199\u019a\u019b\u019e\u01a1\u01a3\u01a5\u01a8\u01aa\u01ab' + '\u01ad\u01b0\u01b4\u01b6\u01b9\u01ba\u01bd\u01be\u01bf\u01c6' + '\u01c9\u01cc\u01ce\u01d0\u01d2\u01d4\u01d6\u01d8\u01da\u01dc' + '\u01dd\u01df\u01e1\u01e3\u01e5\u01e7\u01e9\u01eb\u01ed\u01ef' + '\u01f0\u01f3\u01f5\u01f9\u01fb\u01fd\u01ff\u0201\u0203\u0205' + '\u0207\u0209\u020b\u020d\u020f\u0211\u0213\u0215\u0217\u0219' + '\u021b\u021d\u021f\u0221\u0223\u0225\u0227\u0229\u022b\u022d' + '\u022f\u0231\u0233\u0234\u0235\u0236\u0237\u0238\u0239\u023c' + '\u023f\u0240\u0242\u0247\u0249\u024b\u024d\u024f\u0250\u0251' + '\u0252\u0253\u0254\u0255\u0256\u0257\u0258\u0259\u025a\u025b' + '\u025c\u025d\u025e\u025f\u0260\u0261\u0262\u0263\u0264\u0265' + '\u0266\u0267\u0268\u0269\u026a\u026b\u026c\u026d\u026e\u026f' + '\u0270\u0271\u0272\u0273\u0274\u0275\u0276\u0277\u0278\u0279' + '\u027a\u027b\u027c\u027d\u027e\u027f\u0280\u0281\u0282\u0283' + '\u0284\u0285\u0286\u0287\u0288\u0289\u028a\u028b\u028c\u028d' + '\u028e\u028f\u0290\u0291\u0292\u0293\u0295\u0296\u0297\u0298' + '\u0299\u029a\u029b\u029c\u029d\u029e\u029f\u02a0\u02a1\u02a2' + '\u02a3\u02a4\u02a5\u02a6\u02a7\u02a8\u02a9\u02aa\u02ab\u02ac' + '\u02ad\u02ae\u02af\u0371\u0373\u0377\u037b\u037c\u037d\u0390' + '\u03ac\u03ad\u03ae\u03af\u03b0\u03b1\u03b2\u03b3\u03b4\u03b5' + '\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf' + '\u03c0\u03c1\u03c2\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9' + '\u03ca\u03cb\u03cc\u03cd\u03ce\u03d0\u03d1\u03d5\u03d6\u03d7' + '\u03d9\u03db\u03dd\u03df\u03e1\u03e3\u03e5\u03e7\u03e9\u03eb' + '\u03ed\u03ef\u03f0\u03f1\u03f2\u03f3\u03f5\u03f8\u03fb\u03fc' + '\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439' + '\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443' + '\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d' + '\u044e\u044f\u0450\u0451\u0452\u0453\u0454\u0455\u0456\u0457' + '\u0458\u0459\u045a\u045b\u045c\u045d\u045e\u045f\u0461\u0463' + '\u0465\u0467\u0469\u046b\u046d\u046f\u0471\u0473\u0475\u0477' + '\u0479\u047b\u047d\u047f\u0481\u048b\u048d\u048f\u0491\u0493' + '\u0495\u0497\u0499\u049b\u049d\u049f\u04a1\u04a3\u04a5\u04a7' + '\u04a9\u04ab\u04ad\u04af\u04b1\u04b3\u04b5\u04b7\u04b9\u04bb' + '\u04bd\u04bf\u04c2\u04c4\u04c6\u04c8\u04ca\u04cc\u04ce\u04cf' + '\u04d1\u04d3\u04d5\u04d7\u04d9\u04db\u04dd\u04df\u04e1\u04e3' + '\u04e5\u04e7\u04e9\u04eb\u04ed\u04ef\u04f1\u04f3\u04f5\u04f7' + '\u04f9\u04fb\u04fd\u04ff\u0501\u0503\u0505\u0507\u0509\u050b' + '\u050d\u050f\u0511\u0513\u0515\u0517\u0519\u051b\u051d\u051f' + '\u0521\u0523\u0525\u0561\u0562\u0563\u0564\u0565\u0566\u0567' + '\u0568\u0569\u056a\u056b\u056c\u056d\u056e\u056f\u0570\u0571' + '\u0572\u0573\u0574\u0575\u0576\u0577\u0578\u0579\u057a\u057b' + '\u057c\u057d\u057e\u057f\u0580\u0581\u0582\u0583\u0584\u0585' + '\u0586\u0587\u1d00\u1d01\u1d02\u1d03\u1d04\u1d05\u1d06\u1d07' + '\u1d08\u1d09\u1d0a\u1d0b\u1d0c\u1d0d\u1d0e\u1d0f\u1d10\u1d11' + '\u1d12\u1d13\u1d14\u1d15\u1d16\u1d17\u1d18\u1d19\u1d1a\u1d1b' + '\u1d1c\u1d1d\u1d1e\u1d1f\u1d20\u1d21\u1d22\u1d23\u1d24\u1d25' + '\u1d26\u1d27\u1d28\u1d29\u1d2a\u1d2b\u1d62\u1d63\u1d64\u1d65' + '\u1d66\u1d67\u1d68\u1d69\u1d6a\u1d6b\u1d6c\u1d6d\u1d6e\u1d6f' + '\u1d70\u1d71\u1d72\u1d73\u1d74\u1d75\u1d76\u1d77\u1d79\u1d7a' + '\u1d7b\u1d7c\u1d7d\u1d7e\u1d7f\u1d80\u1d81\u1d82\u1d83\u1d84' + '\u1d85\u1d86\u1d87\u1d88\u1d89\u1d8a\u1d8b\u1d8c\u1d8d\u1d8e' + '\u1d8f\u1d90\u1d91\u1d92\u1d93\u1d94\u1d95\u1d96\u1d97\u1d98' + '\u1d99\u1d9a\u1e01\u1e03\u1e05\u1e07\u1e09\u1e0b\u1e0d\u1e0f' + '\u1e11\u1e13\u1e15\u1e17\u1e19\u1e1b\u1e1d\u1e1f\u1e21\u1e23' + '\u1e25\u1e27\u1e29\u1e2b\u1e2d\u1e2f\u1e31\u1e33\u1e35\u1e37' + '\u1e39\u1e3b\u1e3d\u1e3f\u1e41\u1e43\u1e45\u1e47\u1e49\u1e4b' + '\u1e4d\u1e4f\u1e51\u1e53\u1e55\u1e57\u1e59\u1e5b\u1e5d\u1e5f' + '\u1e61\u1e63\u1e65\u1e67\u1e69\u1e6b\u1e6d\u1e6f\u1e71\u1e73' + '\u1e75\u1e77\u1e79\u1e7b\u1e7d\u1e7f\u1e81\u1e83\u1e85\u1e87' + '\u1e89\u1e8b\u1e8d\u1e8f\u1e91\u1e93\u1e95\u1e96\u1e97\u1e98' + '\u1e99\u1e9a\u1e9b\u1e9c\u1e9d\u1e9f\u1ea1\u1ea3\u1ea5\u1ea7' + '\u1ea9\u1eab\u1ead\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u1eb9\u1ebb' + '\u1ebd\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u1ec9\u1ecb\u1ecd\u1ecf' + '\u1ed1\u1ed3\u1ed5\u1ed7\u1ed9\u1edb\u1edd\u1edf\u1ee1\u1ee3' + '\u1ee5\u1ee7\u1ee9\u1eeb\u1eed\u1eef\u1ef1\u1ef3\u1ef5\u1ef7' + '\u1ef9\u1efb\u1efd\u1eff\u1f00\u1f01\u1f02\u1f03\u1f04\u1f05' + '\u1f06\u1f07\u1f10\u1f11\u1f12\u1f13\u1f14\u1f15\u1f20\u1f21' + '\u1f22\u1f23\u1f24\u1f25\u1f26\u1f27\u1f30\u1f31\u1f32\u1f33' + '\u1f34\u1f35\u1f36\u1f37\u1f40\u1f41\u1f42\u1f43\u1f44\u1f45' + '\u1f50\u1f51\u1f52\u1f53\u1f54\u1f55\u1f56\u1f57\u1f60\u1f61' + '\u1f62\u1f63\u1f64\u1f65\u1f66\u1f67\u1f70\u1f71\u1f72\u1f73' + '\u1f74\u1f75\u1f76\u1f77\u1f78\u1f79\u1f7a\u1f7b\u1f7c\u1f7d' + '\u1f80\u1f81\u1f82\u1f83\u1f84\u1f85\u1f86\u1f87\u1f90\u1f91' + '\u1f92\u1f93\u1f94\u1f95\u1f96\u1f97\u1fa0\u1fa1\u1fa2\u1fa3' + '\u1fa4\u1fa5\u1fa6\u1fa7\u1fb0\u1fb1\u1fb2\u1fb3\u1fb4\u1fb6' + '\u1fb7\u1fbe\u1fc2\u1fc3\u1fc4\u1fc6\u1fc7\u1fd0\u1fd1\u1fd2' + '\u1fd3\u1fd6\u1fd7\u1fe0\u1fe1\u1fe2\u1fe3\u1fe4\u1fe5\u1fe6' + '\u1fe7\u1ff2\u1ff3\u1ff4\u1ff6\u1ff7\u210a\u210e\u210f\u2113' + '\u212f\u2134\u2139\u213c\u213d\u2146\u2147\u2148\u2149\u214e' + '\u2184\u2c30\u2c31\u2c32\u2c33\u2c34\u2c35\u2c36\u2c37\u2c38' + '\u2c39\u2c3a\u2c3b\u2c3c\u2c3d\u2c3e\u2c3f\u2c40\u2c41\u2c42' + '\u2c43\u2c44\u2c45\u2c46\u2c47\u2c48\u2c49\u2c4a\u2c4b\u2c4c' + '\u2c4d\u2c4e\u2c4f\u2c50\u2c51\u2c52\u2c53\u2c54\u2c55\u2c56' + '\u2c57\u2c58\u2c59\u2c5a\u2c5b\u2c5c\u2c5d\u2c5e\u2c61\u2c65' + '\u2c66\u2c68\u2c6a\u2c6c\u2c71\u2c73\u2c74\u2c76\u2c77\u2c78' + '\u2c79\u2c7a\u2c7b\u2c7c\u2c81\u2c83\u2c85\u2c87\u2c89\u2c8b' + '\u2c8d\u2c8f\u2c91\u2c93\u2c95\u2c97\u2c99\u2c9b\u2c9d\u2c9f' + '\u2ca1\u2ca3\u2ca5\u2ca7\u2ca9\u2cab\u2cad\u2caf\u2cb1\u2cb3' + '\u2cb5\u2cb7\u2cb9\u2cbb\u2cbd\u2cbf\u2cc1\u2cc3\u2cc5\u2cc7' + '\u2cc9\u2ccb\u2ccd\u2ccf\u2cd1\u2cd3\u2cd5\u2cd7\u2cd9\u2cdb' + '\u2cdd\u2cdf\u2ce1\u2ce3\u2ce4\u2cec\u2cee\u2d00\u2d01\u2d02' + '\u2d03\u2d04\u2d05\u2d06\u2d07\u2d08\u2d09\u2d0a\u2d0b\u2d0c' + '\u2d0d\u2d0e\u2d0f\u2d10\u2d11\u2d12\u2d13\u2d14\u2d15\u2d16' + '\u2d17\u2d18\u2d19\u2d1a\u2d1b\u2d1c\u2d1d\u2d1e\u2d1f\u2d20' + '\u2d21\u2d22\u2d23\u2d24\u2d25\ua641\ua643\ua645\ua647\ua649' + '\ua64b\ua64d\ua64f\ua651\ua653\ua655\ua657\ua659\ua65b\ua65d' + '\ua65f\ua663\ua665\ua667\ua669\ua66b\ua66d\ua681\ua683\ua685' + '\ua687\ua689\ua68b\ua68d\ua68f\ua691\ua693\ua695\ua697\ua723' + '\ua725\ua727\ua729\ua72b\ua72d\ua72f\ua730\ua731\ua733\ua735' + '\ua737\ua739\ua73b\ua73d\ua73f\ua741\ua743\ua745\ua747\ua749' + '\ua74b\ua74d\ua74f\ua751\ua753\ua755\ua757\ua759\ua75b\ua75d' + '\ua75f\ua761\ua763\ua765\ua767\ua769\ua76b\ua76d\ua76f\ua771' + '\ua772\ua773\ua774\ua775\ua776\ua777\ua778\ua77a\ua77c\ua77f' + '\ua781\ua783\ua785\ua787\ua78c\ufb00\ufb01\ufb02\ufb03\ufb04' + '\ufb05\ufb06\ufb13\ufb14\ufb15\ufb16\ufb17\uff41\uff42\uff43' + '\uff44\uff45\uff46\uff47\uff48\uff49\uff4a\uff4b\uff4c\uff4d' + '\uff4e\uff4f\uff50\uff51\uff52\uff53\uff54\uff55\uff56\uff57' + '\uff58\uff59\uff5a') +uppercase = u('ABCDEFGHIJKLMNOPQRSTUVWXYZ\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8' + '\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd8' + '\xd9\xda\xdb\xdc\xdd\xde\u0100\u0102\u0104\u0106\u0108\u010a' + '\u010c\u010e\u0110\u0112\u0114\u0116\u0118\u011a\u011c\u011e' + '\u0120\u0122\u0124\u0126\u0128\u012a\u012c\u012e\u0130\u0132' + '\u0134\u0136\u0139\u013b\u013d\u013f\u0141\u0143\u0145\u0147' + '\u014a\u014c\u014e\u0150\u0152\u0154\u0156\u0158\u015a\u015c' + '\u015e\u0160\u0162\u0164\u0166\u0168\u016a\u016c\u016e\u0170' + '\u0172\u0174\u0176\u0178\u0179\u017b\u017d\u0181\u0182\u0184' + '\u0186\u0187\u0189\u018a\u018b\u018e\u018f\u0190\u0191\u0193' + '\u0194\u0196\u0197\u0198\u019c\u019d\u019f\u01a0\u01a2\u01a4' + '\u01a6\u01a7\u01a9\u01ac\u01ae\u01af\u01b1\u01b2\u01b3\u01b5' + '\u01b7\u01b8\u01bc\u01c4\u01c7\u01ca\u01cd\u01cf\u01d1\u01d3' + '\u01d5\u01d7\u01d9\u01db\u01de\u01e0\u01e2\u01e4\u01e6\u01e8' + '\u01ea\u01ec\u01ee\u01f1\u01f4\u01f6\u01f7\u01f8\u01fa\u01fc' + '\u01fe\u0200\u0202\u0204\u0206\u0208\u020a\u020c\u020e\u0210' + '\u0212\u0214\u0216\u0218\u021a\u021c\u021e\u0220\u0222\u0224' + '\u0226\u0228\u022a\u022c\u022e\u0230\u0232\u023a\u023b\u023d' + '\u023e\u0241\u0243\u0244\u0245\u0246\u0248\u024a\u024c\u024e' + '\u0370\u0372\u0376\u0386\u0388\u0389\u038a\u038c\u038e\u038f' + '\u0391\u0392\u0393\u0394\u0395\u0396\u0397\u0398\u0399\u039a' + '\u039b\u039c\u039d\u039e\u039f\u03a0\u03a1\u03a3\u03a4\u03a5' + '\u03a6\u03a7\u03a8\u03a9\u03aa\u03ab\u03cf\u03d2\u03d3\u03d4' + '\u03d8\u03da\u03dc\u03de\u03e0\u03e2\u03e4\u03e6\u03e8\u03ea' + '\u03ec\u03ee\u03f4\u03f7\u03f9\u03fa\u03fd\u03fe\u03ff\u0400' + '\u0401\u0402\u0403\u0404\u0405\u0406\u0407\u0408\u0409\u040a' + '\u040b\u040c\u040d\u040e\u040f\u0410\u0411\u0412\u0413\u0414' + '\u0415\u0416\u0417\u0418\u0419\u041a\u041b\u041c\u041d\u041e' + '\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428' + '\u0429\u042a\u042b\u042c\u042d\u042e\u042f\u0460\u0462\u0464' + '\u0466\u0468\u046a\u046c\u046e\u0470\u0472\u0474\u0476\u0478' + '\u047a\u047c\u047e\u0480\u048a\u048c\u048e\u0490\u0492\u0494' + '\u0496\u0498\u049a\u049c\u049e\u04a0\u04a2\u04a4\u04a6\u04a8' + '\u04aa\u04ac\u04ae\u04b0\u04b2\u04b4\u04b6\u04b8\u04ba\u04bc' + '\u04be\u04c0\u04c1\u04c3\u04c5\u04c7\u04c9\u04cb\u04cd\u04d0' + '\u04d2\u04d4\u04d6\u04d8\u04da\u04dc\u04de\u04e0\u04e2\u04e4' + '\u04e6\u04e8\u04ea\u04ec\u04ee\u04f0\u04f2\u04f4\u04f6\u04f8' + '\u04fa\u04fc\u04fe\u0500\u0502\u0504\u0506\u0508\u050a\u050c' + '\u050e\u0510\u0512\u0514\u0516\u0518\u051a\u051c\u051e\u0520' + '\u0522\u0524\u0531\u0532\u0533\u0534\u0535\u0536\u0537\u0538' + '\u0539\u053a\u053b\u053c\u053d\u053e\u053f\u0540\u0541\u0542' + '\u0543\u0544\u0545\u0546\u0547\u0548\u0549\u054a\u054b\u054c' + '\u054d\u054e\u054f\u0550\u0551\u0552\u0553\u0554\u0555\u0556' + '\u10a0\u10a1\u10a2\u10a3\u10a4\u10a5\u10a6\u10a7\u10a8\u10a9' + '\u10aa\u10ab\u10ac\u10ad\u10ae\u10af\u10b0\u10b1\u10b2\u10b3' + '\u10b4\u10b5\u10b6\u10b7\u10b8\u10b9\u10ba\u10bb\u10bc\u10bd' + '\u10be\u10bf\u10c0\u10c1\u10c2\u10c3\u10c4\u10c5\u1e00\u1e02' + '\u1e04\u1e06\u1e08\u1e0a\u1e0c\u1e0e\u1e10\u1e12\u1e14\u1e16' + '\u1e18\u1e1a\u1e1c\u1e1e\u1e20\u1e22\u1e24\u1e26\u1e28\u1e2a' + '\u1e2c\u1e2e\u1e30\u1e32\u1e34\u1e36\u1e38\u1e3a\u1e3c\u1e3e' + '\u1e40\u1e42\u1e44\u1e46\u1e48\u1e4a\u1e4c\u1e4e\u1e50\u1e52' + '\u1e54\u1e56\u1e58\u1e5a\u1e5c\u1e5e\u1e60\u1e62\u1e64\u1e66' + '\u1e68\u1e6a\u1e6c\u1e6e\u1e70\u1e72\u1e74\u1e76\u1e78\u1e7a' + '\u1e7c\u1e7e\u1e80\u1e82\u1e84\u1e86\u1e88\u1e8a\u1e8c\u1e8e' + '\u1e90\u1e92\u1e94\u1e9e\u1ea0\u1ea2\u1ea4\u1ea6\u1ea8\u1eaa' + '\u1eac\u1eae\u1eb0\u1eb2\u1eb4\u1eb6\u1eb8\u1eba\u1ebc\u1ebe' + '\u1ec0\u1ec2\u1ec4\u1ec6\u1ec8\u1eca\u1ecc\u1ece\u1ed0\u1ed2' + '\u1ed4\u1ed6\u1ed8\u1eda\u1edc\u1ede\u1ee0\u1ee2\u1ee4\u1ee6' + '\u1ee8\u1eea\u1eec\u1eee\u1ef0\u1ef2\u1ef4\u1ef6\u1ef8\u1efa' + '\u1efc\u1efe\u1f08\u1f09\u1f0a\u1f0b\u1f0c\u1f0d\u1f0e\u1f0f' + '\u1f18\u1f19\u1f1a\u1f1b\u1f1c\u1f1d\u1f28\u1f29\u1f2a\u1f2b' + '\u1f2c\u1f2d\u1f2e\u1f2f\u1f38\u1f39\u1f3a\u1f3b\u1f3c\u1f3d' + '\u1f3e\u1f3f\u1f48\u1f49\u1f4a\u1f4b\u1f4c\u1f4d\u1f59\u1f5b' + '\u1f5d\u1f5f\u1f68\u1f69\u1f6a\u1f6b\u1f6c\u1f6d\u1f6e\u1f6f' + '\u1fb8\u1fb9\u1fba\u1fbb\u1fc8\u1fc9\u1fca\u1fcb\u1fd8\u1fd9' + '\u1fda\u1fdb\u1fe8\u1fe9\u1fea\u1feb\u1fec\u1ff8\u1ff9\u1ffa' + '\u1ffb\u2102\u2107\u210b\u210c\u210d\u2110\u2111\u2112\u2115' + '\u2119\u211a\u211b\u211c\u211d\u2124\u2126\u2128\u212a\u212b' + '\u212c\u212d\u2130\u2131\u2132\u2133\u213e\u213f\u2145\u2183' + '\u2c00\u2c01\u2c02\u2c03\u2c04\u2c05\u2c06\u2c07\u2c08\u2c09' + '\u2c0a\u2c0b\u2c0c\u2c0d\u2c0e\u2c0f\u2c10\u2c11\u2c12\u2c13' + '\u2c14\u2c15\u2c16\u2c17\u2c18\u2c19\u2c1a\u2c1b\u2c1c\u2c1d' + '\u2c1e\u2c1f\u2c20\u2c21\u2c22\u2c23\u2c24\u2c25\u2c26\u2c27' + '\u2c28\u2c29\u2c2a\u2c2b\u2c2c\u2c2d\u2c2e\u2c60\u2c62\u2c63' + '\u2c64\u2c67\u2c69\u2c6b\u2c6d\u2c6e\u2c6f\u2c70\u2c72\u2c75' + '\u2c7e\u2c7f\u2c80\u2c82\u2c84\u2c86\u2c88\u2c8a\u2c8c\u2c8e' + '\u2c90\u2c92\u2c94\u2c96\u2c98\u2c9a\u2c9c\u2c9e\u2ca0\u2ca2' + '\u2ca4\u2ca6\u2ca8\u2caa\u2cac\u2cae\u2cb0\u2cb2\u2cb4\u2cb6' + '\u2cb8\u2cba\u2cbc\u2cbe\u2cc0\u2cc2\u2cc4\u2cc6\u2cc8\u2cca' + '\u2ccc\u2cce\u2cd0\u2cd2\u2cd4\u2cd6\u2cd8\u2cda\u2cdc\u2cde' + '\u2ce0\u2ce2\u2ceb\u2ced\ua640\ua642\ua644\ua646\ua648\ua64a' + '\ua64c\ua64e\ua650\ua652\ua654\ua656\ua658\ua65a\ua65c\ua65e' + '\ua662\ua664\ua666\ua668\ua66a\ua66c\ua680\ua682\ua684\ua686' + '\ua688\ua68a\ua68c\ua68e\ua690\ua692\ua694\ua696\ua722\ua724' + '\ua726\ua728\ua72a\ua72c\ua72e\ua732\ua734\ua736\ua738\ua73a' + '\ua73c\ua73e\ua740\ua742\ua744\ua746\ua748\ua74a\ua74c\ua74e' + '\ua750\ua752\ua754\ua756\ua758\ua75a\ua75c\ua75e\ua760\ua762' + '\ua764\ua766\ua768\ua76a\ua76c\ua76e\ua779\ua77b\ua77d\ua77e' + '\ua780\ua782\ua784\ua786\ua78b\uff21\uff22\uff23\uff24\uff25' + '\uff26\uff27\uff28\uff29\uff2a\uff2b\uff2c\uff2d\uff2e\uff2f' + '\uff30\uff31\uff32\uff33\uff34\uff35\uff36\uff37\uff38\uff39' + '\uff3a') diff --git a/src/whoosh/system.py b/src/whoosh/system.py new file mode 100644 index 0000000..2bdce1b --- /dev/null +++ b/src/whoosh/system.py @@ -0,0 +1,79 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import sys +from struct import Struct, calcsize + + +IS_LITTLE = sys.byteorder == "little" + +_INT_SIZE = calcsize("!i") +_SHORT_SIZE = calcsize("!H") +_LONG_SIZE = calcsize("!Q") +_FLOAT_SIZE = calcsize("!f") +_DOUBLE_SIZE = calcsize("!d") + +_byte_struct = Struct("!B") +_sbyte_struct = Struct("!b") +_ushort_struct = Struct("!H") +_int_struct = Struct("!i") +_uint_struct = Struct("!I") +_long_struct = Struct("!q") +_ulong_struct = Struct("!Q") +_float_struct = Struct("!f") +_double_struct = Struct("!d") +_ushort_le_struct = Struct("= 3: + return bytes(gen) + else: + return array("B", gen).tostring() + + +def make_binary_tree(fn, args, **kwargs): + """Takes a function/class that takes two positional arguments and a list of + arguments and returns a binary tree of results/instances. + + >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3]) + UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3)) + + Any keyword arguments given to this function are passed to the class + initializer. + """ + + count = len(args) + if not count: + raise ValueError("Called make_binary_tree with empty list") + elif count == 1: + return args[0] + + half = count // 2 + return fn(make_binary_tree(fn, args[:half], **kwargs), + make_binary_tree(fn, args[half:], **kwargs), **kwargs) + + +def make_weighted_tree(fn, ls, **kwargs): + """Takes a function/class that takes two positional arguments and a list of + (weight, argument) tuples and returns a huffman-like weighted tree of + results/instances. + """ + + if not ls: + raise ValueError("Called make_weighted_tree with empty list") + + ls.sort() + while len(ls) > 1: + a = ls.pop(0) + b = ls.pop(0) + insort(ls, (a[0] + b[0], fn(a[1], b[1]))) + return ls[0][1] + + +# Fibonacci function + +_fib_cache = {} + + +def fib(n): + """Returns the nth value in the Fibonacci sequence. + """ + + if n <= 2: + return n + if n in _fib_cache: + return _fib_cache[n] + result = fib(n - 1) + fib(n - 2) + _fib_cache[n] = result + return result + + +# Decorators + +def synchronized(func): + """Decorator for storage-access methods, which synchronizes on a threading + lock. The parent object must have 'is_closed' and '_sync_lock' attributes. + """ + + @wraps(func) + def synchronized_wrapper(self, *args, **kwargs): + with self._sync_lock: + return func(self, *args, **kwargs) + + return synchronized_wrapper + + +def unclosed(method): + """ + Decorator to check if the object is closed. + """ + + @wraps(method) + def unclosed_wrapper(self, *args, **kwargs): + if self.closed: + raise ValueError("Operation on a closed object") + return method(self, *args, **kwargs) + return unclosed_wrapper diff --git a/src/whoosh/util/cache.py b/src/whoosh/util/cache.py new file mode 100644 index 0000000..064129e --- /dev/null +++ b/src/whoosh/util/cache.py @@ -0,0 +1,375 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import with_statement +import functools, random +from array import array +from heapq import nsmallest +from operator import itemgetter +from threading import Lock +from time import time + +from whoosh.compat import iteritems, xrange + + +try: + from collections import Counter +except ImportError: + class Counter(dict): + def __missing__(self, key): + return 0 + + +def unbound_cache(func): + """Caching decorator with an unbounded cache size. + """ + + cache = {} + + @functools.wraps(func) + def caching_wrapper(*args): + try: + return cache[args] + except KeyError: + result = func(*args) + cache[args] = result + return result + + return caching_wrapper + + +def lru_cache(maxsize=100): + """A simple cache that, when the cache is full, deletes the least recently + used 10% of the cached values. + + This function duplicates (more-or-less) the protocol of the + ``functools.lru_cache`` decorator in the Python 3.2 standard library. + + Arguments to the cached function must be hashable. + + View the cache statistics tuple ``(hits, misses, maxsize, currsize)`` + with f.cache_info(). Clear the cache and statistics with f.cache_clear(). + Access the underlying function with f.__wrapped__. + """ + + def decorating_function(user_function): + stats = [0, 0] # Hits, misses + data = {} + lastused = {} + + @functools.wraps(user_function) + def wrapper(*args): + try: + result = data[args] + stats[0] += 1 # Hit + except KeyError: + stats[1] += 1 # Miss + if len(data) == maxsize: + for k, _ in nsmallest(maxsize // 10 or 1, + iteritems(lastused), + key=itemgetter(1)): + del data[k] + del lastused[k] + data[args] = user_function(*args) + result = data[args] + finally: + lastused[args] = time() + return result + + def cache_info(): + return stats[0], stats[1], maxsize, len(data) + + def cache_clear(): + data.clear() + lastused.clear() + stats[0] = stats[1] = 0 + + wrapper.cache_info = cache_info + wrapper.cache_clear = cache_clear + return wrapper + return decorating_function + + +def lfu_cache(maxsize=100): + """A simple cache that, when the cache is full, deletes the least frequently + used 10% of the cached values. + + This function duplicates (more-or-less) the protocol of the + ``functools.lru_cache`` decorator in the Python 3.2 standard library. + + Arguments to the cached function must be hashable. + + View the cache statistics tuple ``(hits, misses, maxsize, currsize)`` + with f.cache_info(). Clear the cache and statistics with f.cache_clear(). + Access the underlying function with f.__wrapped__. + """ + + def decorating_function(user_function): + stats = [0, 0] # Hits, misses + data = {} + usecount = Counter() + + @functools.wraps(user_function) + def wrapper(*args): + try: + result = data[args] + stats[0] += 1 # Hit + except KeyError: + stats[1] += 1 # Miss + if len(data) == maxsize: + for k, _ in nsmallest(maxsize // 10 or 1, + iteritems(usecount), + key=itemgetter(1)): + del data[k] + del usecount[k] + data[args] = user_function(*args) + result = data[args] + finally: + usecount[args] += 1 + return result + + def cache_info(): + return stats[0], stats[1], maxsize, len(data) + + def cache_clear(): + data.clear() + usecount.clear() + + wrapper.cache_info = cache_info + wrapper.cache_clear = cache_clear + return wrapper + return decorating_function + + +def random_cache(maxsize=100): + """A very simple cache that, when the cache is filled, deletes 10% of the + cached values AT RANDOM. + + This function duplicates (more-or-less) the protocol of the + ``functools.lru_cache`` decorator in the Python 3.2 standard library. + + Arguments to the cached function must be hashable. + + View the cache statistics tuple ``(hits, misses, maxsize, currsize)`` + with f.cache_info(). Clear the cache and statistics with f.cache_clear(). + Access the underlying function with f.__wrapped__. + """ + + def decorating_function(user_function): + stats = [0, 0] # hits, misses + data = {} + + @functools.wraps(user_function) + def wrapper(*args): + try: + result = data[args] + stats[0] += 1 # Hit + except KeyError: + stats[1] += 1 # Miss + if len(data) == maxsize: + keys = data.keys() + for i in xrange(maxsize // 10 or 1): + n = random.randint(0, len(keys) - 1) + k = keys.pop(n) + del data[k] + data[args] = user_function(*args) + result = data[args] + return result + + def cache_info(): + return stats[0], stats[1], maxsize, len(data) + + def cache_clear(): + data.clear() + + wrapper.cache_info = cache_info + wrapper.cache_clear = cache_clear + return wrapper + return decorating_function + + +def db_lru_cache(maxsize=100): + """Double-barrel least-recently-used cache decorator. This is a simple + LRU algorithm that keeps a primary and secondary dict. Keys are checked + in the primary dict, and then the secondary. Once the primary dict fills + up, the secondary dict is cleared and the two dicts are swapped. + + This function duplicates (more-or-less) the protocol of the + ``functools.lru_cache`` decorator in the Python 3.2 standard library. + + Arguments to the cached function must be hashable. + + View the cache statistics tuple ``(hits, misses, maxsize, currsize)`` + with f.cache_info(). Clear the cache and statistics with f.cache_clear(). + Access the underlying function with f.__wrapped__. + """ + + def decorating_function(user_function): + # Cache1, Cache2, Pointer, Hits, Misses + stats = [{}, {}, 0, 0, 0] + + @functools.wraps(user_function) + def wrapper(*args): + ptr = stats[2] + a = stats[ptr] + b = stats[not ptr] + key = args + + if key in a: + stats[3] += 1 # Hit + return a[key] + elif key in b: + stats[3] += 1 # Hit + return b[key] + else: + stats[4] += 1 # Miss + result = user_function(*args) + a[key] = result + if len(a) >= maxsize: + stats[2] = not ptr + b.clear() + return result + + def cache_info(): + return stats[3], stats[4], maxsize, len(stats[0]) + len(stats[1]) + + def cache_clear(): + """Clear the cache and cache statistics""" + stats[0].clear() + stats[1].clear() + stats[3] = stats[4] = 0 + + wrapper.cache_info = cache_info + wrapper.cache_clear = cache_clear + + return wrapper + return decorating_function + + +def clockface_lru_cache(maxsize=100): + """Least-recently-used cache decorator. + + This function duplicates (more-or-less) the protocol of the + ``functools.lru_cache`` decorator in the Python 3.2 standard library, but + uses the clock face LRU algorithm instead of an ordered dictionary. + + If *maxsize* is set to None, the LRU features are disabled and the cache + can grow without bound. + + Arguments to the cached function must be hashable. + + View the cache statistics named tuple (hits, misses, maxsize, currsize) + with f.cache_info(). Clear the cache and statistics with f.cache_clear(). + Access the underlying function with f.__wrapped__. + """ + + def decorating_function(user_function): + stats = [0, 0, 0] # hits, misses, hand + data = {} + + if maxsize: + # The keys at each point on the clock face + clock_keys = [None] * maxsize + # The "referenced" bits at each point on the clock face + clock_refs = array("B", (0 for _ in xrange(maxsize))) + lock = Lock() + + @functools.wraps(user_function) + def wrapper(*args): + key = args + try: + with lock: + pos, result = data[key] + # The key is in the cache. Set the key's reference bit + clock_refs[pos] = 1 + # Record a cache hit + stats[0] += 1 + except KeyError: + # Compute the value + result = user_function(*args) + with lock: + # Current position of the clock hand + hand = stats[2] + # Remember to stop here after a full revolution + end = hand + # Sweep around the clock looking for a position with + # the reference bit off + while True: + hand = (hand + 1) % maxsize + current_ref = clock_refs[hand] + if current_ref: + # This position's "referenced" bit is set. Turn + # the bit off and move on. + clock_refs[hand] = 0 + elif not current_ref or hand == end: + # We've either found a position with the + # "reference" bit off or reached the end of the + # circular cache. So we'll replace this + # position with the new key + current_key = clock_keys[hand] + if current_key in data: + del data[current_key] + clock_keys[hand] = key + clock_refs[hand] = 1 + break + # Put the key and result in the cache + data[key] = (hand, result) + # Save the new hand position + stats[2] = hand + # Record a cache miss + stats[1] += 1 + return result + + else: + @functools.wraps(user_function) + def wrapper(*args): + key = args + try: + result = data[key] + stats[0] += 1 + except KeyError: + result = user_function(*args) + data[key] = result + stats[1] += 1 + return result + + def cache_info(): + return stats[0], stats[1], maxsize, len(data) + + def cache_clear(): + """Clear the cache and cache statistics""" + data.clear() + stats[0] = stats[1] = stats[2] = 0 + for i in xrange(maxsize): + clock_keys[i] = None + clock_refs[i] = 0 + + wrapper.cache_info = cache_info + wrapper.cache_clear = cache_clear + return wrapper + return decorating_function + diff --git a/src/whoosh/util/filelock.py b/src/whoosh/util/filelock.py new file mode 100644 index 0000000..382c40d --- /dev/null +++ b/src/whoosh/util/filelock.py @@ -0,0 +1,163 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +""" +This module contains classes implementing exclusive locks for platforms with +fcntl (UNIX and Mac OS X) and Windows. Whoosh originally used directory +creation as a locking method, but it had the problem that if the program +crashed the lock directory was left behind and would keep the index locked +until it was cleaned up. Using OS-level file locks fixes this. +""" + +import errno +import os +import sys +import time + + +def try_for(fn, timeout=5.0, delay=0.1): + """Calls ``fn`` every ``delay`` seconds until it returns True or + ``timeout`` seconds elapse. Returns True if the lock was acquired, or False + if the timeout was reached. + + :param timeout: Length of time (in seconds) to keep retrying to acquire the + lock. 0 means return immediately. Only used when blocking is False. + :param delay: How often (in seconds) to retry acquiring the lock during + the timeout period. Only used when blocking is False and timeout > 0. + """ + + until = time.time() + timeout + v = fn() + while not v and time.time() < until: + time.sleep(delay) + v = fn() + return v + + +class LockBase(object): + """Base class for file locks. + """ + + def __init__(self, filename): + self.fd = None + self.filename = filename + self.locked = False + + def __del__(self): + if hasattr(self, "fd") and self.fd: + try: + self.release() + except: + pass + + def acquire(self, blocking=False): + """Acquire the lock. Returns True if the lock was acquired. + + :param blocking: if True, call blocks until the lock is acquired. + This may not be available on all platforms. On Windows, this is + actually just a delay of 10 seconds, rechecking every second. + """ + pass + + def release(self): + pass + + +class FcntlLock(LockBase): + """File lock based on UNIX-only fcntl module. + """ + + def acquire(self, blocking=False): + import fcntl # @UnresolvedImport + + flags = os.O_CREAT | os.O_WRONLY + self.fd = os.open(self.filename, flags) + + mode = fcntl.LOCK_EX + if not blocking: + mode |= fcntl.LOCK_NB + + try: + fcntl.flock(self.fd, mode) + self.locked = True + return True + except IOError: + e = sys.exc_info()[1] + if e.errno not in (errno.EAGAIN, errno.EACCES): + raise + os.close(self.fd) + self.fd = None + return False + + def release(self): + if self.fd is None: + raise Exception("Lock was not acquired") + + import fcntl # @UnresolvedImport + fcntl.flock(self.fd, fcntl.LOCK_UN) + os.close(self.fd) + self.fd = None + + +class MsvcrtLock(LockBase): + """File lock based on Windows-only msvcrt module. + """ + + def acquire(self, blocking=False): + import msvcrt # @UnresolvedImport + + flags = os.O_CREAT | os.O_WRONLY + mode = msvcrt.LK_NBLCK + if blocking: + mode = msvcrt.LK_LOCK + + self.fd = os.open(self.filename, flags) + try: + msvcrt.locking(self.fd, mode, 1) + return True + except IOError: + e = sys.exc_info()[1] + if e.errno not in (errno.EAGAIN, errno.EACCES, errno.EDEADLK): + raise + os.close(self.fd) + self.fd = None + return False + + def release(self): + import msvcrt # @UnresolvedImport + + if self.fd is None: + raise Exception("Lock was not acquired") + msvcrt.locking(self.fd, msvcrt.LK_UNLCK, 1) + os.close(self.fd) + self.fd = None + + +if os.name == "nt": + FileLock = MsvcrtLock +else: + FileLock = FcntlLock diff --git a/src/whoosh/util/loading.py b/src/whoosh/util/loading.py new file mode 100644 index 0000000..1de00f7 --- /dev/null +++ b/src/whoosh/util/loading.py @@ -0,0 +1,84 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import pickle + + +class RenamingUnpickler(pickle.Unpickler): + """Subclasses ``pickle.Unpickler`` to allow remapping of class names before + loading them. + """ + + def __init__(self, f, objmap, shortcuts=None): + pickle.Unpickler.__init__(self, f) + + if shortcuts: + objmap = dict((k % shortcuts, v % shortcuts) + for k, v in objmap.items()) + self._objmap = objmap + + def find_class(self, modulename, objname): + fqname = "%s.%s" % (modulename, objname) + if fqname in self._objmap: + fqname = self._objmap[fqname] + try: + obj = find_object(fqname) + except ImportError: + raise ImportError("Couldn't find %r" % fqname) + return obj + + +def find_object(name, blacklist=None, whitelist=None): + """Imports and returns an object given a fully qualified name. + + >>> find_object("whoosh.analysis.StopFilter") + + """ + + if blacklist: + for pre in blacklist: + if name.startswith(pre): + raise TypeError("%r: can't instantiate names starting with %r" + % (name, pre)) + if whitelist: + passes = False + for pre in whitelist: + if name.startswith(pre): + passes = True + break + if not passes: + raise TypeError("Can't instantiate %r" % name) + + lastdot = name.rfind(".") + + assert lastdot > -1, "Name %r must be fully qualified" % name + modname = name[:lastdot] + clsname = name[lastdot + 1:] + + mod = __import__(modname, fromlist=[clsname]) + cls = getattr(mod, clsname) + return cls diff --git a/src/whoosh/util/numeric.py b/src/whoosh/util/numeric.py new file mode 100644 index 0000000..42291cc --- /dev/null +++ b/src/whoosh/util/numeric.py @@ -0,0 +1,317 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import math, struct +from array import array +from bisect import bisect_left +from struct import pack, unpack + +from whoosh.compat import b, long_type +from whoosh.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort +from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint +from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong +from whoosh.system import pack_float, unpack_float, pack_double, unpack_double + + +NaN = struct.unpack("i") +_qstruct = struct.Struct(">q") +_dstruct = struct.Struct(">d") +_ipack, _iunpack = _istruct.pack, _istruct.unpack +_qpack, _qunpack = _qstruct.pack, _qstruct.unpack +_dpack, _dunpack = _dstruct.pack, _dstruct.unpack + + +def to_sortable(numtype, intsize, signed, x): + if numtype is int or numtype is long_type: + if signed: + x += (1 << intsize - 1) + return x + else: + return float_to_sortable_long(x, signed) + + +def from_sortable(numtype, intsize, signed, x): + if numtype is int or numtype is long_type: + if signed: + x -= (1 << intsize - 1) + return x + else: + return sortable_long_to_float(x, signed) + + +def float_to_sortable_long(x, signed): + x = _qunpack(_dpack(x))[0] + if x < 0: + x ^= 0x7fffffffffffffff + if signed: + x += 1 << 63 + assert x >= 0 + return x + + +def sortable_long_to_float(x, signed): + if signed: + x -= 1 << 63 + if x < 0: + x ^= 0x7fffffffffffffff + x = _dunpack(_qpack(x))[0] + return x + + +# Functions for generating tiered ranges + +def split_ranges(intsize, step, start, end): + """Splits a range of numbers (from ``start`` to ``end``, inclusive) + into a sequence of trie ranges of the form ``(start, end, shift)``. The + consumer of these tuples is expected to shift the ``start`` and ``end`` + right by ``shift``. + + This is used for generating term ranges for a numeric field. The queries + for the edges of the range are generated at high precision and large blocks + in the middle are generated at low precision. + """ + + shift = 0 + while True: + diff = 1 << (shift + step) + mask = ((1 << step) - 1) << shift + setbits = lambda x: x | ((1 << shift) - 1) + + haslower = (start & mask) != 0 + hasupper = (end & mask) != mask + + not_mask = ~mask & ((1 << intsize + 1) - 1) + nextstart = (start + diff if haslower else start) & not_mask + nextend = (end - diff if hasupper else end) & not_mask + + if shift + step >= intsize or nextstart > nextend: + yield (start, setbits(end), shift) + break + + if haslower: + yield (start, setbits(start | mask), shift) + if hasupper: + yield (end & not_mask, setbits(end), shift) + + start = nextstart + end = nextend + shift += step + + +def tiered_ranges(numtype, intsize, signed, start, end, shift_step, + startexcl, endexcl): + assert numtype in (int, float) + assert intsize in (8, 16, 32, 64) + + # Convert start and end values to sortable ints + if start is None: + start = 0 + else: + start = to_sortable(numtype, intsize, signed, start) + if startexcl: + start += 1 + + if end is None: + end = 2 ** intsize - 1 + else: + end = to_sortable(numtype, intsize, signed, end) + if endexcl: + end -= 1 + + if not shift_step: + return ((start, end, 0),) + + # Yield (rstart, rend, shift) ranges for the different resolutions + return split_ranges(intsize, shift_step, start, end) + + +# Float-to-byte encoding/decoding + +def float_to_byte(value, mantissabits=5, zeroexp=2): + """Encodes a floating point number in a single byte. + """ + + # Assume int size == float size + + fzero = (63 - zeroexp) << mantissabits + bits = unpack("i", pack("f", value))[0] + smallfloat = bits >> (24 - mantissabits) + if smallfloat < fzero: + # Map negative numbers and 0 to 0 + # Map underflow to next smallest non-zero number + if bits <= 0: + result = chr(0) + else: + result = chr(1) + elif smallfloat >= fzero + 0x100: + # Map overflow to largest number + result = chr(255) + else: + result = chr(smallfloat - fzero) + return b(result) + + +def byte_to_float(b, mantissabits=5, zeroexp=2): + """Decodes a floating point number stored in a single byte. + """ + if type(b) is not int: + b = ord(b) + if b == 0: + return 0.0 + + bits = (b & 0xff) << (24 - mantissabits) + bits += (63 - zeroexp) << 24 + return unpack("f", pack("i", bits))[0] + + +# Length-to-byte approximation functions + +# Old implementation: + +#def length_to_byte(length): +# """Returns a logarithmic approximation of the given number, in the range +# 0-255. The approximation has high precision at the low end (e.g. +# 1 -> 0, 2 -> 1, 3 -> 2 ...) and low precision at the high end. Numbers +# equal to or greater than 108116 all approximate to 255. +# +# This is useful for storing field lengths, where the general case is small +# documents and very large documents are more rare. +# """ +# +# # This encoding formula works up to 108116 -> 255, so if the length is +# # equal to or greater than that limit, just return 255. +# if length >= 108116: +# return 255 +# +# # The parameters of this formula where chosen heuristically so that low +# # numbers would approximate closely, and the byte range 0-255 would cover +# # a decent range of document lengths (i.e. 1 to ~100000). +# return int(round(log((length / 27.0) + 1, 1.033))) +#def _byte_to_length(n): +# return int(round((pow(1.033, n) - 1) * 27)) +#_b2l_cache = array("i", (_byte_to_length(i) for i in xrange(256))) +#byte_to_length = _b2l_cache.__getitem__ + +# New implementation + +# Instead of computing the actual formula to get the byte for any given length, +# precompute the length associated with each byte, and use bisect to find the +# nearest value. This gives quite a large speed-up. +# +# Note that this does not give all the same answers as the old, "real" +# implementation since this implementation always "rounds down" (thanks to the +# bisect_left) while the old implementation would "round up" or "round down" +# depending on the input. Since this is a fairly gross approximation anyway, +# I don't think it matters much. + +# Values generated using the formula from the "old" implementation above +_length_byte_cache = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, +16, 17, 18, 20, 21, 23, 25, 26, 28, 30, 32, 34, 36, 38, 40, 42, 45, 47, 49, 52, +54, 57, 60, 63, 66, 69, 72, 75, 79, 82, 86, 89, 93, 97, 101, 106, 110, 114, +119, 124, 129, 134, 139, 145, 150, 156, 162, 169, 175, 182, 189, 196, 203, 211, +219, 227, 235, 244, 253, 262, 271, 281, 291, 302, 313, 324, 336, 348, 360, 373, +386, 399, 414, 428, 443, 459, 475, 491, 508, 526, 544, 563, 583, 603, 623, 645, +667, 690, 714, 738, 763, 789, 816, 844, 873, 903, 933, 965, 998, 1032, 1066, +1103, 1140, 1178, 1218, 1259, 1302, 1345, 1391, 1438, 1486, 1536, 1587, 1641, +1696, 1753, 1811, 1872, 1935, 1999, 2066, 2135, 2207, 2280, 2356, 2435, 2516, +2600, 2687, 2777, 2869, 2965, 3063, 3165, 3271, 3380, 3492, 3608, 3728, 3852, +3980, 4112, 4249, 4390, 4536, 4686, 4842, 5002, 5168, 5340, 5517, 5700, 5889, +6084, 6286, 6494, 6709, 6932, 7161, 7398, 7643, 7897, 8158, 8428, 8707, 8995, +9293, 9601, 9918, 10247, 10586, 10936, 11298, 11671, 12057, 12456, 12868, +13294, 13733, 14187, 14656, 15141, 15641, 16159, 16693, 17244, 17814, 18403, +19011, 19640, 20289, 20959, 21652, 22367, 23106, 23869, 24658, 25472, 26314, +27183, 28081, 29009, 29967, 30957, 31979, 33035, 34126, 35254, 36418, 37620, +38863, 40146, 41472, 42841, 44256, 45717, 47227, 48786, 50397, 52061, 53780, +55556, 57390, 59285, 61242, 63264, 65352, 67510, 69739, 72041, 74419, 76876, +79414, 82035, 84743, 87541, 90430, 93416, 96499, 99684, 102975, 106374]) + + +def length_to_byte(length): + if length is None: + return 0 + if length >= 106374: + return 255 + else: + return bisect_left(_length_byte_cache, length) + +byte_to_length = _length_byte_cache.__getitem__ diff --git a/src/whoosh/util/numlists.py b/src/whoosh/util/numlists.py new file mode 100644 index 0000000..8f3d84d --- /dev/null +++ b/src/whoosh/util/numlists.py @@ -0,0 +1,373 @@ +from array import array + +from whoosh.compat import xrange +from whoosh.system import emptybytes +from whoosh.system import pack_byte, unpack_byte +from whoosh.system import pack_ushort_le, unpack_ushort_le +from whoosh.system import pack_uint_le, unpack_uint_le + + +def delta_encode(nums): + base = 0 + for n in nums: + yield n - base + base = n + + +def delta_decode(nums): + base = 0 + for n in nums: + base += n + yield base + + +class GrowableArray(object): + def __init__(self, inittype="B", allow_longs=True): + self.array = array(inittype) + self._allow_longs = allow_longs + + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.array) + + def __len__(self): + return len(self.array) + + def __iter__(self): + return iter(self.array) + + def _retype(self, maxnum): + if maxnum < 2 ** 16: + newtype = "H" + elif maxnum < 2 ** 31: + newtype = "i" + elif maxnum < 2 ** 32: + newtype = "I" + elif self._allow_longs: + newtype = "q" + else: + raise OverflowError("%r is too big to fit in an array" % maxnum) + + try: + self.array = array(newtype, iter(self.array)) + except ValueError: + self.array = list(self.array) + + def append(self, n): + try: + self.array.append(n) + except OverflowError: + self._retype(n) + self.array.append(n) + + def extend(self, ns): + append = self.append + for n in ns: + append(n) + + @property + def typecode(self): + if isinstance(self.array, array): + return self.array.typecode + else: + return "q" + + def to_file(self, dbfile): + if isinstance(self.array, array): + dbfile.write_array(self.array) + else: + write_long = dbfile.write_long + for n in self.array: + write_long(n) + + +# Number list encoding base class + +class NumberEncoding(object): + maxint = None + + def write_nums(self, f, numbers): + raise NotImplementedError + + def read_nums(self, f, n): + raise NotImplementedError + + def write_deltas(self, f, numbers): + return self.write_nums(f, list(delta_encode(numbers))) + + def read_deltas(self, f, n): + return delta_decode(self.read_nums(f, n)) + + def get(self, f, pos, i): + f.seek(pos) + n = None + for n in self.read_nums(f, i + 1): + pass + return n + + +# Fixed width encodings + +class FixedEncoding(NumberEncoding): + _encode = None + _decode = None + size = None + + def write_nums(self, f, numbers): + _encode = self._encode + + for n in numbers: + f.write(_encode(n)) + + def read_nums(self, f, n): + _decode = self._decode + + for _ in xrange(n): + yield _decode(f.read(self.size)) + + def get(self, f, pos, i): + f.seek(pos + i * self.size) + return self._decode(f.read(self.size)) + + +class ByteEncoding(FixedEncoding): + size = 1 + maxint = 255 + _encode = pack_byte + _decode = unpack_byte + + +class UShortEncoding(FixedEncoding): + size = 2 + maxint = 2 ** 16 - 1 + _encode = pack_ushort_le + _decode = unpack_ushort_le + + +class UIntEncoding(FixedEncoding): + size = 4 + maxint = 2 ** 32 - 1 + _encode = pack_uint_le + _decode = unpack_uint_le + + +# High-bit encoded variable-length integer + +class Varints(NumberEncoding): + maxint = None + + def write_nums(self, f, numbers): + for n in numbers: + f.write_varint(n) + + def read_nums(self, f, n): + for _ in xrange(n): + yield f.read_varint() + + +# Simple16 algorithm for storing arrays of positive integers (usually delta +# encoded lists of sorted integers) +# +# 1. http://www2008.org/papers/pdf/p387-zhangA.pdf +# 2. http://www2009.org/proceedings/pdf/p401.pdf + +class Simple16(NumberEncoding): + # The maximum possible integer value Simple16 can encode is < 2^28. + # Therefore, in order to use Simple16, the application must have its own + # code to encode numbers in the range of [2^28, 2^32). A simple way is just + # write those numbers as 32-bit integers (that is, no compression for very + # big numbers). + _numsize = 16 + _bitsize = 28 + maxint = 2 ** _bitsize - 1 + + # Number of stored numbers per code + _num = [28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1] + # Number of bits for each number per code + _bits = [ + (1,) * 28, + (2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), + (1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1), + (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2), + (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), + (4, 3, 3, 3, 3, 3, 3, 3, 3), + (3, 4, 4, 4, 4, 3, 3, 3), + (4, 4, 4, 4, 4, 4, 4), + (5, 5, 5, 5, 4, 4), + (4, 4, 5, 5, 5, 5), + (6, 6, 6, 5, 5), + (5, 5, 6, 6, 6), + (7, 7, 7, 7), + (10, 9, 9), + (14, 14), + (28,), + ] + + def write_nums(self, f, numbers): + _compress = self._compress + + i = 0 + while i < len(numbers): + value, taken = _compress(numbers, i, len(numbers) - i) + f.write_uint_le(value) + i += taken + + def _compress(self, inarray, inoffset, n): + _numsize = self._numsize + _bitsize = self._bitsize + _num = self._num + _bits = self._bits + + for key in xrange(_numsize): + value = key << _bitsize + num = _num[key] if _num[key] < n else n + bits = 0 + + j = 0 + while j < num and inarray[inoffset + j] < (1 << _bits[key][j]): + x = inarray[inoffset + j] + value |= x << bits + bits += _bits[key][j] + j += 1 + + if j == num: + return value, num + + raise Exception + + def read_nums(self, f, n): + _decompress = self._decompress + + i = 0 + while i < n: + value = unpack_uint_le(f.read(4))[0] + for v in _decompress(value, n - i): + yield v + i += 1 + + def _decompress(self, value, n): + _numsize = self._numsize + _bitsize = self._bitsize + _num = self._num + _bits = self._bits + + key = value >> _bitsize + num = _num[key] if _num[key] < n else n + bits = 0 + for j in xrange(num): + v = value >> bits + yield v & (0xffffffff >> (32 - _bits[key][j])) + bits += _bits[key][j] + + def get(self, f, pos, i): + f.seek(pos) + base = 0 + value = unpack_uint_le(f.read(4)) + key = value >> self._bitsize + num = self._num[key] + while i > base + num: + base += num + value = unpack_uint_le(f.read(4)) + key = value >> self._bitsize + num = self._num[key] + + offset = i - base + if offset: + value = value >> sum(self._bits[key][:offset]) + return value & (2 ** self._bits[key][offset] - 1) + + +# Google Packed Ints algorithm: a set of four numbers is preceded by a "key" +# byte, which encodes how many bytes each of the next four integers use +# (stored in the byte as four 2-bit numbers) + +class GInts(NumberEncoding): + maxint = 2 ** 32 - 1 + + # Number of future bytes to expect after a "key" byte value of N -- used to + # skip ahead from a key byte + _lens = array("B", [4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6, + 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, + 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, + 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, + 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, + 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, + 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, + 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, + 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, + 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, + 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, + 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, + 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16]) + + def key_to_sizes(self, key): + """Returns a list of the sizes of the next four numbers given a key + byte. + """ + + return [(key >> (i * 2) & 3) + 1 for i in xrange(4)] + + def write_nums(self, f, numbers): + buf = emptybytes + count = 0 + key = 0 + for v in numbers: + shift = count * 2 + if v < 256: + buf += pack_byte(v) + elif v < 65536: + key |= 1 << shift + buf += pack_ushort_le(v) + elif v < 16777216: + key |= 2 << shift + buf += pack_uint_le(v)[:3] + else: + key |= 3 << shift + buf += pack_uint_le(v) + + count += 1 + if count == 4: + f.write_byte(key) + f.write(buf) + count = 0 + key = 0 + buf = emptybytes # Clear the buffer + + # Write out leftovers in the buffer + if count: + f.write_byte(key) + f.write(buf) + + def read_nums(self, f, n): + """Read N integers from the bytes stream dbfile. Expects that the file + is positioned at a key byte. + """ + + count = 0 + key = None + for _ in xrange(n): + if count == 0: + key = f.read_byte() + code = key >> (count * 2) & 3 + if code == 0: + yield f.read_byte() + elif code == 1: + yield f.read_ushort_le() + elif code == 2: + yield unpack_uint_le(f.read(3) + "\x00")[0] + else: + yield f.read_uint_le() + + count = (count + 1) % 4 + +# def get(self, f, pos, i): +# f.seek(pos) +# base = 0 +# key = f.read_byte() +# while i > base + 4: +# base += 4 +# f.seek(self._lens[key], 1) +# key = f.read_byte() +# +# for n in self.read_nums(f, (i + 1) - base): +# pass +# return n diff --git a/src/whoosh/util/testing.py b/src/whoosh/util/testing.py new file mode 100644 index 0000000..358bfc3 --- /dev/null +++ b/src/whoosh/util/testing.py @@ -0,0 +1,130 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import os.path +import random +import shutil +import sys +import tempfile +from contextlib import contextmanager + +from whoosh.filedb.filestore import FileStorage +from whoosh.util import now, random_name + + +class TempDir(object): + def __init__(self, basename="", parentdir=None, ext=".whoosh", + suppress=frozenset(), keepdir=False): + self.basename = basename or random_name(8) + self.parentdir = parentdir + + dirname = parentdir or tempfile.mkdtemp(ext, self.basename) + self.dir = os.path.abspath(dirname) + self.suppress = suppress + self.keepdir = keepdir + + def __enter__(self): + if not os.path.exists(self.dir): + os.makedirs(self.dir) + return self.dir + + def cleanup(self): + pass + + def __exit__(self, exc_type, exc_val, exc_tb): + self.cleanup() + if not self.keepdir: + try: + shutil.rmtree(self.dir) + except OSError: + e = sys.exc_info()[1] + #sys.stderr.write("Can't remove temp dir: " + str(e) + "\n") + #if exc_type is None: + # raise + + if exc_type is not None: + if self.keepdir: + sys.stderr.write("Temp dir=" + self.dir + "\n") + if exc_type not in self.suppress: + return False + + +class TempStorage(TempDir): + def __init__(self, debug=False, **kwargs): + TempDir.__init__(self, **kwargs) + self._debug = debug + + def cleanup(self): + self.store.close() + + def __enter__(self): + dirpath = TempDir.__enter__(self) + self.store = FileStorage(dirpath, debug=self._debug) + return self.store + + +class TempIndex(TempStorage): + def __init__(self, schema, ixname='', storage_debug=False, **kwargs): + TempStorage.__init__(self, basename=ixname, debug=storage_debug, + **kwargs) + self.schema = schema + + def __enter__(self): + fstore = TempStorage.__enter__(self) + return fstore.create_index(self.schema, indexname=self.basename) + + +def is_abstract_method(attr): + """Returns True if the given object has __isabstractmethod__ == True. + """ + + return (hasattr(attr, "__isabstractmethod__") + and getattr(attr, "__isabstractmethod__")) + + +def check_abstract_methods(base, subclass): + """Raises AssertionError if ``subclass`` does not override a method on + ``base`` that is marked as an abstract method. + """ + + for attrname in dir(base): + if attrname.startswith("_"): + continue + attr = getattr(base, attrname) + if is_abstract_method(attr): + oattr = getattr(subclass, attrname) + if is_abstract_method(oattr): + raise Exception("%s.%s not overridden" + % (subclass.__name__, attrname)) + + +@contextmanager +def timing(name=None): + t = now() + yield + t = now() - t + print("%s: %0.06f s" % (name or '', t)) diff --git a/src/whoosh/util/text.py b/src/whoosh/util/text.py new file mode 100644 index 0000000..d524aa1 --- /dev/null +++ b/src/whoosh/util/text.py @@ -0,0 +1,132 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import codecs, re + +from whoosh.compat import string_type, u, byte + + +# Note: these functions return a tuple of (text, length), so when you call +# them, you have to add [0] on the end, e.g. str = utf8encode(unicode)[0] + +utf8encode = codecs.getencoder("utf-8") +utf8decode = codecs.getdecoder("utf-8") + + +# Prefix encoding functions + +def first_diff(a, b): + """ + Returns the position of the first differing character in the sequences a + and b. For example, first_diff('render', 'rending') == 4. This function + limits the return value to 255 so the difference can be encoded in a single + byte. + """ + + i = 0 + while i <= 255 and i < len(a) and i < len(b) and a[i] == b[i]: + i += 1 + return i + + +def prefix_encode(a, b): + """ + Compresses bytestring b as a byte representing the prefix it shares with a, + followed by the suffix bytes. + """ + + i = first_diff(a, b) + return byte(i) + b[i:] + + +def prefix_encode_all(ls): + """Compresses the given list of (unicode) strings by storing each string + (except the first one) as an integer (encoded in a byte) representing + the prefix it shares with its predecessor, followed by the suffix encoded + as UTF-8. + """ + + last = u('') + for w in ls: + i = first_diff(last, w) + yield chr(i) + w[i:].encode("utf-8") + last = w + + +def prefix_decode_all(ls): + """Decompresses a list of strings compressed by prefix_encode(). + """ + + last = u('') + for w in ls: + i = ord(w[0]) + decoded = last[:i] + w[1:].decode("utf-8") + yield decoded + last = decoded + + +# Natural key sorting function + +_nkre = re.compile(r"\D+|\d+", re.UNICODE) + + +def _nkconv(i): + try: + return int(i) + except ValueError: + return i.lower() + + +def natural_key(s): + """Converts string ``s`` into a tuple that will sort "naturally" (i.e., + ``name5`` will come before ``name10`` and ``1`` will come before ``A``). + This function is designed to be used as the ``key`` argument to sorting + functions. + + :param s: the str/unicode string to convert. + :rtype: tuple + """ + + # Use _nkre to split the input string into a sequence of + # digit runs and non-digit runs. Then use _nkconv() to convert + # the digit runs into ints and the non-digit runs to lowercase. + return tuple(_nkconv(m) for m in _nkre.findall(s)) + + +# Regular expression functions + +def rcompile(pattern, flags=0, verbose=False): + """A wrapper for re.compile that checks whether "pattern" is a regex object + or a string to be compiled, and automatically adds the re.UNICODE flag. + """ + + if not isinstance(pattern, string_type): + # If it's not a string, assume it's already a compiled pattern + return pattern + if verbose: + flags |= re.VERBOSE + return re.compile(pattern, re.UNICODE | flags) diff --git a/src/whoosh/util/times.py b/src/whoosh/util/times.py new file mode 100644 index 0000000..e72448c --- /dev/null +++ b/src/whoosh/util/times.py @@ -0,0 +1,467 @@ +# Copyright 2010 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +import calendar +import copy +from datetime import date, datetime, timedelta + +from whoosh.compat import iteritems + + +class TimeError(Exception): + pass + + +def relative_days(current_wday, wday, dir): + """Returns the number of days (positive or negative) to the "next" or + "last" of a certain weekday. ``current_wday`` and ``wday`` are numbers, + i.e. 0 = monday, 1 = tuesday, 2 = wednesday, etc. + + >>> # Get the number of days to the next tuesday, if today is Sunday + >>> relative_days(6, 1, 1) + 2 + + :param current_wday: the number of the current weekday. + :param wday: the target weekday. + :param dir: -1 for the "last" (past) weekday, 1 for the "next" (future) + weekday. + """ + + if current_wday == wday: + return 7 * dir + + if dir == 1: + return (wday + 7 - current_wday) % 7 + else: + return (current_wday + 7 - wday) % 7 * -1 + + +def timedelta_to_usecs(td): + total = td.days * 86400000000 # Microseconds in a day + total += td.seconds * 1000000 # Microseconds in a second + total += td.microseconds + return total + + +def datetime_to_long(dt): + """Converts a datetime object to a long integer representing the number + of microseconds since ``datetime.min``. + """ + + return timedelta_to_usecs(dt.replace(tzinfo=None) - dt.min) + + +def long_to_datetime(x): + """Converts a long integer representing the number of microseconds since + ``datetime.min`` to a datetime object. + """ + + days = x // 86400000000 # Microseconds in a day + x -= days * 86400000000 + + seconds = x // 1000000 # Microseconds in a second + x -= seconds * 1000000 + + return datetime.min + timedelta(days=days, seconds=seconds, microseconds=x) + + +# Ambiguous datetime object + +class adatetime(object): + """An "ambiguous" datetime object. This object acts like a + ``datetime.datetime`` object but can have any of its attributes set to + None, meaning unspecified. + """ + + units = frozenset(("year", "month", "day", "hour", "minute", "second", + "microsecond")) + + def __init__(self, year=None, month=None, day=None, hour=None, minute=None, + second=None, microsecond=None): + if isinstance(year, datetime): + dt = year + self.year, self.month, self.day = dt.year, dt.month, dt.day + self.hour, self.minute, self.second = dt.hour, dt.minute, dt.second + self.microsecond = dt.microsecond + else: + if month is not None and (month < 1 or month > 12): + raise TimeError("month must be in 1..12") + + if day is not None and day < 1: + raise TimeError("day must be greater than 1") + if (year is not None and month is not None and day is not None + and day > calendar.monthrange(year, month)[1]): + raise TimeError("day is out of range for month") + + if hour is not None and (hour < 0 or hour > 23): + raise TimeError("hour must be in 0..23") + if minute is not None and (minute < 0 or minute > 59): + raise TimeError("minute must be in 0..59") + if second is not None and (second < 0 or second > 59): + raise TimeError("second must be in 0..59") + if microsecond is not None and (microsecond < 0 + or microsecond > 999999): + raise TimeError("microsecond must be in 0..999999") + + self.year, self.month, self.day = year, month, day + self.hour, self.minute, self.second = hour, minute, second + self.microsecond = microsecond + + def __eq__(self, other): + if not other.__class__ is self.__class__: + if not is_ambiguous(self) and isinstance(other, datetime): + return fix(self) == other + else: + return False + return all(getattr(self, unit) == getattr(other, unit) + for unit in self.units) + + def __repr__(self): + return "%s%r" % (self.__class__.__name__, self.tuple()) + + def tuple(self): + """Returns the attributes of the ``adatetime`` object as a tuple of + ``(year, month, day, hour, minute, second, microsecond)``. + """ + + return (self.year, self.month, self.day, self.hour, self.minute, + self.second, self.microsecond) + + def date(self): + return date(self.year, self.month, self.day) + + def copy(self): + return adatetime(year=self.year, month=self.month, day=self.day, + hour=self.hour, minute=self.minute, second=self.second, + microsecond=self.microsecond) + + def replace(self, **kwargs): + """Returns a copy of this object with the attributes given as keyword + arguments replaced. + + >>> adt = adatetime(year=2009, month=10, day=31) + >>> adt.replace(year=2010) + (2010, 10, 31, None, None, None, None) + """ + + newadatetime = self.copy() + for key, value in iteritems(kwargs): + if key in self.units: + setattr(newadatetime, key, value) + else: + raise KeyError("Unknown argument %r" % key) + return newadatetime + + def floor(self): + """Returns a ``datetime`` version of this object with all unspecified + (None) attributes replaced by their lowest values. + + This method raises an error if the ``adatetime`` object has no year. + + >>> adt = adatetime(year=2009, month=5) + >>> adt.floor() + datetime.datetime(2009, 5, 1, 0, 0, 0, 0) + """ + + y, m, d, h, mn, s, ms = (self.year, self.month, self.day, self.hour, + self.minute, self.second, self.microsecond) + + if y is None: + raise ValueError("Date has no year") + + if m is None: + m = 1 + if d is None: + d = 1 + if h is None: + h = 0 + if mn is None: + mn = 0 + if s is None: + s = 0 + if ms is None: + ms = 0 + return datetime(y, m, d, h, mn, s, ms) + + def ceil(self): + """Returns a ``datetime`` version of this object with all unspecified + (None) attributes replaced by their highest values. + + This method raises an error if the ``adatetime`` object has no year. + + >>> adt = adatetime(year=2009, month=5) + >>> adt.floor() + datetime.datetime(2009, 5, 30, 23, 59, 59, 999999) + """ + + y, m, d, h, mn, s, ms = (self.year, self.month, self.day, self.hour, + self.minute, self.second, self.microsecond) + + if y is None: + raise ValueError("Date has no year") + + if m is None: + m = 12 + if d is None: + d = calendar.monthrange(y, m)[1] + if h is None: + h = 23 + if mn is None: + mn = 59 + if s is None: + s = 59 + if ms is None: + ms = 999999 + return datetime(y, m, d, h, mn, s, ms) + + def disambiguated(self, basedate): + """Returns either a ``datetime`` or unambiguous ``timespan`` version + of this object. + + Unless this ``adatetime`` object is full specified down to the + microsecond, this method will return a timespan built from the "floor" + and "ceil" of this object. + + This method raises an error if the ``adatetime`` object has no year. + + >>> adt = adatetime(year=2009, month=10, day=31) + >>> adt.disambiguated() + timespan(datetime(2009, 10, 31, 0, 0, 0, 0), datetime(2009, 10, 31, 23, 59 ,59, 999999) + """ + + dt = self + if not is_ambiguous(dt): + return fix(dt) + return timespan(dt, dt).disambiguated(basedate) + + +# Time span class + +class timespan(object): + """A span of time between two ``datetime`` or ``adatetime`` objects. + """ + + def __init__(self, start, end): + """ + :param start: a ``datetime`` or ``adatetime`` object representing the + start of the time span. + :param end: a ``datetime`` or ``adatetime`` object representing the + end of the time span. + """ + + if not isinstance(start, (datetime, adatetime)): + raise TimeError("%r is not a datetime object" % start) + if not isinstance(end, (datetime, adatetime)): + raise TimeError("%r is not a datetime object" % end) + + self.start = copy.copy(start) + self.end = copy.copy(end) + + def __eq__(self, other): + if not other.__class__ is self.__class__: + return False + return self.start == other.start and self.end == other.end + + def __repr__(self): + return "%s(%r, %r)" % (self.__class__.__name__, self.start, self.end) + + def disambiguated(self, basedate, debug=0): + """Returns an unambiguous version of this object. + + >>> start = adatetime(year=2009, month=2) + >>> end = adatetime(year=2009, month=10) + >>> ts = timespan(start, end) + >>> ts + timespan(adatetime(2009, 2, None, None, None, None, None), adatetime(2009, 10, None, None, None, None, None)) + >>> td.disambiguated(datetime.now()) + timespan(datetime(2009, 2, 28, 0, 0, 0, 0), datetime(2009, 10, 31, 23, 59 ,59, 999999) + """ + + #- If year is in start but not end, use basedate.year for end + #-- If year is in start but not end, but startdate is > basedate, + # use "next " to get end month/year + #- If year is in end but not start, copy year from end to start + #- Support "next february", "last april", etc. + + start, end = copy.copy(self.start), copy.copy(self.end) + start_year_was_amb = start.year is None + end_year_was_amb = end.year is None + + if has_no_date(start) and has_no_date(end): + # The start and end points are just times, so use the basedate + # for the date information. + by, bm, bd = basedate.year, basedate.month, basedate.day + start = start.replace(year=by, month=bm, day=bd) + end = end.replace(year=by, month=bm, day=bd) + else: + # If one side has a year and the other doesn't, the decision + # of what year to assign to the ambiguous side is kind of + # arbitrary. I've used a heuristic here based on how the range + # "reads", but it may only be reasonable in English. And maybe + # even just to me. + + if start.year is None and end.year is None: + # No year on either side, use the basedate + start.year = end.year = basedate.year + elif start.year is None: + # No year in the start, use the year from the end + start.year = end.year + elif end.year is None: + end.year = max(start.year, basedate.year) + + if start.year == end.year: + # Once again, if one side has a month and day but the other side + # doesn't, the disambiguation is arbitrary. Does "3 am to 5 am + # tomorrow" mean 3 AM today to 5 AM tomorrow, or 3am tomorrow to + # 5 am tomorrow? What I picked is similar to the year: if the + # end has a month+day and the start doesn't, copy the month+day + # from the end to the start UNLESS that would make the end come + # before the start on that day, in which case use the basedate + # instead. If the start has a month+day and the end doesn't, use + # the basedate. + start_dm = not (start.month is None and start.day is None) + end_dm = not (end.month is None and end.day is None) + if end_dm and not start_dm: + if start.floor().time() > end.ceil().time(): + start.month = basedate.month + start.day = basedate.day + else: + start.month = end.month + start.day = end.day + elif start_dm and not end_dm: + end.month = basedate.month + end.day = basedate.day + + if floor(start).date() > ceil(end).date(): + # If the disambiguated dates are out of order: + # - If no start year was given, reduce the start year to put the + # start before the end + # - If no end year was given, increase the end year to put the end + # after the start + # - If a year was specified for both, just swap the start and end + if start_year_was_amb: + start.year = end.year - 1 + elif end_year_was_amb: + end.year = start.year + 1 + else: + start, end = end, start + + start = floor(start) + end = ceil(end) + + if start.date() == end.date() and start.time() > end.time(): + # If the start and end are on the same day, but the start time + # is after the end time, move the end time to the next day + end += timedelta(days=1) + + return timespan(start, end) + + +# Functions for working with datetime/adatetime objects + +def floor(at): + if isinstance(at, datetime): + return at + return at.floor() + + +def ceil(at): + if isinstance(at, datetime): + return at + return at.ceil() + + +def fill_in(at, basedate, units=adatetime.units): + """Returns a copy of ``at`` with any unspecified (None) units filled in + with values from ``basedate``. + """ + + if isinstance(at, datetime): + return at + + args = {} + for unit in units: + v = getattr(at, unit) + if v is None: + v = getattr(basedate, unit) + args[unit] = v + return fix(adatetime(**args)) + + +def has_no_date(at): + """Returns True if the given object is an ``adatetime`` where ``year``, + ``month``, and ``day`` are all None. + """ + + if isinstance(at, datetime): + return False + return at.year is None and at.month is None and at.day is None + + +def has_no_time(at): + """Returns True if the given object is an ``adatetime`` where ``hour``, + ``minute``, ``second`` and ``microsecond`` are all None. + """ + + if isinstance(at, datetime): + return False + return (at.hour is None and at.minute is None and at.second is None + and at.microsecond is None) + + +def is_ambiguous(at): + """Returns True if the given object is an ``adatetime`` with any of its + attributes equal to None. + """ + + if isinstance(at, datetime): + return False + return any((getattr(at, attr) is None) for attr in adatetime.units) + + +def is_void(at): + """Returns True if the given object is an ``adatetime`` with all of its + attributes equal to None. + """ + + if isinstance(at, datetime): + return False + return all((getattr(at, attr) is None) for attr in adatetime.units) + + +def fix(at): + """If the given object is an ``adatetime`` that is unambiguous (because + all its attributes are specified, that is, not equal to None), returns a + ``datetime`` version of it. Otherwise returns the ``adatetime`` object + unchanged. + """ + + if is_ambiguous(at) or isinstance(at, datetime): + return at + return datetime(year=at.year, month=at.month, day=at.day, hour=at.hour, + minute=at.minute, second=at.second, + microsecond=at.microsecond) diff --git a/src/whoosh/util/varints.py b/src/whoosh/util/varints.py new file mode 100644 index 0000000..148c141 --- /dev/null +++ b/src/whoosh/util/varints.py @@ -0,0 +1,110 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from array import array + +from whoosh.compat import array_tobytes, xrange + + +# Varint cache + +# Build a cache of the varint byte sequences for the first N integers, so we +# don't have to constantly recalculate them on the fly. This makes a small but +# noticeable difference. + +def _varint(i): + a = array("B") + while (i & ~0x7F) != 0: + a.append((i & 0x7F) | 0x80) + i = i >> 7 + a.append(i) + return array_tobytes(a) + + +_varint_cache_size = 512 +_varint_cache = [] +for i in xrange(0, _varint_cache_size): + _varint_cache.append(_varint(i)) +_varint_cache = tuple(_varint_cache) + + +def varint(i): + """Encodes the given integer into a string of the minimum number of bytes. + """ + if i < len(_varint_cache): + return _varint_cache[i] + return _varint(i) + + +def varint_to_int(vi): + b = ord(vi[0]) + p = 1 + i = b & 0x7f + shift = 7 + while b & 0x80 != 0: + b = ord(vi[p]) + p += 1 + i |= (b & 0x7F) << shift + shift += 7 + return i + + +def signed_varint(i): + """Zig-zag encodes a signed integer into a varint. + """ + + if i >= 0: + return varint(i << 1) + return varint((i << 1) ^ (~0)) + + +def decode_signed_varint(i): + """Zig-zag decodes an integer value. + """ + + if not i & 1: + return i >> 1 + return (i >> 1) ^ (~0) + + +def read_varint(readfn): + """ + Reads a variable-length encoded integer. + + :param readfn: a callable that reads a given number of bytes, + like file.read(). + """ + + b = ord(readfn(1)) + i = b & 0x7F + + shift = 7 + while b & 0x80 != 0: + b = ord(readfn(1)) + i |= (b & 0x7F) << shift + shift += 7 + return i diff --git a/src/whoosh/util/versions.py b/src/whoosh/util/versions.py new file mode 100644 index 0000000..1fcfe9c --- /dev/null +++ b/src/whoosh/util/versions.py @@ -0,0 +1,165 @@ +# Copyright 2012 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from whoosh.util.text import rcompile + + +class BaseVersion(object): + @classmethod + def parse(cls, text): + obj = cls() + match = cls._version_exp.match(text) + if match: + groupdict = match.groupdict() + for groupname, typ in cls._parts: + v = groupdict.get(groupname) + if v is not None: + setattr(obj, groupname, typ(v)) + return obj + + def __repr__(self): + vs = ", ".join(repr(getattr(self, slot)) for slot in self.__slots__) + return "%s(%s)" % (self.__class__.__name__, vs) + + def tuple(self): + return tuple(getattr(self, slot) for slot in self.__slots__) + + def __eq__(self, other): + if not hasattr(other, "tuple"): + raise ValueError("Can't compare %r with %r" % (self, other)) + return self.tuple() == other.tuple() + + def __lt__(self, other): + if not hasattr(other, "tuple"): + raise ValueError("Can't compare %r with %r" % (self, other)) + return self.tuple() < other.tuple() + + # It's dumb that you have to define these + + def __gt__(self, other): + if not hasattr(other, "tuple"): + raise ValueError("Can't compare %r with %r" % (self, other)) + return self.tuple() > other.tuple() + + def __ge__(self, other): + if not hasattr(other, "tuple"): + raise ValueError("Can't compare %r with %r" % (self, other)) + return self.tuple() >= other.tuple() + + def __le__(self, other): + if not hasattr(other, "tuple"): + raise ValueError("Can't compare %r with %r" % (self, other)) + return self.tuple() <= other.tuple() + + def __ne__(self, other): + if not hasattr(other, "tuple"): + raise ValueError("Can't compare %r with %r" % (self, other)) + return self.tuple() != other.tuple() + + +class SimpleVersion(BaseVersion): + """An object that parses version numbers such as:: + + 12.2.5b + + The filter supports a limited subset of PEP 386 versions including:: + + 1 + 1.2 + 1.2c + 1.2c3 + 1.2.3 + 1.2.3a + 1.2.3b4 + 10.7.5rc1 + 999.999.999c999 + """ + + _version_exp = rcompile(r""" + ^ + (?P\d{1,4}) + ( + [.](?P\d{1,4}) + ( + [.](?P\d{1,4}) + )? + ( + (?P[abc]|rc) + (?P\d{1,4})? + )? + )? + $ + """, verbose=True) + + # (groupid, method, skippable, default) + _parts = [("major", int), + ("minor", int), + ("release", int), + ("ex", str), + ("exnum", int), + ] + + _ex_bits = {"a": 0, "b": 1, "c": 2, "rc": 10, "z": 15} + _bits_ex = dict((v, k) for k, v in _ex_bits.items()) + + __slots__ = ("major", "minor", "release", "ex", "exnum") + + def __init__(self, major=1, minor=0, release=0, ex="z", exnum=0): + self.major = major + self.minor = minor + self.release = release + self.ex = ex + self.exnum = exnum + + def to_int(self): + assert self.major < 1024 + n = self.major << 34 + + assert self.minor < 1024 + n |= self.minor << 24 + + assert self.release < 1024 + n |= self.release << 14 + + exbits = self._ex_bits.get(self.ex, 15) + n |= exbits << 10 + + assert self.exnum < 1024 + n |= self.exnum + + return n + + @classmethod + def from_int(cls, n): + major = (n & (1023 << 34)) >> 34 + minor = (n & (1023 << 24)) >> 24 + release = (n & (1023 << 14)) >> 14 + exbits = (n & (7 << 10)) >> 10 + ex = cls._bits_ex.get(exbits, "z") + exnum = n & 1023 + + return cls(major, minor, release, ex, exnum) diff --git a/src/whoosh/writing.py b/src/whoosh/writing.py new file mode 100644 index 0000000..cc0ad34 --- /dev/null +++ b/src/whoosh/writing.py @@ -0,0 +1,1272 @@ +# Copyright 2007 Matt Chaput. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are +# those of the authors and should not be interpreted as representing official +# policies, either expressed or implied, of Matt Chaput. + +from __future__ import with_statement +import threading, time +from bisect import bisect_right +from contextlib import contextmanager + +from whoosh import columns +from whoosh.compat import abstractmethod, bytes_type +from whoosh.externalsort import SortingPool +from whoosh.fields import UnknownFieldError +from whoosh.index import LockError +from whoosh.system import emptybytes +from whoosh.util import fib, random_name +from whoosh.util.filelock import try_for +from whoosh.util.text import utf8encode + + +# Exceptions + +class IndexingError(Exception): + pass + + +# Document grouping context manager + +@contextmanager +def groupmanager(writer): + writer.start_group() + yield + writer.end_group() + + +# Merge policies + +# A merge policy is a callable that takes the Index object, the SegmentWriter +# object, and the current segment list (not including the segment being +# written), and returns an updated segment list (not including the segment +# being written). + +def NO_MERGE(writer, segments): + """This policy does not merge any existing segments. + """ + return segments + + +def MERGE_SMALL(writer, segments): + """This policy merges small segments, where "small" is defined using a + heuristic based on the fibonacci sequence. + """ + + from whoosh.reading import SegmentReader + + unchanged_segments = [] + segments_to_merge = [] + + sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all()) + total_docs = 0 + + merge_point_found = False + for i, seg in enumerate(sorted_segment_list): + count = seg.doc_count_all() + if count > 0: + total_docs += count + + if merge_point_found: # append the remaining to unchanged + unchanged_segments.append(seg) + else: # look for a merge point + segments_to_merge.append((seg, i)) # merge every segment up to the merge point + if i > 3 and total_docs < fib(i + 5): + merge_point_found = True + + if merge_point_found and len(segments_to_merge) > 1: + for seg, i in segments_to_merge: + reader = SegmentReader(writer.storage, writer.schema, seg) + writer.add_reader(reader) + reader.close() + return unchanged_segments + else: + return segments + + +def OPTIMIZE(writer, segments): + """This policy merges all existing segments. + """ + + from whoosh.reading import SegmentReader + + for seg in segments: + reader = SegmentReader(writer.storage, writer.schema, seg) + writer.add_reader(reader) + reader.close() + return [] + + +def CLEAR(writer, segments): + """This policy DELETES all existing segments and only writes the new + segment. + """ + + return [] + + +# Customized sorting pool for postings + +class PostingPool(SortingPool): + # Subclass whoosh.externalsort.SortingPool to use knowledge of + # postings to set run size in bytes instead of items + + namechars = "abcdefghijklmnopqrstuvwxyz0123456789" + + def __init__(self, tempstore, segment, limitmb=128, **kwargs): + SortingPool.__init__(self, **kwargs) + self.tempstore = tempstore + self.segment = segment + self.limit = limitmb * 1024 * 1024 + self.currentsize = 0 + self.fieldnames = set() + + def _new_run(self): + path = "%s.run" % random_name() + f = self.tempstore.create_file(path).raw_file() + return path, f + + def _open_run(self, path): + return self.tempstore.open_file(path).raw_file() + + def _remove_run(self, path): + return self.tempstore.delete_file(path) + + def add(self, item): + # item = (fieldname, tbytes, docnum, weight, vbytes) + assert isinstance(item[1], bytes_type), "tbytes=%r" % item[1] + if item[4] is not None: + assert isinstance(item[4], bytes_type), "vbytes=%r" % item[4] + self.fieldnames.add(item[0]) + size = (28 + 4 * 5 # tuple = 28 + 4 * length + + 21 + len(item[0]) # fieldname = str = 21 + length + + 26 + len(item[1]) * 2 # text = unicode = 26 + 2 * length + + 18 # docnum = long = 18 + + 16 # weight = float = 16 + + 21 + len(item[4] or '')) # valuestring + self.currentsize += size + if self.currentsize > self.limit: + self.save() + self.current.append(item) + + def iter_postings(self): + # This is just an alias for items() to be consistent with the + # iter_postings()/add_postings() interface of a lot of other classes + return self.items() + + def save(self): + SortingPool.save(self) + self.currentsize = 0 + + +# Writer base class + +class IndexWriter(object): + """High-level object for writing to an index. + + To get a writer for a particular index, call + :meth:`~whoosh.index.Index.writer` on the Index object. + + >>> writer = myindex.writer() + + You can use this object as a context manager. If an exception is thrown + from within the context it calls :meth:`~IndexWriter.cancel` to clean up + temporary files, otherwise it calls :meth:`~IndexWriter.commit` when the + context exits. + + >>> with myindex.writer() as w: + ... w.add_document(title="First document", content="Hello there.") + ... w.add_document(title="Second document", content="This is easy!") + """ + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type: + self.cancel() + else: + self.commit() + + def group(self): + """Returns a context manager that calls + :meth:`~IndexWriter.start_group` and :meth:`~IndexWriter.end_group` for + you, allowing you to use a ``with`` statement to group hierarchical + documents:: + + with myindex.writer() as w: + with w.group(): + w.add_document(kind="class", name="Accumulator") + w.add_document(kind="method", name="add") + w.add_document(kind="method", name="get_result") + w.add_document(kind="method", name="close") + + with w.group(): + w.add_document(kind="class", name="Calculator") + w.add_document(kind="method", name="add") + w.add_document(kind="method", name="multiply") + w.add_document(kind="method", name="get_result") + w.add_document(kind="method", name="close") + """ + + return groupmanager(self) + + def start_group(self): + """Start indexing a group of hierarchical documents. The backend should + ensure that these documents are all added to the same segment:: + + with myindex.writer() as w: + w.start_group() + w.add_document(kind="class", name="Accumulator") + w.add_document(kind="method", name="add") + w.add_document(kind="method", name="get_result") + w.add_document(kind="method", name="close") + w.end_group() + + w.start_group() + w.add_document(kind="class", name="Calculator") + w.add_document(kind="method", name="add") + w.add_document(kind="method", name="multiply") + w.add_document(kind="method", name="get_result") + w.add_document(kind="method", name="close") + w.end_group() + + A more convenient way to group documents is to use the + :meth:`~IndexWriter.group` method and the ``with`` statement. + """ + + pass + + def end_group(self): + """Finish indexing a group of hierarchical documents. See + :meth:`~IndexWriter.start_group`. + """ + + pass + + def add_field(self, fieldname, fieldtype, **kwargs): + """Adds a field to the index's schema. + + :param fieldname: the name of the field to add. + :param fieldtype: an instantiated :class:`whoosh.fields.FieldType` + object. + """ + + self.schema.add(fieldname, fieldtype, **kwargs) + + def remove_field(self, fieldname, **kwargs): + """Removes the named field from the index's schema. Depending on the + backend implementation, this may or may not actually remove existing + data for the field from the index. Optimizing the index should always + clear out existing data for a removed field. + """ + + self.schema.remove(fieldname, **kwargs) + + @abstractmethod + def reader(self, **kwargs): + """Returns a reader for the existing index. + """ + + raise NotImplementedError + + def searcher(self, **kwargs): + from whoosh.searching import Searcher + + return Searcher(self.reader(), **kwargs) + + def delete_by_term(self, fieldname, text, searcher=None): + """Deletes any documents containing "term" in the "fieldname" field. + This is useful when you have an indexed field containing a unique ID + (such as "pathname") for each document. + + :returns: the number of documents deleted. + """ + + from whoosh.query import Term + + q = Term(fieldname, text) + return self.delete_by_query(q, searcher=searcher) + + def delete_by_query(self, q, searcher=None): + """Deletes any documents matching a query object. + + :returns: the number of documents deleted. + """ + + if searcher: + s = searcher + else: + s = self.searcher() + + try: + count = 0 + for docnum in s.docs_for_query(q, for_deletion=True): + self.delete_document(docnum) + count += 1 + finally: + if not searcher: + s.close() + + return count + + @abstractmethod + def delete_document(self, docnum, delete=True): + """Deletes a document by number. + """ + raise NotImplementedError + + @abstractmethod + def add_document(self, **fields): + """The keyword arguments map field names to the values to index/store:: + + w = myindex.writer() + w.add_document(path=u"/a", title=u"First doc", text=u"Hello") + w.commit() + + Depending on the field type, some fields may take objects other than + unicode strings. For example, NUMERIC fields take numbers, and DATETIME + fields take ``datetime.datetime`` objects:: + + from datetime import datetime, timedelta + from whoosh import index + from whoosh.fields import * + + schema = Schema(date=DATETIME, size=NUMERIC(float), content=TEXT) + myindex = index.create_in("indexdir", schema) + + w = myindex.writer() + w.add_document(date=datetime.now(), size=5.5, content=u"Hello") + w.commit() + + Instead of a single object (i.e., unicode string, number, or datetime), + you can supply a list or tuple of objects. For unicode strings, this + bypasses the field's analyzer. For numbers and dates, this lets you add + multiple values for the given field:: + + date1 = datetime.now() + date2 = datetime(2005, 12, 25) + date3 = datetime(1999, 1, 1) + w.add_document(date=[date1, date2, date3], size=[9.5, 10], + content=[u"alfa", u"bravo", u"charlie"]) + + For fields that are both indexed and stored, you can specify an + alternate value to store using a keyword argument in the form + "_stored_". For example, if you have a field named "title" + and you want to index the text "a b c" but store the text "e f g", use + keyword arguments like this:: + + writer.add_document(title=u"a b c", _stored_title=u"e f g") + + You can boost the weight of all terms in a certain field by specifying + a ``__boost`` keyword argument. For example, if you have a + field named "content", you can double the weight of this document for + searches in the "content" field like this:: + + writer.add_document(content="a b c", _title_boost=2.0) + + You can boost every field at once using the ``_boost`` keyword. For + example, to boost fields "a" and "b" by 2.0, and field "c" by 3.0:: + + writer.add_document(a="alfa", b="bravo", c="charlie", + _boost=2.0, _c_boost=3.0) + + Note that some scoring algroithms, including Whoosh's default BM25F, + do not work with term weights less than 1, so you should generally not + use a boost factor less than 1. + + See also :meth:`Writer.update_document`. + """ + + raise NotImplementedError + + @abstractmethod + def add_reader(self, reader): + raise NotImplementedError + + def _doc_boost(self, fields, default=1.0): + if "_boost" in fields: + return float(fields["_boost"]) + else: + return default + + def _field_boost(self, fields, fieldname, default=1.0): + boostkw = "_%s_boost" % fieldname + if boostkw in fields: + return float(fields[boostkw]) + else: + return default + + def _unique_fields(self, fields): + # Check which of the supplied fields are unique + unique_fields = [name for name, field in self.schema.items() + if name in fields and field.unique] + return unique_fields + + def update_document(self, **fields): + """The keyword arguments map field names to the values to index/store. + + This method adds a new document to the index, and automatically deletes + any documents with the same values in any fields marked "unique" in the + schema:: + + schema = fields.Schema(path=fields.ID(unique=True, stored=True), + content=fields.TEXT) + myindex = index.create_in("index", schema) + + w = myindex.writer() + w.add_document(path=u"/", content=u"Mary had a lamb") + w.commit() + + w = myindex.writer() + w.update_document(path=u"/", content=u"Mary had a little lamb") + w.commit() + + assert myindex.doc_count() == 1 + + It is safe to use ``update_document`` in place of ``add_document``; if + there is no existing document to replace, it simply does an add. + + You cannot currently pass a list or tuple of values to a "unique" + field. + + Because this method has to search for documents with the same unique + fields and delete them before adding the new document, it is slower + than using ``add_document``. + + * Marking more fields "unique" in the schema will make each + ``update_document`` call slightly slower. + + * When you are updating multiple documents, it is faster to batch + delete all changed documents and then use ``add_document`` to add + the replacements instead of using ``update_document``. + + Note that this method will only replace a *committed* document; + currently it cannot replace documents you've added to the IndexWriter + but haven't yet committed. For example, if you do this: + + >>> writer.update_document(unique_id=u"1", content=u"Replace me") + >>> writer.update_document(unique_id=u"1", content=u"Replacement") + + ...this will add two documents with the same value of ``unique_id``, + instead of the second document replacing the first. + + See :meth:`Writer.add_document` for information on + ``_stored_``, ``__boost``, and ``_boost`` keyword + arguments. + """ + + # Delete the set of documents matching the unique terms + unique_fields = self._unique_fields(fields) + if unique_fields: + with self.searcher() as s: + uniqueterms = [(name, fields[name]) for name in unique_fields] + docs = s._find_unique(uniqueterms) + for docnum in docs: + self.delete_document(docnum) + + # Add the given fields + self.add_document(**fields) + + def commit(self): + """Finishes writing and unlocks the index. + """ + pass + + def cancel(self): + """Cancels any documents/deletions added by this object + and unlocks the index. + """ + pass + + +# Codec-based writer + +class SegmentWriter(IndexWriter): + def __init__(self, ix, poolclass=None, timeout=0.0, delay=0.1, _lk=True, + limitmb=128, docbase=0, codec=None, compound=True, **kwargs): + # Lock the index + self.writelock = None + if _lk: + self.writelock = ix.lock("WRITELOCK") + if not try_for(self.writelock.acquire, timeout=timeout, + delay=delay): + raise LockError + + if codec is None: + from whoosh.codec import default_codec + codec = default_codec() + self.codec = codec + + # Get info from the index + self.storage = ix.storage + self.indexname = ix.indexname + info = ix._read_toc() + self.generation = info.generation + 1 + self.schema = info.schema + self.segments = info.segments + self.docnum = self.docbase = docbase + self._setup_doc_offsets() + + # Internals + self._tempstorage = self.storage.temp_storage("%s.tmp" % self.indexname) + newsegment = codec.new_segment(self.storage, self.indexname) + self.newsegment = newsegment + self.compound = compound and newsegment.should_assemble() + self.is_closed = False + self._added = False + self.pool = PostingPool(self._tempstorage, self.newsegment, + limitmb=limitmb) + + # Set up writers + self.perdocwriter = codec.per_document_writer(self.storage, newsegment) + self.fieldwriter = codec.field_writer(self.storage, newsegment) + + self.merge = True + self.optimize = False + self.mergetype = None + + def __repr__(self): + return "<%s %r>" % (self.__class__.__name__, self.newsegment) + + def _check_state(self): + if self.is_closed: + raise IndexingError("This writer is closed") + + def _setup_doc_offsets(self): + self._doc_offsets = [] + base = 0 + for s in self.segments: + self._doc_offsets.append(base) + base += s.doc_count_all() + + def _document_segment(self, docnum): + #Returns the index.Segment object containing the given document + #number. + offsets = self._doc_offsets + if len(offsets) == 1: + return 0 + return bisect_right(offsets, docnum) - 1 + + def _segment_and_docnum(self, docnum): + #Returns an (index.Segment, segment_docnum) pair for the segment + #containing the given document number. + + segmentnum = self._document_segment(docnum) + offset = self._doc_offsets[segmentnum] + segment = self.segments[segmentnum] + return segment, docnum - offset + + def _process_posts(self, items, startdoc, docmap): + schema = self.schema + for fieldname, text, docnum, weight, vbytes in items: + if fieldname not in schema: + continue + if docmap is not None: + newdoc = docmap[docnum] + else: + newdoc = startdoc + docnum + + yield (fieldname, text, newdoc, weight, vbytes) + + def temp_storage(self): + return self._tempstorage + + def add_field(self, fieldname, fieldspec, **kwargs): + self._check_state() + if self._added: + raise Exception("Can't modify schema after adding data to writer") + super(SegmentWriter, self).add_field(fieldname, fieldspec, **kwargs) + + def remove_field(self, fieldname): + self._check_state() + if self._added: + raise Exception("Can't modify schema after adding data to writer") + super(SegmentWriter, self).remove_field(fieldname) + + def has_deletions(self): + """ + Returns True if the current index has documents that are marked deleted + but haven't been optimized out of the index yet. + """ + + return any(s.has_deletions() for s in self.segments) + + def delete_document(self, docnum, delete=True): + self._check_state() + if docnum >= sum(seg.doc_count_all() for seg in self.segments): + raise IndexingError("No document ID %r in this index" % docnum) + segment, segdocnum = self._segment_and_docnum(docnum) + segment.delete_document(segdocnum, delete=delete) + + def deleted_count(self): + """ + :returns: the total number of deleted documents in the index. + """ + + return sum(s.deleted_count() for s in self.segments) + + def is_deleted(self, docnum): + segment, segdocnum = self._segment_and_docnum(docnum) + return segment.is_deleted(segdocnum) + + def reader(self, reuse=None): + from whoosh.index import FileIndex + + self._check_state() + return FileIndex._reader(self.storage, self.schema, self.segments, + self.generation, reuse=reuse) + + def iter_postings(self): + return self.pool.iter_postings() + + def add_postings_to_pool(self, reader, startdoc, docmap): + items = self._process_posts(reader.iter_postings(), startdoc, docmap) + add_post = self.pool.add + for item in items: + add_post(item) + + def write_postings(self, lengths, items, startdoc, docmap): + items = self._process_posts(items, startdoc, docmap) + self.fieldwriter.add_postings(self.schema, lengths, items) + + def write_per_doc(self, fieldnames, reader): + # Very bad hack: reader should be an IndexReader, but may be a + # PerDocumentReader if this is called from multiproc, where the code + # tries to be efficient by merging per-doc and terms separately. + # TODO: fix this! + + schema = self.schema + if reader.has_deletions(): + docmap = {} + else: + docmap = None + + pdw = self.perdocwriter + # Open all column readers + cols = {} + for fieldname in fieldnames: + fieldobj = schema[fieldname] + coltype = fieldobj.column_type + if coltype and reader.has_column(fieldname): + creader = reader.column_reader(fieldname, coltype) + if isinstance(creader, columns.TranslatingColumnReader): + creader = creader.raw_column() + cols[fieldname] = creader + + for docnum, stored in reader.iter_docs(): + if docmap is not None: + docmap[docnum] = self.docnum + + pdw.start_doc(self.docnum) + for fieldname in fieldnames: + fieldobj = schema[fieldname] + length = reader.doc_field_length(docnum, fieldname) + pdw.add_field(fieldname, fieldobj, + stored.get(fieldname), length) + + if fieldobj.vector and reader.has_vector(docnum, fieldname): + v = reader.vector(docnum, fieldname, fieldobj.vector) + pdw.add_vector_matcher(fieldname, fieldobj, v) + + if fieldname in cols: + cv = cols[fieldname][docnum] + pdw.add_column_value(fieldname, fieldobj.column_type, cv) + + pdw.finish_doc() + self.docnum += 1 + + return docmap + + def add_reader(self, reader): + self._check_state() + basedoc = self.docnum + ndxnames = set(fname for fname in reader.indexed_field_names() + if fname in self.schema) + fieldnames = set(self.schema.names()) | ndxnames + + docmap = self.write_per_doc(fieldnames, reader) + self.add_postings_to_pool(reader, basedoc, docmap) + self._added = True + + def _check_fields(self, schema, fieldnames): + # Check if the caller gave us a bogus field + for name in fieldnames: + if name not in schema: + raise UnknownFieldError("No field named %r in %s" + % (name, schema)) + + def add_document(self, **fields): + self._check_state() + perdocwriter = self.perdocwriter + schema = self.schema + docnum = self.docnum + add_post = self.pool.add + + docboost = self._doc_boost(fields) + fieldnames = sorted([name for name in fields.keys() + if not name.startswith("_")]) + self._check_fields(schema, fieldnames) + + perdocwriter.start_doc(docnum) + for fieldname in fieldnames: + value = fields.get(fieldname) + if value is None: + continue + field = schema[fieldname] + + length = 0 + if field.indexed: + # TODO: Method for adding progressive field values, ie + # setting start_pos/start_char? + fieldboost = self._field_boost(fields, fieldname, docboost) + # Ask the field to return a list of (text, weight, vbytes) + # tuples + items = field.index(value) + # Only store the length if the field is marked scorable + scorable = field.scorable + # Add the terms to the pool + for tbytes, freq, weight, vbytes in items: + weight *= fieldboost + if scorable: + length += freq + add_post((fieldname, tbytes, docnum, weight, vbytes)) + + if field.separate_spelling(): + spellfield = field.spelling_fieldname(fieldname) + for word in field.spellable_words(value): + word = utf8encode(word)[0] + # item = (fieldname, tbytes, docnum, weight, vbytes) + add_post((spellfield, word, 0, 1, vbytes)) + + vformat = field.vector + if vformat: + analyzer = field.analyzer + # Call the format's word_values method to get posting values + vitems = vformat.word_values(value, analyzer, mode="index") + # Remove unused frequency field from the tuple + vitems = sorted((text, weight, vbytes) + for text, _, weight, vbytes in vitems) + perdocwriter.add_vector_items(fieldname, field, vitems) + + # Allow a custom value for stored field/column + customval = fields.get("_stored_%s" % fieldname, value) + + # Add the stored value and length for this field to the per- + # document writer + sv = customval if field.stored else None + perdocwriter.add_field(fieldname, field, sv, length) + + column = field.column_type + if column and customval is not None: + cv = field.to_column_value(customval) + perdocwriter.add_column_value(fieldname, column, cv) + + perdocwriter.finish_doc() + self._added = True + self.docnum += 1 + + def doc_count(self): + return self.docnum - self.docbase + + def get_segment(self): + newsegment = self.newsegment + newsegment.set_doc_count(self.docnum) + return newsegment + + def per_document_reader(self): + if not self.perdocwriter.is_closed: + raise Exception("Per-doc writer is still open") + return self.codec.per_document_reader(self.storage, self.get_segment()) + + # The following methods break out the commit functionality into smaller + # pieces to allow MpWriter to call them individually + + def _merge_segments(self, mergetype, optimize, merge): + # The writer supports two ways of setting mergetype/optimize/merge: + # as attributes or as keyword arguments to commit(). Originally there + # were just the keyword arguments, but then I added the ability to use + # the writer as a context manager using "with", so the user no longer + # explicitly called commit(), hence the attributes + mergetype = mergetype if mergetype is not None else self.mergetype + optimize = optimize if optimize is not None else self.optimize + merge = merge if merge is not None else self.merge + + if mergetype: + pass + elif optimize: + mergetype = OPTIMIZE + elif not merge: + mergetype = NO_MERGE + else: + mergetype = MERGE_SMALL + + # Call the merge policy function. The policy may choose to merge + # other segments into this writer's pool + return mergetype(self, self.segments) + + def _flush_segment(self): + self.perdocwriter.close() + if self.codec.length_stats: + pdr = self.per_document_reader() + else: + pdr = None + postings = self.pool.iter_postings() + self.fieldwriter.add_postings(self.schema, pdr, postings) + self.fieldwriter.close() + if pdr: + pdr.close() + + def _close_segment(self): + if not self.perdocwriter.is_closed: + self.perdocwriter.close() + if not self.fieldwriter.is_closed: + self.fieldwriter.close() + self.pool.cleanup() + + def _assemble_segment(self): + if self.compound: + # Assemble the segment files into a compound file + newsegment = self.get_segment() + newsegment.create_compound_file(self.storage) + newsegment.compound = True + + def _partial_segment(self): + # For use by a parent multiprocessing writer: Closes out the segment + # but leaves the pool files intact so the parent can access them + self._check_state() + self.perdocwriter.close() + self.fieldwriter.close() + # Don't call self.pool.cleanup()! We want to grab the pool files. + return self.get_segment() + + def _finalize_segment(self): + # Finish writing segment + self._flush_segment() + # Close segment files + self._close_segment() + # Assemble compound segment if necessary + self._assemble_segment() + + return self.get_segment() + + def _commit_toc(self, segments): + from whoosh.index import TOC, clean_files + + # Write a new TOC with the new segment list (and delete old files) + toc = TOC(self.schema, segments, self.generation) + toc.write(self.storage, self.indexname) + # Delete leftover files + clean_files(self.storage, self.indexname, self.generation, segments) + + def _finish(self): + self._tempstorage.destroy() + if self.writelock: + self.writelock.release() + self.is_closed = True + #self.storage.close() + + # Finalization methods + + def commit(self, mergetype=None, optimize=None, merge=None): + """Finishes writing and saves all additions and changes to disk. + + There are four possible ways to use this method:: + + # Merge small segments but leave large segments, trying to + # balance fast commits with fast searching: + writer.commit() + + # Merge all segments into a single segment: + writer.commit(optimize=True) + + # Don't merge any existing segments: + writer.commit(merge=False) + + # Use a custom merge function + writer.commit(mergetype=my_merge_function) + + :param mergetype: a custom merge function taking a Writer object and + segment list as arguments, and returning a new segment list. If you + supply a ``mergetype`` function, the values of the ``optimize`` and + ``merge`` arguments are ignored. + :param optimize: if True, all existing segments are merged with the + documents you've added to this writer (and the value of the + ``merge`` argument is ignored). + :param merge: if False, do not merge small segments. + """ + + self._check_state() + # Merge old segments if necessary + finalsegments = self._merge_segments(mergetype, optimize, merge) + if self._added: + # Flush the current segment being written and add it to the + # list of remaining segments returned by the merge policy + # function + finalsegments.append(self._finalize_segment()) + else: + # Close segment files + self._close_segment() + # Write TOC + self._commit_toc(finalsegments) + + # Final cleanup + self._finish() + + def cancel(self): + self._check_state() + self._close_segment() + self._finish() + + +# Writer wrappers + +class AsyncWriter(threading.Thread, IndexWriter): + """Convenience wrapper for a writer object that might fail due to locking + (i.e. the ``filedb`` writer). This object will attempt once to obtain the + underlying writer, and if it's successful, will simply pass method calls on + to it. + + If this object *can't* obtain a writer immediately, it will *buffer* + delete, add, and update method calls in memory until you call ``commit()``. + At that point, this object will start running in a separate thread, trying + to obtain the writer over and over, and once it obtains it, "replay" all + the buffered method calls on it. + + In a typical scenario where you're adding a single or a few documents to + the index as the result of a Web transaction, this lets you just create the + writer, add, and commit, without having to worry about index locks, + retries, etc. + + For example, to get an aynchronous writer, instead of this: + + >>> writer = myindex.writer() + + Do this: + + >>> from whoosh.writing import AsyncWriter + >>> writer = AsyncWriter(myindex) + """ + + def __init__(self, index, delay=0.25, writerargs=None): + """ + :param index: the :class:`whoosh.index.Index` to write to. + :param delay: the delay (in seconds) between attempts to instantiate + the actual writer. + :param writerargs: an optional dictionary specifying keyword arguments + to to be passed to the index's ``writer()`` method. + """ + + threading.Thread.__init__(self) + self.running = False + self.index = index + self.writerargs = writerargs or {} + self.delay = delay + self.events = [] + try: + self.writer = self.index.writer(**self.writerargs) + except LockError: + self.writer = None + + def reader(self): + return self.index.reader() + + def searcher(self, **kwargs): + from whoosh.searching import Searcher + return Searcher(self.reader(), fromindex=self.index, **kwargs) + + def _record(self, method, args, kwargs): + if self.writer: + getattr(self.writer, method)(*args, **kwargs) + else: + self.events.append((method, args, kwargs)) + + def run(self): + self.running = True + writer = self.writer + while writer is None: + try: + writer = self.index.writer(**self.writerargs) + except LockError: + time.sleep(self.delay) + for method, args, kwargs in self.events: + getattr(writer, method)(*args, **kwargs) + writer.commit(*self.commitargs, **self.commitkwargs) + + def delete_document(self, *args, **kwargs): + self._record("delete_document", args, kwargs) + + def add_document(self, *args, **kwargs): + self._record("add_document", args, kwargs) + + def update_document(self, *args, **kwargs): + self._record("update_document", args, kwargs) + + def add_field(self, *args, **kwargs): + self._record("add_field", args, kwargs) + + def remove_field(self, *args, **kwargs): + self._record("remove_field", args, kwargs) + + def delete_by_term(self, *args, **kwargs): + self._record("delete_by_term", args, kwargs) + + def commit(self, *args, **kwargs): + if self.writer: + self.writer.commit(*args, **kwargs) + else: + self.commitargs, self.commitkwargs = args, kwargs + self.start() + + def cancel(self, *args, **kwargs): + if self.writer: + self.writer.cancel(*args, **kwargs) + + +# Ex post factor functions + +def add_spelling(ix, fieldnames, commit=True): + """Adds spelling files to an existing index that was created without + them, and modifies the schema so the given fields have the ``spelling`` + attribute. Only works on filedb indexes. + + >>> ix = index.open_dir("testindex") + >>> add_spelling(ix, ["content", "tags"]) + + :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object. + :param fieldnames: a list of field names to create word graphs for. + :param force: if True, overwrites existing word graph files. This is only + useful for debugging. + """ + + from whoosh.automata import fst + from whoosh.reading import SegmentReader + + writer = ix.writer() + storage = writer.storage + schema = writer.schema + segments = writer.segments + + for segment in segments: + ext = segment.codec().FST_EXT + + r = SegmentReader(storage, schema, segment) + f = segment.create_file(storage, ext) + gw = fst.GraphWriter(f) + for fieldname in fieldnames: + gw.start_field(fieldname) + for word in r.lexicon(fieldname): + gw.insert(word) + gw.finish_field() + gw.close() + + for fieldname in fieldnames: + schema[fieldname].spelling = True + + if commit: + writer.commit(merge=False) + + +# Buffered writer class + +class BufferedWriter(IndexWriter): + """Convenience class that acts like a writer but buffers added documents + before dumping the buffered documents as a batch into the actual index. + + In scenarios where you are continuously adding single documents very + rapidly (for example a web application where lots of users are adding + content simultaneously), using a BufferedWriter is *much* faster than + opening and committing a writer for each document you add. If you're adding + batches of documents at a time, you can just use a regular writer. + + (This class may also be useful for batches of ``update_document`` calls. In + a normal writer, ``update_document`` calls cannot update documents you've + added *in that writer*. With ``BufferedWriter``, this will work.) + + To use this class, create it from your index and *keep it open*, sharing + it between threads. + + >>> from whoosh.writing import BufferedWriter + >>> writer = BufferedWriter(myindex, period=120, limit=20) + >>> # Then you can use the writer to add and update documents + >>> writer.add_document(...) + >>> writer.add_document(...) + >>> writer.add_document(...) + >>> # Before the writer goes out of scope, call close() on it + >>> writer.close() + + .. note:: + This object stores documents in memory and may keep an underlying + writer open, so you must explicitly call the + :meth:`~BufferedWriter.close` method on this object before it goes out + of scope to release the write lock and make sure any uncommitted + changes are saved. + + You can read/search the combination of the on-disk index and the + buffered documents in memory by calling ``BufferedWriter.reader()`` or + ``BufferedWriter.searcher()``. This allows quasi-real-time search, where + documents are available for searching as soon as they are buffered in + memory, before they are committed to disk. + + .. tip:: + By using a searcher from the shared writer, multiple *threads* can + search the buffered documents. Of course, other *processes* will only + see the documents that have been written to disk. If you want indexed + documents to become available to other processes as soon as possible, + you have to use a traditional writer instead of a ``BufferedWriter``. + + You can control how often the ``BufferedWriter`` flushes the in-memory + index to disk using the ``period`` and ``limit`` arguments. ``period`` is + the maximum number of seconds between commits. ``limit`` is the maximum + number of additions to buffer between commits. + + You don't need to call ``commit()`` on the ``BufferedWriter`` manually. + Doing so will just flush the buffered documents to disk early. You can + continue to make changes after calling ``commit()``, and you can call + ``commit()`` multiple times. + """ + + def __init__(self, index, period=60, limit=10, writerargs=None, + commitargs=None): + """ + :param index: the :class:`whoosh.index.Index` to write to. + :param period: the maximum amount of time (in seconds) between commits. + Set this to ``0`` or ``None`` to not use a timer. Do not set this + any lower than a few seconds. + :param limit: the maximum number of documents to buffer before + committing. + :param writerargs: dictionary specifying keyword arguments to be passed + to the index's ``writer()`` method when creating a writer. + """ + + self.index = index + self.period = period + self.limit = limit + self.writerargs = writerargs or {} + self.commitargs = commitargs or {} + + self.lock = threading.RLock() + self.writer = self.index.writer(**self.writerargs) + + self._make_ram_index() + self.bufferedcount = 0 + + # Start timer + if self.period: + self.timer = threading.Timer(self.period, self.commit) + self.timer.start() + + def _make_ram_index(self): + from whoosh.codec.memory import MemoryCodec + + self.codec = MemoryCodec() + + def _get_ram_reader(self): + return self.codec.reader(self.schema) + + @property + def schema(self): + return self.writer.schema + + def reader(self, **kwargs): + from whoosh.reading import MultiReader + + reader = self.writer.reader() + with self.lock: + ramreader = self._get_ram_reader() + + # If there are in-memory docs, combine the readers + if ramreader.doc_count(): + if reader.is_atomic(): + reader = MultiReader([reader, ramreader]) + else: + reader.add_reader(ramreader) + + return reader + + def searcher(self, **kwargs): + from whoosh.searching import Searcher + + return Searcher(self.reader(), fromindex=self.index, **kwargs) + + def close(self): + self.commit(restart=False) + + def commit(self, restart=True): + if self.period: + self.timer.cancel() + + with self.lock: + ramreader = self._get_ram_reader() + self._make_ram_index() + + if self.bufferedcount: + self.writer.add_reader(ramreader) + self.writer.commit(**self.commitargs) + self.bufferedcount = 0 + + if restart: + self.writer = self.index.writer(**self.writerargs) + if self.period: + self.timer = threading.Timer(self.period, self.commit) + self.timer.start() + + def add_reader(self, reader): + # Pass through to the underlying on-disk index + self.writer.add_reader(reader) + self.commit() + + def add_document(self, **fields): + with self.lock: + # Hijack a writer to make the calls into the codec + with self.codec.writer(self.writer.schema) as w: + w.add_document(**fields) + + self.bufferedcount += 1 + if self.bufferedcount >= self.limit: + self.commit() + + def update_document(self, **fields): + with self.lock: + IndexWriter.update_document(self, **fields) + + def delete_document(self, docnum, delete=True): + with self.lock: + base = self.index.doc_count_all() + if docnum < base: + self.writer.delete_document(docnum, delete=delete) + else: + ramsegment = self.codec.segment + ramsegment.delete_document(docnum - base, delete=delete) + + def is_deleted(self, docnum): + base = self.index.doc_count_all() + if docnum < base: + return self.writer.is_deleted(docnum) + else: + return self._get_ram_reader().is_deleted(docnum - base) + + +# Backwards compatibility with old name +BatchWriter = BufferedWriter diff --git a/tests/test_analysis.py b/tests/test_analysis.py new file mode 100644 index 0000000..c46a70d --- /dev/null +++ b/tests/test_analysis.py @@ -0,0 +1,532 @@ +# coding=utf-8 + +from __future__ import with_statement + +import pytest + +from whoosh import analysis, fields, qparser +from whoosh.compat import b, u, unichr +from whoosh.compat import dumps +from whoosh.filedb.filestore import RamStorage + + +def test_regextokenizer(): + value = u("AAAaaaBBBbbbCCCcccDDDddd") + + rex = analysis.RegexTokenizer("[A-Z]+") + assert [t.text for t in rex(value)] == ["AAA", "BBB", "CCC", "DDD"] + + rex = analysis.RegexTokenizer("[A-Z]+", gaps=True) + assert [t.text for t in rex(value)] == ["aaa", "bbb", "ccc", "ddd"] + + +def test_path_tokenizer(): + value = u("/alfa/bravo/charlie/delta/") + pt = analysis.PathTokenizer() + assert [t.text for t in pt(value)] == ["/alfa", "/alfa/bravo", + "/alfa/bravo/charlie", + "/alfa/bravo/charlie/delta"] + + +def test_path_tokenizer2(): + path_field = fields.TEXT(analyzer=analysis.PathTokenizer()) + st = RamStorage() + schema = fields.Schema(path=path_field) + index = st.create_index(schema) + + with index.writer() as writer: + writer.add_document(path=u('/alfa/brvo/charlie/delta/')) + writer.add_document(path=u('/home/user/file.txt')) + assert not index.is_empty() + + with index.reader() as reader: + items = list(reader.all_terms()) + assert 'path' in [field for field, value in items] + assert b('/alfa') in [value for field, value in items] + + +def test_composition1(): + ca = analysis.RegexTokenizer() | analysis.LowercaseFilter() + assert ca.__class__.__name__ == "CompositeAnalyzer" + assert ca[0].__class__.__name__ == "RegexTokenizer" + assert ca[1].__class__.__name__ == "LowercaseFilter" + assert [t.text for t in ca(u("ABC 123"))] == ["abc", "123"] + + +def test_composition2(): + ca = analysis.RegexTokenizer() | analysis.LowercaseFilter() + sa = ca | analysis.StopFilter() + assert len(sa), 3 + assert sa.__class__.__name__ == "CompositeAnalyzer" + assert sa[0].__class__.__name__ == "RegexTokenizer" + assert sa[1].__class__.__name__ == "LowercaseFilter" + assert sa[2].__class__.__name__ == "StopFilter" + assert [t.text for t in sa(u("The ABC 123"))], ["abc", "123"] + + +def test_composition3(): + sa = analysis.RegexTokenizer() | analysis.StopFilter() + assert sa.__class__.__name__ == "CompositeAnalyzer" + + +def test_composing_functions(): + tokenizer = analysis.RegexTokenizer() + + def filter(tokens): + for t in tokens: + t.text = t.text.upper() + yield t + + with pytest.raises(TypeError): + tokenizer | filter + + +def test_shared_composition(): + shared = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() + + ana1 = shared | analysis.NgramFilter(3) + ana2 = shared | analysis.DoubleMetaphoneFilter() + + assert [t.text for t in ana1(u("hello"))] == ["hel", "ell", "llo"] + assert [t.text for t in ana2(u("hello"))] == ["HL"] + + +def test_multifilter(): + f1 = analysis.LowercaseFilter() + f2 = analysis.PassFilter() + mf = analysis.MultiFilter(a=f1, b=f2) + ana = analysis.RegexTokenizer(r"\S+") | mf + text = u("ALFA BRAVO CHARLIE") + assert [t.text for t in ana(text, mode="a")] == ["alfa", "bravo", "charlie"] + assert [t.text for t in ana(text, mode="b")] == ["ALFA", "BRAVO", "CHARLIE"] + + +def test_tee_filter(): + target = u("Alfa Bravo Charlie") + f1 = analysis.LowercaseFilter() + f2 = analysis.ReverseTextFilter() + ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) + result = " ".join([t.text for t in ana(target)]) + assert result == "alfa aflA bravo ovarB charlie eilrahC" + + class ucfilter(analysis.Filter): + def __call__(self, tokens): + for t in tokens: + t.text = t.text.upper() + yield t + + f2 = analysis.ReverseTextFilter() | ucfilter() + ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) + result = " ".join([t.text for t in ana(target)]) + assert result == "alfa AFLA bravo OVARB charlie EILRAHC" + + f1 = analysis.PassFilter() + f2 = analysis.BiWordFilter() + ana = (analysis.RegexTokenizer(r"\S+") + | analysis.TeeFilter(f1, f2) + | analysis.LowercaseFilter()) + result = " ".join([t.text for t in ana(target)]) + assert result == "alfa alfa-bravo bravo bravo-charlie charlie" + + +def test_intraword(): + iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) + ana = analysis.RegexTokenizer(r"\S+") | iwf + + def check(text, ls): + assert [(t.pos, t.text) for t in ana(text)] == ls + + check(u("PowerShot)"), [(0, "Power"), (1, "Shot"), (1, "PowerShot")]) + check(u("A's+B's&C's"), [(0, "A"), (1, "B"), (2, "C"), (2, "ABC")]) + check(u("Super-Duper-XL500-42-AutoCoder!"), + [(0, "Super"), (1, "Duper"), (2, "XL"), (2, "SuperDuperXL"), + (3, "500"), (4, "42"), (4, "50042"), (5, "Auto"), (6, "Coder"), + (6, "AutoCoder")]) + + +def test_intraword_chars(): + iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) + ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter() + + target = u("WiKiWo-rd") + tokens = [(t.text, t.startchar, t.endchar) + for t in ana(target, chars=True)] + assert tokens == [("wi", 0, 2), ("ki", 2, 4), ("wo", 4, 6), + ("rd", 7, 9), ("wikiword", 0, 9)] + + target = u("Zo WiKiWo-rd") + tokens = [(t.text, t.startchar, t.endchar) + for t in ana(target, chars=True)] + assert tokens == [("zo", 0, 2), ("wi", 3, 5), ("ki", 5, 7), + ("wo", 7, 9), ("rd", 10, 12), ("wikiword", 3, 12)] + + +def test_intraword_possessive(): + iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) + ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter() + + target = u("O'Malley's-Bar") + tokens = [(t.text, t.startchar, t.endchar) + for t in ana(target, chars=True)] + assert tokens == [("o", 0, 1), ("malley", 2, 8), ("bar", 11, 14), + ("omalleybar", 0, 14)] + + +def test_word_segments(): + wordset = set(u("alfa bravo charlie delta").split()) + + cwf = analysis.CompoundWordFilter(wordset, keep_compound=True) + ana = analysis.RegexTokenizer(r"\S+") | cwf + target = u("alfacharlie bravodelta delto bravo subalfa") + tokens = [t.text for t in ana(target)] + assert tokens == ["alfacharlie", "alfa", "charlie", "bravodelta", + "bravo", "delta", "delto", "bravo", "subalfa"] + + cwf = analysis.CompoundWordFilter(wordset, keep_compound=False) + ana = analysis.RegexTokenizer(r"\S+") | cwf + target = u("alfacharlie bravodelta delto bravo subalfa") + tokens = [t.text for t in ana(target)] + assert tokens == ["alfa", "charlie", "bravo", "delta", "delto", "bravo", + "subalfa"] + + +def test_biword(): + ana = analysis.RegexTokenizer(r"\w+") | analysis.BiWordFilter() + result = [t.copy() for t + in ana(u("the sign of four"), chars=True, positions=True)] + assert ["the-sign", "sign-of", "of-four"] == [t.text for t in result] + assert [(0, 8), (4, 11), (9, 16)] == [(t.startchar, t.endchar) + for t in result] + assert [0, 1, 2] == [t.pos for t in result] + + result = [t.copy() for t in ana(u("single"))] + assert len(result) == 1 + assert result[0].text == "single" + + +def test_shingles(): + ana = analysis.RegexTokenizer(r"\w+") | analysis.ShingleFilter(3, " ") + source = u("better a witty fool than a foolish wit") + results = [t.copy() for t in ana(source, positions=True, chars=True)] + assert [t.text for t in results] == [u('better a witty'), u('a witty fool'), + u('witty fool than'), u('fool than a'), + u('than a foolish'), + u('a foolish wit')] + assert [t.pos for t in results] == list(range(len(results))) + for t in results: + assert t.text == source[t.startchar:t.endchar] + + +def test_unicode_blocks(): + from whoosh.support.unicode import blocks, blockname, blocknum + + assert blockname(u('a')) == 'Basic Latin' + assert blockname(unichr(0x0b80)) == 'Tamil' + assert blockname(unichr(2048)) is None + assert blocknum(u('a')) == 0 + assert blocknum(unichr(0x0b80)) == 22 + assert blocknum(unichr(2048)) is None + assert blocknum(u('a')) == blocks.Basic_Latin # @UndefinedVariable + assert blocknum(unichr(0x0b80)) == blocks.Tamil # @UndefinedVariable + + +def test_double_metaphone(): + from whoosh.lang.dmetaphone import double_metaphone + + names = {'maurice': ('MRS', None), + 'aubrey': ('APR', None), + 'cambrillo': ('KMPRL', 'KMPR'), + 'heidi': ('HT', None), + 'katherine': ('K0RN', 'KTRN'), + 'Thumbail': ('0MPL', 'TMPL'), + 'catherine': ('K0RN', 'KTRN'), + 'richard': ('RXRT', 'RKRT'), + 'bob': ('PP', None), + 'eric': ('ARK', None), + 'geoff': ('JF', 'KF'), + 'Through': ('0R', 'TR'), + 'Schwein': ('XN', 'XFN'), + 'dave': ('TF', None), + 'ray': ('R', None), + 'steven': ('STFN', None), + 'bryce': ('PRS', None), + 'randy': ('RNT', None), + 'bryan': ('PRN', None), + 'Rapelje': ('RPL', None), + 'brian': ('PRN', None), + 'otto': ('AT', None), + 'auto': ('AT', None), + 'Dallas': ('TLS', None), + 'maisey': ('MS', None), + 'zhang': ('JNK', None), + 'Chile': ('XL', None), + 'Jose': ('HS', None), + 'Arnow': ('ARN', 'ARNF'), + 'solilijs': ('SLLS', None), + 'Parachute': ('PRKT', None), + 'Nowhere': ('NR', None), + 'Tux': ('TKS', None)} + + dmn = name = None + for name in names.keys(): + dmn = double_metaphone(name) + assert dmn == names[name] + + mf = (analysis.RegexTokenizer() + | analysis.LowercaseFilter() + | analysis.DoubleMetaphoneFilter()) + results = [(t.text, t.boost) for t in mf(u("Spruce View"))] + assert results == [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)] + + mf = (analysis.RegexTokenizer() + | analysis.LowercaseFilter() + | analysis.DoubleMetaphoneFilter(combine=True)) + results = [(t.text, t.boost) for t in mf(u("Spruce View"))] + assert results == [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0), + ('F', 1.0), ('FF', 0.5)] + + namefield = fields.TEXT(analyzer=mf) + texts = list(namefield.process_text(u("Spruce View"), mode="query")) + assert texts == [u('spruce'), 'SPRS', u('view'), 'F', 'FF'] + + +def test_substitution(): + mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("-", "") + assert ([t.text for t in mf(u("one-two th-re-ee four"))] + == ["onetwo", "threee", "four"]) + + mf = (analysis.RegexTokenizer(r"\S+") + | analysis.SubstitutionFilter("([^=]*)=(.*)", r"\2=\1")) + assert [t.text for t in mf(u("a=b c=d ef"))] == ["b=a", "d=c", "ef"] + + +def test_delimited_attribute(): + ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() + results = [(t.text, t.boost) for t in ana(u("image render^2 file^0.5"))] + assert results == [("image", 1.0), ("render", 2.0), ("file", 0.5)] + + +def test_porter2(): + from whoosh.lang.porter2 import stem + + plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', + 'died', 'agreed', 'owned', 'humbled', 'sized', + 'meeting', 'stating', 'siezing', 'itemization', + 'sensational', 'traditional', 'reference', 'colonizer', + 'plotted'] + singles = [stem(w) for w in plurals] + + assert singles == ['caress', 'fli', 'die', 'mule', 'deni', 'die', + 'agre', 'own', 'humbl', 'size', 'meet', 'state', + 'siez', 'item', 'sensat', 'tradit', 'refer', + 'colon', 'plot'] + assert stem("bill's") == "bill" + assert stem("y's") == "y" + + +#def test_pystemmer(): +# Stemmer = pytest.importorskip("Stemmer") +# +# ana = (analysis.RegexTokenizer() +# | analysis.LowercaseFilter() +# | analysis.PyStemmerFilter()) +# schema = fields.Schema(text=fields.TEXT(analyzer=ana)) +# st = RamStorage() +# +# ix = st.create_index(schema) +# with ix.writer() as w: +# w.add_document(text=u("rains falling strangely")) +# +# ix = st.open_index() +# with ix.writer() as w: +# w.add_document(text=u("pains stalling strongly")) +# +# ix = st.open_index() +# with ix.reader() as r: +# assert (list(r.field_terms("text")) +# == ["fall", "pain", "rain", "stall", "strang", "strong"]) + + +def test_url(): + sample = u("Visit http://bitbucket.org/mchaput/whoosh or " + + "urn:isbn:5930502 or http://www.apple.com/.") + + anas = [analysis.SimpleAnalyzer(analysis.url_pattern), + analysis.StandardAnalyzer(analysis.url_pattern, stoplist=None)] + for ana in anas: + ts = [t.text for t in ana(sample)] + assert ts == [u('visit'), u('http://bitbucket.org/mchaput/whoosh'), + u('or'), u('urn:isbn:5930502'), u('or'), + u('http://www.apple.com/')] + + +def test_name_field(): + ana = (analysis.RegexTokenizer(r"\S+") + | analysis.LowercaseFilter() + | analysis.DoubleMetaphoneFilter(combine=True)) + namefield = fields.TEXT(analyzer=ana, multitoken_query="or") + schema = fields.Schema(id=fields.STORED, name=namefield) + + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=u("one"), name=u("Leif Ericson")) + w.commit() + + s = ix.searcher() + qp = qparser.QueryParser("name", schema) + q = qp.parse(u("leaf eriksen"), normalize=False) + r = s.search(q) + assert len(r) == 1 + + +def test_start_pos(): + from whoosh import formats + ana = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() + kw = {"positions": True} + tks = formats.tokens(u("alfa bravo charlie delta"), ana, kw) + assert [t.pos for t in tks] == [0, 1, 2, 3] + + kw["start_pos"] = 3 + ts = [t.copy() for t in formats.tokens(u("A B C D").split(), ana, kw)] + assert " ".join([t.text for t in ts]) == "A B C D" + assert [t.pos for t in ts] == [3, 4, 5, 6] + + +def test_frowny_face(): + # See https://bitbucket.org/mchaput/whoosh/issue/166/ + ana = analysis.RegexTokenizer(r"\S+") | analysis.IntraWordFilter() + # text is all delimiters + tokens = [t.text for t in ana(u(":-("))] + assert tokens == [] + + # text has consecutive delimiters + tokens = [t.text for t in ana(u("LOL:)"))] + assert tokens == ["LOL"] + + +def test_ngrams(): + s = u("abcdefg h ij klm") + tk = analysis.RegexTokenizer(r"\S+") + + def dotest(f): + ana = tk | f + tokens = ana(s, positions=True, chars=True) + return "/".join(t.text for t in tokens) + + f = analysis.NgramFilter(3, 4) + assert dotest(f) == "abc/abcd/bcd/bcde/cde/cdef/def/defg/efg/klm" + + f = analysis.NgramFilter(3, 4, at="start") + assert dotest(f) == "abc/abcd/klm" + + f = analysis.NgramFilter(3, 4, at="end") + assert dotest(f) == "defg/efg/klm" + + ana = tk | analysis.NgramFilter(2, 5, at="end") + tokens = [(t.text, t.startchar, t.endchar) for t in ana(s, chars=True)] + assert tokens == [("cdefg", 2, 7), ("defg", 3, 7), ("efg", 4, 7), + ("fg", 5, 7), ("ij", 10, 12), ("klm", 13, 16), + ("lm", 14, 16)] + + +@pytest.mark.skipif("sys.version_info < (2,6)") +def test_language_analyzer(): + domain = [("da", u("Jeg gik mig over s\xf8 og land"), + [u('gik'), u('s\xf8'), u('land')]), + + ("nl", u("Daar komt een muisje aangelopen"), + [u('komt'), u('muisj'), u('aangelop')]), + + ("de", u("Berlin war ihm zu gro\xdf, da baut' er sich ein Schlo\xdf."), + [u('berlin'), u('gross'), u('baut'), u('schloss')]), + + ("es", u("Por el mar corren las liebres"), + ['mar', 'corr', 'liebr']), + ] + + for lang, source, target in domain: + ana = analysis.LanguageAnalyzer(lang) + words = [t.text for t in ana(source)] + assert words == target + + +@pytest.mark.skipif("sys.version_info < (2,6)") +def test_la_pickleability(): + ana = analysis.LanguageAnalyzer("en") + _ = dumps(ana, -1) + + +def test_charset_pickeability(): + from whoosh.support import charset + charmap = charset.charset_table_to_dict(charset.default_charset) + ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap) + _ = dumps(ana, -1) + + ana = analysis.CharsetTokenizer(charmap) + _ = dumps(ana, -1) + + +def test_shingle_stopwords(): + # Note that the stop list is None here + ana = (analysis.RegexTokenizer() + | analysis.StopFilter(stoplist=None, minsize=3) + | analysis.ShingleFilter(size=3)) + + texts = [t.text for t + in ana(u("some other stuff and then some things To Check "))] + assert texts == ["some-other-stuff", "other-stuff-and", "stuff-and-then", + "and-then-some", "then-some-things", "some-things-Check"] + + # Use a stop list here + ana = (analysis.RegexTokenizer() + | analysis.LowercaseFilter() + | analysis.StopFilter() + | analysis.ShingleFilter(size=3)) + + texts = [t.text for t + in ana(u("some other stuff and then some things To Check "))] + assert texts == ["some-other-stuff", "other-stuff-then", "stuff-then-some", + "then-some-things", "some-things-check"] + + +def test_biword_stopwords(): + # Note that the stop list is None here + ana = (analysis.RegexTokenizer() + | analysis.StopFilter(stoplist=None, minsize=3) + | analysis.BiWordFilter()) + + texts = [t.text for t in ana(u("stuff and then some"))] + assert texts == ["stuff-and", "and-then", "then-some"] + + # Use a stop list here + ana = (analysis.RegexTokenizer() + | analysis.LowercaseFilter() + | analysis.StopFilter() + | analysis.BiWordFilter()) + + texts = [t.text for t in ana(u("stuff and then some"))] + assert texts == ["stuff-then", "then-some"] + + +@pytest.mark.skipif("sys.version_info < (2,6)") +def test_stop_lang(): + stopper = analysis.RegexTokenizer() | analysis.StopFilter() + ls = [token.text for token in stopper(u("this is a test"))] + assert ls == [u("test")] + + es_stopper = analysis.RegexTokenizer() | analysis.StopFilter(lang="es") + ls = [token.text for token in es_stopper(u("el lapiz es en la mesa"))] + assert ls == ["lapiz", "mesa"] + + +def test_issue358(): + t = analysis.RegexTokenizer("\w+") + with pytest.raises(analysis.CompositionError): + _ = t | analysis.StandardAnalyzer() + + +def test_ngramwords_tokenizer(): + tk = analysis.CommaSeparatedTokenizer() + tags = fields.NGRAMWORDS(minsize=3, maxsize=50, tokenizer=tk, stored=True, + queryor=True) + schema = fields.Schema(tags=tags) diff --git a/tests/test_automata.py b/tests/test_automata.py new file mode 100644 index 0000000..daab96c --- /dev/null +++ b/tests/test_automata.py @@ -0,0 +1,372 @@ +import gzip +import os.path +from bisect import bisect_left + +import pytest + +from whoosh.compat import permutations +from whoosh.compat import xrange +from whoosh.automata import fsa, glob, lev, reg +from whoosh.support.levenshtein import levenshtein +from whoosh.util import random_bytes + + +def test_nfa(): + nfa = fsa.NFA(0) + nfa.add_transition(0, "a", 1) + nfa.add_transition(0, fsa.EPSILON, 4) + nfa.add_transition(0, "b", 1) + nfa.add_transition(1, "c", 4) + nfa.add_final_state(4) + + assert nfa.accept("") + assert nfa.accept("ac") + assert nfa.accept("bc") + assert not nfa.accept("c") + + +def test_empty_string(): + nfa = fsa.NFA(1) + nfa.add_final_state(1) + + assert nfa.accept("") + assert not nfa.accept("a") + + dfa = nfa.to_dfa() + assert dfa.accept("") + assert not dfa.accept("a") + + +def test_nfa2(): + nfa = fsa.NFA(1) + nfa.add_transition(1, "a", 2) + nfa.add_transition(1, "c", 4) + nfa.add_transition(2, "b", 3) + nfa.add_transition(2, fsa.EPSILON, 1) + nfa.add_transition(3, "a", 2) + nfa.add_transition(4, "c", 3) + nfa.add_transition(4, fsa.EPSILON, 3) + nfa.add_final_state(3) + + assert nfa.accept("ab") + assert nfa.accept("abab") + assert nfa.accept("cc") + assert nfa.accept("c") + assert nfa.accept("ccab") + assert nfa.accept("ccacc") + assert nfa.accept("ccac") + assert nfa.accept("abacab") + + assert not nfa.accept("b") + assert not nfa.accept("a") + assert not nfa.accept("cb") + assert not nfa.accept("caa") + + dfa = nfa.to_dfa() + assert dfa.accept("ab") + assert dfa.accept("abab") + assert dfa.accept("cc") + assert dfa.accept("c") + assert dfa.accept("ccab") + assert dfa.accept("ccacc") + assert dfa.accept("ccac") + assert dfa.accept("abacab") + + assert not dfa.accept("b") + assert not dfa.accept("a") + assert not dfa.accept("cb") + assert not dfa.accept("caa") + + +def test_insert(): + nfa1 = fsa.NFA(1) + nfa1.add_transition(1, "a", 2) + nfa1.add_transition(2, "b", 3) + nfa1.add_final_state(3) + + nfa2 = fsa.NFA(4) + nfa2.add_transition(4, "x", 5) + nfa2.add_transition(4, "y", 5) + nfa2.insert(4, nfa1, 5) + nfa2.add_final_state(5) + + assert nfa2.accept("x") + assert nfa2.accept("y") + assert nfa2.accept("ab") + assert not nfa2.accept("a") + + +def test_to_dfa(): + nfa = fsa.NFA(0) + nfa.add_transition(0, "a", 1) + nfa.add_transition(0, fsa.EPSILON, 4) + nfa.add_transition(0, "b", 1) + nfa.add_transition(1, "c", 4) + nfa.add_final_state(4) + + assert nfa.accept("") + + dfa = nfa.to_dfa() + assert dfa.accept("") + assert dfa.accept("ac") + assert dfa.accept("bc") + assert not dfa.accept("c") + + +def test_glob_star(): + nfa = glob.glob_automaton("a*c") + assert not nfa.accept("a") + assert not nfa.accept("c") + assert nfa.accept("ac") + assert nfa.accept("abc") + assert nfa.accept("abcc") + assert nfa.accept("abcac") + assert nfa.accept("aaaaaaaaaac") + assert not nfa.accept("abb") + + dfa = nfa.to_dfa() + assert not dfa.accept("a") + assert not dfa.accept("c") + assert dfa.accept("ac") + assert dfa.accept("abc") + assert dfa.accept("abcc") + assert dfa.accept("abcac") + assert not dfa.accept("abb") + + +def test_glob_question(): + nfa = glob.glob_automaton("?") + assert not nfa.accept("") + assert nfa.accept("a") + assert not nfa.accept("aa") + + nfa = glob.glob_automaton("a?c") + assert not nfa.accept("a") + assert not nfa.accept("ac") + assert nfa.accept("abc") + assert not nfa.accept("aba") + + +def test_glob_range(): + nfa = glob.glob_automaton("[ab][cd]") + assert not nfa.accept("") + assert not nfa.accept("a") + assert not nfa.accept("c") + assert nfa.accept("ac") + assert nfa.accept("bc") + assert nfa.accept("ad") + assert nfa.accept("bd") + assert not nfa.accept("acc") + + +# def test_glob_negate_range(): +# nfa = glob.glob_automaton("a[!ab]a") +# assert not nfa.accept("aaa") +# assert not nfa.accept("aba") +# assert nfa.accept("aca") +# assert not nfa.accept("bcb") + + +class Skipper(object): + def __init__(self, data): + self.data = data + self.i = 0 + + def __call__(self, w): + if self.data[self.i] == w: + return w + self.i += 1 + pos = bisect_left(self.data, w, self.i) + if pos < len(self.data): + return self.data[pos] + else: + return None + + +def test_levenshtein(): + path = os.path.join(os.path.dirname(__file__), "english-words.10.gz") + wordfile = gzip.open(path, "rb") + words = sorted(line.decode("latin1").strip().lower() for line in wordfile) + + def find_brute(target, k): + for w in words: + if levenshtein(w, target, k) <= k: + yield w + + def find_auto(target, k): + dfa = lev.levenshtein_automaton(target, k).to_dfa() + sk = Skipper(words) + return fsa.find_all_matches(dfa, sk) + + assert set(find_brute("look", 2)) == set(find_auto("look", 2)) + assert set(find_brute("bend", 1)) == set(find_auto("bend", 1)) + assert set(find_brute("puck", 1)) == set(find_auto("puck", 1)) + assert set(find_brute("zero", 1)) == set(find_auto("zero", 1)) + + +def test_levenshtein_prefix(): + path = os.path.join(os.path.dirname(__file__), "english-words.10.gz") + wordfile = gzip.open(path, "rb") + words = sorted(line.decode("latin1").strip().lower() for line in wordfile) + prefixlen = 1 + + def find_brute(target, k): + for w in words: + d = levenshtein(w, target, k) + if d <= k and w[:prefixlen] == target[:prefixlen]: + yield w + + def find_auto(target, k): + dfa = lev.levenshtein_automaton(target, k, prefix=prefixlen).to_dfa() + sk = Skipper(words) + return fsa.find_all_matches(dfa, sk) + + assert set(find_brute("look", 2)) == set(find_auto("look", 2)) + assert set(find_brute("bend", 1)) == set(find_auto("bend", 1)) + assert set(find_brute("puck", 1)) == set(find_auto("puck", 1)) + assert set(find_brute("zero", 1)) == set(find_auto("zero", 1)) + + +def test_basics(): + n = fsa.epsilon_nfa() + assert n.accept("") + assert not n.accept("a") + + n = fsa.basic_nfa("a") + assert not n.accept("") + assert n.accept("a") + assert not n.accept("b") + + n = fsa.dot_nfa() + assert not n.accept("") + assert n.accept("a") + assert n.accept("b") + + +def test_concat(): + n = fsa.concat_nfa(fsa.basic_nfa("a"), fsa.basic_nfa("b")) + assert not n.accept("") + assert not n.accept("a") + assert not n.accept("aa") + assert not n.accept("b") + assert not n.accept("bb") + assert not n.accept("ba") + assert not n.accept("abc") + assert n.accept("ab") + + +def test_choice(): + n = fsa.choice_nfa(fsa.basic_nfa("a"), + fsa.choice_nfa(fsa.basic_nfa("b"), + fsa.basic_nfa("c"))) + assert not n.accept("") + assert n.accept("a") + assert n.accept("b") + assert n.accept("c") + assert not n.accept("d") + assert not n.accept("aa") + assert not n.accept("ab") + assert not n.accept("abc") + + +def test_star(): + n = fsa.star_nfa(fsa.basic_nfa("a")) + assert n.accept("") + assert n.accept("a") + assert n.accept("aaaaaa") + assert not n.accept("b") + assert not n.accept("ab") + + +def test_optional(): + n = fsa.concat_nfa(fsa.basic_nfa("a"), fsa.optional_nfa(fsa.basic_nfa("b"))) + assert n.accept("a") + assert n.accept("ab") + assert not n.accept("") + assert not n.accept("b") + assert not n.accept("ba") + assert not n.accept("bab") + + +def test_reverse_nfa(): + n = fsa.concat_nfa(fsa.basic_nfa("a"), fsa.basic_nfa("b")) + + r = fsa.reverse_nfa(n) + assert not r.accept("") + assert not r.accept("a") + assert not r.accept("aa") + assert not r.accept("b") + assert not r.accept("bb") + assert not r.accept("ab") + assert not r.accept("abc") + assert r.accept("ba") + + +def test_regular(): + ex = fsa.star_nfa(fsa.choice_nfa(fsa.basic_nfa("a"), fsa.basic_nfa("b"))) + + assert ex.accept("") + assert ex.accept("a") + assert ex.accept("aaaa") + assert ex.accept("b") + assert ex.accept("bbbb") + assert ex.accept("abab") + assert ex.accept("babb") + + ex = fsa.concat_nfa( + fsa.basic_nfa("a"), + fsa.concat_nfa( + fsa.optional_nfa(fsa.basic_nfa("b")), + fsa.basic_nfa("c") + ) + ) + + assert ex.accept("ac") + assert ex.accept("abc") + assert not ex.accept("ab") + assert not ex.accept("bc") + + +def test_minimize_dfa(): + # Example from www.cs.odu.edu/~toida/nerzic/390teched/regular/fa/min-fa.html + + dfa = fsa.DFA(1) + dfa.add_transition(1, "a", 3) + dfa.add_transition(1, "b", 2) + dfa.add_transition(2, "a", 4) + dfa.add_transition(2, "b", 1) + dfa.add_transition(3, "a", 5) + dfa.add_transition(3, "b", 4) + dfa.add_transition(4, "a", 4) + dfa.add_transition(4, "b", 4) + dfa.add_transition(5, "a", 3) + dfa.add_transition(5, "b", 2) + dfa.add_final_state(1) + dfa.add_final_state(5) + + good = fsa.DFA(1) + good.add_transition(1, "a", 3) + good.add_transition(1, "b", 2) + good.add_transition(2, "b", 1) + good.add_transition(3, "a", 1) + good.add_final_state(1) + + dfa.minimize() + assert dfa == good + + +def test_strings_dfa(): + strings = "able alfa alpha apple bar bear beat boom boot".split() + dfa = fsa.strings_dfa(strings) + output = list(dfa.generate_all()) + assert output == strings + + domain = "abcd" + words = set() + for i in xrange(1, len(domain) + 1): + words.update("".join(p) for p in permutations(domain[:i])) + words = sorted(words) + dfa = fsa.strings_dfa(words) + assert list(dfa.generate_all()) == words + + diff --git a/tests/test_bits.py b/tests/test_bits.py new file mode 100644 index 0000000..e265ddf --- /dev/null +++ b/tests/test_bits.py @@ -0,0 +1,185 @@ +from whoosh.filedb.filestore import RamStorage +from whoosh.idsets import BitSet, OnDiskBitSet, SortedIntSet + + +def test_bit_basics(c=BitSet): + b = c() + assert not b + assert 12 not in b + + b.update([0, 2, 4, 6, 7]) + assert b + assert ([(n in b) for n in range(10)] == + [True, False, True, False, True, False, True, True, False, False]) + + b.add(9) + assert 9 in b + assert len(b) == 6 + + assert list(b.invert(10)) == [1, 3, 5, 8] + + b.discard(6) + assert list(b) == [0, 2, 4, 7, 9] + assert len(b) == 5 + + +def test_len(c=BitSet): + b = c() + b.add(3) + b.add(5) + b.add(1024) + assert len(b) == 3 + b.add(5) + assert len(b) == 3 + b.discard(1000) + assert len(b) == 3 + b.discard(5) + assert len(b) == 2 + + +def test_union(c=BitSet): + assert c([2, 4, 5]) | c([3, 9]) == c([2, 3, 4, 5, 9]) + b = c([2, 4, 5]) + b.update([3, 9]) + assert list(b) == [2, 3, 4, 5, 9] + b = c([2, 4, 5]) + b.update(c([3, 9])) + assert list(b) == [2, 3, 4, 5, 9] + b = c([1, 2]) + b.update([1, 5, 9]) + assert list(b) == [1, 2, 5, 9] + + +def test_intersection(c=BitSet): + assert c([2, 4, 5]) & c([3, 9]) == c() + assert c([2, 4, 5]) & c([4, 5, 9]) == c([4, 5]) + b = c([2, 4, 5]) + assert b.intersection([4, 5, 9]) == c([4, 5]) + b.intersection_update([4, 5, 9]) + assert list(b) == [4, 5] + b = c([2, 4, 5]) + b.intersection_update(c([4, 5, 9])) + assert list(b) == [4, 5] + + +def test_difference(c=BitSet): + assert c([1, 3, 50, 72]) - c([3, 72]) == c([1, 50]) + assert list(c([1, 3, 50, 72]).difference([3, 72])) == [1, 50] + b = c([1, 3, 50, 72]) + b.difference_update(c([3, 72])) + assert list(b) == [1, 50] + b = c([1, 3, 50, 72]) + b.difference_update([3, 72]) + assert list(b) == [1, 50] + + +def test_copy(c=BitSet): + b = c([1, 5, 100, 60]) + assert b == b.copy() + + +def test_clear(c=BitSet): + b = c([1, 5, 100, 60]) + b.clear() + assert list(b) == [] + + +def test_isdisjoint(c=BitSet): + b = c([1, 7, 20, 100]) + assert b.isdisjoint(c([2, 8, 25])) + assert b.isdisjoint([2, 8, 25]) + assert not b.isdisjoint(c([2, 7, 25])) + assert not b.isdisjoint([1, 8, 25]) + + +def test_before_after(c=BitSet): + b = c([10, 11, 30, 50, 80]) + assert b.after(0) == 10 + assert b.after(7) == 10 + assert b.after(8) == 10 + assert b.after(10) == 11 + assert b.after(11) == 30 + assert b.after(30) == 50 + assert b.after(33) == 50 + assert b.after(38) == 50 + assert b.after(41) == 50 + assert b.after(42) == 50 + assert b.after(45) == 50 + assert b.after(47) == 50 + assert b.after(50) == 80 + assert b.after(80) is None + + assert b.before(0) is None + assert b.before(99) == 80 + assert b.before(81) == 80 + assert b.before(80) == 50 + assert b.before(50) == 30 + assert b.before(48) == 30 + assert b.before(46) == 30 + assert b.before(45) == 30 + assert b.before(44) == 30 + assert b.before(42) == 30 + assert b.before(38) == 30 + assert b.before(36) == 30 + assert b.before(34) == 30 + assert b.before(33) == 30 + assert b.before(32) == 30 + assert b.before(30) == 11 + assert b.before(11) == 10 + assert b.before(10) is None + + b = c([7]) + assert b.after(0) == 7 + b = c([8]) + assert b.after(0) == 8 + b = c([9]) + assert b.after(0) == 9 + + b = c([7]) + assert b.before(16) == 7 + b = c([8]) + assert b.before(16) == 8 + b = c([9]) + assert b.before(16) == 9 + + b = c([49]) + assert b.after(0) == 49 + + +def test_sortedintset(): + test_bit_basics(SortedIntSet) + test_len(SortedIntSet) + test_union(SortedIntSet) + test_intersection(SortedIntSet) + test_difference(SortedIntSet) + test_copy(SortedIntSet) + test_clear(SortedIntSet) + test_isdisjoint(SortedIntSet) + test_before_after(SortedIntSet) + + +def test_ondisk(): + bs = BitSet([10, 11, 30, 50, 80]) + + st = RamStorage() + f = st.create_file("test") + size = bs.to_disk(f) + f.close() + + f = st.open_file("test") + b = OnDiskBitSet(f, 0, size) + assert list(b) == list(bs) + + assert b.after(0) == 10 + assert b.after(10) == 11 + assert b.after(80) is None + assert b.after(99) is None + + assert b.before(0) is None + assert b.before(99) == 80 + assert b.before(80) == 50 + assert b.before(10) is None + + f.seek(0) + b = BitSet.from_disk(f, size) + assert list(b) == list(bs) diff --git a/tests/test_classify.py b/tests/test_classify.py new file mode 100644 index 0000000..38d3fc3 --- /dev/null +++ b/tests/test_classify.py @@ -0,0 +1,132 @@ +from __future__ import with_statement + +from whoosh import analysis, classify, fields, formats, query +from whoosh.compat import u, text_type +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex + + +domain = [u("A volume that is a signed distance field used for collision calculations. The turbulence is damped near the collision object to prevent particles from passing through."), + u("When particles cross the SDF boundary they have their velocities reversed according to the SDF normal and are pushed outside of the SDF."), + u("The distance at which the particles start to slow down due to a collision object."), + u("There are several different ways to update a particle system in response to an external velocity field. They are broadly categorized as Force, Velocity, and Position updates."), + u("Instead of applying a force in the direction of the velocity field, the force is applied relative to the difference between the particle's velocity and the velocity field. This effectively adds an implicit drag that causes the particles to match the velocity field."), + u("In Velocity Blend mode, the amount to mix in the field velocity every timestep."), + u("In Velocity Blend mode, the amount to add the curlnoise velocity to the particle's velocity. This can be useful in addition to advectbyvolume to layer turbulence on a velocity field."), + ] + +text = u("How do I use a velocity field for particles") + + +def create_index(): + analyzer = analysis.StandardAnalyzer() + vector_format = formats.Frequency() + schema = fields.Schema(path=fields.ID(stored=True), + content=fields.TEXT(analyzer=analyzer, + vector=vector_format)) + + ix = RamStorage().create_index(schema) + + w = ix.writer() + from string import ascii_lowercase + for letter, content in zip(ascii_lowercase, domain): + w.add_document(path=u("/%s") % letter, content=content) + w.commit() + + return ix + + +def test_add_text(): + ix = create_index() + with ix.reader() as r: + exp = classify.Expander(r, "content") + exp.add_text(text) + assert ([t[0] for t in exp.expanded_terms(3)] + == ["particles", "velocity", "field"]) + + +def test_keyterms(): + ix = create_index() + with ix.searcher() as s: + docnum = s.document_number(path="/a") + keys = list(s.key_terms([docnum], "content", numterms=3)) + assert ([t[0] for t in keys] + == [u("collision"), u("calculations"), u("damped")]) + + +def test_keyterms_from_text(): + ix = create_index() + with ix.searcher() as s: + keys = list(s.key_terms_from_text("content", text)) + assert [t[0] for t in keys] == ["particles", "velocity", "field"] + + +def test_more_like_this(): + docs = [u("alfa bravo charlie delta echo foxtrot golf"), + u("delta echo foxtrot golf hotel india juliet"), + u("echo foxtrot golf hotel india juliet kilo"), + u("foxtrot golf hotel india juliet kilo lima"), + u("golf hotel india juliet kilo lima mike"), + u("foxtrot golf hotel india alfa bravo charlie")] + + def _check(schema, **kwargs): + ix = RamStorage().create_index(schema) + with ix.writer() as w: + for i, text in enumerate(docs): + w.add_document(id=text_type(i + 1), text=text) + + with ix.searcher() as s: + docnum = s.document_number(id=u("1")) + r = s.more_like(docnum, "text", **kwargs) + assert [hit["id"] for hit in r] == ["6", "2", "3"] + + schema = fields.Schema(id=fields.ID(stored=True), + text=fields.TEXT(stored=True)) + _check(schema) + + ana = analysis.StandardAnalyzer() + schema = fields.Schema(id=fields.ID(stored=True), + text=fields.TEXT(analyzer=ana, + vector=formats.Frequency())) + _check(schema) + + schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) + _check(schema, text=docs[0]) + + +def test_more_like(): + schema = fields.Schema(id=fields.ID(stored=True), + text=fields.TEXT(stored=True)) + with TempIndex(schema, "morelike") as ix: + with ix.writer() as w: + w.add_document(id=u("1"), text=u("alfa bravo charlie")) + w.add_document(id=u("2"), text=u("bravo charlie delta")) + w.add_document(id=u("3"), text=u("echo")) + w.add_document(id=u("4"), text=u("delta echo foxtrot")) + w.add_document(id=u("5"), text=u("echo echo echo")) + w.add_document(id=u("6"), text=u("foxtrot golf hotel")) + w.add_document(id=u("7"), text=u("golf hotel india")) + + with ix.searcher() as s: + docnum = s.document_number(id="3") + r = s.more_like(docnum, "text") + assert [hit["id"] for hit in r] == ["5", "4"] + + +def test_empty_more_like(): + schema = fields.Schema(text=fields.TEXT) + with TempIndex(schema, "emptymore") as ix: + with ix.searcher() as s: + assert s.doc_count() == 0 + q = query.Term("a", u("b")) + r = s.search(q) + assert r.scored_length() == 0 + assert r.key_terms("text") == [] + + ex = classify.Expander(s.reader(), "text") + assert ex.expanded_terms(1) == [] + + + + + diff --git a/tests/test_codecs.py b/tests/test_codecs.py new file mode 100644 index 0000000..5f362c3 --- /dev/null +++ b/tests/test_codecs.py @@ -0,0 +1,621 @@ +from __future__ import with_statement +import random +from array import array + +import pytest + +from whoosh import analysis, fields, formats, query +from whoosh.compat import u, b, text_type +from whoosh.compat import array_tobytes, xrange +from whoosh.codec import default_codec +from whoosh.filedb.filestore import RamStorage +from whoosh.util.numeric import byte_to_length, length_to_byte +from whoosh.util.testing import TempStorage + + +def _make_codec(**kwargs): + st = RamStorage() + codec = default_codec(**kwargs) + seg = codec.new_segment(st, "test") + return st, codec, seg + + +class FakeLengths(object): + def __init__(self, **lens): + self.lens = lens + + def doc_field_length(self, docnum, fieldname): + if fieldname in self.lens: + if docnum < len(self.lens[fieldname]): + return self.lens[fieldname][docnum] + return 1 + + +def test_termkey(): + st, codec, seg = _make_codec() + tw = codec.field_writer(st, seg) + fieldobj = fields.TEXT() + tw.start_field("alfa", fieldobj) + tw.start_term(b("bravo")) + tw.add(0, 1.0, b(""), 3) + tw.finish_term() + tw.start_term(b('\xc3\xa6\xc3\xaf\xc5\xc3\xba')) + tw.add(0, 4.0, b(""), 3) + tw.finish_term() + tw.finish_field() + tw.start_field("text", fieldobj) + tw.start_term(b('\xe6\xa5\xe6\xac\xe8\xaa')) + tw.add(0, 7.0, b(""), 9) + tw.finish_term() + tw.finish_field() + tw.close() + + tr = codec.terms_reader(st, seg) + assert ("alfa", b("bravo")) in tr + assert ("alfa", b('\xc3\xa6\xc3\xaf\xc5\xc3\xba')) in tr + assert ("text", b('\xe6\xa5\xe6\xac\xe8\xaa')) in tr + tr.close() + + +def test_random_termkeys(): + def random_fieldname(): + return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20)) + + def random_btext(): + a = array("H", (random.randint(0, 0xd7ff) for _ in xrange(1, 20))) + return array_tobytes(a).decode("utf-16") + + domain = sorted(set([(random_fieldname(), random_btext().encode("utf-8")) + for _ in xrange(1000)])) + + st, codec, seg = _make_codec() + fieldobj = fields.TEXT() + tw = codec.field_writer(st, seg) + # Stupid ultra-low-level hand-adding of postings just to check handling of + # random fieldnames and term texts + lastfield = None + for fieldname, text in domain: + if lastfield and fieldname != lastfield: + tw.finish_field() + lastfield = None + if lastfield is None: + tw.start_field(fieldname, fieldobj) + lastfield = fieldname + tw.start_term(text) + tw.add(0, 1.0, b(""), 1) + tw.finish_term() + if lastfield: + tw.finish_field() + tw.close() + + tr = codec.terms_reader(st, seg) + for term in domain: + assert term in tr + + +def test_stored_fields(): + codec = default_codec() + fieldobj = fields.TEXT(stored=True) + with TempStorage("storedfields") as st: + seg = codec.new_segment(st, "test") + + dw = codec.per_document_writer(st, seg) + dw.start_doc(0) + dw.add_field("a", fieldobj, "hello", 1) + dw.add_field("b", fieldobj, "there", 1) + dw.finish_doc() + + dw.start_doc(1) + dw.add_field("a", fieldobj, "one", 1) + dw.add_field("b", fieldobj, "two", 1) + dw.add_field("c", fieldobj, "three", 1) + dw.finish_doc() + + dw.start_doc(2) + dw.finish_doc() + + dw.start_doc(3) + dw.add_field("a", fieldobj, "alfa", 1) + dw.add_field("b", fieldobj, "bravo", 1) + dw.finish_doc() + + dw.close() + seg.set_doc_count(4) + + pdr = codec.per_document_reader(st, seg) + assert pdr.doc_count_all() == 4 + assert pdr.stored_fields(0) == {"a": "hello", "b": "there"} + # Note: access out of order + assert pdr.stored_fields(3), {"a": "alfa", "b": "bravo"} + assert pdr.stored_fields(1) == {"a": "one", "b": "two", "c": "three"} + + sfs = list(pdr.all_stored_fields()) + assert len(sfs) == 4 + assert sfs == [{"a": "hello", "b": "there"}, + {"a": "one", "b": "two", "c": "three"}, + {}, + {"a": "alfa", "b": "bravo"}, + ] + pdr.close() + + +def test_termindex(): + terms = [("a", "alfa"), ("a", "bravo"), ("a", "charlie"), ("a", "delta"), + ("b", "able"), ("b", "baker"), ("b", "dog"), ("b", "easy")] + st, codec, seg = _make_codec() + schema = fields.Schema(a=fields.TEXT, b=fields.TEXT) + + tw = codec.field_writer(st, seg) + postings = ((fname, b(text), 0, i, b("")) for (i, (fname, text)) + in enumerate(terms)) + tw.add_postings(schema, FakeLengths(), postings) + tw.close() + + tr = codec.terms_reader(st, seg) + for i, (fieldname, text) in enumerate(terms): + assert (fieldname, b(text)) in tr + ti = tr.term_info(fieldname, b(text)) + assert ti.weight() == i + assert ti.doc_frequency() == 1 + + +def test_docwriter_one(): + field = fields.TEXT(stored=True) + st, codec, seg = _make_codec() + dw = codec.per_document_writer(st, seg) + dw.start_doc(0) + dw.add_field("text", field, "Testing one two three", 4) + dw.finish_doc() + dw.close() + seg.set_doc_count(1) + + pdr = codec.per_document_reader(st, seg) + assert pdr.doc_field_length(0, "text") == 4 + assert pdr.stored_fields(0) == {"text": "Testing one two three"} + + +def test_docwriter_two(): + field = fields.TEXT(stored=True) + st, codec, seg = _make_codec() + dw = codec.per_document_writer(st, seg) + dw.start_doc(0) + dw.add_field("title", field, ("a", "b"), 2) + dw.add_field("text", field, "Testing one two three", 4) + dw.finish_doc() + dw.start_doc(1) + dw.add_field("title", field, "The second document", 3) + dw.add_field("text", field, 500, 1) + dw.finish_doc() + dw.close() + seg.set_doc_count(2) + + pdr = codec.per_document_reader(st, seg) + assert pdr.doc_field_length(0, "title") == 2 + assert pdr.doc_field_length(0, "text") == 4 + assert pdr.doc_field_length(1, "title") == 3 + assert pdr.doc_field_length(1, "text") == 1 + + assert (pdr.stored_fields(0) + == {"title": ("a", "b"), "text": "Testing one two three"}) + assert (pdr.stored_fields(1) + == {"title": "The second document", "text": 500}) + + +def test_vector(): + field = fields.TEXT(vector=True) + st, codec, seg = _make_codec() + dw = codec.per_document_writer(st, seg) + dw.start_doc(0) + dw.add_field("title", field, None, 1) + dw.add_vector_items("title", field, [(u("alfa"), 1.0, b("t1")), + (u("bravo"), 2.0, b("t2"))]) + dw.finish_doc() + dw.close() + seg.set_doc_count(1) + + pdr = codec.per_document_reader(st, seg) + assert pdr.stored_fields(0) == {} + + m = pdr.vector(0, "title", field.vector) + assert m.is_active() + ps = [] + while m.is_active(): + ps.append((m.id(), m.weight(), m.value())) + m.next() + assert ps == [(u("alfa"), 1.0, b("t1")), (u("bravo"), 2.0, b("t2"))] + + +def test_vector_values(): + field = fields.TEXT(vector=formats.Frequency()) + st, codec, seg = _make_codec() + content = u("alfa bravo charlie alfa") + + dw = codec.per_document_writer(st, seg) + dw.start_doc(0) + vals = ((t, w, v) for t, _, w, v + in sorted(field.vector.word_values(content, field.analyzer))) + dw.add_vector_items("f1", field, vals) + dw.finish_doc() + dw.close() + + vr = codec.per_document_reader(st, seg) + m = vr.vector(0, "f1", field.vector) + assert (list(m.items_as("frequency")) + == [("alfa", 2), ("bravo", 1), ("charlie", 1)]) + + +def test_no_lengths(): + f1 = fields.ID() + st, codec, seg = _make_codec() + dw = codec.per_document_writer(st, seg) + dw.start_doc(0) + dw.add_field("name", f1, None, None) + dw.finish_doc() + dw.start_doc(1) + dw.add_field("name", f1, None, None) + dw.finish_doc() + dw.start_doc(2) + dw.add_field("name", f1, None, None) + dw.finish_doc() + dw.close() + seg.set_doc_count(3) + + pdr = codec.per_document_reader(st, seg) + assert pdr.doc_field_length(0, "name") == 0 + assert pdr.doc_field_length(1, "name") == 0 + assert pdr.doc_field_length(2, "name") == 0 + + +def test_store_zero(): + f1 = fields.ID(stored=True) + st, codec, seg = _make_codec() + dw = codec.per_document_writer(st, seg) + dw.start_doc(0) + dw.add_field("name", f1, 0, None) + dw.finish_doc() + dw.close() + seg.set_doc_count(1) + + sr = codec.per_document_reader(st, seg) + assert sr.stored_fields(0) == {"name": 0} + + +def test_fieldwriter_single_term(): + field = fields.TEXT() + st, codec, seg = _make_codec() + + fw = codec.field_writer(st, seg) + fw.start_field("text", field) + fw.start_term(b("alfa")) + fw.add(0, 1.5, b("test"), 1) + fw.finish_term() + fw.finish_field() + fw.close() + + tr = codec.terms_reader(st, seg) + assert ("text", b("alfa")) in tr + ti = tr.term_info("text", b("alfa")) + assert ti.weight() == 1.5 + assert ti.doc_frequency() == 1 + assert ti.min_length() == 1 + assert ti.max_length() == 1 + assert ti.max_weight() == 1.5 + assert ti.min_id() == 0 + assert ti.max_id() == 0 + + +def test_fieldwriter_two_terms(): + field = fields.TEXT() + st, codec, seg = _make_codec() + + fw = codec.field_writer(st, seg) + fw.start_field("text", field) + fw.start_term(b("alfa")) + fw.add(0, 2.0, b("test1"), 2) + fw.add(1, 1.0, b("test2"), 1) + fw.finish_term() + fw.start_term(b("bravo")) + fw.add(0, 3.0, b("test3"), 3) + fw.add(2, 2.0, b("test4"), 2) + fw.finish_term() + fw.finish_field() + fw.close() + + tr = codec.terms_reader(st, seg) + assert ("text", b("alfa")) in tr + ti = tr.term_info("text", b("alfa")) + assert ti.weight() == 3.0 + assert ti.doc_frequency() == 2 + assert ti.min_length() == 1 + assert ti.max_length() == 2 + assert ti.max_weight() == 2.0 + assert ti.min_id() == 0 + assert ti.max_id() == 1 + assert ("text", b("bravo")) in tr + ti = tr.term_info("text", b("bravo")) + assert ti.weight() == 5.0 + assert ti.doc_frequency() == 2 + assert ti.min_length() == 2 + assert ti.max_length() == 3 + assert ti.max_weight() == 3.0 + assert ti.min_id() == 0 + assert ti.max_id() == 2 + + m = tr.matcher("text", b("bravo"), field.format) + assert list(m.all_ids()) == [0, 2] + + +def test_fieldwriter_multiblock(): + field = fields.TEXT() + st, codec, seg = _make_codec(blocklimit=2) + + fw = codec.field_writer(st, seg) + fw.start_field("text", field) + fw.start_term(b("alfa")) + fw.add(0, 2.0, b("test1"), 2) + fw.add(1, 5.0, b("test2"), 5) + fw.add(2, 3.0, b("test3"), 3) + fw.add(3, 4.0, b("test4"), 4) + fw.add(4, 1.0, b("test5"), 1) + fw.finish_term() + fw.finish_field() + fw.close() + + tr = codec.terms_reader(st, seg) + ti = tr.term_info("text", b("alfa")) + assert ti.weight() == 15.0 + assert ti.doc_frequency() == 5 + assert ti.min_length() == 1 + assert ti.max_length() == 5 + assert ti.max_weight() == 5.0 + assert ti.min_id() == 0 + assert ti.max_id() == 4 + + ps = [] + m = tr.matcher("text", b("alfa"), field.format) + while m.is_active(): + ps.append((m.id(), m.weight(), m.value())) + m.next() + assert ps == [(0, 2.0, b("test1")), (1, 5.0, b("test2")), + (2, 3.0, b("test3")), (3, 4.0, b("test4")), + (4, 1.0, b("test5"))] + + +def test_term_values(): + field = fields.TEXT(phrase=False) + st, codec, seg = _make_codec() + content = u("alfa bravo charlie alfa") + + fw = codec.field_writer(st, seg) + fw.start_field("f1", field) + for text, freq, weight, val in sorted(field.index(content)): + fw.start_term(text) + fw.add(0, weight, val, freq) + fw.finish_term() + fw.finish_field() + fw.close() + + tr = codec.terms_reader(st, seg) + ps = [(term, ti.weight(), ti.doc_frequency()) for term, ti in tr.items()] + assert ps == [(("f1", b("alfa")), 2.0, 1), (("f1", b("bravo")), 1.0, 1), + (("f1", b("charlie")), 1.0, 1)] + + +def test_skip(): + _docnums = [1, 3, 12, 34, 43, 67, 68, 102, 145, 212, 283, 291, 412, 900, + 905, 1024, 1800, 2048, 15000] + st, codec, seg = _make_codec() + fieldobj = fields.TEXT() + fw = codec.field_writer(st, seg) + fw.start_field("f1", fieldobj) + fw.start_term(b("test")) + for n in _docnums: + fw.add(n, 1.0, b(''), None) + fw.finish_term() + fw.finish_field() + fw.close() + + tr = codec.terms_reader(st, seg) + m = tr.matcher("f1", b("test"), fieldobj.format) + assert m.id() == 1 + m.skip_to(220) + assert m.id() == 283 + m.skip_to(1) + assert m.id() == 283 + m.skip_to(1000) + assert m.id() == 1024 + m.skip_to(1800) + assert m.id() == 1800 + + +# def test_spelled_field(): +# field = fields.TEXT(spelling=True) +# st, codec, seg = _make_codec() +# +# fw = codec.field_writer(st, seg) +# fw.start_field("text", field) +# fw.start_term(b("special")) +# fw.add(0, 1.0, b("test1"), 1) +# fw.finish_term() +# fw.start_term(b("specific")) +# fw.add(1, 1.0, b("test2"), 1) +# fw.finish_term() +# fw.finish_field() +# fw.close() +# +# gr = codec.graph_reader(st, seg) +# assert gr.has_root("text") +# cur = gr.cursor("text") +# strings = list(cur.flatten_strings()) +# assert type(strings[0]) == text_type +# assert strings == ["special", "specific"] +# +# +# def test_special_spelled_field(): +# from whoosh.analysis import StemmingAnalyzer +# +# field = fields.TEXT(analyzer=StemmingAnalyzer(), spelling=True) +# st, codec, seg = _make_codec() +# +# fw = codec.field_writer(st, seg) +# fw.start_field("text", field) +# fw.start_term(b("special")) +# fw.add(0, 1.0, b("test1"), 1) +# fw.finish_term() +# fw.start_term(b("specific")) +# fw.add(1, 1.0, b("test2"), 1) +# fw.finish_term() +# fw.add_spell_word("text", u("specials")) +# fw.add_spell_word("text", u("specifically")) +# fw.finish_field() +# fw.close() +# +# tr = codec.terms_reader(st, seg) +# assert list(tr.terms()) == [("text", b("special")), ("text", b("specific"))] +# +# cur = codec.graph_reader(st, seg).cursor("text") +# assert list(cur.flatten_strings()) == ["specials", "specifically"] + + +def test_plaintext_codec(): + pytest.importorskip("ast") + from whoosh.codec.plaintext import PlainTextCodec + from whoosh.codec.whoosh3 import W3Codec + + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(a=fields.TEXT(vector=True, sortable=True), + b=fields.STORED, + c=fields.NUMERIC(stored=True, sortable=True), + d=fields.TEXT(analyzer=ana, spelling=True)) + + st = RamStorage() + ix = st.create_index(schema) + with ix.writer(codec=W3Codec()) as w: + w.add_document(a=u("alfa bravo charlie"), b="hello", c=100, + d=u("quelling whining echoing")) + w.add_document(a=u("bravo charlie delta"), b=1000, c=200, + d=u("rolling timing yelling")) + w.add_document(a=u("charlie delta echo"), b=5.5, c=300, + d=u("using opening pulling")) + w.add_document(a=u("delta echo foxtrot"), b=True, c=-100, + d=u("aching selling dipping")) + w.add_document(a=u("echo foxtrot india"), b=None, c=-200, + d=u("filling going hopping")) + + with ix.reader() as r: + assert r.has_column("a") + c = r.column_reader("a") + assert c[2] == u("charlie delta echo") + + w = ix.writer(codec=PlainTextCodec()) + w.commit(optimize=True) + + with ix.searcher() as s: + reader = s.reader() + assert isinstance(reader.codec(), PlainTextCodec) + + r = s.search(query.Term("a", "delta")) + assert len(r) == 3 + assert [hit["b"] for hit in r] == [1000, 5.5, True] + + assert (" ".join(s.field_terms("a")) + == "alfa bravo charlie delta echo foxtrot india") + + storage = ix.storage + for fname in storage.list(): + if fname.endswith(".dcs"): + f = storage.open_file(fname) + # print(f.read().decode("utf8")) + + assert reader.doc_field_length(0, "a") == 3 + assert reader.doc_field_length(2, "a") == 3 + + cfield = schema["c"] + assert type(cfield), fields.NUMERIC + sortables = list(cfield.sortable_terms(reader, "c")) + assert sortables + assert ([cfield.from_bytes(t) for t in sortables] + == [-200, -100, 100, 200, 300]) + + assert reader.has_column("a") + c = reader.column_reader("a") + assert c[2] == u("charlie delta echo") + + assert reader.has_column("c") + c = reader.column_reader("c") + assert list(c) == [100, 200, 300, -100, -200] + + assert s.has_vector(2, "a") + v = s.vector(2, "a") + assert " ".join(v.all_ids()) == "charlie delta echo" + + +def test_memory_codec(): + from whoosh.codec import memory + from whoosh.searching import Searcher + + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(a=fields.TEXT(vector=True), + b=fields.STORED, + c=fields.NUMERIC(stored=True, sortable=True), + d=fields.TEXT(analyzer=ana, spelling=True)) + + codec = memory.MemoryCodec() + with codec.writer(schema) as w: + w.add_document(a=u("alfa bravo charlie"), b="hello", c=100, + d=u("quelling whining echoing")) + w.add_document(a=u("bravo charlie delta"), b=1000, c=200, + d=u("rolling timing yelling")) + w.add_document(a=u("charlie delta echo"), b=5.5, c=300, + d=u("using opening pulling")) + w.add_document(a=u("delta echo foxtrot"), b=True, c=-100, + d=u("aching selling dipping")) + w.add_document(a=u("echo foxtrot india"), b=None, c=-200, + d=u("filling going hopping")) + + reader = codec.reader(schema) + s = Searcher(reader) + + assert ("a", "delta") in reader + q = query.Term("a", "delta") + r = s.search(q) + assert len(r) == 3 + assert [hit["b"] for hit in r] == [1000, 5.5, True] + + assert (" ".join(s.field_terms("a")) + == "alfa bravo charlie delta echo foxtrot india") + + cfield = schema["c"] + c_sortables = cfield.sortable_terms(reader, "c") + c_values = [cfield.from_bytes(t) for t in c_sortables] + assert c_values, [-200, -100, 100, 200, 300] + + assert reader.has_column("c") + c_values = list(reader.column_reader("c")) + assert c_values == [100, 200, 300, -100, -200] + + assert s.has_vector(2, "a") + v = s.vector(2, "a") + assert " ".join(v.all_ids()) == "charlie delta echo" + + +def test_memory_multiwrite(): + from whoosh.codec import memory + + domain = ["alfa bravo charlie delta", + "bravo charlie delta echo", + "charlie delta echo foxtrot", + "delta echo foxtrot india", + "echo foxtrot india juliet"] + + schema = fields.Schema(line=fields.TEXT(stored=True)) + codec = memory.MemoryCodec() + + for line in domain: + with codec.writer(schema) as w: + w.add_document(line=u(line)) + + reader = codec.reader(schema) + assert [sf["line"] for sf in reader.all_stored_fields()] == domain + assert (" ".join(reader.field_terms("line")) + == "alfa bravo charlie delta echo foxtrot india juliet") diff --git a/tests/test_collector.py b/tests/test_collector.py new file mode 100644 index 0000000..a42faab --- /dev/null +++ b/tests/test_collector.py @@ -0,0 +1,229 @@ +from __future__ import with_statement + +import pytest + +from whoosh import collectors, fields, query, searching +from whoosh.compat import b, u, xrange +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex + + +def test_add(): + schema = fields.Schema(id=fields.STORED, text=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, text=u("alfa bravo charlie")) + w.add_document(id=2, text=u("alfa bravo delta")) + w.add_document(id=3, text=u("alfa charlie echo")) + w.commit() + + with ix.searcher() as s: + assert s.doc_frequency("text", u("charlie")) == 2 + r = s.search(query.Term("text", u("charlie"))) + assert [hit["id"] for hit in r] == [1, 3] + assert len(r) == 2 + + +def test_filter_that_matches_no_document(): + schema = fields.Schema(id=fields.STORED, text=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, text=u("alfa bravo charlie")) + w.add_document(id=2, text=u("alfa bravo delta")) + w.commit() + + with ix.searcher() as s: + r = s.search( + query.Every(), + filter=query.Term("text", u("echo"))) + assert [hit["id"] for hit in r] == [] + assert len(r) == 0 + + +def test_timelimit(): + schema = fields.Schema(text=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + for _ in xrange(50): + w.add_document(text=u("alfa")) + w.commit() + + import time + from whoosh import collectors, matching + + class SlowMatcher(matching.WrappingMatcher): + def next(self): + time.sleep(0.02) + self.child.next() + + class SlowQuery(query.WrappingQuery): + def matcher(self, searcher, context=None): + return SlowMatcher(self.child.matcher(searcher, context)) + + with ix.searcher() as s: + oq = query.Term("text", u("alfa")) + sq = SlowQuery(oq) + + col = collectors.TimeLimitCollector(s.collector(limit=None), + timelimit=0.1) + with pytest.raises(searching.TimeLimit): + s.search_with_collector(sq, col) + + col = collectors.TimeLimitCollector(s.collector(limit=40), + timelimit=0.1) + with pytest.raises(collectors.TimeLimit): + s.search_with_collector(sq, col) + + col = collectors.TimeLimitCollector(s.collector(limit=None), + timelimit=0.25) + try: + s.search_with_collector(sq, col) + assert False # Shouldn't get here + except collectors.TimeLimit: + r = col.results() + assert r.scored_length() > 0 + + col = collectors.TimeLimitCollector(s.collector(limit=None), + timelimit=0.5) + s.search_with_collector(oq, col) + assert col.results().runtime < 0.5 + + +@pytest.mark.skipif("not hasattr(__import__('signal'), 'SIGALRM')") +def test_timelimit_alarm(): + import time + from whoosh import matching + + class SlowMatcher(matching.Matcher): + def __init__(self): + self._id = 0 + + def id(self): + return self._id + + def is_active(self): + return self._id == 0 + + def next(self): + time.sleep(10) + self._id = 1 + + def score(self): + return 1.0 + + class SlowQuery(query.Query): + def matcher(self, searcher, context=None): + return SlowMatcher() + + schema = fields.Schema(text=fields.TEXT) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(text=u("Hello")) + + with ix.searcher() as s: + q = SlowQuery() + + t = time.time() + c = s.collector() + c = collectors.TimeLimitCollector(c, 0.2) + with pytest.raises(searching.TimeLimit): + _ = s.search_with_collector(q, c) + assert time.time() - t < 0.5 + + +def test_reverse_collapse(): + from whoosh import sorting + + schema = fields.Schema(title=fields.TEXT(stored=True), + content=fields.TEXT, + path=fields.ID(stored=True), + tags=fields.KEYWORD, + order=fields.NUMERIC(stored=True)) + + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(title=u"First document", + content=u"This is my document!", + path=u"/a", tags=u"first", order=20.0) + w.add_document(title=u"Second document", + content=u"This is the second example.", + path=u"/b", tags=u"second", order=12.0) + w.add_document(title=u"Third document", + content=u"Examples are many.", + path=u"/c", tags=u"third", order=15.0) + w.add_document(title=u"Thirdish document", + content=u"Examples are too many.", + path=u"/d", tags=u"third", order=25.0) + + with ix.searcher() as s: + q = query.Every('content') + r = s.search(q) + assert [hit["path"] for hit in r] == ["/a", "/b", "/c", "/d"] + + q = query.Or([query.Term("title", "document"), + query.Term("content", "document"), + query.Term("tags", "document")]) + cf = sorting.FieldFacet("tags") + of = sorting.FieldFacet("order", reverse=True) + r = s.search(q, collapse=cf, collapse_order=of, terms=True) + assert [hit["path"] for hit in r] == ["/a", "/b", "/d"] + + +def test_termdocs(): + schema = fields.Schema(key=fields.TEXT, city=fields.ID) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(key=u"ant", city=u"london") + w.add_document(key=u"anteater", city=u"roma") + w.add_document(key=u"bear", city=u"london") + w.add_document(key=u"bees", city=u"roma") + w.add_document(key=u"anorak", city=u"london") + w.add_document(key=u"antimatter", city=u"roma") + w.add_document(key=u"angora", city=u"london") + w.add_document(key=u"angels", city=u"roma") + + with ix.searcher() as s: + cond_q = query.Term("city", u"london") + pref_q = query.Prefix("key", u"an") + q = query.And([cond_q, pref_q]).normalize() + r = s.search(q, scored=False, terms=True) + + field = s.schema["key"] + terms = [field.from_bytes(term) for fieldname, term in r.termdocs + if fieldname == "key"] + assert sorted(terms) == [u"angora", u"anorak", u"ant"] + +def test_termdocs2(): + schema = fields.Schema(key=fields.TEXT, city=fields.ID) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(key=u"ant", city=u"london") + w.add_document(key=u"anteater", city=u"roma") + w.add_document(key=u"bear", city=u"london") + w.add_document(key=u"bees", city=u"roma") + w.add_document(key=u"anorak", city=u"london") + w.add_document(key=u"antimatter", city=u"roma") + w.add_document(key=u"angora", city=u"london") + w.add_document(key=u"angels", city=u"roma") + + with ix.searcher() as s: + # A query that matches the applicable documents + cond_q = query.Term("city", "london") + # Get a list of the documents that match the condition(s) + cond_docnums = set(cond_q.docs(s)) + # Grab the suggestion field for later + field = s.schema["key"] + + terms = [] + # Expand the prefix + for term in s.reader().expand_prefix("key", "an"): + # Get the documents the term is in + for docnum in s.document_numbers(key=term): + # Check if it's in the set matching the condition(s) + if docnum in cond_docnums: + # If so, decode the term from bytes and add it to the list, + # then move on to the next term + terms.append(field.from_bytes(term)) + break + assert terms == ["angora", "anorak", "ant"] + diff --git a/tests/test_columns.py b/tests/test_columns.py new file mode 100644 index 0000000..d5b8b93 --- /dev/null +++ b/tests/test_columns.py @@ -0,0 +1,280 @@ +from __future__ import with_statement +import inspect, random, sys + +from whoosh import columns, fields, query +from whoosh.codec.whoosh3 import W3Codec +from whoosh.compat import b, u, BytesIO, bytes_type, text_type +from whoosh.compat import izip, xrange, dumps, loads +from whoosh.filedb import compound +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex, TempStorage + + +def test_pickleability(): + # Ignore base classes + ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn) + # Required arguments + init_args = {"ClampedNumericColumn": (columns.NumericColumn("B"),), + "FixedBytesColumn": (5,), + "FixedBytesListColumn": (5,), + "NumericColumn": ("i",), + "PickleColumn": (columns.VarBytesColumn(),), + "StructColumn": ("=if", (0, 0.0)), + } + + coltypes = [c for _, c in inspect.getmembers(columns, inspect.isclass) + if issubclass(c, columns.Column) and not c in ignore] + + for coltype in coltypes: + args = init_args.get(coltype.__name__, ()) + try: + inst = coltype(*args) + except TypeError: + e = sys.exc_info()[1] + raise TypeError("Error instantiating %r: %s" % (coltype, e)) + _ = loads(dumps(inst, -1)) + + +def test_multistream(): + domain = [("a", "12345"), ("b", "abc"), ("c", "AaBbC"), + ("a", "678"), ("c", "cDdEeF"), ("b", "defgh"), + ("b", "ijk"), ("c", "fGgHh"), ("a", "9abc")] + + st = RamStorage() + msw = compound.CompoundWriter(st) + files = dict((name, msw.create_file(name)) for name in "abc") + for name, data in domain: + files[name].write(b(data)) + f = st.create_file("test") + msw.save_as_compound(f) + + f = st.open_file("test") + msr = compound.CompoundStorage(f) + assert msr.open_file("a").read() == b("123456789abc") + assert msr.open_file("b").read() == b("abcdefghijk") + assert msr.open_file("c").read() == b("AaBbCcDdEeFfGgHh") + + +def test_random_multistream(): + letters = "abcdefghijklmnopqrstuvwxyz" + + def randstring(n): + s = "".join(random.choice(letters) for _ in xrange(n)) + return s.encode("latin1") + + domain = {} + for _ in xrange(100): + name = randstring(random.randint(5, 10)) + value = randstring(2500) + domain[name] = value + + outfiles = dict((name, BytesIO(value)) for name, value in domain.items()) + + with TempStorage() as st: + msw = compound.CompoundWriter(st, buffersize=1024) + mfiles = {} + for name in domain: + mfiles[name] = msw.create_file(name) + while outfiles: + name = random.choice(list(outfiles.keys())) + v = outfiles[name].read(1000) + mfiles[name].write(v) + if len(v) < 1000: + del outfiles[name] + f = st.create_file("test") + msw.save_as_compound(f) + + f = st.open_file("test") + msr = compound.CompoundStorage(f) + for name, value in domain.items(): + assert msr.open_file(name).read() == value + msr.close() + + +def _rt(c, values, default): + # Continuous + st = RamStorage() + f = st.create_file("test1") + f.write(b("hello")) + w = c.writer(f) + for docnum, v in enumerate(values): + w.add(docnum, v) + w.finish(len(values)) + length = f.tell() - 5 + f.close() + + f = st.open_file("test1") + r = c.reader(f, 5, length, len(values)) + assert values == list(r) + for x in range(len(values)): + assert values[x] == r[x] + f.close() + + # Sparse + doccount = len(values) * 7 + 15 + target = [default] * doccount + + f = st.create_file("test2") + f.write(b("hello")) + w = c.writer(f) + for docnum, v in izip(xrange(10, doccount, 7), values): + target[docnum] = v + w.add(docnum, v) + w.finish(doccount) + length = f.tell() - 5 + f.close() + + f = st.open_file("test2") + r = c.reader(f, 5, length, doccount) + assert target == list(r) + for x in range(doccount): + assert target[x] == r[x] + + lr = r.load() + assert target == list(lr) + f.close() + + +def test_roundtrip(): + _rt(columns.VarBytesColumn(), + [b("a"), b("ccc"), b("bbb"), b("e"), b("dd")], b("")) + _rt(columns.FixedBytesColumn(5), + [b("aaaaa"), b("eeeee"), b("ccccc"), b("bbbbb"), b("eeeee")], + b("\x00") * 5) + _rt(columns.RefBytesColumn(), + [b("a"), b("ccc"), b("bb"), b("ccc"), b("a"), b("bb")], b("")) + _rt(columns.RefBytesColumn(3), + [b("aaa"), b("bbb"), b("ccc"), b("aaa"), b("bbb"), b("ccc")], + b("\x00") * 3) + _rt(columns.StructColumn("ifH", (0, 0.0, 0)), + [(100, 1.5, 15000), (-100, -5.0, 0), (5820, 6.5, 462), + (-57829, -1.5, 6), (0, 0, 0)], + (0, 0.0, 0)) + + numcol = columns.NumericColumn + _rt(numcol("b"), [10, -20, 30, -25, 15], 0) + _rt(numcol("B"), [10, 20, 30, 25, 15], 0) + _rt(numcol("h"), [1000, -2000, 3000, -15000, 32000], 0) + _rt(numcol("H"), [1000, 2000, 3000, 15000, 50000], 0) + _rt(numcol("i"), [2 ** 16, -(2 ** 20), 2 ** 24, -(2 ** 28), 2 ** 30], 0) + _rt(numcol("I"), [2 ** 16, 2 ** 20, 2 ** 24, 2 ** 28, 2 ** 31 & 0xFFFFFFFF], 0) + _rt(numcol("q"), [10, -20, 30, -25, 15], 0) + _rt(numcol("Q"), [2 ** 35, 2 ** 40, 2 ** 48, 2 ** 52, 2 ** 63], 0) + _rt(numcol("f"), [1.5, -2.5, 3.5, -4.5, 1.25], 0) + _rt(numcol("d"), [1.5, -2.5, 3.5, -4.5, 1.25], 0) + + c = columns.BitColumn(compress_at=10) + _rt(c, [bool(random.randint(0, 1)) for _ in xrange(70)], False) + _rt(c, [bool(random.randint(0, 1)) for _ in xrange(90)], False) + + c = columns.PickleColumn(columns.VarBytesColumn()) + _rt(c, [None, True, False, 100, -7, "hello"], None) + + +def test_multivalue(): + schema = fields.Schema(s=fields.TEXT(sortable=True), + n=fields.NUMERIC(sortable=True)) + ix = RamStorage().create_index(schema) + with ix.writer(codec=W3Codec()) as w: + w.add_document(s=u("alfa foxtrot charlie").split(), n=[100, 200, 300]) + w.add_document(s=u("juliet bravo india").split(), n=[10, 20, 30]) + + with ix.reader() as r: + scr = r.column_reader("s") + assert list(scr) == ["alfa", "juliet"] + + ncr = r.column_reader("n") + assert list(ncr) == [100, 10] + + +def test_column_field(): + schema = fields.Schema(a=fields.TEXT(sortable=True), + b=fields.COLUMN(columns.RefBytesColumn())) + with TempIndex(schema, "columnfield") as ix: + with ix.writer(codec=W3Codec()) as w: + w.add_document(a=u("alfa bravo"), b=b("charlie delta")) + w.add_document(a=u("bravo charlie"), b=b("delta echo")) + w.add_document(a=u("charlie delta"), b=b("echo foxtrot")) + + with ix.reader() as r: + assert r.has_column("a") + assert r.has_column("b") + + cra = r.column_reader("a") + assert cra[0] == u("alfa bravo") + assert type(cra[0]) == text_type + + crb = r.column_reader("b") + assert crb[0] == b("charlie delta") + assert type(crb[0]) == bytes_type + + +def test_column_query(): + schema = fields.Schema(id=fields.STORED, + a=fields.ID(sortable=True), + b=fields.NUMERIC(sortable=True)) + with TempIndex(schema, "columnquery") as ix: + with ix.writer(codec=W3Codec()) as w: + w.add_document(id=1, a=u("alfa"), b=10) + w.add_document(id=2, a=u("bravo"), b=20) + w.add_document(id=3, a=u("charlie"), b=30) + w.add_document(id=4, a=u("delta"), b=40) + w.add_document(id=5, a=u("echo"), b=50) + w.add_document(id=6, a=u("foxtrot"), b=60) + + with ix.searcher() as s: + def check(q): + return [s.stored_fields(docnum)["id"] for docnum in q.docs(s)] + + q = query.ColumnQuery("a", u("bravo")) + assert check(q) == [2] + + q = query.ColumnQuery("b", 30) + assert check(q) == [3] + + q = query.ColumnQuery("a", lambda v: v != u("delta")) + assert check(q) == [1, 2, 3, 5, 6] + + q = query.ColumnQuery("b", lambda v: v > 30) + assert check(q) == [4, 5, 6] + + +def test_ref_switch(): + import warnings + + col = columns.RefBytesColumn() + + def rw(size): + st = RamStorage() + + f = st.create_file("test") + cw = col.writer(f) + for i in xrange(size): + cw.add(i, hex(i).encode("latin1")) + cw.finish(size) + length = f.tell() + f.close() + + f = st.open_file("test") + cr = col.reader(f, 0, length, size) + for i in xrange(size): + v = cr[i] + # Column ignores additional unique values after 65535 + if i <= 65535 - 1: + assert v == hex(i).encode("latin1") + else: + assert v == b('') + f.close() + + rw(255) + + # warnings.catch_warnings is not available in Python 2.5 + if hasattr(warnings, "catch_warnings"): + # Column warns on additional unique values after 65535 + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + rw(65537) + + assert len(w) == 2 + assert issubclass(w[-1].category, UserWarning) diff --git a/tests/test_compound.py b/tests/test_compound.py new file mode 100644 index 0000000..032b375 --- /dev/null +++ b/tests/test_compound.py @@ -0,0 +1,65 @@ +from __future__ import with_statement + +from whoosh.compat import b +from whoosh.filedb.compound import CompoundStorage +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempStorage + + +def _test_simple_compound(st): + alist = [1, 2, 3, 5, -5, -4, -3, -2] + blist = [1, 12, 67, 8, 2, 1023] + clist = [100, -100, 200, -200] + + with st.create_file("a") as af: + for x in alist: + af.write_int(x) + with st.create_file("b") as bf: + for x in blist: + bf.write_varint(x) + with st.create_file("c") as cf: + for x in clist: + cf.write_int(x) + + f = st.create_file("f") + CompoundStorage.assemble(f, st, ["a", "b", "c"]) + + f = CompoundStorage(st.open_file("f")) + with f.open_file("a") as af: + for x in alist: + assert x == af.read_int() + assert af.read() == b('') + + with f.open_file("b") as bf: + for x in blist: + assert x == bf.read_varint() + assert bf.read() == b('') + + with f.open_file("c") as cf: + for x in clist: + assert x == cf.read_int() + assert cf.read() == b('') + + +def test_simple_compound_mmap(): + with TempStorage("compound") as st: + assert st.supports_mmap + _test_simple_compound(st) + + +def test_simple_compound_nomap(): + st = RamStorage() + _test_simple_compound(st) + + +#def test_unclosed_mmap(): +# with TempStorage("unclosed") as st: +# assert st.supports_mmap +# with st.create_file("a") as af: +# af.write("alfa") +# with st.create_file("b") as bf: +# bf.write("bravo") +# f = st.create_file("f") +# CompoundStorage.assemble(f, st, ["a", "b"]) +# +# f = CompoundStorage(st, "f") diff --git a/tests/test_dateparse.py b/tests/test_dateparse.py new file mode 100644 index 0000000..427ba93 --- /dev/null +++ b/tests/test_dateparse.py @@ -0,0 +1,356 @@ +from whoosh.qparser.dateparse import * + + +basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) +english = English() + + +def assert_adatetime(at, **kwargs): + assert at.__class__ is adatetime + for key in adatetime.units: + val = getattr(at, key) + target = kwargs.get(key) + assert val == target, "at.%s=%r not %r in %r" % (key, val, target, at) + + +def assert_timespan(ts, sargs, eargs): + assert_adatetime(ts.start, **sargs) + + +def assert_unamb(ts, **kwargs): + assert_unamb_span(ts, kwargs, kwargs) + + +def assert_unamb_span(ts, sargs, eargs): + startdt = adatetime(**sargs).floor() + enddt = adatetime(**eargs).ceil() + assert ts.start == startdt, "start %s != %s" % (ts.start, startdt) + assert ts.end == enddt, "end %s != %s" % (ts.end, enddt) + + +def assert_datespan(ts, startdate, enddate): + assert ts.__class__ is timespan + assert ts.start == startdate + assert ts.end == enddate + + +# + +def test_simple_dateparse(t=english.simple): + assert_adatetime(t.date_from("2005", basedate), year=2005) + assert_adatetime(t.date_from("200505", basedate), year=2005, month=5) + assert_adatetime(t.date_from("20050510", basedate), year=2005, month=5, + day=10) + assert_adatetime(t.date_from("2005051001", basedate), + year=2005, month=5, day=10, hour=1) + assert_adatetime(t.date_from("200505100108", basedate), + year=2005, month=5, day=10, hour=1, minute=8) + assert_adatetime(t.date_from("20050510010835", basedate), + year=2005, month=5, day=10, hour=1, minute=8, second=35) + + assert_adatetime(t.date_from("2005-05", basedate), + year=2005, month=5) + assert_adatetime(t.date_from("2005 05 10", basedate), + year=2005, month=5, day=10) + assert_adatetime(t.date_from("2005.05.10.01", basedate), + year=2005, month=5, day=10, hour=1) + assert_adatetime(t.date_from("2005/05/10 01:08", basedate), + year=2005, month=5, day=10, hour=1, minute=8) + assert_adatetime(t.date_from("2005.05.10 01:08:35", basedate), + year=2005, month=5, day=10, hour=1, minute=8, second=35) + + assert t.date_from("2005 02 31", basedate) is None + assert t.date_from("2005-13-32", basedate) is None + + +def test_time(t=english.time): + assert_adatetime(t.date_from("13:05", basedate), hour=13, minute=5) + assert t.date_from("28:91", basedate) is None + + assert_adatetime(t.date_from("3pm", basedate), hour=15) + assert_adatetime(t.date_from("3 pm", basedate), hour=15) + assert_adatetime(t.date_from("10pm", basedate), hour=22) + assert_adatetime(t.date_from("10 pm", basedate), hour=22) + assert_adatetime(t.date_from("3am", basedate), hour=3) + assert_adatetime(t.date_from("3:15 am", basedate), hour=3, minute=15) + assert_adatetime(t.date_from("5:10pm", basedate), hour=17, minute=10) + assert_adatetime(t.date_from("12:45am", basedate), hour=0, minute=45) + assert_adatetime(t.date_from("12:45pm", basedate), hour=12, minute=45) + assert_adatetime(t.date_from("5:45:05 pm", basedate), + hour=17, minute=45, second=5) + + assert_adatetime(t.date_from("noon", basedate), + hour=12, minute=0, second=0, microsecond=0) + assert_adatetime(t.date_from("midnight", basedate), + hour=0, minute=0, second=0, microsecond=0) + + assert t.date_from("15 am", basedate) is None + assert t.date_from("24:00", basedate) is None + assert t.date_from("12:65", basedate) is None + + +def test_dmy(d=english.dmy): + assert_adatetime(d.date_from("25 may 2011", basedate), + year=2011, month=5, day=25) + assert_adatetime(d.date_from("may 2 2011", basedate), + year=2011, month=5, day=2) + assert_adatetime(d.date_from("2011 25 may", basedate), + year=2011, month=5, day=25) + assert_adatetime(d.date_from("2011 may 5", basedate), + year=2011, month=5, day=5) + + assert_adatetime(d.date_from("apr", basedate), month=4) + assert_adatetime(d.date_from("september", basedate), month=9) + assert_adatetime(d.date_from("2001", basedate), year=2001) + assert_adatetime(d.date_from("july 2525", basedate), year=2525, month=7) + assert_adatetime(d.date_from("nov 30", basedate), month=11, day=30) + assert d.date_from("25 2525", basedate) is None + + assert_adatetime(d.date_from("25 may, 2011", basedate), + year=2011, month=5, day=25) + assert_adatetime(d.date_from("may 2nd, 2011", basedate), + year=2011, month=5, day=2) + assert_adatetime(d.date_from("2011, 25 may", basedate), + year=2011, month=5, day=25) + assert_adatetime(d.date_from("2011, may 5th", basedate), + year=2011, month=5, day=5) + + assert_adatetime(d.date_from("today", basedate), + year=2010, month=9, day=20) + assert_adatetime(d.date_from("tomorrow", basedate), + year=2010, month=9, day=21) + assert_adatetime(d.date_from("yesterday", basedate), + year=2010, month=9, day=19) + assert_adatetime(d.date_from("this month", basedate), year=2010, month=9) + assert_adatetime(d.date_from("this year", basedate), year=2010) + + assert d.date_from("now", basedate) == basedate + + +def test_plustime(rt=english.plusdate): + assert rt.date_from("+1hr", basedate) == basedate + timedelta(hours=1) + assert rt.date_from("+5mins", basedate) == basedate + timedelta(minutes=5) + assert rt.date_from("+20s", basedate) == basedate + timedelta(seconds=20) + + assert rt.date_from("- 2 h", basedate) == basedate + timedelta(hours=-2) + assert rt.date_from("- 25 minutes", basedate) == basedate + timedelta(minutes=-25) + assert rt.date_from("-400 secs", basedate) == basedate + timedelta(seconds=-400) + + assert rt.date_from("+1hr 5m", basedate) == basedate + timedelta(hours=1, minutes=5) + assert rt.date_from("-8hr 12m", basedate) == basedate + timedelta(hours=-8, minutes=-12) + assert rt.date_from("+1hr 5s", basedate) == basedate + timedelta(hours=1, seconds=5) + assert rt.date_from("+1hr 12m 5s", basedate) == basedate + timedelta(hours=1, minutes=12, seconds=5) + assert rt.date_from("-1hr 5s", basedate) == basedate + timedelta(hours=-1, seconds=-5) + assert rt.date_from("-1hr 12m 5s", basedate) == basedate + timedelta(hours=-1, minutes=-12, seconds=-5) + + +def test_relative_days(): + # "next monday" on monday + assert relative_days(0, 0, 1) == 7 + # "last monday" on monday + assert relative_days(0, 0, -1) == -7 + # "next tuesday" on wednesday + assert relative_days(2, 1, 1) == 6 + # "last tuesday" on wednesay + assert relative_days(2, 1, -1) == -1 + # "last monday" on sunday + assert relative_days(6, 0, -1) == -6 + # "next monday" on sunday + assert relative_days(6, 0, 1) == 1 + # "next wednesday" on tuesday + assert relative_days(1, 2, 1) == 1 + # "last wednesday" on tuesday + assert relative_days(1, 2, -1) == -6 + # "last wednesday" on thursday + assert relative_days(3, 2, -1) == -1 + # "next wednesday" on thursday + assert relative_days(3, 2, 1) == 6 + # "last wednesday" on tuesday + assert relative_days(1, 2, -1) == -6 + # "next wednesday" on tuesday + assert relative_days(1, 2, 1) == 1 + + +def test_dayname(p=english.dayname): + assert_adatetime(p.date_from("next tuesday", basedate), + year=2010, month=9, day=21) + assert_adatetime(p.date_from("last tuesday", basedate), + year=2010, month=9, day=14) + assert_adatetime(p.date_from("next sunday", basedate), + year=2010, month=9, day=26) + assert_adatetime(p.date_from("last sun", basedate), + year=2010, month=9, day=19) + assert_adatetime(p.date_from("next th", basedate), + year=2010, month=9, day=23) + + +def test_reldate(p=english.plusdate): + assert p.date_from("+1y", basedate) == basedate + relativedelta(years=1) + assert p.date_from("+2mo", basedate) == basedate + relativedelta(months=2) + assert p.date_from("+3w", basedate) == basedate + relativedelta(weeks=3) + assert p.date_from("+5d", basedate) == basedate + relativedelta(days=5) + assert p.date_from("+5days", basedate) == basedate + relativedelta(days=5) + + assert p.date_from("-6yr", basedate) == basedate + relativedelta(years=-6) + assert p.date_from("- 7 mons", basedate) == basedate + relativedelta(months=-7) + assert p.date_from("-8 wks", basedate) == basedate + relativedelta(weeks=-8) + assert p.date_from("- 9 dy", basedate) == basedate + relativedelta(days=-9) + + assert p.date_from("+1y 12mo 400d", basedate) == basedate + relativedelta(years=1, months=12, days=400) + assert p.date_from("-7mo 8d", basedate) == basedate + relativedelta(months=-7, days=-8) + assert p.date_from("+5wks 2d", basedate) == basedate + relativedelta(weeks=5, days=2) + assert p.date_from("-1y 1w", basedate) == basedate + relativedelta(years=-1, weeks=-1) + + assert p.date_from("+1y 2d 5h 12s", basedate) == basedate + relativedelta(years=1, days=2, hours=5, seconds=12) + + +def test_bundle_subs(p=english.bundle): + test_time(p) + test_dmy(p) + test_plustime(p) + test_dayname(p) + test_reldate(p) + + +def test_bundle(p=english.bundle): + assert_adatetime(p.date_from("mar 29 1972 2:45am", basedate), + year=1972, month=3, day=29, hour=2, minute=45) + assert_adatetime(p.date_from("16:10:45 14 February 2005", basedate), + year=2005, month=2, day=14, hour=16, minute=10, second=45) + assert_adatetime(p.date_from("1985 sept 12 12:01", basedate), + year=1985, month=9, day=12, hour=12, minute=1) + assert_adatetime(p.date_from("5pm 21st oct 2005", basedate), + year=2005, month=10, day=21, hour=17) + assert_adatetime(p.date_from("5:59:59pm next thur", basedate), + year=2010, month=9, day=23, hour=17, minute=59, second=59) + + +def test_ranges(p=english.torange): + assert_timespan(p.date_from("last tuesday to next tuesday", basedate), + dict(year=2010, month=9, day=14), + dict(year=2010, month=9, day=21)) + assert_timespan(p.date_from("last monday to dec 25", basedate), + dict(year=2010, month=9, day=13), + dict(year=None, month=12, day=25)) + assert_timespan(p.date_from("oct 25 to feb 14", basedate), + dict(year=None, month=10, day=25), + dict(year=None, month=2, day=14)) + assert_timespan(p.date_from("3am oct 12 to 5pm", basedate), + dict(year=None, month=10, day=12, hour=3), + dict(year=None, month=None, day=None, hour=17)) + assert_timespan(p.date_from("3am feb 12 to 5pm today", basedate), + dict(year=None, month=2, day=12, hour=3), + dict(year=2010, month=9, day=20, hour=17)) + assert_timespan(p.date_from("feb to oct", basedate), + dict(year=None, month=2), + dict(year=None, month=10)) + assert_timespan(p.date_from("oct 25 2005 11am to 5pm tomorrow", basedate), + dict(year=2005, month=10, day=25, hour=11), + dict(year=2010, month=9, day=21, hour=17)) + assert_timespan(p.date_from("oct 5 2005 to november 20", basedate), + dict(year=2005, month=10, day=5), + dict(year=None, month=11, day=20)) + assert_timespan(p.date_from("2007 to 2010", basedate), + dict(year=2007, month=None, day=None), + dict(year=2010, month=None, day=None)) + assert_timespan(p.date_from("2007 to oct 12", basedate), + dict(year=2007, month=None, day=None), + dict(year=None, month=10, day=12)) + + assert_datespan(p.date_from("-2d to +1w", basedate), + basedate + relativedelta(days=-2), + basedate + relativedelta(weeks=1)) + + +def test_all(): + p = english.all + test_bundle_subs(p) + test_bundle(p) + test_ranges(p) + + +def test_final_dates(p=english): + assert_unamb(p.date_from("5:10pm", basedate), + year=2010, month=9, day=20, hour=17, minute=10) + + assert p.date_from("may 32 2005", basedate) is None + assert p.date_from("2005 may 32", basedate) is None + assert p.date_from("2005-13-32", basedate) is None + + +def test_final_ranges(p=english): + assert_unamb_span(p.date_from("feb to nov", basedate), + dict(year=2010, month=2), + dict(year=2010, month=11)) + + # 2005 to 10 oct 2009 -> jan 1 2005 to oct 31 2009 + assert_unamb_span(p.date_from("2005 to 10 oct 2009", basedate), + dict(year=2005), + dict(year=2009, month=10, day=10)) + + # jan 12 to oct 10 2009 -> jan 12 2009 to oct 10 2009 + assert_unamb_span(p.date_from("jan 12 to oct 10 2009", basedate), + dict(year=2009, month=1, day=12), + dict(year=2009, month=10, day=10)) + + # jan to oct 2009 -> jan 1 2009 to oct 31 2009 + assert_unamb_span(p.date_from("jan to oct 2009", basedate), + dict(year=2009, month=1), + dict(year=2009, month=10, day=31)) + + # mar 2005 to oct -> mar 1 2005 to oct 31 basedate.year + assert_unamb_span(p.date_from("mar 2005 to oct", basedate), + dict(year=2005, month=3), + dict(year=2010, month=10, day=31)) + + # jan 10 to jan 25 -> jan 10 basedate.year to jan 25 basedate.year + assert_unamb_span(p.date_from("jan 10 to jan 25", basedate), + dict(year=2010, month=1, day=10), + dict(year=2010, month=1, day=25)) + + # jan 2005 to feb 2009 -> jan 1 2005 to feb 28 2009 + assert_unamb_span(p.date_from("jan 2005 to feb 2009", basedate), + dict(year=2005, month=1), + dict(year=2009, month=2)) + + # jan 5000 to mar -> jan 1 5000 to mar 5000 + assert_unamb_span(p.date_from("jan 5000 to mar", basedate), + dict(year=5000, month=1), + dict(year=5000, month=3)) + + # jun 5000 to jan -> jun 1 5000 to jan 31 5001 + assert_unamb_span(p.date_from("jun 5000 to jan", basedate), + dict(year=5000, month=6), + dict(year=5001, month=1)) + + # oct 2010 to feb -> oct 1 2010 to feb 28 2011 + assert_unamb_span(p.date_from("oct 2010 to feb", basedate), + dict(year=2010, month=10), + dict(year=2011, month=2)) + + assert_unamb_span(p.date_from("5pm to 3am", basedate), + dict(year=2010, month=9, day=20, hour=17), + dict(year=2010, month=9, day=21, hour=3)) + + assert_unamb_span(p.date_from("5am to 3 am tomorrow", basedate), + dict(year=2010, month=9, day=20, hour=5), + dict(year=2010, month=9, day=21, hour=3)) + + assert_unamb_span(p.date_from("3am to 5 pm tomorrow", basedate), + dict(year=2010, month=9, day=21, hour=3), + dict(year=2010, month=9, day=21, hour=17)) + + assert_unamb_span(p.date_from("-2hrs to +20min", basedate), + dict(year=2010, month=9, day=20, hour=13, minute=16, + second=6, microsecond=454000), + dict(year=2010, month=9, day=20, hour=15, minute=36, + second=6, microsecond=454000)) + + # Swap + assert_unamb_span(p.date_from("oct 25 2009 to feb 14 2008", basedate), + dict(year=2008, month=2, day=14), + dict(year=2009, month=10, day=25)) + + assert_unamb_span(p.date_from("oct 25 5000 to tomorrow", basedate), + dict(year=2010, month=9, day=21), + dict(year=5000, month=10, day=25)) diff --git a/tests/test_fields.py b/tests/test_fields.py new file mode 100644 index 0000000..7b66e8b --- /dev/null +++ b/tests/test_fields.py @@ -0,0 +1,597 @@ +from __future__ import with_statement +from datetime import datetime, timedelta + +import pytest + +from whoosh import fields, qparser, query +from whoosh.compat import long_type, u, b, xrange +from whoosh.filedb.filestore import RamStorage +from whoosh.util import times + + +def test_schema_eq(): + a = fields.Schema() + b = fields.Schema() + assert a == b + + a = fields.Schema(id=fields.ID) + b = a.copy() + assert a["id"] == b["id"] + assert a == b + + c = fields.Schema(id=fields.TEXT) + assert a != c + + +def test_creation1(): + s = fields.Schema() + s.add("content", fields.TEXT(phrase=True)) + s.add("title", fields.TEXT(stored=True)) + s.add("path", fields.ID(stored=True)) + s.add("tags", fields.KEYWORD(stored=True)) + s.add("quick", fields.NGRAM) + s.add("note", fields.STORED) + + assert s.names() == ["content", "note", "path", "quick", "tags", "title"] + assert "content" in s + assert "buzz" not in s + assert isinstance(s["tags"], fields.KEYWORD) + + +def test_creation2(): + s = fields.Schema(a=fields.ID(stored=True), + b=fields.ID, + c=fields.KEYWORD(scorable=True)) + + assert s.names() == ["a", "b", "c"] + assert "a" in s + assert "b" in s + assert "c" in s + + +def test_declarative(): + class MySchema(fields.SchemaClass): + content = fields.TEXT + title = fields.TEXT + path = fields.ID + date = fields.DATETIME + + ix = RamStorage().create_index(MySchema) + assert ix.schema.names() == ["content", "date", "path", "title"] + + ix = RamStorage().create_index(MySchema()) + assert ix.schema.names() == ["content", "date", "path", "title"] + + with pytest.raises(fields.FieldConfigurationError): + RamStorage().create_index(object()) + + +def test_declarative_inherit(): + class Parent(fields.SchemaClass): + path = fields.ID + date = fields.DATETIME + + class Child(Parent): + content = fields.TEXT + + class Grandchild(Child): + title = fields.TEXT + + s = Grandchild() + assert s.names() == ["content", "date", "path", "title"] + + +def test_badnames(): + s = fields.Schema() + with pytest.raises(fields.FieldConfigurationError): + s.add("_test", fields.ID) + with pytest.raises(fields.FieldConfigurationError): + s.add("a f", fields.ID) + + +#def test_numeric_support(): +# intf = fields.NUMERIC(int, shift_step=0) +# longf = fields.NUMERIC(int, bits=64, shift_step=0) +# floatf = fields.NUMERIC(float, shift_step=0) +# +# def roundtrip(obj, num): +# assert obj.from_bytes(obj.to_bytes(num)), num) +# +# roundtrip(intf, 0) +# roundtrip(intf, 12345) +# roundtrip(intf, -12345) +# roundtrip(longf, 0) +# roundtrip(longf, 85020450482) +# roundtrip(longf, -85020450482) +# roundtrip(floatf, 0) +# roundtrip(floatf, 582.592) +# roundtrip(floatf, -582.592) +# roundtrip(floatf, -99.42) +# +# from random import shuffle +# +# def roundtrip_sort(obj, start, end, step): +# count = start +# rng = [] +# while count < end: +# rng.append(count) +# count += step +# +# scrabled = list(rng) +# shuffle(scrabled) +# round = [obj.from_text(t) for t +# in sorted([obj.to_text(n) for n in scrabled])] +# assert round, rng) +# +# roundtrip_sort(intf, -100, 100, 1) +# roundtrip_sort(longf, -58902, 58249, 43) +# roundtrip_sort(floatf, -99.42, 99.83, 2.38) + + +def test_index_numeric(): + schema = fields.Schema(a=fields.NUMERIC(int, 32, signed=False), + b=fields.NUMERIC(int, 32, signed=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(a=1, b=1) + with ix.searcher() as s: + assert list(s.lexicon("a")) == \ + [b('\x00\x00\x00\x00\x01'), b('\x04\x00\x00\x00\x00'), + b('\x08\x00\x00\x00\x00'), b('\x0c\x00\x00\x00\x00'), + b('\x10\x00\x00\x00\x00'), b('\x14\x00\x00\x00\x00'), + b('\x18\x00\x00\x00\x00'), b('\x1c\x00\x00\x00\x00')] + assert list(s.lexicon("b")) == \ + [b('\x00\x80\x00\x00\x01'), b('\x04\x08\x00\x00\x00'), + b('\x08\x00\x80\x00\x00'), b('\x0c\x00\x08\x00\x00'), + b('\x10\x00\x00\x80\x00'), b('\x14\x00\x00\x08\x00'), + b('\x18\x00\x00\x00\x80'), b('\x1c\x00\x00\x00\x08')] + + +def test_numeric(): + schema = fields.Schema(id=fields.ID(stored=True), + integer=fields.NUMERIC(int), + floating=fields.NUMERIC(float)) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(id=u("a"), integer=5820, floating=1.2) + w.add_document(id=u("b"), integer=22, floating=2.3) + w.add_document(id=u("c"), integer=78, floating=3.4) + w.add_document(id=u("d"), integer=13, floating=4.5) + w.add_document(id=u("e"), integer=9, floating=5.6) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("integer", schema) + + q = qp.parse(u("5820")) + r = s.search(q) + assert len(r) == 1 + assert r[0]["id"] == "a" + + with ix.searcher() as s: + r = s.search(qp.parse("floating:4.5")) + assert len(r) == 1 + assert r[0]["id"] == "d" + + q = qp.parse("integer:*") + assert q.__class__ == query.Every + assert q.field() == "integer" + + q = qp.parse("integer:5?6") + assert q == query.NullQuery + + +def test_decimal_numeric(): + from decimal import Decimal + + f = fields.NUMERIC(int, decimal_places=4) + schema = fields.Schema(id=fields.ID(stored=True), deci=f) + ix = RamStorage().create_index(schema) + + # assert f.from_text(f.to_text(Decimal("123.56"))), Decimal("123.56")) + + w = ix.writer() + w.add_document(id=u("a"), deci=Decimal("123.56")) + w.add_document(id=u("b"), deci=Decimal("0.536255")) + w.add_document(id=u("c"), deci=Decimal("2.5255")) + w.add_document(id=u("d"), deci=Decimal("58")) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("deci", schema) + q = qp.parse(u("123.56")) + r = s.search(q) + assert len(r) == 1 + assert r[0]["id"] == "a" + + r = s.search(qp.parse(u("0.536255"))) + assert len(r) == 1 + assert r[0]["id"] == "b" + + +def test_numeric_parsing(): + schema = fields.Schema(id=fields.ID(stored=True), number=fields.NUMERIC) + + qp = qparser.QueryParser("number", schema) + q = qp.parse(u("[10 to *]")) + assert q == query.NullQuery + + q = qp.parse(u("[to 400]")) + assert q.__class__ is query.NumericRange + assert q.start is None + assert q.end == 400 + + q = qp.parse(u("[10 to]")) + assert q.__class__ is query.NumericRange + assert q.start == 10 + assert q.end is None + + q = qp.parse(u("[10 to 400]")) + assert q.__class__ is query.NumericRange + assert q.start == 10 + assert q.end == 400 + + +def test_numeric_ranges(): + schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC) + ix = RamStorage().create_index(schema) + w = ix.writer() + + for i in xrange(400): + w.add_document(id=i, num=i) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("num", schema) + + def check(qs, target): + q = qp.parse(qs) + result = [s.stored_fields(d)["id"] for d in q.docs(s)] + assert result == target + + # Note that range() is always inclusive-exclusive + check("[10 to 390]", list(range(10, 390 + 1))) + check("[100 to]", list(range(100, 400))) + check("[to 350]", list(range(0, 350 + 1))) + check("[16 to 255]", list(range(16, 255 + 1))) + check("{10 to 390]", list(range(11, 390 + 1))) + check("[10 to 390}", list(range(10, 390))) + check("{10 to 390}", list(range(11, 390))) + check("{16 to 255}", list(range(17, 255))) + + +def test_numeric_ranges_unsigned(): + values = [1, 10, 100, 1000, 2, 20, 200, 2000, 9, 90, 900, 9000] + schema = fields.Schema(num2=fields.NUMERIC(stored=True, signed=False)) + + ix = RamStorage().create_index(schema) + with ix.writer() as w: + for v in values: + w.add_document(num2=v) + + with ix.searcher() as s: + q = query.NumericRange("num2", 55, None, True, False) + r = s.search(q, limit=None) + for hit in r: + assert int(hit["num2"]) >= 55 + + +def test_decimal_ranges(): + from decimal import Decimal + + schema = fields.Schema(id=fields.STORED, + num=fields.NUMERIC(int, decimal_places=2)) + ix = RamStorage().create_index(schema) + w = ix.writer() + count = Decimal("0.0") + inc = Decimal("0.2") + for _ in xrange(500): + w.add_document(id=str(count), num=count) + count += inc + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("num", schema) + + def check(qs, start, end): + q = qp.parse(qs) + result = [s.stored_fields(d)["id"] for d in q.docs(s)] + + target = [] + count = Decimal(start) + limit = Decimal(end) + while count <= limit: + target.append(str(count)) + count += inc + + assert result == target + + check("[10.2 to 80.8]", "10.2", "80.8") + check("{10.2 to 80.8]", "10.4", "80.8") + check("[10.2 to 80.8}", "10.2", "80.6") + check("{10.2 to 80.8}", "10.4", "80.6") + + +def test_numeric_errors(): + f = fields.NUMERIC(int, bits=16, signed=True) + schema = fields.Schema(f=f) + + with pytest.raises(ValueError): + list(f.index(-32769)) + with pytest.raises(ValueError): + list(f.index(32768)) + + +def test_nontext_document(): + schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC, + date=fields.DATETIME, even=fields.BOOLEAN) + ix = RamStorage().create_index(schema) + + dt = datetime.now() + w = ix.writer() + for i in xrange(50): + w.add_document(id=i, num=i, date=dt + timedelta(days=i), + even=not(i % 2)) + w.commit() + + with ix.searcher() as s: + def check(kwargs, target): + result = [d['id'] for d in s.documents(**kwargs)] + assert result == target + + check({"num": 49}, [49]) + check({"date": dt + timedelta(days=30)}, [30]) + check({"even": True}, list(range(0, 50, 2))) + + +def test_nontext_update(): + schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC(unique=True), + date=fields.DATETIME(unique=True)) + ix = RamStorage().create_index(schema) + + dt = datetime.now() + w = ix.writer() + for i in xrange(10): + w.add_document(id=i, num=i, date=dt + timedelta(days=i)) + w.commit() + + w = ix.writer() + w.update_document(num=8, id="a") + w.update_document(num=2, id="b") + w.update_document(num=4, id="c") + w.update_document(date=dt + timedelta(days=5), id="d") + w.update_document(date=dt + timedelta(days=1), id="e") + w.update_document(date=dt + timedelta(days=7), id="f") + w.commit() + + +def test_datetime(): + dtf = fields.DATETIME(stored=True) + schema = fields.Schema(id=fields.ID(stored=True), date=dtf) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + for month in xrange(1, 12): + for day in xrange(1, 28): + w.add_document(id=u("%s-%s") % (month, day), + date=datetime(2010, month, day, 14, 0, 0)) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("id", schema) + + r = s.search(qp.parse("date:20100523")) + assert len(r) == 1 + assert r[0]["id"] == "5-23" + assert r[0]["date"].__class__ is datetime + assert r[0]["date"].month == 5 + assert r[0]["date"].day == 23 + + r = s.search(qp.parse("date:'2010 02'")) + assert len(r) == 27 + + q = qp.parse(u("date:[2010-05 to 2010-08]")) + startdt = datetime(2010, 5, 1, 0, 0, 0, 0) + enddt = datetime(2010, 8, 31, 23, 59, 59, 999999) + assert q.__class__ is query.NumericRange + assert q.start == times.datetime_to_long(startdt) + assert q.end == times.datetime_to_long(enddt) + + +def test_boolean(): + schema = fields.Schema(id=fields.ID(stored=True), + done=fields.BOOLEAN(stored=True)) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(id=u("a"), done=True) + w.add_document(id=u("b"), done=False) + w.add_document(id=u("c"), done=True) + w.add_document(id=u("d"), done=False) + w.add_document(id=u("e"), done=True) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("id", schema) + + r = s.search(qp.parse("done:true")) + assert sorted([d["id"] for d in r]) == ["a", "c", "e"] + assert all(d["done"] for d in r) + + r = s.search(qp.parse("done:yes")) + assert sorted([d["id"] for d in r]) == ["a", "c", "e"] + assert all(d["done"] for d in r) + + q = qp.parse("done:false") + assert q.__class__ == query.Term + assert q.text is False + assert schema["done"].to_bytes(False) == b("f") + r = s.search(q) + assert sorted([d["id"] for d in r]) == ["b", "d"] + assert not any(d["done"] for d in r) + + r = s.search(qp.parse("done:no")) + assert sorted([d["id"] for d in r]) == ["b", "d"] + assert not any(d["done"] for d in r) + + +def test_boolean2(): + schema = fields.Schema(t=fields.TEXT(stored=True), + b=fields.BOOLEAN(stored=True)) + ix = RamStorage().create_index(schema) + writer = ix.writer() + writer.add_document(t=u('some kind of text'), b=False) + writer.add_document(t=u('some other kind of text'), b=False) + writer.add_document(t=u('some more text'), b=False) + writer.add_document(t=u('some again'), b=True) + writer.commit() + + with ix.searcher() as s: + qf = qparser.QueryParser('b', None).parse(u('f')) + qt = qparser.QueryParser('b', None).parse(u('t')) + r = s.search(qf) + assert len(r) == 3 + + assert [d["b"] for d in s.search(qt)] == [True] + assert [d["b"] for d in s.search(qf)] == [False] * 3 + + +def test_boolean3(): + schema = fields.Schema(t=fields.TEXT(stored=True, field_boost=5), + b=fields.BOOLEAN(stored=True), + c=fields.TEXT) + ix = RamStorage().create_index(schema) + + with ix.writer() as w: + w.add_document(t=u("with hardcopy"), b=True, c=u("alfa")) + w.add_document(t=u("no hardcopy"), b=False, c=u("bravo")) + + with ix.searcher() as s: + q = query.Term("b", schema["b"].to_bytes(True)) + ts = [hit["t"] for hit in s.search(q)] + assert ts == ["with hardcopy"] + + +def test_boolean_strings(): + schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(i=0, b="true") + w.add_document(i=1, b="True") + w.add_document(i=2, b="false") + w.add_document(i=3, b="False") + w.add_document(i=4, b=u("true")) + w.add_document(i=5, b=u("True")) + w.add_document(i=6, b=u("false")) + w.add_document(i=7, b=u("False")) + + with ix.searcher() as s: + qp = qparser.QueryParser("b", ix.schema) + + def check(qs, nums): + q = qp.parse(qs) + r = s.search(q, limit=None) + assert [hit["i"] for hit in r] == nums + + trues = [0, 1, 4, 5] + falses = [2, 3, 6, 7] + check("true", trues) + check("True", trues) + check("false", falses) + check("False", falses) + check("t", trues) + check("f", falses) + + +def test_boolean_find_deleted(): + # "Random" string of ones and zeros representing deleted and undeleted + domain = "1110001010001110010101000101001011101010001011111101000101010101" + + schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True)) + ix = RamStorage().create_index(schema) + count = 0 + # Create multiple segments just in case + for _ in xrange(5): + w = ix.writer() + for c in domain: + w.add_document(i=count, b=(c == "1")) + w.commit(merge=False) + + # Delete documents where "b" is True + with ix.writer() as w: + w.delete_by_term("b", "t") + + with ix.searcher() as s: + # Double check that documents with b=True are all deleted + reader = s.reader() + for docnum in xrange(s.doc_count_all()): + b = s.stored_fields(docnum)["b"] + assert b == reader.is_deleted(docnum) + + # Try doing a search for documents where b=True + qp = qparser.QueryParser("b", ix.schema) + q = qp.parse("b:t") + r = s.search(q, limit=None) + assert len(r) == 0 + + # Make sure Every query doesn't match deleted docs + r = s.search(qp.parse("*"), limit=None) + assert not any(hit["b"] for hit in r) + assert not any(reader.is_deleted(hit.docnum) for hit in r) + + r = s.search(qp.parse("*:*"), limit=None) + assert not any(hit["b"] for hit in r) + assert not any(reader.is_deleted(hit.docnum) for hit in r) + + # Make sure Not query doesn't match deleted docs + q = qp.parse("NOT b:t") + r = s.search(q, limit=None) + assert not any(hit["b"] for hit in r) + assert not any(reader.is_deleted(hit.docnum) for hit in r) + + r = s.search(q, limit=5) + assert not any(hit["b"] for hit in r) + assert not any(reader.is_deleted(hit.docnum) for hit in r) + + +def test_boolean_multifield(): + schema = fields.Schema(name=fields.TEXT(stored=True), + bit=fields.BOOLEAN(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(name=u('audi'), bit=True) + w.add_document(name=u('vw'), bit=False) + w.add_document(name=u('porsche'), bit=False) + w.add_document(name=u('ferrari'), bit=True) + w.add_document(name=u('citroen'), bit=False) + + with ix.searcher() as s: + qp = qparser.MultifieldParser(["name", "bit"], schema) + q = qp.parse(u("boop")) + + r = s.search(q) + assert sorted(hit["name"] for hit in r) == ["audi", "ferrari"] + assert len(r) == 2 + + +def test_missing_field(): + schema = fields.Schema() + ix = RamStorage().create_index(schema) + + with ix.searcher() as s: + with pytest.raises(KeyError): + s.document_numbers(id=u("test")) + + +def test_token_boost(): + from whoosh.analysis import RegexTokenizer, DoubleMetaphoneFilter + ana = RegexTokenizer() | DoubleMetaphoneFilter() + field = fields.TEXT(analyzer=ana, phrase=False) + results = sorted(field.index(u("spruce view"))) + assert results == [(b('F'), 1, 1.0, b('\x00\x00\x00\x01')), + (b('FF'), 1, 0.5, b('\x00\x00\x00\x01')), + (b('SPRS'), 1, 1.0, b('\x00\x00\x00\x01')), + ] + diff --git a/tests/test_flexible.py b/tests/test_flexible.py new file mode 100644 index 0000000..446aedf --- /dev/null +++ b/tests/test_flexible.py @@ -0,0 +1,104 @@ +from __future__ import with_statement + +from whoosh import fields +from whoosh.compat import u, b +from whoosh.util.testing import TempIndex + + +def test_addfield(): + schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) + with TempIndex(schema, "addfield") as ix: + w = ix.writer() + w.add_document(id=u("a"), content=u("alfa")) + w.add_document(id=u("b"), content=u("bravo")) + w.add_document(id=u("c"), content=u("charlie")) + w.commit() + + ix.add_field("added", fields.KEYWORD(stored=True)) + + w = ix.writer() + w.add_document(id=u("d"), content=u("delta"), added=u("fourth")) + w.add_document(id=u("e"), content=u("echo"), added=u("fifth")) + w.commit(merge=False) + + with ix.searcher() as s: + assert ("id", "d") in s.reader() + assert s.document(id="d") == {"id": "d", "added": "fourth"} + assert s.document(id="b") == {"id": "b"} + + +def test_addfield_spelling(): + schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) + with TempIndex(schema, "addfield") as ix: + w = ix.writer() + w.add_document(id=u("a"), content=u("alfa")) + w.add_document(id=u("b"), content=u("bravo")) + w.add_document(id=u("c"), content=u("charlie")) + w.commit() + + ix.add_field("added", fields.KEYWORD(stored=True)) + + w = ix.writer() + w.add_document(id=u("d"), content=u("delta"), added=u("fourth")) + w.add_document(id=u("e"), content=u("echo"), added=u("fifth")) + w.commit(merge=False) + + with ix.searcher() as s: + assert s.document(id=u("d")) == {"id": "d", "added": "fourth"} + assert s.document(id=u("b")) == {"id": "b"} + + +def test_removefield(): + schema = fields.Schema(id=fields.ID(stored=True), + content=fields.TEXT, + city=fields.KEYWORD(stored=True)) + with TempIndex(schema, "removefield") as ix: + w = ix.writer() + w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad")) + w.add_document(id=u("c"), content=u("charlie"), city=u("cairo")) + w.add_document(id=u("d"), content=u("delta"), city=u("dakar")) + w.commit() + + with ix.searcher() as s: + assert s.document(id=u("c")) == {"id": "c", "city": "cairo"} + + w = ix.writer() + w.remove_field("content") + w.remove_field("city") + w.commit() + + ixschema = ix._current_schema() + assert ixschema.names() == ["id"] + assert ixschema.stored_names() == ["id"] + + with ix.searcher() as s: + assert ("content", b("charlie")) not in s.reader() + assert s.document(id=u("c")) == {"id": u("c")} + + +def test_optimize_away(): + schema = fields.Schema(id=fields.ID(stored=True), + content=fields.TEXT, + city=fields.KEYWORD(stored=True)) + with TempIndex(schema, "optimizeaway") as ix: + w = ix.writer() + w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad")) + w.add_document(id=u("c"), content=u("charlie"), city=u("cairo")) + w.add_document(id=u("d"), content=u("delta"), city=u("dakar")) + w.commit() + + with ix.searcher() as s: + assert s.document(id=u("c")) == {"id": "c", "city": "cairo"} + + w = ix.writer() + w.remove_field("content") + w.remove_field("city") + w.commit(optimize=True) + + with ix.searcher() as s: + assert ("content", u("charlie")) not in s.reader() + assert s.document(id=u("c")) == {"id": u("c")} + + +if __name__ == "__main__": + test_addfield() diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py new file mode 100644 index 0000000..4cb7877 --- /dev/null +++ b/tests/test_highlighting.py @@ -0,0 +1,282 @@ +# coding: utf-8 + +from __future__ import with_statement + +import pytest + +from whoosh import analysis, highlight, fields, qparser, query +from whoosh.compat import u +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex + + +_doc = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet " + + "kilo lima") + + +def test_null_fragment(): + terms = frozenset(("bravo", "india")) + sa = analysis.StandardAnalyzer() + nf = highlight.WholeFragmenter() + uc = highlight.UppercaseFormatter() + htext = highlight.highlight(_doc, terms, sa, nf, uc) + assert htext == "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima" + + +def test_sentence_fragment(): + text = u("This is the first sentence. This one doesn't have the word. " + + "This sentence is the second. Third sentence here.") + terms = ("sentence",) + sa = analysis.StandardAnalyzer(stoplist=None) + sf = highlight.SentenceFragmenter() + uc = highlight.UppercaseFormatter() + htext = highlight.highlight(text, terms, sa, sf, uc) + assert htext == "This is the first SENTENCE...This SENTENCE is the second...Third SENTENCE here" + + +def test_context_fragment(): + terms = frozenset(("bravo", "india")) + sa = analysis.StandardAnalyzer() + cf = highlight.ContextFragmenter(surround=6) + uc = highlight.UppercaseFormatter() + htext = highlight.highlight(_doc, terms, sa, cf, uc) + assert htext == "alfa BRAVO charlie...hotel INDIA juliet" + + +def test_context_at_start(): + terms = frozenset(["alfa"]) + sa = analysis.StandardAnalyzer() + cf = highlight.ContextFragmenter(surround=15) + uc = highlight.UppercaseFormatter() + htext = highlight.highlight(_doc, terms, sa, cf, uc) + assert htext == "ALFA bravo charlie delta echo foxtrot" + + +def test_html_format(): + terms = frozenset(("bravo", "india")) + sa = analysis.StandardAnalyzer() + cf = highlight.ContextFragmenter(surround=6) + hf = highlight.HtmlFormatter() + htext = highlight.highlight(_doc, terms, sa, cf, hf) + assert htext == 'alfa bravo charlie...hotel india juliet' + + +def test_html_escape(): + terms = frozenset(["bravo"]) + sa = analysis.StandardAnalyzer() + wf = highlight.WholeFragmenter() + hf = highlight.HtmlFormatter() + htext = highlight.highlight(u('alfa delta'), terms, sa, + wf, hf) + assert htext == 'alfa <bravo "charlie"> delta' + + +def test_maxclasses(): + terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo")) + sa = analysis.StandardAnalyzer() + cf = highlight.ContextFragmenter(surround=6) + hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2) + htext = highlight.highlight(_doc, terms, sa, cf, hf) + assert htext == 'alfa bravo charlie...delta echo foxtrot' + + +def test_workflow_easy(): + schema = fields.Schema(id=fields.ID(stored=True), + title=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(id=u("1"), title=u("The man who wasn't there")) + w.add_document(id=u("2"), title=u("The dog who barked at midnight")) + w.add_document(id=u("3"), title=u("The invisible man")) + w.add_document(id=u("4"), title=u("The girl with the dragon tattoo")) + w.add_document(id=u("5"), title=u("The woman who disappeared")) + w.commit() + + with ix.searcher() as s: + # Parse the user query + parser = qparser.QueryParser("title", schema=ix.schema) + q = parser.parse(u("man")) + r = s.search(q, terms=True) + assert len(r) == 2 + + r.fragmenter = highlight.WholeFragmenter() + r.formatter = highlight.UppercaseFormatter() + outputs = [hit.highlights("title") for hit in r] + assert outputs == ["The invisible MAN", "The MAN who wasn't there"] + + +def test_workflow_manual(): + schema = fields.Schema(id=fields.ID(stored=True), + title=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(id=u("1"), title=u("The man who wasn't there")) + w.add_document(id=u("2"), title=u("The dog who barked at midnight")) + w.add_document(id=u("3"), title=u("The invisible man")) + w.add_document(id=u("4"), title=u("The girl with the dragon tattoo")) + w.add_document(id=u("5"), title=u("The woman who disappeared")) + w.commit() + + with ix.searcher() as s: + # Parse the user query + parser = qparser.QueryParser("title", schema=ix.schema) + q = parser.parse(u("man")) + + # Extract the terms the user used in the field we're interested in + terms = [text for fieldname, text in q.all_terms() + if fieldname == "title"] + + # Perform the search + r = s.search(q) + assert len(r) == 2 + + # Use the same analyzer as the field uses. To be sure, you can + # do schema[fieldname].analyzer. Be careful not to do this + # on non-text field types such as DATETIME. + analyzer = schema["title"].analyzer + + # Since we want to highlight the full title, not extract fragments, + # we'll use WholeFragmenter. + nf = highlight.WholeFragmenter() + + # In this example we'll simply uppercase the matched terms + fmt = highlight.UppercaseFormatter() + + outputs = [] + for d in r: + text = d["title"] + outputs.append(highlight.highlight(text, terms, analyzer, nf, fmt)) + + assert outputs == ["The invisible MAN", "The MAN who wasn't there"] + + +def test_unstored(): + schema = fields.Schema(text=fields.TEXT, tags=fields.KEYWORD) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(text=u("alfa bravo charlie"), tags=u("delta echo")) + w.commit() + + hit = ix.searcher().search(query.Term("text", "bravo"))[0] + with pytest.raises(KeyError): + hit.highlights("tags") + + +def test_multifilter(): + iwf_for_index = analysis.IntraWordFilter(mergewords=True, mergenums=False) + iwf_for_query = analysis.IntraWordFilter(mergewords=False, mergenums=False) + mf = analysis.MultiFilter(index=iwf_for_index, query=iwf_for_query) + ana = analysis.RegexTokenizer() | mf | analysis.LowercaseFilter() + + schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True)) + with TempIndex(schema) as ix: + w = ix.writer() + w.add_document(text=u("Our BabbleTron5000 is great")) + w.commit() + + with ix.searcher() as s: + assert ("text", "5000") in s.reader() + hit = s.search(query.Term("text", "5000"))[0] + assert (hit.highlights("text") + == 'Our BabbleTron5000 is great') + + +def test_pinpoint(): + domain = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet " + "kilo lima mike november oskar papa quebec romeo sierra tango") + schema = fields.Schema(text=fields.TEXT(stored=True, chars=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(text=domain) + w.commit() + + assert ix.schema["text"].supports("characters") + with ix.searcher() as s: + r = s.search(query.Term("text", "juliet"), terms=True) + hit = r[0] + hi = highlight.Highlighter() + hi.formatter = highlight.UppercaseFormatter() + + assert not hi.can_load_chars(r, "text") + assert (hi.highlight_hit(hit, "text") + == "golf hotel india JULIET kilo lima mike november") + + hi.fragmenter = highlight.PinpointFragmenter() + assert hi.can_load_chars(r, "text") + assert (hi.highlight_hit(hit, "text") + == "ot golf hotel india JULIET kilo lima mike nove") + + hi.fragmenter.autotrim = True + assert (hi.highlight_hit(hit, "text") + == "golf hotel india JULIET kilo lima mike") + + +def test_highlight_wildcards(): + schema = fields.Schema(text=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(text=u("alfa bravo charlie delta cookie echo")) + + with ix.searcher() as s: + qp = qparser.QueryParser("text", ix.schema) + q = qp.parse(u("c*")) + r = s.search(q) + assert r.scored_length() == 1 + r.formatter = highlight.UppercaseFormatter() + hit = r[0] + assert hit.highlights("text") == "alfa bravo CHARLIE delta COOKIE echo" + + +def test_highlight_ngrams(): + schema = fields.Schema(text=fields.NGRAMWORDS(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(text=u("Multiplication and subtraction are good")) + + with ix.searcher() as s: + qp = qparser.QueryParser("text", ix.schema) + q = qp.parse(u("multiplication")) + r = s.search(q) + assert r.scored_length() == 1 + + r.fragmenter = highlight.SentenceFragmenter() + r.formatter = highlight.UppercaseFormatter() + snippet = r[0].highlights("text") + assert snippet == "MULTIPLICATIon and subtracTION are good" + + +def test_issue324(): + sa = analysis.StemmingAnalyzer() + result = highlight.highlight(u("Indexed!\n1"), [u("index")], sa, + fragmenter=highlight.ContextFragmenter(), + formatter=highlight.UppercaseFormatter()) + assert result == "INDEXED!\n1" + + +def test_whole_noterms(): + schema = fields.Schema(text=fields.TEXT(stored=True), tag=fields.KEYWORD) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(text=u("alfa bravo charlie delta echo foxtrot golf"), + tag=u("foo")) + + with ix.searcher() as s: + r = s.search(query.Term("text", u("delta"))) + assert len(r) == 1 + + r.fragmenter = highlight.WholeFragmenter() + r.formatter = highlight.UppercaseFormatter() + hi = r[0].highlights("text") + assert hi == u("alfa bravo charlie DELTA echo foxtrot golf") + + r = s.search(query.Term("tag", u("foo"))) + assert len(r) == 1 + r.fragmenter = highlight.WholeFragmenter() + r.formatter = highlight.UppercaseFormatter() + hi = r[0].highlights("text") + assert hi == u("") + + hi = r[0].highlights("text", minscore=0) + assert hi == u("alfa bravo charlie delta echo foxtrot golf") diff --git a/tests/test_indexing.py b/tests/test_indexing.py new file mode 100644 index 0000000..eecfdd2 --- /dev/null +++ b/tests/test_indexing.py @@ -0,0 +1,702 @@ +from __future__ import with_statement +import random +from collections import defaultdict +from datetime import datetime + +import pytest + +from whoosh import analysis, fields, index, qparser, query +from whoosh.compat import b, u, xrange, text_type, PY3, permutations +from whoosh.filedb.filestore import RamStorage +from whoosh.writing import IndexingError +from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.util.testing import TempIndex, TempStorage + + +def test_creation(): + s = fields.Schema(content=fields.TEXT(phrase=True), + title=fields.TEXT(stored=True), + path=fields.ID(stored=True), + tags=fields.KEYWORD(stored=True), + quick=fields.NGRAM, + note=fields.STORED) + st = RamStorage() + + ix = st.create_index(s) + w = ix.writer() + w.add_document(title=u("First"), content=u("This is the first document"), + path=u("/a"), tags=u("first second third"), + quick=u("First document"), + note=u("This is the first document")) + w.add_document(content=u("Let's try this again"), title=u("Second"), + path=u("/b"), tags=u("Uno Dos Tres"), + quick=u("Second document"), + note=u("This is the second document")) + w.commit() + + +def test_empty_commit(): + s = fields.Schema(id=fields.ID(stored=True)) + with TempIndex(s, "emptycommit") as ix: + w = ix.writer() + w.add_document(id=u("1")) + w.add_document(id=u("2")) + w.add_document(id=u("3")) + w.commit() + + w = ix.writer() + w.commit() + + +def test_version_in(): + from whoosh import __version__ + from whoosh import index + + with TempStorage("versionin") as st: + assert not index.exists(st) + + schema = fields.Schema(text=fields.TEXT) + ix = st.create_index(schema) + assert index.exists(st) + assert ix.is_empty() + + v = index.version(st) + assert v[0] == __version__ + assert v[1] == index._CURRENT_TOC_VERSION + + with ix.writer() as w: + w.add_document(text=u("alfa")) + + assert not ix.is_empty() + + +def test_simple_indexing(): + schema = fields.Schema(text=fields.TEXT, id=fields.STORED) + domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), + u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), + u("kilo"), u("lima"), u("mike"), u("november")) + docs = defaultdict(list) + with TempIndex(schema, "simple") as ix: + with ix.writer() as w: + for i in xrange(100): + smp = random.sample(domain, 5) + for word in smp: + docs[word].append(i) + w.add_document(text=u(" ").join(smp), id=i) + + with ix.searcher() as s: + for word in domain: + rset = sorted([hit["id"] for hit + in s.search(query.Term("text", word), + limit=None)]) + assert rset == docs[word] + + +def test_integrity(): + s = fields.Schema(name=fields.TEXT, value=fields.TEXT) + st = RamStorage() + ix = st.create_index(s) + + w = ix.writer() + w.add_document(name=u("Yellow brown"), value=u("Blue red green purple?")) + w.add_document(name=u("Alpha beta"), value=u("Gamma delta epsilon omega.")) + w.commit() + + w = ix.writer() + w.add_document(name=u("One two"), value=u("Three four five.")) + w.commit() + + tr = ix.reader() + assert ix.doc_count_all() == 3 + assert " ".join(tr.field_terms("name")) == "alpha beta brown one two yellow" + + +def test_lengths(): + s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), + f2=fields.KEYWORD(stored=True, scorable=True)) + with TempIndex(s, "testlengths") as ix: + w = ix.writer() + items = u("ABCDEFG") + from itertools import cycle, islice + lengths = [10, 20, 2, 102, 45, 3, 420, 2] + for length in lengths: + w.add_document(f2=u(" ").join(islice(cycle(items), length))) + w.commit() + + with ix.reader() as dr: + ls1 = [dr.doc_field_length(i, "f1") + for i in xrange(0, len(lengths))] + assert ls1 == [0] * len(lengths) + ls2 = [dr.doc_field_length(i, "f2") + for i in xrange(0, len(lengths))] + assert ls2 == [byte_to_length(length_to_byte(l)) for l in lengths] + + +def test_many_lengths(): + domain = u("alfa bravo charlie delta echo").split() + schema = fields.Schema(text=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + for i, word in enumerate(domain): + length = (i + 1) ** 6 + w.add_document(text=" ".join(word for _ in xrange(length))) + w.commit() + + s = ix.searcher() + for i, word in enumerate(domain): + target = byte_to_length(length_to_byte((i + 1) ** 6)) + ti = s.term_info("text", word) + assert ti.min_length() == target + assert ti.max_length() == target + + +def test_lengths_ram(): + s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), + f2=fields.KEYWORD(stored=True, scorable=True)) + st = RamStorage() + ix = st.create_index(s) + w = ix.writer() + w.add_document(f1=u("A B C D E"), f2=u("X Y Z")) + w.add_document(f1=u("B B B B C D D Q"), f2=u("Q R S T")) + w.add_document(f1=u("D E F"), f2=u("U V A B C D E")) + w.commit() + + dr = ix.reader() + assert dr.stored_fields(0)["f1"] == "A B C D E" + assert dr.doc_field_length(0, "f1") == 5 + assert dr.doc_field_length(1, "f1") == 8 + assert dr.doc_field_length(2, "f1") == 3 + assert dr.doc_field_length(0, "f2") == 3 + assert dr.doc_field_length(1, "f2") == 4 + assert dr.doc_field_length(2, "f2") == 7 + + assert dr.field_length("f1") == 16 + assert dr.field_length("f2") == 14 + assert dr.max_field_length("f1") == 8 + assert dr.max_field_length("f2") == 7 + + +def test_merged_lengths(): + s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), + f2=fields.KEYWORD(stored=True, scorable=True)) + with TempIndex(s, "mergedlengths") as ix: + w = ix.writer() + w.add_document(f1=u("A B C"), f2=u("X")) + w.add_document(f1=u("B C D E"), f2=u("Y Z")) + w.commit() + + w = ix.writer() + w.add_document(f1=u("A"), f2=u("B C D E X Y")) + w.add_document(f1=u("B C"), f2=u("X")) + w.commit(merge=False) + + w = ix.writer() + w.add_document(f1=u("A B X Y Z"), f2=u("B C")) + w.add_document(f1=u("Y X"), f2=u("A B")) + w.commit(merge=False) + + with ix.reader() as dr: + assert dr.stored_fields(0)["f1"] == u("A B C") + assert dr.doc_field_length(0, "f1") == 3 + assert dr.doc_field_length(2, "f2") == 6 + assert dr.doc_field_length(4, "f1") == 5 + + +def test_frequency_keyword(): + s = fields.Schema(content=fields.KEYWORD) + st = RamStorage() + ix = st.create_index(s) + + w = ix.writer() + w.add_document(content=u("A B C D E")) + w.add_document(content=u("B B B B C D D")) + w.add_document(content=u("D E F")) + w.commit() + + with ix.reader() as tr: + assert tr.doc_frequency("content", u("B")) == 2 + assert tr.frequency("content", u("B")) == 5 + assert tr.doc_frequency("content", u("E")) == 2 + assert tr.frequency("content", u("E")) == 2 + assert tr.doc_frequency("content", u("A")) == 1 + assert tr.frequency("content", u("A")) == 1 + assert tr.doc_frequency("content", u("D")) == 3 + assert tr.frequency("content", u("D")) == 4 + assert tr.doc_frequency("content", u("F")) == 1 + assert tr.frequency("content", u("F")) == 1 + assert tr.doc_frequency("content", u("Z")) == 0 + assert tr.frequency("content", u("Z")) == 0 + + stats = [(fname, text, ti.doc_frequency(), ti.weight()) + for (fname, text), ti in tr] + + assert stats == [("content", b("A"), 1, 1), ("content", b("B"), 2, 5), + ("content", b("C"), 2, 2), ("content", b("D"), 3, 4), + ("content", b("E"), 2, 2), ("content", b("F"), 1, 1)] + + +def test_frequency_text(): + s = fields.Schema(content=fields.KEYWORD) + st = RamStorage() + ix = st.create_index(s) + + w = ix.writer() + w.add_document(content=u("alfa bravo charlie delta echo")) + w.add_document(content=u("bravo bravo bravo bravo charlie delta delta")) + w.add_document(content=u("delta echo foxtrot")) + w.commit() + + with ix.reader() as tr: + assert tr.doc_frequency("content", u("bravo")) == 2 + assert tr.frequency("content", u("bravo")) == 5 + assert tr.doc_frequency("content", u("echo")) == 2 + assert tr.frequency("content", u("echo")) == 2 + assert tr.doc_frequency("content", u("alfa")) == 1 + assert tr.frequency("content", u("alfa")) == 1 + assert tr.doc_frequency("content", u("delta")) == 3 + assert tr.frequency("content", u("delta")) == 4 + assert tr.doc_frequency("content", u("foxtrot")) == 1 + assert tr.frequency("content", u("foxtrot")) == 1 + assert tr.doc_frequency("content", u("zulu")) == 0 + assert tr.frequency("content", u("zulu")) == 0 + + stats = [(fname, text, ti.doc_frequency(), ti.weight()) + for (fname, text), ti in tr] + + assert stats == [("content", b("alfa"), 1, 1), + ("content", b("bravo"), 2, 5), + ("content", b("charlie"), 2, 2), + ("content", b("delta"), 3, 4), + ("content", b("echo"), 2, 2), + ("content", b("foxtrot"), 1, 1)] + + +def test_deletion(): + s = fields.Schema(key=fields.ID, name=fields.TEXT, value=fields.TEXT) + with TempIndex(s, "deletion") as ix: + w = ix.writer() + w.add_document(key=u("A"), name=u("Yellow brown"), + value=u("Blue red green purple?")) + w.add_document(key=u("B"), name=u("Alpha beta"), + value=u("Gamma delta epsilon omega.")) + w.add_document(key=u("C"), name=u("One two"), + value=u("Three four five.")) + w.commit() + + w = ix.writer() + assert w.delete_by_term("key", u("B")) == 1 + w.commit(merge=False) + + assert ix.doc_count_all() == 3 + assert ix.doc_count() == 2 + + w = ix.writer() + w.add_document(key=u("A"), name=u("Yellow brown"), + value=u("Blue red green purple?")) + w.add_document(key=u("B"), name=u("Alpha beta"), + value=u("Gamma delta epsilon omega.")) + w.add_document(key=u("C"), name=u("One two"), + value=u("Three four five.")) + w.commit() + + # This will match both documents with key == B, one of which is already + # deleted. This should not raise an error. + w = ix.writer() + assert w.delete_by_term("key", u("B")) == 1 + w.commit() + + ix.optimize() + assert ix.doc_count_all() == 4 + assert ix.doc_count() == 4 + + with ix.reader() as tr: + assert " ".join(tr.field_terms("name")) == "brown one two yellow" + + +def test_writer_reuse(): + s = fields.Schema(key=fields.ID) + ix = RamStorage().create_index(s) + + w = ix.writer() + w.add_document(key=u("A")) + w.add_document(key=u("B")) + w.add_document(key=u("C")) + w.commit() + + # You can't re-use a commited/canceled writer + pytest.raises(IndexingError, w.add_document, key=u("D")) + pytest.raises(IndexingError, w.update_document, key=u("B")) + pytest.raises(IndexingError, w.delete_document, 0) + pytest.raises(IndexingError, w.add_reader, None) + pytest.raises(IndexingError, w.add_field, "name", fields.ID) + pytest.raises(IndexingError, w.remove_field, "key") + pytest.raises(IndexingError, w.searcher) + + +def test_update(): + # Test update with multiple unique keys + SAMPLE_DOCS = [{"id": u("test1"), "path": u("/test/1"), + "text": u("Hello")}, + {"id": u("test2"), "path": u("/test/2"), + "text": u("There")}, + {"id": u("test3"), "path": u("/test/3"), + "text": u("Reader")}, + ] + + schema = fields.Schema(id=fields.ID(unique=True, stored=True), + path=fields.ID(unique=True, stored=True), + text=fields.TEXT) + + with TempIndex(schema, "update") as ix: + with ix.writer() as w: + for doc in SAMPLE_DOCS: + w.add_document(**doc) + + with ix.writer() as w: + w.update_document(id=u("test2"), path=u("test/1"), + text=u("Replacement")) + + +def test_update2(): + schema = fields.Schema(key=fields.ID(unique=True, stored=True), + p=fields.ID(stored=True)) + with TempIndex(schema, "update2") as ix: + nums = list(range(21)) + random.shuffle(nums) + for i, n in enumerate(nums): + w = ix.writer() + w.update_document(key=text_type(n % 10), p=text_type(i)) + w.commit() + + with ix.searcher() as s: + results = [d["key"] for _, d in s.iter_docs()] + results = " ".join(sorted(results)) + assert results == "0 1 2 3 4 5 6 7 8 9" + + +def test_update_numeric(): + schema = fields.Schema(num=fields.NUMERIC(unique=True, stored=True), + text=fields.ID(stored=True)) + with TempIndex(schema, "updatenum") as ix: + nums = list(range(5)) * 3 + random.shuffle(nums) + for num in nums: + with ix.writer() as w: + w.update_document(num=num, text=text_type(num)) + + with ix.searcher() as s: + results = [d["text"] for _, d in s.iter_docs()] + results = " ".join(sorted(results)) + assert results == "0 1 2 3 4" + + +def test_reindex(): + sample_docs = [ + {'id': u('test1'), + 'text': u('This is a document. Awesome, is it not?')}, + {'id': u('test2'), 'text': u('Another document. Astounding!')}, + {'id': u('test3'), + 'text': u('A fascinating article on the behavior of domestic ' + 'steak knives.')}, + ] + + schema = fields.Schema(text=fields.TEXT(stored=True), + id=fields.ID(unique=True, stored=True)) + with TempIndex(schema, "reindex") as ix: + def reindex(): + writer = ix.writer() + for doc in sample_docs: + writer.update_document(**doc) + writer.commit() + + reindex() + assert ix.doc_count() == 3 + reindex() + assert ix.doc_count() == 3 + + +def test_noscorables1(): + values = [u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), + u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), + u("kilo"), u("lima")] + from random import choice, sample, randint + + times = 1000 + + schema = fields.Schema(id=fields.ID, tags=fields.KEYWORD) + with TempIndex(schema, "noscorables1") as ix: + w = ix.writer() + for _ in xrange(times): + w.add_document(id=choice(values), + tags=u(" ").join(sample(values, randint(2, 7)))) + w.commit() + + with ix.searcher() as s: + s.search(query.Term("id", "bravo")) + + +def test_noscorables2(): + schema = fields.Schema(field=fields.ID) + with TempIndex(schema, "noscorables2") as ix: + writer = ix.writer() + writer.add_document(field=u('foo')) + writer.commit() + + +def test_multi(): + schema = fields.Schema(id=fields.ID(stored=True), + content=fields.KEYWORD(stored=True)) + with TempIndex(schema, "multi") as ix: + writer = ix.writer() + # Deleted 1 + writer.add_document(id=u("1"), content=u("alfa bravo charlie")) + # Deleted 1 + writer.add_document(id=u("2"), content=u("bravo charlie delta echo")) + # Deleted 2 + writer.add_document(id=u("3"), content=u("charlie delta echo foxtrot")) + writer.commit() + + writer = ix.writer() + writer.delete_by_term("id", "1") + writer.delete_by_term("id", "2") + writer.add_document(id=u("4"), content=u("apple bear cherry donut")) + writer.add_document(id=u("5"), content=u("bear cherry donut eggs")) + # Deleted 2 + writer.add_document(id=u("6"), content=u("delta echo foxtrot golf")) + # no d + writer.add_document(id=u("7"), content=u("echo foxtrot golf hotel")) + writer.commit(merge=False) + + writer = ix.writer() + writer.delete_by_term("id", "3") + writer.delete_by_term("id", "6") + writer.add_document(id=u("8"), content=u("cherry donut eggs falafel")) + writer.add_document(id=u("9"), content=u("donut eggs falafel grape")) + writer.add_document(id=u("A"), content=u(" foxtrot golf hotel india")) + writer.commit(merge=False) + + assert ix.doc_count() == 6 + + with ix.searcher() as s: + r = s.search(query.Prefix("content", u("d")), optimize=False) + assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"] + + r = s.search(query.Prefix("content", u("d"))) + assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"] + + r = s.search(query.Prefix("content", u("d")), limit=None) + assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"] + + +def test_deleteall(): + schema = fields.Schema(text=fields.TEXT) + with TempIndex(schema, "deleteall") as ix: + w = ix.writer() + domain = u("alfa bravo charlie delta echo").split() + for i, ls in enumerate(permutations(domain)): + w.add_document(text=u(" ").join(ls)) + if not i % 10: + w.commit() + w = ix.writer() + w.commit() + + # This is just a test, don't use this method to delete all docs IRL! + doccount = ix.doc_count_all() + w = ix.writer() + for docnum in xrange(doccount): + w.delete_document(docnum) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Or([query.Term("text", u("alfa")), + query.Term("text", u("bravo"))])) + assert len(r) == 0 + + ix.optimize() + assert ix.doc_count_all() == 0 + + with ix.reader() as r: + assert list(r) == [] + + +def test_simple_stored(): + schema = fields.Schema(a=fields.ID(stored=True), b=fields.ID(stored=False)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(a=u("alfa"), b=u("bravo")) + with ix.searcher() as s: + sf = s.stored_fields(0) + assert sf == {"a": "alfa"} + + +def test_single(): + schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) + with TempIndex(schema, "single") as ix: + w = ix.writer() + w.add_document(id=u("1"), text=u("alfa")) + w.commit() + + with ix.searcher() as s: + assert ("text", u("alfa")) in s.reader() + assert list(s.documents(id="1")) == [{"id": "1"}] + assert list(s.documents(text="alfa")) == [{"id": "1"}] + assert list(s.all_stored_fields()) == [{"id": "1"}] + + +def test_indentical_fields(): + schema = fields.Schema(id=fields.STORED, + f1=fields.TEXT, f2=fields.TEXT, f3=fields.TEXT) + with TempIndex(schema, "identifields") as ix: + w = ix.writer() + w.add_document(id=1, f1=u("alfa"), f2=u("alfa"), f3=u("alfa")) + w.commit() + + with ix.searcher() as s: + assert list(s.lexicon("f1")) == [b("alfa")] + assert list(s.lexicon("f2")) == [b("alfa")] + assert list(s.lexicon("f3")) == [b("alfa")] + assert list(s.documents(f1="alfa")) == [{"id": 1}] + assert list(s.documents(f2="alfa")) == [{"id": 1}] + assert list(s.documents(f3="alfa")) == [{"id": 1}] + + +def test_multivalue(): + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(id=fields.STORED, date=fields.DATETIME, + num=fields.NUMERIC, + txt=fields.TEXT(analyzer=ana)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=1, date=datetime(2001, 1, 1), num=5) + w.add_document(id=2, date=[datetime(2002, 2, 2), datetime(2003, 3, 3)], + num=[1, 2, 3, 12]) + w.add_document(txt=u("a b c").split()) + + with ix.reader() as r: + assert ("num", 3) in r + assert ("date", datetime(2003, 3, 3)) in r + assert " ".join(r.field_terms("txt")) == "a b c" + + +def test_multi_language(): + # Analyzer for English + ana_eng = analysis.StemmingAnalyzer() + + # analyzer for Pig Latin + def stem_piglatin(w): + if w.endswith("ay"): + w = w[:-2] + return w + ana_pig = analysis.StemmingAnalyzer(stoplist=["nday", "roay"], + stemfn=stem_piglatin) + + # Dictionary mapping languages to analyzers + analyzers = {"eng": ana_eng, "pig": ana_pig} + + # Fake documents + corpus = [(u("eng"), u("Such stuff as dreams are made on")), + (u("pig"), u("Otay ebay, roay otnay otay ebay"))] + + schema = fields.Schema(content=fields.TEXT(stored=True), + lang=fields.ID(stored=True)) + ix = RamStorage().create_index(schema) + + with ix.writer() as w: + for doclang, content in corpus: + ana = analyzers[doclang] + # "Pre-analyze" the field into token strings + words = [token.text for token in ana(content)] + # Note we store the original value but index the pre-analyzed words + w.add_document(lang=doclang, content=words, + _stored_content=content) + + with ix.searcher() as s: + schema = s.schema + + # Modify the schema to fake the correct analyzer for the language + # we're searching in + schema["content"].analyzer = analyzers["eng"] + + qp = qparser.QueryParser("content", schema) + q = qp.parse("dreaming") + r = s.search(q) + assert len(r) == 1 + assert r[0]["content"] == "Such stuff as dreams are made on" + + schema["content"].analyzer = analyzers["pig"] + qp = qparser.QueryParser("content", schema) + q = qp.parse("otnay") + r = s.search(q) + assert len(r) == 1 + assert r[0]["content"] == "Otay ebay, roay otnay otay ebay" + + +def test_doc_boost(): + schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=0, a=u("alfa alfa alfa"), b=u("bravo")) + w.add_document(id=1, a=u("alfa"), b=u("bear"), _a_boost=5.0) + w.add_document(id=2, a=u("alfa alfa alfa alfa"), _boost=0.5) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Term("a", "alfa")) + assert [hit["id"] for hit in r] == [1, 0, 2] + + w = ix.writer() + w.add_document(id=3, a=u("alfa"), b=u("bottle")) + w.add_document(id=4, b=u("bravo"), _b_boost=2.0) + w.commit(merge=False) + + with ix.searcher() as s: + r = s.search(query.Term("a", "alfa")) + assert [hit["id"] for hit in r] == [1, 0, 3, 2] + + +def test_globfield_length_merge(): + # Issue 343 + + schema = fields.Schema(title=fields.TEXT(stored=True), + path=fields.ID(stored=True)) + schema.add("*_text", fields.TEXT, glob=True) + + with TempIndex(schema, "globlenmerge") as ix: + with ix.writer() as w: + w.add_document(title=u("First document"), path=u("/a"), + content_text=u("This is the first document we've added!")) + + with ix.writer() as w: + w.add_document(title=u("Second document"), path=u("/b"), + content_text=u("The second document is even more interesting!")) + + with ix.searcher() as s: + docnum = s.document_number(path="/a") + assert s.doc_field_length(docnum, "content_text") is not None + + qp = qparser.QueryParser("content", schema) + q = qp.parse("content_text:document") + r = s.search(q) + paths = sorted(hit["path"] for hit in r) + assert paths == ["/a", "/b"] + + +def test_index_decimals(): + from decimal import Decimal + + schema = fields.Schema(name=fields.KEYWORD(stored=True), + num=fields.NUMERIC(int)) + ix = RamStorage().create_index(schema) + + with ix.writer() as w: + with pytest.raises(TypeError): + w.add_document(name=u("hello"), num=Decimal("3.2")) + + schema = fields.Schema(name=fields.KEYWORD(stored=True), + num=fields.NUMERIC(Decimal, decimal_places=5)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(name=u("hello"), num=Decimal("3.2")) + + + diff --git a/tests/test_matching.py b/tests/test_matching.py new file mode 100644 index 0000000..75bec7e --- /dev/null +++ b/tests/test_matching.py @@ -0,0 +1,556 @@ +from __future__ import with_statement +from random import randint, choice, sample + +from whoosh import fields, matching, qparser, query +from whoosh.compat import b, u, xrange, permutations +from whoosh.filedb.filestore import RamStorage +from whoosh.query import And, Term +from whoosh.util import make_binary_tree +from whoosh.scoring import WeightScorer + + +def _keys(searcher, docnums): + return sorted([searcher.stored_fields(docnum)['key'] + for docnum in docnums]) + + +def test_nullmatcher(): + nm = matching.NullMatcher() + assert not nm.is_active() + assert list(nm.all_ids()) == [] + + +def test_listmatcher(): + ids = [1, 2, 5, 9, 10] + + lm = matching.ListMatcher(ids) + ls = [] + while lm.is_active(): + ls.append((lm.id(), lm.score())) + lm.next() + assert ls == [(1, 1.0), (2, 1.0), (5, 1.0), (9, 1.0), (10, 1.0)] + + lm = matching.ListMatcher(ids) + assert list(lm.all_ids()) == ids + + lm = matching.ListMatcher(ids, position=3) + ls = [] + while lm.is_active(): + ls.append(lm.id()) + lm.next() + assert ls == [9, 10] + + lm = matching.ListMatcher(ids) + for _ in xrange(3): + lm.next() + lm = lm.copy() + ls = [] + while lm.is_active(): + ls.append(lm.id()) + lm.next() + assert ls == [9, 10] + + +def test_listmatcher_skip_to_quality_identical_scores(): + ids = [1, 2, 5, 9, 10] + lm = matching.ListMatcher(ids, scorer=WeightScorer(1.0)) + lm.skip_to_quality(0.3) + ls = [] + while lm.is_active(): + ls.append((lm.id(), lm.score())) + lm.next() + assert ls == [(1, 1.0), (2, 1.0), (5, 1.0), (9, 1.0), (10, 1.0)] + + +def test_wrapper(): + wm = matching.WrappingMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), + boost=2.0) + ls = [] + while wm.is_active(): + ls.append((wm.id(), wm.score())) + wm.next() + assert ls == [(1, 2.0), (2, 2.0), (5, 2.0), (9, 2.0), (10, 2.0)] + + ids = [1, 2, 5, 9, 10] + wm = matching.WrappingMatcher(matching.ListMatcher(ids), boost=2.0) + assert list(wm.all_ids()) == ids + + +def test_filter(): + lm = lambda: matching.ListMatcher(list(range(2, 10))) + + fm = matching.FilterMatcher(lm(), frozenset([3, 9])) + assert list(fm.all_ids()) == [3, 9] + + fm = matching.FilterMatcher(lm(), frozenset([1, 5, 9, 13])) + assert list(fm.all_ids()) == [5, 9] + + +def test_exclude(): + em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), + frozenset([2, 9]), exclude=True) + assert list(em.all_ids()) == [1, 5, 10] + + em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), + frozenset([2, 9]), exclude=True) + assert list(em.all_ids()) == [1, 5, 10] + + em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), + frozenset([2, 9]), exclude=True) + em.next() + em.next() + em = em.copy() + ls = [] + while em.is_active(): + ls.append(em.id()) + em.next() + assert ls == [10] + + +def test_simple_union(): + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + um = matching.UnionMatcher(lm1, lm2) + ls = [] + while um.is_active(): + ls.append((um.id(), um.score())) + um.next() + assert ls == [(0, 1.0), (1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)] + + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + um = matching.UnionMatcher(lm1, lm2) + assert list(um.all_ids()) == [0, 1, 4, 10, 20, 90] + + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + um = matching.UnionMatcher(lm1, lm2) + um.next() + um.next() + um = um.copy() + ls = [] + while um.is_active(): + ls.append(um.id()) + um.next() + assert ls == [4, 10, 20, 90] + + +def test_simple_intersection(): + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + im = matching.IntersectionMatcher(lm1, lm2) + ls = [] + while im.is_active(): + ls.append((im.id(), im.score())) + im.next() + assert ls == [(4, 2.0), (20, 2.0)] + + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + im = matching.IntersectionMatcher(lm1, lm2) + assert list(im.all_ids()) == [4, 20] + + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + im = matching.IntersectionMatcher(lm1, lm2) + im.next() + im.next() + im = im.copy() + ls = [] + while im.is_active(): + ls.append(im.id()) + im.next() + assert not ls + + +def test_andnot(): + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + anm = matching.AndNotMatcher(lm1, lm2) + ls = [] + while anm.is_active(): + ls.append((anm.id(), anm.score())) + anm.next() + assert ls == [(1, 1.0), (10, 1.0), (90, 1.0)] + + echo_lm = matching.ListMatcher([0, 1, 2, 3, 4]) + bravo_lm = matching.ListMatcher([0, 1]) + anm = matching.AndNotMatcher(echo_lm, bravo_lm) + assert list(anm.all_ids()) == [2, 3, 4] + + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + anm = matching.AndNotMatcher(lm1, lm2) + assert list(anm.all_ids()) == [1, 10, 90] + + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + anm = matching.AndNotMatcher(lm1, lm2) + anm.next() + anm.next() + anm = anm.copy() + ls = [] + while anm.is_active(): + ls.append(anm.id()) + anm.next() + assert ls == [90] + + +def test_require(): + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + rm = matching.RequireMatcher(lm1, lm2) + ls = [] + while rm.is_active(): + ls.append((rm.id(), rm.score())) + rm.next() + assert ls == [(4, 1.0), (20, 1.0)] + + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + rm = matching.RequireMatcher(lm1, lm2) + assert list(rm.all_ids()) == [4, 20] + + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + rm = matching.RequireMatcher(lm1, lm2) + rm.next() + rm.next() + rm = rm.copy() + ls = [] + while rm.is_active(): + ls.append(rm.id()) + rm.next() + assert not ls + + +def test_andmaybe(): + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + amm = matching.AndMaybeMatcher(lm1, lm2) + ls = [] + while amm.is_active(): + ls.append((amm.id(), amm.score())) + amm.next() + assert ls == [(1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)] + + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + amm = matching.AndMaybeMatcher(lm1, lm2) + assert list(amm.all_ids()) == [1, 4, 10, 20, 90] + + lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) + lm2 = matching.ListMatcher([0, 4, 20]) + amm = matching.AndMaybeMatcher(lm1, lm2) + amm.next() + amm.next() + amm = amm.copy() + ls = [] + while amm.is_active(): + ls.append(amm.id()) + amm.next() + assert ls == [10, 20, 90] + + +def test_intersection(): + schema = fields.Schema(key=fields.ID(stored=True), + value=fields.TEXT(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + w.add_document(key=u("a"), value=u("alpha bravo charlie delta")) + w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo")) + w.add_document(key=u("c"), value=u("charlie delta golf hotel")) + w.commit() + + w = ix.writer() + w.add_document(key=u("d"), value=u("india alpha bravo charlie")) + w.add_document(key=u("e"), value=u("delta bravo india bravo")) + w.commit() + + with ix.searcher() as s: + q = And([Term("value", u("bravo")), Term("value", u("delta"))]) + m = q.matcher(s) + assert _keys(s, m.all_ids()) == ["a", "e"] + + q = And([Term("value", u("bravo")), Term("value", u("alpha"))]) + m = q.matcher(s) + assert _keys(s, m.all_ids()) == ["a", "b", "d"] + + +def test_random_intersections(): + domain = [u("alpha"), u("bravo"), u("charlie"), u("delta"), u("echo"), + u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), + u("kilo"), u("lima"), u("mike")] + segments = 5 + docsperseg = 50 + fieldlimits = (3, 10) + documents = [] + + schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + + # Create docsperseg * segments documents containing random words from + # the domain list. Add the documents to the index, but also keep them + # in the "documents" list for the sanity check + for i in xrange(segments): + w = ix.writer() + for j in xrange(docsperseg): + docnum = i * docsperseg + j + # Create a string of random words + doc = u(" ").join(choice(domain) + for _ in xrange(randint(*fieldlimits))) + # Add the string to the index + w.add_document(key=docnum, value=doc) + # Add a (docnum, string) tuple to the documents list + documents.append((docnum, doc)) + w.commit() + assert len(ix._segments()) != 1 + + testcount = 20 + testlimits = (2, 5) + + with ix.searcher() as s: + for i in xrange(s.doc_count_all()): + assert s.stored_fields(i).get("key") is not None + + for _ in xrange(testcount): + # Create a random list of words and manually do an intersection of + # items in "documents" that contain the words ("target"). + words = sample(domain, randint(*testlimits)) + target = [] + for docnum, doc in documents: + if all((doc.find(w) > -1) for w in words): + target.append(docnum) + target.sort() + + # Create a query from the list of words and get two matchers from + # it. + q = And([Term("value", w) for w in words]) + m1 = q.matcher(s) + m2 = q.matcher(s) + + # Try getting the list of IDs from all_ids() + ids1 = list(m1.all_ids()) + + # Try getting the list of IDs using id()/next() + ids2 = [] + while m2.is_active(): + ids2.append(m2.id()) + m2.next() + + # Check that the two methods return the same list + assert ids1 == ids2 + + # Check that the IDs match the ones we manually calculated + assert _keys(s, ids1) == target + + +def test_union(): + s1 = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8]) + s2 = matching.ListMatcher([2, 4, 8, 10, 20, 30]) + s3 = matching.ListMatcher([10, 100, 200]) + target = [1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 30, 100, 200] + um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3)) + assert target == list(um.all_ids()) + + +def test_union_scores(): + s1 = matching.ListMatcher([1, 2, 3]) + s2 = matching.ListMatcher([2, 4, 8]) + s3 = matching.ListMatcher([2, 3, 8]) + target = [(1, 1.0), (2, 3.0), (3, 2.0), (4, 1.0), (8, 2.0)] + um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3)) + result = [] + while um.is_active(): + result.append((um.id(), um.score())) + um.next() + assert target == result + + +def test_random_union(): + testcount = 100 + rangelimits = (2, 10) + clauselimits = (2, 10) + + vals = list(range(100)) + + for _ in xrange(testcount): + target = set() + matchers = [] + for _ in xrange(randint(*clauselimits)): + nums = sample(vals, randint(*rangelimits)) + target = target.union(nums) + matchers.append(matching.ListMatcher(sorted(nums))) + target = sorted(target) + um = make_binary_tree(matching.UnionMatcher, matchers) + assert list(um.all_ids()) == target + + +def test_inverse(): + s = matching.ListMatcher([1, 5, 10, 11, 13]) + inv = matching.InverseMatcher(s, 15) + ids = [] + while inv.is_active(): + ids.append(inv.id()) + inv.next() + assert ids == [0, 2, 3, 4, 6, 7, 8, 9, 12, 14] + + +def test_inverse_skip(): + s = matching.ListMatcher([1, 5, 10, 11, 13]) + inv = matching.InverseMatcher(s, 15) + inv.skip_to(8) + + ids = [] + while inv.is_active(): + ids.append(inv.id()) + inv.next() + assert ids == [8, 9, 12, 14] + + +def test_empty_andnot(): + pos = matching.NullMatcher() + neg = matching.NullMatcher() + anm = matching.AndNotMatcher(pos, neg) + assert not anm.is_active() + assert not list(anm.all_ids()) + + pos = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + neg = matching.NullMatcher() + ans = matching.AndNotMatcher(pos, neg) + ids = list(ans.all_ids()) + assert ids == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + +def test_random_andnot(): + testcount = 100 + rangesize = 100 + + rng = list(range(rangesize)) + + for _ in xrange(testcount): + negs = sorted(sample(rng, randint(0, rangesize - 1))) + negset = frozenset(negs) + matched = [n for n in rng if n not in negset] + + pos = matching.ListMatcher(rng) + neg = matching.ListMatcher(negs) + + anm = matching.AndNotMatcher(pos, neg) + ids = list(anm.all_ids()) + assert ids == matched + + +def test_current_terms(): + domain = u("alfa bravo charlie delta").split() + schema = fields.Schema(text=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + for ls in permutations(domain, 3): + w.add_document(text=" ".join(ls), _stored_text=ls) + w.commit() + + with ix.searcher() as s: + q = query.And([query.Term("text", "alfa"), + query.Term("text", "charlie")]) + m = q.matcher(s) + + while m.is_active(): + assert sorted(m.matching_terms()) == [("text", b("alfa")), ("text", b("charlie"))] + m.next() + + +def test_exclusion(): + from datetime import datetime + + schema = fields.Schema(id=fields.ID(stored=True), date=fields.DATETIME) + ix = RamStorage().create_index(schema) + dt1 = datetime(1950, 1, 1) + dt2 = datetime(1960, 1, 1) + with ix.writer() as w: + # Make 39 documents with dates != dt1 and then make a last document + # with feed == dt1. + for i in xrange(40): + w.add_document(id=u(str(i)), date=(dt2 if i >= 1 else dt1)) + + with ix.searcher() as s: + qp = qparser.QueryParser("id", schema) + # Find documents where date != dt1 + q = qp.parse("NOT (date:(19500101000000))") + + r = s.search(q, limit=None) + assert len(r) == 39 # Total number of matched documents + assert r.scored_length() == 39 # Number of docs in the results + + +def test_arrayunion(): + l1 = matching.ListMatcher([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) + l2 = matching.ListMatcher([100, 200, 300, 400, 500, 600]) + aum = matching.ArrayUnionMatcher([l1, l2], 600, partsize=5) + assert aum.id() == 10 + aum.skip_to(45) + assert aum.id() == 50 + aum.skip_to(550) + assert aum.id() == 600 + + +def test_arrayunion2(): + l1 = matching.ListMatcher([1, 2]) + l2 = matching.ListMatcher([1, 2, 10, 20]) + l3 = matching.ListMatcher([1, 5, 10, 50]) + aum = matching.ArrayUnionMatcher([l1, l2, l3], 51, partsize=2) + + assert aum.id() == 1 + assert not l1.is_active() + aum.skip_to(50) + assert aum.id() == 50 + + +def test_every_matcher(): + class MyQuery(query.Query): + def __init__(self, subqs): + self.subqs = subqs + + def estimate_min_size(self, ixreader): + return ixreader.doc_count() + + def matcher(self, searcher, context=None): + # Get matchers for the sub-queries + children = [q.matcher(searcher, context) for q in self.subqs] + # Pass the child matchers, the number of documents in the searcher, + # and a reference to the searcher's is_deleted() method to the + # matcher + return MyMatcher(children, searcher.doc_count_all(), + searcher.is_deleted) + + class MyMatcher(matching.UnionMatcher): + def __init__(self, children, doccount, is_deleted): + self.children = children + self._id = 0 + self.doccount = doccount + self.is_deleted = is_deleted + + def is_active(self): + return self._id < self.doccount + + def id(self): + return self._id + + def next(self): + self._id += 1 + while self._id < self.doccount and self.is_deleted(self._id): + self._id += 1 + + def score(self): + # Iterate through the sub-matchers + for child in self.children: + # If the matcher is on the current document, do something + # with its score + if child.is_active() and child.id() == self.id(): + # Something here + pass + return 0 + diff --git a/tests/test_misc.py b/tests/test_misc.py new file mode 100644 index 0000000..a2a5ffc --- /dev/null +++ b/tests/test_misc.py @@ -0,0 +1,161 @@ +from __future__ import with_statement +import os, threading, time + +from whoosh.compat import u +from whoosh.util.filelock import try_for +from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.util.testing import TempStorage + + +def test_now(): + from whoosh.util import now + + t1 = now() + t2 = now() + assert t1 <= t2 + + +def test_storage_creation(): + import tempfile, uuid + from whoosh import fields + from whoosh.filedb.filestore import FileStorage + + schema = fields.Schema(text=fields.TEXT) + uid = uuid.uuid4() + dirpath = os.path.join(tempfile.gettempdir(), str(uid)) + assert not os.path.exists(dirpath) + + st = FileStorage(dirpath) + st.create() + assert os.path.exists(dirpath) + + ix = st.create_index(schema) + with ix.writer() as w: + w.add_document(text=u("alfa bravo")) + w.add_document(text=u("bracho charlie")) + + st.destroy() + assert not os.path.exists(dirpath) + + +def test_ramstorage(): + from whoosh.filedb.filestore import RamStorage + + st = RamStorage() + lock = st.lock("test") + lock.acquire() + lock.release() + + +def test_filelock_simple(): + with TempStorage("simplefilelock") as st: + lock1 = st.lock("testlock") + lock2 = st.lock("testlock") + assert lock1 is not lock2 + + assert lock1.acquire() + assert st.file_exists("testlock") + assert not lock2.acquire() + lock1.release() + assert lock2.acquire() + assert not lock1.acquire() + lock2.release() + + +def test_threaded_filelock(): + with TempStorage("threadedfilelock") as st: + lock1 = st.lock("testlock") + result = [] + + # The thread function tries to acquire the lock and then quits + def fn(): + lock2 = st.lock("testlock") + gotit = try_for(lock2.acquire, 1.0, 0.1) + if gotit: + result.append(True) + lock2.release() + t = threading.Thread(target=fn) + + # Acquire the lock in this thread + lock1.acquire() + # Start the other thread trying to acquire the lock + t.start() + # Wait for a bit + time.sleep(0.15) + # Release the lock + lock1.release() + # Wait for the other thread to finish + t.join() + # If the other thread got the lock, it should have appended True to the + # "results" list. + assert result == [True] + + +def test_length_byte(): + source = list(range(11)) + xform = [length_to_byte(n) for n in source] + result = [byte_to_length(n) for n in xform] + assert source == result + + +def test_clockface_lru(): + from whoosh.util.cache import clockface_lru_cache + + @clockface_lru_cache(5) + def test(n): + return n * 2 + + result = [test(n) for n in (1, 2, 3, 4, 5, 4, 3, 2, 10, 1)] + assert result == [2, 4, 6, 8, 10, 8, 6, 4, 20, 2] + assert test.cache_info() == (3, 7, 5, 5) + test.cache_clear() + assert test.cache_info() == (0, 0, 5, 0) + + +def test_double_barrel_lru(): + from whoosh.util.cache import lru_cache + + @lru_cache(5) + def test(n): + return n * 2 + + result = [test(n) for n in (1, 2, 3, 4, 5, 4, 3, 2, 10, 1)] + assert result == [2, 4, 6, 8, 10, 8, 6, 4, 20, 2] + # # hits, misses, maxsize and currsize + # assert test.cache_info() == (4, 6, 5, 5) + test.cache_clear() + # assert test.cache_info() == (0, 0, 5, 0) + + +def test_version_object(): + from whoosh.util.versions import SimpleVersion as sv + + assert sv.parse("1") == sv(1) + assert sv.parse("1.2") == sv(1, 2) + assert sv.parse("1.2b") == sv(1, 2, ex="b") + assert sv.parse("1.2rc") == sv(1, 2, ex="rc") + assert sv.parse("1.2b3") == sv(1, 2, ex="b", exnum=3) + assert sv.parse("1.2.3") == sv(1, 2, 3) + assert sv.parse("1.2.3a") == sv(1, 2, 3, "a") + assert sv.parse("1.2.3rc") == sv(1, 2, 3, "rc") + assert sv.parse("1.2.3a4") == sv(1, 2, 3, "a", 4) + assert sv.parse("1.2.3rc2") == sv(1, 2, 3, "rc", 2) + assert sv.parse("999.999.999c999") == sv(999, 999, 999, "c", 999) + + assert sv.parse("1.2") == sv.parse("1.2") + assert sv("1.2") != sv("1.3") + assert sv.parse("1.0") < sv.parse("1.1") + assert sv.parse("1.0") < sv.parse("2.0") + assert sv.parse("1.2.3a4") < sv.parse("1.2.3a5") + assert sv.parse("1.2.3a5") > sv.parse("1.2.3a4") + assert sv.parse("1.2.3c99") < sv.parse("1.2.4") + assert sv.parse("1.2.3a4") != sv.parse("1.2.3a5") + assert sv.parse("1.2.3a5") != sv.parse("1.2.3a4") + assert sv.parse("1.2.3c99") != sv.parse("1.2.4") + assert sv.parse("1.2.3a4") <= sv.parse("1.2.3a5") + assert sv.parse("1.2.3a5") >= sv.parse("1.2.3a4") + assert sv.parse("1.2.3c99") <= sv.parse("1.2.4") + assert sv.parse("1.2") <= sv.parse("1.2") + + assert sv(1, 2, 3).to_int() == 17213488128 + assert sv.from_int(17213488128) == sv(1, 2, 3) diff --git a/tests/test_mpwriter.py b/tests/test_mpwriter.py new file mode 100644 index 0000000..421bcce --- /dev/null +++ b/tests/test_mpwriter.py @@ -0,0 +1,277 @@ +from __future__ import with_statement +import random +from collections import deque + +import pytest + +from whoosh import fields, query +from whoosh.compat import u, izip, xrange, permutations +from whoosh.util.numeric import length_to_byte, byte_to_length +from whoosh.util.testing import TempIndex + + +def check_multi(): + try: + import multiprocessing + import multiprocessing.synchronize # @UnusedImport + except ImportError: + pytest.skip() + else: + try: + from multiprocessing import Queue + Queue() + except OSError: + pytest.skip() + else: + return False + + +def _byten(n): + return byte_to_length(length_to_byte(n)) + + +def _do_basic(writerclass): + # Create the domain data + + # List of individual words added to the index + words = [] + # List of string values added to the index + docs = [] + # A ring buffer for creating string values + buf = deque() + for ls in permutations(u("abcd")): + word = "".join(ls) + # Remember this word is in the index (to check lexicon) + words.append(word) + + # Add this word on to the end, pop the first word off to create N word + # documents where N <= 10 + buf.append(word) + if len(buf) > 10: + buf.popleft() + # Create a copy of the buffer and shuffle it to create a document value + # and add it to the list of document values + doc = list(buf) + random.shuffle(doc) + docs.append(" ".join(doc)) + # Shuffle the list of document values + random.shuffle(docs) + + schema = fields.Schema(text=fields.TEXT(stored=True, spelling=True, + vector=True), + row=fields.NUMERIC(stored=True)) + + with TempIndex(schema, storage_debug=True) as ix: + # Add the domain data to the index + with writerclass(ix, procs=3) as w: + for i, value in enumerate(docs): + w.add_document(text=value, row=i) + + with ix.searcher() as s: + r = s.reader() + + # Check the lexicon + for word, term in izip(words, r.field_terms("text")): + assert word == term + # Check the doc count + assert r.doc_count_all() == len(docs) + + # Check there are lengths + total = sum(r.doc_field_length(docnum, "text", 0) + for docnum in xrange(r.doc_count_all())) + assert total > 0 + + # Check per-doc info + for i, value in enumerate(docs): + pieces = value.split() + docnum = s.document_number(row=i) + + # Check stored value + sv = r.stored_fields(docnum) + assert sv["text"] == value + + # Check vectors + vr = r.vector(docnum, "text") + # Get the terms and positions from the vector matcher + iv = list(vr.items_as("positions")) + # What the vector should look like + ov = sorted((text, [i]) for i, text in enumerate(pieces)) + assert iv == ov + + # Check field length + assert r.doc_field_length(docnum, "text") == len(pieces) + + +def test_basic_serial(): + check_multi() + from whoosh.multiproc import SerialMpWriter + + _do_basic(SerialMpWriter) + + +def test_basic_multi(): + check_multi() + from whoosh.multiproc import MpWriter + + _do_basic(MpWriter) + + +def test_no_add(): + check_multi() + from whoosh.multiproc import MpWriter + + schema = fields.Schema(text=fields.TEXT(stored=True, spelling=True, + vector=True)) + with TempIndex(schema) as ix: + with ix.writer(procs=3) as w: + assert type(w) == MpWriter + + +def _do_merge(writerclass): + schema = fields.Schema(key=fields.ID(stored=True, unique=True), + value=fields.TEXT(stored=True, spelling=True, + vector=True)) + + domain = {"a": "aa", "b": "bb cc", "c": "cc dd ee", "d": "dd ee ff gg", + "e": "ee ff gg hh ii", "f": "ff gg hh ii jj kk", + "g": "gg hh ii jj kk ll mm", "h": "hh ii jj kk ll mm nn oo", + "i": "ii jj kk ll mm nn oo pp qq ww ww ww ww ww ww", + "j": "jj kk ll mm nn oo pp qq rr ss", + "k": "kk ll mm nn oo pp qq rr ss tt uu"} + + with TempIndex(schema) as ix: + w = ix.writer() + for key in "abc": + w.add_document(key=u(key), value=u(domain[key])) + w.commit() + + w = ix.writer() + for key in "def": + w.add_document(key=u(key), value=u(domain[key])) + w.commit(merge=False) + + w = writerclass(ix, procs=3) + del domain["b"] + w.delete_by_term("key", u("b")) + + domain["e"] = "xx yy zz" + w.update_document(key=u("e"), value=u(domain["e"])) + + for key in "ghijk": + w.add_document(key=u(key), value=u(domain[key])) + w.commit(optimize=True) + + assert len(ix._segments()) == 1 + + with ix.searcher() as s: + r = s.reader() + + assert s.doc_count() == len(domain) + + assert "".join(r.field_terms("key")) == "acdefghijk" + assert " ".join(r.field_terms("value")) == "aa cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu ww xx yy zz" + + for key in domain: + docnum = s.document_number(key=key) + assert docnum is not None + + length = r.doc_field_length(docnum, "value") + assert length + assert _byten(len(domain[key].split())) == length + + sf = r.stored_fields(docnum) + assert domain[key] == sf["value"] + + words = sorted(set((" ".join(domain.values())).split())) + assert words == list(r.field_terms("value")) + + for word in words: + hits = s.search(query.Term("value", word)) + for hit in hits: + assert word in hit["value"].split() + + +def test_merge_serial(): + check_multi() + from whoosh.multiproc import SerialMpWriter + + _do_merge(SerialMpWriter) + + +def test_merge_multi(): + check_multi() + from whoosh.multiproc import MpWriter + + _do_merge(MpWriter) + + +def test_no_score_no_store(): + check_multi() + from whoosh.multiproc import MpWriter + + schema = fields.Schema(a=fields.ID, b=fields.KEYWORD) + domain = {} + keys = list(u("abcdefghijklmnopqrstuvwx")) + random.shuffle(keys) + words = u("alfa bravo charlie delta").split() + for i, key in enumerate(keys): + domain[key] = words[i % len(words)] + + with TempIndex(schema) as ix: + with MpWriter(ix, procs=3) as w: + for key, value in domain.items(): + w.add_document(a=key, b=value) + + with ix.searcher() as s: + for word in words: + r = s.search(query.Term("b", word)) + assert len(r) == 6 + + +def test_multisegment(): + check_multi() + from whoosh.multiproc import MpWriter + + schema = fields.Schema(a=fields.TEXT(stored=True, spelling=True, + vector=True)) + words = u("alfa bravo charlie delta echo").split() + with TempIndex(schema) as ix: + with ix.writer(procs=3, multisegment=True, batchsize=10) as w: + assert w.__class__ == MpWriter + assert w.multisegment + + for ls in permutations(words, 3): + w.add_document(a=u(" ").join(ls)) + + assert len(ix._segments()) == 3 + + with ix.searcher() as s: + for word in words: + r = s.search(query.Term("a", word)) + for hit in r: + assert word in hit["a"].split() + + +def test_batchsize_eq_doccount(): + check_multi() + schema = fields.Schema(a=fields.KEYWORD(stored=True)) + with TempIndex(schema) as ix: + with ix.writer(procs=4, batchsize=10) as w: + for i in xrange(10): + w.add_document(a=u(str(i))) + + +def test_finish_segment(): + check_multi() + + from whoosh.multiproc import MpWriter + + schema = fields.Schema(a=fields.KEYWORD(stored=True)) + with TempIndex(schema) as ix: + w = MpWriter(ix, procs=2, batchsize=1, multisegment=False, + limitmb=0.00001) + + for i in range(9): + w.add_document(a=u(chr(65 + i) * 50)) + + w.commit() diff --git a/tests/test_nested.py b/tests/test_nested.py new file mode 100644 index 0000000..f91dc83 --- /dev/null +++ b/tests/test_nested.py @@ -0,0 +1,361 @@ +from __future__ import with_statement + +from whoosh import fields, qparser, query, sorting +from whoosh.compat import u +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex + + +def test_nested_parent(): + schema = fields.Schema(name=fields.ID(stored=True), type=fields.ID, + part=fields.ID, price=fields.NUMERIC) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + with w.group(): + w.add_document(name=u("iPad"), type=u("product")) + w.add_document(part=u("screen"), price=100) + w.add_document(part=u("battery"), price=50) + w.add_document(part=u("case"), price=20) + + with w.group(): + w.add_document(name=u("iPhone"), type=u("product")) + w.add_document(part=u("screen"), price=60) + w.add_document(part=u("battery"), price=30) + w.add_document(part=u("case"), price=10) + + with w.group(): + w.add_document(name=u("Mac mini"), type=u("product")) + w.add_document(part=u("hard drive"), price=50) + w.add_document(part=u("case"), price=50) + + with ix.searcher() as s: + price = s.schema["price"] + + pq = query.Term("type", "product") + cq = query.Term("price", 50) + q = query.NestedParent(pq, cq) + + r = s.search(q) + assert sorted([hit["name"] for hit in r]) == ["Mac mini", "iPad"] + + +def test_scoring(): + schema = fields.Schema(kind=fields.ID, + name=fields.KEYWORD(scorable=True, stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + with w.group(): + w.add_document(kind=u("class"), name=u("Index")) + w.add_document(kind=u("method"), name=u("add document")) + w.add_document(kind=u("method"), name=u("add reader")) + w.add_document(kind=u("method"), name=u("close")) + with w.group(): + w.add_document(kind=u("class"), name=u("Accumulator")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("get result")) + with w.group(): + w.add_document(kind=u("class"), name=u("Calculator")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("add all")) + w.add_document(kind=u("method"), name=u("add some")) + w.add_document(kind=u("method"), name=u("multiply")) + w.add_document(kind=u("method"), name=u("close")) + + with ix.searcher() as s: + q = query.NestedParent(query.Term("kind", "class"), + query.Term("name", "add")) + r = s.search(q) + assert [hit["name"] for hit in r] == ["Calculator", "Index", "Accumulator"] + + +def test_missing(): + schema = fields.Schema(kind=fields.ID, + name=fields.KEYWORD(scorable=True, stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + with w.group(): + w.add_document(kind=u("class"), name=u("Index")) + w.add_document(kind=u("method"), name=u("add document")) + w.add_document(kind=u("method"), name=u("add reader")) + w.add_document(kind=u("method"), name=u("close")) + with w.group(): + w.add_document(kind=u("class"), name=u("Accumulator")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("get result")) + with w.group(): + w.add_document(kind=u("class"), name=u("Calculator")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("add all")) + w.add_document(kind=u("method"), name=u("add some")) + w.add_document(kind=u("method"), name=u("multiply")) + w.add_document(kind=u("method"), name=u("close")) + with w.group(): + w.add_document(kind=u("class"), name=u("Deleter")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("delete")) + + with ix.searcher() as s: + q = query.NestedParent(query.Term("kind", "class"), + query.Term("name", "add")) + + r = s.search(q) + assert [hit["name"] for hit in r] == ["Calculator", "Index", "Accumulator", "Deleter"] + + with ix.writer() as w: + w.delete_by_term("name", "Accumulator") + w.delete_by_term("name", "Calculator") + + with ix.searcher() as s: + pq = query.Term("kind", "class") + assert len(list(pq.docs(s))) == 2 + q = query.NestedParent(pq, query.Term("name", "add")) + r = s.search(q) + assert [hit["name"] for hit in r] == ["Index", "Deleter"] + + +def test_nested_delete(): + schema = fields.Schema(kind=fields.ID, + name=fields.KEYWORD(scorable=True, stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + with w.group(): + w.add_document(kind=u("class"), name=u("Index")) + w.add_document(kind=u("method"), name=u("add document")) + w.add_document(kind=u("method"), name=u("add reader")) + w.add_document(kind=u("method"), name=u("close")) + with w.group(): + w.add_document(kind=u("class"), name=u("Accumulator")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("get result")) + with w.group(): + w.add_document(kind=u("class"), name=u("Calculator")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("add all")) + w.add_document(kind=u("method"), name=u("add some")) + w.add_document(kind=u("method"), name=u("multiply")) + w.add_document(kind=u("method"), name=u("close")) + with w.group(): + w.add_document(kind=u("class"), name=u("Deleter")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("delete")) + + # Delete "Accumulator" class + with ix.writer() as w: + q = query.NestedParent(query.Term("kind", "class"), + query.Term("name", "Accumulator")) + w.delete_by_query(q) + + # Check that Accumulator AND ITS METHODS are deleted + with ix.searcher() as s: + r = s.search(query.Term("kind", "class")) + assert sorted(hit["name"] for hit in r) == ["Calculator", "Deleter", "Index"] + + names = [fs["name"] for _, fs in s.iter_docs()] + assert names == ["Index", "add document", "add reader", "close", + "Calculator", "add", "add all", "add some", + "multiply", "close", "Deleter", "add", "delete"] + + # Delete any class with a close method + with ix.writer() as w: + q = query.NestedParent(query.Term("kind", "class"), + query.Term("name", "close")) + w.delete_by_query(q) + + # Check the CLASSES AND METHODS are gone + with ix.searcher() as s: + names = [fs["name"] for _, fs in s.iter_docs()] + assert names == ["Deleter", "add", "delete"] + + +def test_all_parents_deleted(): + schema = fields.Schema(kind=fields.ID, + name=fields.KEYWORD(scorable=True, stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + with w.group(): + w.add_document(kind=u("class"), name=u("Index")) + w.add_document(kind=u("method"), name=u("add document")) + w.add_document(kind=u("method"), name=u("add reader")) + w.add_document(kind=u("method"), name=u("close")) + with w.group(): + w.add_document(kind=u("class"), name=u("Accumulator")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("get result")) + with w.group(): + w.add_document(kind=u("class"), name=u("Calculator")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("add all")) + w.add_document(kind=u("method"), name=u("add some")) + w.add_document(kind=u("method"), name=u("multiply")) + w.add_document(kind=u("method"), name=u("close")) + with w.group(): + w.add_document(kind=u("class"), name=u("Deleter")) + w.add_document(kind=u("method"), name=u("add")) + w.add_document(kind=u("method"), name=u("delete")) + + with ix.writer() as w: + w.delete_by_term("name", "Index") + w.delete_by_term("name", "Accumulator") + w.delete_by_term("name", "Calculator") + w.delete_by_term("name", "Deleter") + + with ix.searcher() as s: + q = query.NestedParent(query.Term("kind", "class"), + query.Term("name", "add")) + r = s.search(q) + assert r.is_empty() + + +def test_everything_is_a_parent(): + schema = fields.Schema(id=fields.STORED, kind=fields.ID, + name=fields.ID(stored=True)) + k = u("alfa") + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=0, kind=k, name=u("one")) + w.add_document(id=1, kind=k, name=u("two")) + w.add_document(id=2, kind=k, name=u("three")) + w.add_document(id=3, kind=k, name=u("four")) + w.add_document(id=4, kind=k, name=u("one")) + w.add_document(id=5, kind=k, name=u("two")) + w.add_document(id=6, kind=k, name=u("three")) + w.add_document(id=7, kind=k, name=u("four")) + w.add_document(id=8, kind=k, name=u("one")) + w.add_document(id=9, kind=k, name=u("two")) + w.add_document(id=10, kind=k, name=u("three")) + w.add_document(id=11, kind=k, name=u("four")) + + with ix.searcher() as s: + pq = query.Term("kind", k) + cq = query.Or([query.Term("name", "two"), query.Term("name", "four")]) + q = query.NestedParent(pq, cq) + r = s.search(q) + assert [hit["id"] for hit in r] == [1, 3, 5, 7, 9, 11] + + +def test_no_parents(): + schema = fields.Schema(id=fields.STORED, kind=fields.ID, + name=fields.ID(stored=True)) + k = u("alfa") + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=0, kind=k, name=u("one")) + w.add_document(id=1, kind=k, name=u("two")) + w.add_document(id=2, kind=k, name=u("three")) + w.add_document(id=3, kind=k, name=u("four")) + w.add_document(id=4, kind=k, name=u("one")) + w.add_document(id=5, kind=k, name=u("two")) + w.add_document(id=6, kind=k, name=u("three")) + w.add_document(id=7, kind=k, name=u("four")) + w.add_document(id=8, kind=k, name=u("one")) + w.add_document(id=9, kind=k, name=u("two")) + w.add_document(id=10, kind=k, name=u("three")) + w.add_document(id=11, kind=k, name=u("four")) + + with ix.searcher() as s: + pq = query.Term("kind", "bravo") + cq = query.Or([query.Term("name", "two"), query.Term("name", "four")]) + q = query.NestedParent(pq, cq) + r = s.search(q) + assert r.is_empty() + + +def test_nested_children(): + schema = fields.Schema(t=fields.ID(stored=True), + track=fields.NUMERIC(stored=True), + album_name=fields.TEXT(stored=True), + song_name=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + with w.group(): + w.add_document(t=u("album"), album_name=u("alfa bravo charlie")) + w.add_document(t=u("track"), track=1, + song_name=u("delta echo foxtrot")) + w.add_document(t=u("track"), track=2, + song_name=u("golf hotel india")) + w.add_document(t=u("track"), track=3, + song_name=u("juliet kilo lima")) + with w.group(): + w.add_document(t=u("album"), album_name=u("mike november oskar")) + w.add_document(t=u("track"), track=1, + song_name=u("papa quebec romeo")) + w.add_document(t=u("track"), track=2, + song_name=u("sierra tango ultra")) + w.add_document(t=u("track"), track=3, + song_name=u("victor whiskey xray")) + with w.group(): + w.add_document(t=u("album"), album_name=u("yankee zulu one")) + w.add_document(t=u("track"), track=1, + song_name=u("two three four")) + w.add_document(t=u("track"), track=2, + song_name=u("five six seven")) + w.add_document(t=u("track"), track=3, + song_name=u("eight nine ten")) + + with ix.searcher() as s: + pq = query.Term("t", "album") + aq = query.Term("album_name", "november") + + r = s.search(query.NestedChildren(pq, pq), limit=None) + assert len(r) == 9 + assert [str(hit["t"]) for hit in r] == ["track"] * 9 + + ncq = query.NestedChildren(pq, aq) + assert list(ncq.docs(s)) == [5, 6, 7] + r = s.search(ncq, limit=None) + assert len(r) == 3 + assert [str(hit["song_name"]) for hit in r] == ["papa quebec romeo", + "sierra tango ultra", + "victor whiskey xray"] + + zq = query.NestedChildren(pq, query.Term("album_name", "zulu")) + f = sorting.StoredFieldFacet("song_name") + r = s.search(zq, sortedby=f) + assert [hit["track"] for hit in r] == [3, 2, 1] + + +def test_nested_skip(): + schema = fields.Schema( + id=fields.ID(unique=True, stored=True), + name=fields.TEXT(stored=True), + name_ngrams=fields.NGRAMWORDS(minsize=4, field_boost=1.2), + type=fields.TEXT, + ) + + domain = [ + (u"book_1", u"The Dark Knight Returns", u"book"), + (u"chapter_1", u"The Dark Knight Returns", u"chapter"), + (u"chapter_2", u"The Dark Knight Triumphant", u"chapter"), + (u"chapter_3", u"Hunt the Dark Knight", u"chapter"), + (u"chapter_4", u"The Dark Knight Falls", u"chapter") + ] + + with TempIndex(schema) as ix: + with ix.writer() as w: + for id, name, typ in domain: + w.add_document(id=id, name=name, name_ngrams=name, type=typ) + + with ix.searcher() as s: + all_parents = query.Term("type", "book") + wanted_parents = query.Term("name", "dark") + children_of_wanted_parents = query.NestedChildren(all_parents, + wanted_parents) + + r1 = s.search(children_of_wanted_parents) + assert r1.scored_length() == 4 + assert [hit["id"] for hit in r1] == ["chapter_1", "chapter_2", + "chapter_3", "chapter_4"] + + wanted_children = query.And([query.Term("type", "chapter"), + query.Term("name", "hunt")]) + + r2 = s.search(wanted_children) + assert r2.scored_length() == 1 + assert [hit["id"] for hit in r2] == ["chapter_3"] + + complex_query = query.And([children_of_wanted_parents, + wanted_children]) + + r3 = s.search(complex_query) + assert r3.scored_length() == 1 + assert [hit["id"] for hit in r3] == ["chapter_3"] diff --git a/tests/test_parse_plugins.py b/tests/test_parse_plugins.py new file mode 100644 index 0000000..1d0a35f --- /dev/null +++ b/tests/test_parse_plugins.py @@ -0,0 +1,650 @@ +from __future__ import with_statement +import inspect +from datetime import datetime + +from whoosh import analysis, fields, formats, qparser, query +from whoosh.compat import u, text_type, xrange +from whoosh.filedb.filestore import RamStorage +from whoosh.qparser import dateparse, default, plugins, syntax +from whoosh.util.times import adatetime + + +def _plugin_classes(ignore): + # Get all the subclasses of Plugin in whoosh.qparser.plugins + return [c for _, c in inspect.getmembers(plugins, inspect.isclass) + if plugins.Plugin in c.__bases__ and c not in ignore] + + +def test_combos(): + qs = ('w:a "hi there"^4.2 AND x:b^2.3 OR c AND (y:d OR e) ' + + '(apple ANDNOT bear)^2.3') + + init_args = {plugins.MultifieldPlugin: (["content", "title"], + {"content": 1.0, "title": 1.2}), + plugins.FieldAliasPlugin: ({"content": ("text", "body")},), + plugins.CopyFieldPlugin: ({"name": "phone"},), + plugins.PseudoFieldPlugin: ({"name": lambda x: x}), + } + + pis = _plugin_classes(()) + for i, plugin in enumerate(pis): + try: + pis[i] = plugin(*init_args.get(plugin, ())) + except TypeError: + raise TypeError("Error instantiating %s" % plugin) + + count = 0 + for i, first in enumerate(pis): + for j in xrange(len(pis)): + if i == j: + continue + plist = [p for p in pis[:j] if p is not first] + [first] + qp = qparser.QueryParser("text", None, plugins=plist) + qp.parse(qs) + count += 1 + + +def test_field_alias(): + qp = qparser.QueryParser("content", None) + qp.add_plugin(plugins.FieldAliasPlugin({"title": ("article", "caption")})) + q = qp.parse("alfa title:bravo article:charlie caption:delta") + assert text_type(q) == u("(content:alfa AND title:bravo AND title:charlie AND title:delta)") + + +def test_dateparser(): + schema = fields.Schema(text=fields.TEXT, date=fields.DATETIME) + qp = default.QueryParser("text", schema) + + errs = [] + + def cb(arg): + errs.append(arg) + + basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) + qp.add_plugin(dateparse.DateParserPlugin(basedate, callback=cb)) + + q = qp.parse(u("hello date:'last tuesday'")) + assert q.__class__ == query.And + assert q[1].__class__ == query.DateRange + assert q[1].startdate == adatetime(2010, 9, 14).floor() + assert q[1].enddate == adatetime(2010, 9, 14).ceil() + + q = qp.parse(u("date:'3am to 5pm'")) + assert q.__class__ == query.DateRange + assert q.startdate == adatetime(2010, 9, 20, 3).floor() + assert q.enddate == adatetime(2010, 9, 20, 17).ceil() + + q = qp.parse(u("date:blah")) + assert q == query.NullQuery + assert errs[0] == "blah" + + q = qp.parse(u("hello date:blarg")) + assert q.__unicode__() == "(text:hello AND <_NullQuery>)" + assert q[1].error == "blarg" + assert errs[1] == "blarg" + + q = qp.parse(u("hello date:20055x10")) + assert q.__unicode__() == "(text:hello AND <_NullQuery>)" + assert q[1].error == "20055x10" + assert errs[2] == "20055x10" + + q = qp.parse(u("hello date:'2005 19 32'")) + assert q.__unicode__() == "(text:hello AND <_NullQuery>)" + assert q[1].error == "2005 19 32" + assert errs[3] == "2005 19 32" + + q = qp.parse(u("date:'march 24 to dec 12'")) + assert q.__class__ == query.DateRange + assert q.startdate == adatetime(2010, 3, 24).floor() + assert q.enddate == adatetime(2010, 12, 12).ceil() + + q = qp.parse(u("date:('30 june' OR '10 july') quick")) + assert q.__class__ == query.And + assert len(q) == 2 + assert q[0].__class__ == query.Or + assert q[0][0].__class__ == query.DateRange + assert q[0][1].__class__ == query.DateRange + + +def test_date_range(): + schema = fields.Schema(text=fields.TEXT, date=fields.DATETIME) + qp = qparser.QueryParser("text", schema) + basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) + qp.add_plugin(dateparse.DateParserPlugin(basedate)) + + q = qp.parse(u("date:['30 march' to 'next wednesday']")) + assert q.__class__ == query.DateRange + assert q.startdate == adatetime(2010, 3, 30).floor() + assert q.enddate == adatetime(2010, 9, 22).ceil() + + q = qp.parse(u("date:[to 'next wednesday']")) + assert q.__class__ == query.DateRange + assert q.startdate is None + assert q.enddate == adatetime(2010, 9, 22).ceil() + + q = qp.parse(u("date:['30 march' to]")) + assert q.__class__ == query.DateRange + assert q.startdate == adatetime(2010, 3, 30).floor() + assert q.enddate is None + + q = qp.parse(u("date:[30 march to next wednesday]")) + assert q.__class__ == query.DateRange + assert q.startdate == adatetime(2010, 3, 30).floor() + assert q.enddate == adatetime(2010, 9, 22).ceil() + + q = qp.parse(u("date:[to next wednesday]")) + assert q.__class__ == query.DateRange + assert q.startdate is None + assert q.enddate == adatetime(2010, 9, 22).ceil() + + q = qp.parse(u("date:[30 march to]")) + assert q.__class__ == query.DateRange + assert q.startdate == adatetime(2010, 3, 30).floor() + assert q.enddate is None + + +def test_daterange_multi(): + schema = fields.Schema(text=fields.TEXT, start=fields.DATETIME, + end=fields.DATETIME) + qp = qparser.QueryParser("text", schema) + basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) + qp.add_plugin(dateparse.DateParserPlugin(basedate)) + + q = qp.parse("start:[2008 to] AND end:[2011 to 2011]") + assert q.__class__ == query.And + assert q[0].__class__ == query.DateRange + assert q[1].__class__ == query.DateRange + assert q[0].startdate == adatetime(2008).floor() + assert q[0].enddate is None + assert q[1].startdate == adatetime(2011).floor() + assert q[1].enddate == adatetime(2011).ceil() + + +def test_daterange_empty_field(): + schema = fields.Schema(test=fields.DATETIME) + ix = RamStorage().create_index(schema) + + writer = ix.writer() + writer.add_document(test=None) + writer.commit() + + with ix.searcher() as s: + q = query.DateRange("test", datetime.fromtimestamp(0), + datetime.today()) + r = s.search(q) + assert len(r) == 0 + + +def test_free_dates(): + a = analysis.StandardAnalyzer(stoplist=None) + schema = fields.Schema(text=fields.TEXT(analyzer=a), date=fields.DATETIME) + qp = qparser.QueryParser("text", schema) + basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) + qp.add_plugin(dateparse.DateParserPlugin(basedate, free=True)) + + q = qp.parse(u("hello date:last tuesday")) + assert q.__class__ == query.And + assert len(q) == 2 + assert q[0].__class__ == query.Term + assert q[0].text == "hello" + assert q[1].__class__ == query.DateRange + assert q[1].startdate == adatetime(2010, 9, 14).floor() + assert q[1].enddate == adatetime(2010, 9, 14).ceil() + + q = qp.parse(u("date:mar 29 1972 hello")) + assert q.__class__ == query.And + assert len(q) == 2 + assert q[0].__class__ == query.DateRange + assert q[0].startdate == adatetime(1972, 3, 29).floor() + assert q[0].enddate == adatetime(1972, 3, 29).ceil() + assert q[1].__class__ == query.Term + assert q[1].text == "hello" + + q = qp.parse(u("date:2005 march 2")) + assert q.__class__ == query.DateRange + assert q.startdate == adatetime(2005, 3, 2).floor() + assert q.enddate == adatetime(2005, 3, 2).ceil() + + q = qp.parse(u("date:'2005' march 2")) + assert q.__class__ == query.And + assert len(q) == 3 + assert q[0].__class__ == query.DateRange + assert q[0].startdate == adatetime(2005).floor() + assert q[0].enddate == adatetime(2005).ceil() + assert q[1].__class__ == query.Term + assert q[1].fieldname == "text" + assert q[1].text == "march" + + q = qp.parse(u("date:march 24 to dec 12")) + assert q.__class__ == query.DateRange + assert q.startdate == adatetime(2010, 3, 24).floor() + assert q.enddate == adatetime(2010, 12, 12).ceil() + + q = qp.parse(u("date:5:10pm")) + assert q.__class__ == query.DateRange + assert q.startdate == adatetime(2010, 9, 20, 17, 10).floor() + assert q.enddate == adatetime(2010, 9, 20, 17, 10).ceil() + + q = qp.parse(u("(date:30 june OR date:10 july) quick")) + assert q.__class__ == query.And + assert len(q) == 2 + assert q[0].__class__ == query.Or + assert q[0][0].__class__ == query.DateRange + assert q[0][1].__class__ == query.DateRange + + +def test_prefix_plugin(): + schema = fields.Schema(id=fields.ID, text=fields.TEXT) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(id=u("1"), text=u("alfa")) + w.add_document(id=u("2"), text=u("bravo")) + w.add_document(id=u("3"), text=u("buono")) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("text", schema) + qp.remove_plugin_class(plugins.WildcardPlugin) + qp.add_plugin(plugins.PrefixPlugin) + + q = qp.parse(u("b*")) + r = s.search(q, limit=None) + assert len(r) == 2 + + q = qp.parse(u("br*")) + r = s.search(q, limit=None) + assert len(r) == 1 + + +def test_custom_tokens(): + qp = qparser.QueryParser("text", None) + qp.remove_plugin_class(plugins.OperatorsPlugin) + + cp = plugins.OperatorsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~", + Not="-") + qp.add_plugin(cp) + + q = qp.parse("this | that") + assert q.__class__ == query.Or + assert q[0].__class__ == query.Term + assert q[0].text == "this" + assert q[1].__class__ == query.Term + assert q[1].text == "that" + + q = qp.parse("this&!that") + assert q.__class__ == query.AndNot + assert q.a.__class__ == query.Term + assert q.a.text == "this" + assert q.b.__class__ == query.Term + assert q.b.text == "that" + + q = qp.parse("alfa -bravo NOT charlie") + assert len(q) == 4 + assert q[1].__class__ == query.Not + assert q[1].query.text == "bravo" + assert q[2].text == "NOT" + + +def test_copyfield(): + qp = qparser.QueryParser("a", None) + qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, None)) + assert (text_type(qp.parse("hello b:matt")) + == "(a:hello AND b:matt AND c:matt)") + + qp = qparser.QueryParser("a", None) + qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.AndMaybeGroup)) + assert (text_type(qp.parse("hello b:matt")) + == "(a:hello AND (b:matt ANDMAYBE c:matt))") + + qp = qparser.QueryParser("a", None) + qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.RequireGroup)) + assert (text_type(qp.parse("hello (there OR b:matt)")) + == "(a:hello AND (a:there OR (b:matt REQUIRE c:matt)))") + + qp = qparser.QueryParser("a", None) + qp.add_plugin(plugins.CopyFieldPlugin({"a": "c"}, syntax.OrGroup)) + assert (text_type(qp.parse("hello there")) + == "((a:hello OR c:hello) AND (a:there OR c:there))") + + qp = qparser.QueryParser("a", None) + qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, mirror=True)) + assert (text_type(qp.parse("hello c:matt")) + == "(a:hello AND (c:matt OR b:matt))") + + qp = qparser.QueryParser("a", None) + qp.add_plugin(plugins.CopyFieldPlugin({"c": "a"}, mirror=True)) + assert (text_type(qp.parse("hello c:matt")) + == "((a:hello OR c:hello) AND (c:matt OR a:matt))") + + ana = analysis.RegexAnalyzer(r"\w+") | analysis.DoubleMetaphoneFilter() + fmt = formats.Frequency() + ft = fields.FieldType(fmt, ana, multitoken_query="or") + schema = fields.Schema(name=fields.KEYWORD, name_phone=ft) + qp = qparser.QueryParser("name", schema) + qp.add_plugin(plugins.CopyFieldPlugin({"name": "name_phone"})) + target = ("((name:spruce OR name_phone:SPRS) " + "AND (name:view OR name_phone:F OR name_phone:FF))") + assert text_type(qp.parse(u("spruce view"))) == target + + +def test_gtlt(): + schema = fields.Schema(a=fields.KEYWORD, b=fields.NUMERIC, + c=fields.KEYWORD, + d=fields.NUMERIC(float), e=fields.DATETIME) + qp = qparser.QueryParser("a", schema) + qp.add_plugin(plugins.GtLtPlugin()) + qp.add_plugin(dateparse.DateParserPlugin()) + + q = qp.parse(u("a:hello b:>100 c:<=z there")) + assert q.__class__ == query.And + assert len(q) == 4 + assert q[0] == query.Term("a", "hello") + assert q[1] == query.NumericRange("b", 100, None, startexcl=True) + assert q[2] == query.TermRange("c", None, 'z') + assert q[3] == query.Term("a", "there") + + q = qp.parse(u("hello e:>'29 mar 2001' there")) + assert q.__class__ == query.And + assert len(q) == 3 + assert q[0] == query.Term("a", "hello") + # As of this writing, date ranges don't support startexcl/endexcl + assert q[1] == query.DateRange("e", datetime(2001, 3, 29, 0, 0), None) + assert q[2] == query.Term("a", "there") + + q = qp.parse(u("a:> alfa c:<= bravo")) + assert text_type(q) == "(a:a: AND a:alfa AND a:c: AND a:bravo)" + + qp.remove_plugin_class(plugins.FieldsPlugin) + qp.remove_plugin_class(plugins.RangePlugin) + q = qp.parse(u("hello a:>500 there")) + assert text_type(q) == "(a:hello AND a:a: AND a:500 AND a:there)" + + +def test_regex(): + schema = fields.Schema(a=fields.KEYWORD, b=fields.TEXT) + qp = qparser.QueryParser("a", schema) + qp.add_plugin(plugins.RegexPlugin()) + + q = qp.parse(u("a:foo-bar b:foo-bar")) + assert q.__unicode__() == '(a:foo-bar AND b:foo AND b:bar)' + + q = qp.parse(u('a:r"foo-bar" b:r"foo-bar"')) + assert q.__unicode__() == '(a:r"foo-bar" AND b:r"foo-bar")' + + +def test_pseudofield(): + schema = fields.Schema(a=fields.KEYWORD, b=fields.TEXT) + + def regex_maker(node): + if node.has_text: + node = qparser.RegexPlugin.RegexNode(node.text) + node.set_fieldname("content") + return node + + qp = qparser.QueryParser("a", schema) + qp.add_plugin(qparser.PseudoFieldPlugin({"regex": regex_maker})) + q = qp.parse(u("alfa regex:br.vo")) + assert q.__unicode__() == '(a:alfa AND content:r"br.vo")' + + def rev_text(node): + if node.has_text: + # Create a word node for the reversed text + revtext = node.text[::-1] # Reverse the text + rnode = qparser.WordNode(revtext) + # Duplicate the original node's start and end char + rnode.set_range(node.startchar, node.endchar) + + # Put the original node and the reversed node in an OrGroup + group = qparser.OrGroup([node, rnode]) + + # Need to set the fieldname here because the PseudoFieldPlugin + # removes the field name syntax + group.set_fieldname("reverse") + + return group + + qp = qparser.QueryParser("content", schema) + qp.add_plugin(qparser.PseudoFieldPlugin({"reverse": rev_text})) + q = qp.parse(u("alfa reverse:bravo")) + assert q.__unicode__() == '(content:alfa AND (reverse:bravo OR reverse:ovarb))' + + +def test_fuzzy_plugin(): + ana = analysis.StandardAnalyzer("\\S+") + schema = fields.Schema(f=fields.TEXT(analyzer=ana)) + qp = default.QueryParser("f", schema) + qp.add_plugin(plugins.FuzzyTermPlugin()) + + q = qp.parse("bob~") + assert q.__class__ == query.FuzzyTerm + assert q.field() == "f" + assert q.text == "bob" + assert q.maxdist == 1 + + q = qp.parse("Alfa Bravo~ Charlie") + assert q.__class__ == query.And + assert q[0].__class__ == query.Term + assert q[0].text == "alfa" + assert q[1].__class__ == query.FuzzyTerm + assert q[1].field() == "f" + assert q[1].text == "bravo" + assert q[1].maxdist == 1 + assert q[2].__class__ == query.Term + assert q[2].text == "charlie" + + q = qp.parse("Alfa Bravo~2 Charlie") + assert q.__class__ == query.And + assert q[0].__class__ == query.Term + assert q[0].text == "alfa" + assert q[1].__class__ == query.FuzzyTerm + assert q[1].field() == "f" + assert q[1].text == "bravo" + assert q[1].maxdist == 2 + assert q[2].__class__ == query.Term + assert q[2].text == "charlie" + + q = qp.parse("alfa ~2 bravo") + assert q.__class__ == query.And + assert q[0].__class__ == query.Term + assert q[0].text == "alfa" + assert q[1].__class__ == query.Term + assert q[1].text == "~2" + assert q[2].__class__ == query.Term + assert q[2].text == "bravo" + + qp = default.QueryParser("f", None) + q = qp.parse("'bob~'") + assert q.__class__ == query.Term + assert q.field() == "f" + assert q.text == "bob~" + + +def test_fuzzy_prefix(): + from whoosh import scoring + + schema = fields.Schema(title=fields.TEXT(stored=True), + content=fields.TEXT(spelling=True)) + + ix = RamStorage().create_index(schema) + with ix.writer() as w: + # Match -> first + w.add_document(title=u("First"), + content=u("This is the first document we've added!")) + # No match + w.add_document(title=u("Second"), + content=u("The second one is even more interesting! filst")) + # Match -> first + w.add_document(title=u("Third"), + content=u("The world first line we've added!")) + # Match -> zeroth + w.add_document(title=u("Fourth"), + content=u("The second one is alaways comes after zeroth!")) + # Match -> fire is within 2 edits (transpose + delete) of first + w.add_document(title=u("Fifth"), + content=u("The fire is beautiful")) + + from whoosh.qparser import QueryParser, FuzzyTermPlugin + parser = QueryParser("content", ix.schema) + parser.add_plugin(FuzzyTermPlugin()) + q = parser.parse("first~2/3 OR zeroth", debug=False) + + assert isinstance(q, query.Or) + ft = q[0] + assert isinstance(ft, query.FuzzyTerm) + assert ft.maxdist == 2 + assert ft.prefixlength == 3 + + with ix.searcher(weighting=scoring.TF_IDF()) as searcher: + results = searcher.search(q) + assert len(results) == 4 + assert (" ".join(sorted(hit["title"] for hit in results)) + == "Fifth First Fourth Third") + + +def test_function_plugin(): + class FakeQuery(query.Query): + def __init__(self, children, *args, **kwargs): + self.children = children + self.args = args + self.kwargs = kwargs + self.fieldname = None + + def __hash__(self): + return hash(tuple(self.children)) ^ hash(self.args) + + def __unicode__(self): + qs = "|".join(str(q) for q in self.children) + args = ",".join(self.args) + kwargs = ",".join(sorted("%s:%s" % item for item in self.kwargs.items())) + return u("<%s %s %s>") % (qs, args, kwargs) + + __str__ = __unicode__ + + def fuzzy(qs, prefix=0, maxdist=2): + prefix = int(prefix) + maxdist = int(maxdist) + return query.FuzzyTerm(qs[0].fieldname, qs[0].text, + prefixlength=prefix, maxdist=maxdist) + + fp = plugins.FunctionPlugin({"foo": FakeQuery, "fuzzy": fuzzy}) + qp = default.QueryParser("f", None) + qp.add_plugin(fp) + + def check(qstring, target): + q = qp.parse(u(qstring), normalize=False) + assert str(q) == target + + check("alfa #foo charlie delta", + "(f:alfa AND < > AND f:charlie AND f:delta)") + + check("alfa #foo(charlie delta) echo", + "(f:alfa AND AND f:echo)") + + check("alfa #foo(charlie AND delta) echo", + "(f:alfa AND <(f:charlie AND f:delta) > AND f:echo)") + + check("alfa #foo[a] charlie delta", + "(f:alfa AND < a > AND f:charlie AND f:delta)") + + check("alfa #foo[a, b](charlie delta) echo", + "(f:alfa AND AND f:echo)") + + check("alfa #foo[a,b,c=d](charlie AND delta) echo", + "(f:alfa AND <(f:charlie AND f:delta) a,b c:d> AND f:echo)") + + check("alfa #foo[a,b,c=d]() (charlie AND delta)", + "(f:alfa AND < a,b c:d> AND ((f:charlie AND f:delta)))") + + check("alfa #foo[a=1,b=2](charlie AND delta)^2.0 echo", + "(f:alfa AND <(f:charlie AND f:delta) a:1,b:2,boost:2.0> AND f:echo)") + + check("alfa #fuzzy[maxdist=2](bravo) charlie", + "(f:alfa AND f:bravo~2 AND f:charlie)") + + +def test_function_first(): + from whoosh.query.spans import SpanFirst + + def make_first(qs): + return SpanFirst(qs[0]) + + fp = plugins.FunctionPlugin({"first": make_first}) + qp = default.QueryParser("f", None) + qp.add_plugin(fp) + + q = qp.parse("#first(apples)") + assert isinstance(q, SpanFirst) + + +def test_sequence_plugin(): + qp = default.QueryParser("f", None) + qp.remove_plugin_class(plugins.PhrasePlugin) + qp.add_plugin(plugins.FuzzyTermPlugin()) + qp.add_plugin(plugins.SequencePlugin()) + + q = qp.parse(u('alfa "bravo charlie~2 (delta OR echo)" foxtrot')) + assert q.__unicode__() == "(f:alfa AND (f:bravo NEAR f:charlie~2 NEAR (f:delta OR f:echo)) AND f:foxtrot)" + assert q[1].__class__ == query.Sequence + + q = qp.parse(u('alfa "bravo charlie~2 d?lt*')) + assert q[0].text == "alfa" + assert q[1].text == "bravo" + assert q[2].__class__ == query.FuzzyTerm + assert q[3].__class__ == query.Wildcard + + q = qp.parse(u('alfa "bravo charlie~2" d?lt* "[a TO z] [0 TO 9]" echo')) + assert q.__unicode__() == "(f:alfa AND (f:bravo NEAR f:charlie~2) AND f:d?lt* AND (f:[a TO z] NEAR f:[0 TO 9]) AND f:echo)" + assert q[0].text == "alfa" + assert q[1].__class__ == query.Sequence + assert q[2].__class__ == query.Wildcard + assert q[3].__class__ == query.Sequence + assert q[3][0].__class__ == query.TermRange + assert q[3][1].__class__ == query.TermRange + assert q[4].text == "echo" + + q = qp.parse(u('alfa "bravo charlie~3"~2 delta')) + assert q[1].__class__ == query.Sequence + assert q[1].slop == 2 + assert q[1][1].__class__ == query.FuzzyTerm + assert q[1][1].maxdist == 3 + + +def test_sequence_andmaybe(): + qp = default.QueryParser("f", None) + qp.remove_plugin_class(plugins.PhrasePlugin) + qp.add_plugins([plugins.FuzzyTermPlugin(), plugins.SequencePlugin()]) + + q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"')) + assert isinstance(q, query.AndMaybe) + assert q[0] == query.Term("f", u("Dahmen")) + assert q[1] == query.Sequence([query.Term("f", u("Besov")), + query.Term("f", u("Spaces"))]) + + +def test_sequence_complex(): + ana = analysis.StandardAnalyzer(stoplist=None) + schema = fields.Schema(title=fields.TEXT(stored=True), + path=fields.ID(stored=True), + content=fields.TEXT(stored=True, phrase=True, + analyzer=ana)) + ix = RamStorage().create_index(schema) + + with ix.writer() as w: + w.add_document(title=u"First document", path=u"/a", + content=u"This is the first document we've added!") + w.add_document(title=u"Second document", path=u"/b", + content=(u"In truth, he said, I would like to combine " + u"logical operators with proximity-based " + u"search in Whoosh!")) + + with ix.searcher() as s: + qp = qparser.QueryParser("content", ix.schema) + qp.remove_plugin_class(plugins.PhrasePlugin) + qp.add_plugin(plugins.SequencePlugin()) + qp.add_plugin(plugins.FuzzyTermPlugin()) + + q = qp.parse(u'"(he OR she OR we~) would*"~3') + r = s.search(q) + assert r.scored_length() + diff --git a/tests/test_parsing.py b/tests/test_parsing.py new file mode 100644 index 0000000..527773e --- /dev/null +++ b/tests/test_parsing.py @@ -0,0 +1,996 @@ +import pytest + +from whoosh import analysis, fields, query +from whoosh.compat import u, text_type +from whoosh.qparser import default +from whoosh.qparser import plugins + + +def test_whitespace(): + p = default.QueryParser("t", None, [plugins.WhitespacePlugin()]) + assert repr(p.tag("hello there amiga")) == ", < >, , < >, >" + + +def test_singlequotes(): + p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), + plugins.SingleQuotePlugin()]) + assert repr(p.process("a 'b c' d")) == ", , >" + + +def test_prefix(): + p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), + plugins.PrefixPlugin()]) + assert repr(p.process("a b* c")) == ", , >" + + +def test_range(): + p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), + plugins.RangePlugin()]) + ns = p.tag("a [b to c} d") + assert repr(ns) == ", < >, , < >, >" + + assert repr(p.process("a {b to]")) == ", >" + assert repr(p.process("[to c] d")) == ", >" + assert repr(p.process("[to]")) == ">" + + +def test_sq_range(): + p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), + plugins.SingleQuotePlugin(), + plugins.RangePlugin()]) + assert repr(p.process("['a b' to ']']")) == ">" + + +def test_phrase(): + p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), + plugins.PhrasePlugin()]) + assert repr(p.process('a "b c"')) == ", >" + assert repr(p.process('"b c" d')) == ", >" + assert repr(p.process('"b c"')) == ">" + + q = p.parse('alfa "bravo charlie"~2 delta') + assert q[1].__class__ == query.Phrase + assert q[1].words == ["bravo", "charlie"] + assert q[1].slop == 2 + + +def test_groups(): + p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), + plugins.GroupPlugin()]) + + ns = p.process("a ((b c) d) e") + assert repr(ns) == ", , >, >, >" + + +def test_fieldnames(): + p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), + plugins.FieldsPlugin(), + plugins.GroupPlugin()]) + ns = p.process("a:b c d:(e f:(g h)) i j:") + assert repr(ns) == ", , , , <'f':'h'>>>, , >" + assert repr(p.process("a:b:")) == ">" + + +def test_operators(): + p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), + plugins.OperatorsPlugin()]) + ns = p.process("a OR b") + assert repr(ns) == ", >>" + + +def test_boost(): + p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), + plugins.GroupPlugin(), + plugins.BoostPlugin()]) + ns = p.tag("a^3") + assert repr(ns) == ", <^ 3.0>>" + ns = p.filterize(ns) + assert repr(ns) == ">" + + assert repr(p.process("a (b c)^2.5")) == ", , ^2.5>>" + assert repr(p.process("a (b c)^.5 d")) == ", , ^0.5>, >" + assert repr(p.process("^2 a")) == ", >" + assert repr(p.process("a^2^3")) == ">" + + +# + +def test_empty_querystring(): + s = fields.Schema(content=fields.TEXT, title=fields.TEXT, id=fields.ID) + qp = default.QueryParser("content", s) + q = qp.parse(u("")) + assert q == query.NullQuery + + +def test_fields(): + s = fields.Schema(content=fields.TEXT, title=fields.TEXT, id=fields.ID) + qp = default.QueryParser("content", s) + q = qp.parse(u("test")) + assert q.__class__ == query.Term + assert q.fieldname == "content" + assert q.text == "test" + + mq = default.MultifieldParser(("title", "content"), s) + q = mq.parse(u("test")) + assert q.__class__ == query.Or + assert q[0].__class__ == query.Term + assert q[1].__class__ == query.Term + assert q[0].fieldname == "title" + assert q[1].fieldname == "content" + assert q[0].text == "test" + assert q[1].text == "test" + + q = mq.parse(u("title:test")) + assert q.__class__ == query.Term + assert q.fieldname == "title" + assert q.text == "test" + + +def test_multifield(): + schema = fields.Schema(content=fields.TEXT, title=fields.TEXT, + cat=fields.KEYWORD, date=fields.DATETIME) + + qs = u("a (b c cat:d) OR (b c cat:e)") + qp = default.MultifieldParser(['x', 'y'], schema) + + q = qp.parse(qs) + assert text_type(q) == "((x:a OR y:a) AND (((x:b OR y:b) AND (x:c OR y:c) AND cat:d) OR ((x:b OR y:b) AND (x:c OR y:c) AND cat:e)))" + + +def test_fieldname_chars(): + s = fields.Schema(abc123=fields.TEXT, nisbah=fields.KEYWORD) + qp = default.QueryParser("content", s) + fieldmap = {'nisbah': [u('\u0646\u0633\u0628\u0629')], + 'abc123': ['xyz']} + qp.add_plugin(plugins.FieldAliasPlugin(fieldmap)) + + q = qp.parse(u("abc123:456")) + assert q.__class__ == query.Term + assert q.fieldname == u('abc123') + assert q.text == u('456') + + q = qp.parse(u("abc123:456 def")) + assert text_type(q) == u("(abc123:456 AND content:def)") + + q = qp.parse(u('\u0646\u0633\u0628\u0629:\u0627\u0644\u0641\u0644\u0633' + '\u0637\u064a\u0646\u064a')) + assert q.__class__ == query.Term + assert q.fieldname == u('nisbah') + assert q.text == u('\u0627\u0644\u0641\u0644\u0633\u0637\u064a\u0646\u064a') + + q = qp.parse(u("abc123 (xyz:123 OR qrs)")) + assert text_type(q) == "(content:abc123 AND (abc123:123 OR content:qrs))" + + +def test_colonspace(): + s = fields.Schema(content=fields.TEXT, url=fields.ID) + qp = default.QueryParser("content", s) + q = qp.parse(u("url:test")) + assert q.__class__ == query.Term + assert q.fieldname == "url" + assert q.text == "test" + + q = qp.parse(u("url: test")) + assert q.__class__ == query.And + assert q[0].__class__ == query.Term + assert q[1].__class__ == query.Term + assert q[0].fieldname == "content" + assert q[1].fieldname == "content" + assert q[0].text == "url" + assert q[1].text == "test" + + q = qp.parse(u("url:")) + assert q.__class__ == query.Term + assert q.fieldname == "content" + assert q.text == "url" + + s = fields.Schema(foo=fields.KEYWORD) + qp = default.QueryParser("foo", s) + q = qp.parse(u("blah:")) + assert q.__class__ == query.Term + assert q.fieldname == "foo" + assert q.text == "blah:" + + +def test_andor(): + qp = default.QueryParser("a", None) + q = qp.parse("a AND b OR c AND d OR e AND f") + assert text_type(q) == "((a:a AND a:b) OR (a:c AND a:d) OR (a:e AND a:f))" + + q = qp.parse("aORb") + assert q == query.Term("a", "aORb") + + q = qp.parse("aOR b") + assert q == query.And([query.Term("a", "aOR"), query.Term("a", "b")]) + + q = qp.parse("a ORb") + assert q == query.And([query.Term("a", "a"), query.Term("a", "ORb")]) + + assert qp.parse("OR") == query.Term("a", "OR") + + +def test_andnot(): + qp = default.QueryParser("content", None) + q = qp.parse(u("this ANDNOT that")) + assert q.__class__ == query.AndNot + assert q.a.__class__ == query.Term + assert q.b.__class__ == query.Term + assert q.a.text == "this" + assert q.b.text == "that" + + q = qp.parse(u("foo ANDNOT bar baz")) + assert q.__class__ == query.And + assert len(q) == 2 + assert q[0].__class__ == query.AndNot + assert q[1].__class__ == query.Term + + q = qp.parse(u("foo fie ANDNOT bar baz")) + assert q.__class__ == query.And + assert len(q) == 3 + assert q[0].__class__ == query.Term + assert q[1].__class__ == query.AndNot + assert q[2].__class__ == query.Term + + q = qp.parse(u("a AND b ANDNOT c")) + assert q.__class__ == query.AndNot + assert text_type(q) == "((content:a AND content:b) ANDNOT content:c)" + + +def test_boost_query(): + qp = default.QueryParser("content", None) + q = qp.parse(u("this^3 fn:that^0.5 5.67 hi^5x")) + assert q[0].boost == 3.0 + assert q[1].boost == 0.5 + assert q[1].fieldname == "fn" + assert q[2].text == "5.67" + assert q[3].text == "hi^5x" + + q = qp.parse("alfa (bravo OR charlie)^2.5 ^3") + assert len(q) == 3 + assert q[0].boost == 1.0 + assert q[1].boost == 2.5 + assert q[2].text == "^3" + + +def test_boosts(): + qp = default.QueryParser("t", None) + q = qp.parse("alfa ((bravo^2)^3)^4 charlie") + assert q.__unicode__() == "(t:alfa AND t:bravo^24.0 AND t:charlie)" + + +def test_wild(): + qp = default.QueryParser("t", None, [plugins.WhitespacePlugin(), + plugins.WildcardPlugin()]) + assert repr(qp.process("a b*c? d")) == ", , >" + assert repr(qp.process("a * ? d")) == ", , , >" + + # + qp = default.QueryParser("content", None) + q = qp.parse(u("hello *the?e* ?star*s? test")) + assert len(q) == 4 + assert q[0].__class__ == query.Term + assert q[0].text == "hello" + assert q[1].__class__ == query.Wildcard + assert q[1].text == "*the?e*" + assert q[2].__class__ == query.Wildcard + assert q[2].text == "?star*s?" + assert q[3].__class__ == query.Term + assert q[3].text == "test" + + # + qp = default.QueryParser("content", None) + q = qp.parse(u("*the?e*")) + assert q.__class__ == query.Wildcard + assert q.text == "*the?e*" + + +def test_parse_fieldname_underscores(): + s = fields.Schema(my_name=fields.ID(stored=True), my_value=fields.TEXT) + qp = default.QueryParser("my_value", schema=s) + q = qp.parse(u("my_name:Green")) + assert q.__class__ == query.Term + assert q.fieldname == "my_name" + assert q.text == "Green" + + +def test_endstar(): + qp = default.QueryParser("text", None) + q = qp.parse(u("word*")) + assert q.__class__ == query.Prefix + assert q.text == "word" + + q = qp.parse(u("first* second")) + assert q[0].__class__ == query.Prefix + assert q[0].text == "first" + + +def test_singlequotes_query(): + qp = default.QueryParser("text", None) + q = qp.parse("hell's hot 'i stab at thee'") + assert q.__class__.__name__ == 'And' + assert len(q) == 3 + assert q[0].__class__ == query.Term + assert q[1].__class__ == query.Term + assert q[2].__class__ == query.Term + assert q[0].text == "hell's" + assert q[1].text == "hot" + assert q[2].text == "i stab at thee" + + q = qp.parse("alfa zulu:'bravo charlie' delta") + assert q.__class__.__name__ == 'And' + assert len(q) == 3 + assert q[0].__class__ == query.Term + assert q[1].__class__ == query.Term + assert q[2].__class__ == query.Term + assert (q[0].fieldname, q[0].text) == ("text", "alfa") + assert (q[1].fieldname, q[1].text) == ("zulu", "bravo charlie") + assert (q[2].fieldname, q[2].text) == ("text", "delta") + + q = qp.parse("The rest 'is silence") + assert q.__class__ == query.And + assert len(q) == 4 + assert [t.text for t in q.subqueries] == ["The", "rest", "'is", "silence"] + + q = qp.parse("I don't like W's stupid face") + assert q.__class__ == query.And + assert len(q) == 6 + assert [t.text for t in q.subqueries] == ["I", "don't", "like", "W's", + "stupid", "face"] + + q = qp.parse("I forgot the drinkin' in '98") + assert q.__class__ == query.And + assert len(q) == 6 + assert [t.text for t in q.subqueries] == ["I", "forgot", "the", "drinkin'", + "in", "'98"] + +# def test_escaping(): +# qp = default.QueryParser("text", None) +# +# q = qp.parse(r'big\small') +# assert q.__class__, query.Term, q) +# assert q.text == "bigsmall" +# +# q = qp.parse(r'big\\small') +# assert q.__class__ == query.Term +# assert q.text == r'big\small' +# +# q = qp.parse(r'http\:example') +# assert q.__class__ == query.Term +# assert q.fieldname == "text" +# assert q.text == "http:example" +# +# q = qp.parse(r'hello\ there') +# assert q.__class__ == query.Term +# assert q.text == "hello there" +# +# q = qp.parse(r'\[start\ TO\ end\]') +# assert q.__class__ == query.Term +# assert q.text == "[start TO end]" +# +# schema = fields.Schema(text=fields.TEXT) +# qp = default.QueryParser("text", None) +# q = qp.parse(r"http\:\/\/www\.example\.com") +# assert q.__class__ == query.Term +# assert q.text == "http://www.example.com" +# +# q = qp.parse(u("\u005c\u005c")) +# assert q.__class__ == query.Term +# assert q.text == "\\" + +# def test_escaping_wildcards(): +# qp = default.QueryParser("text", None) +# +# q = qp.parse(u("a*b*c?d")) +# assert q.__class__ == query.Wildcard +# assert q.text == "a*b*c?d" +# +# q = qp.parse(u("a*b\u005c*c?d")) +# assert q.__class__ == query.Wildcard +# assert q.text == "a*b*c?d" +# +# q = qp.parse(u("a*b\u005c\u005c*c?d")) +# assert q.__class__ == query.Wildcard +# assert q.text, u('a*b\u005c*c?d')) +# +# q = qp.parse(u("ab*")) +# assert q.__class__ == query.Prefix +# assert q.text, u("ab")) +# +# q = qp.parse(u("ab\u005c\u005c*")) +# assert q.__class__ == query.Wildcard +# assert q.text, u("ab\u005c*")) + + +def test_phrase_phrase(): + qp = default.QueryParser("content", None) + q = qp.parse('"alfa bravo" "charlie delta echo"^2.2 test:"foxtrot golf"') + assert q[0].__class__ == query.Phrase + assert q[0].words == ["alfa", "bravo"] + assert q[1].__class__ == query.Phrase + assert q[1].words == ["charlie", "delta", "echo"] + assert q[1].boost == 2.2 + assert q[2].__class__ == query.Phrase + assert q[2].words == ["foxtrot", "golf"] + assert q[2].fieldname == "test" + + +def test_weird_characters(): + qp = default.QueryParser("content", None) + q = qp.parse(u(".abcd@gmail.com")) + assert q.__class__ == query.Term + assert q.text == ".abcd@gmail.com" + q = qp.parse(u("r*")) + assert q.__class__ == query.Prefix + assert q.text == "r" + q = qp.parse(u(".")) + assert q.__class__ == query.Term + assert q.text == "." + q = qp.parse(u("?")) + assert q.__class__ == query.Wildcard + assert q.text == "?" + + +def test_euro_chars(): + schema = fields.Schema(text=fields.TEXT) + qp = default.QueryParser("text", schema) + q = qp.parse(u("stra\xdfe")) + assert q.__class__ == query.Term + assert q.text == u("stra\xdfe") + + +def test_star(): + schema = fields.Schema(text=fields.TEXT(stored=True)) + qp = default.QueryParser("text", schema) + q = qp.parse(u("*")) + assert q.__class__ == query.Every + assert q.fieldname == "text" + + q = qp.parse(u("*h?ll*")) + assert q.__class__ == query.Wildcard + assert q.text == "*h?ll*" + + q = qp.parse(u("h?pe")) + assert q.__class__ == query.Wildcard + assert q.text == "h?pe" + + q = qp.parse(u("*? blah")) + assert q.__class__ == query.And + assert q[0].__class__ == query.Wildcard + assert q[0].text == "*?" + assert q[1].__class__ == query.Term + assert q[1].text == "blah" + + q = qp.parse(u("*ending")) + assert q.__class__ == query.Wildcard + assert q.text == "*ending" + + q = qp.parse(u("*q")) + assert q.__class__ == query.Wildcard + assert q.text == "*q" + + +def test_star_field(): + schema = fields.Schema(text=fields.TEXT) + qp = default.QueryParser("text", schema) + + q = qp.parse(u("*:*")) + assert q.__class__ == query.Every + assert q.fieldname is None + + # This gets parsed to a term with text="*:test" which is then analyzed down + # to just "test" + q = qp.parse(u("*:test")) + assert q.__class__ == query.Term + assert q.fieldname == "text" + assert q.text == "test" + + +def test_range_query(): + schema = fields.Schema(name=fields.ID(stored=True), + text=fields.TEXT(stored=True)) + qp = default.QueryParser("text", schema) + + q = qp.parse(u("[alfa to bravo}")) + assert q.__class__ == query.TermRange + assert q.start == "alfa" + assert q.end == "bravo" + assert q.startexcl is False + assert q.endexcl is True + + q = qp.parse(u("['hello there' to 'what ever']")) + assert q.__class__ == query.TermRange + assert q.start == "hello there" + assert q.end == "what ever" + assert q.startexcl is False + assert q.endexcl is False + + q = qp.parse(u("name:{'to' to 'b'}")) + assert q.__class__ == query.TermRange + assert q.start == "to" + assert q.end == "b" + assert q.startexcl is True + assert q.endexcl is True + + q = qp.parse(u("name:{'a' to 'to']")) + assert q.__class__ == query.TermRange + assert q.start == "a" + assert q.end == "to" + assert q.startexcl is True + assert q.endexcl is False + + q = qp.parse(u("name:[a to to]")) + assert q.__class__ == query.TermRange + assert q.start == "a" + assert q.end == "to" + + q = qp.parse(u("name:[to to b]")) + assert q.__class__ == query.TermRange + assert q.start == "to" + assert q.end == "b" + + q = qp.parse(u("[alfa to alfa]")) + assert q.__class__ == query.Term + assert q.text == "alfa" + + q = qp.parse(u("Ind* AND name:[d TO]")) + assert q.__class__ == query.And + assert q[0].__class__ == query.Prefix + assert q[1].__class__ == query.TermRange + assert q[0].text == "ind" + assert q[1].start == "d" + assert q[1].fieldname == "name" + + q = qp.parse(u("name:[d TO]")) + assert q.__class__ == query.TermRange + assert q.start == "d" + assert q.fieldname == "name" + + +def test_numeric_range(): + schema = fields.Schema(id=fields.STORED, number=fields.NUMERIC) + qp = default.QueryParser("number", schema) + + teststart = 40 + testend = 100 + + q = qp.parse("[%s to *]" % teststart) + assert q == query.NullQuery + + q = qp.parse("[%s to]" % teststart) + assert q.__class__ == query.NumericRange + assert q.start == teststart + assert q.end is None + + q = qp.parse("[to %s]" % testend) + assert q.__class__ == query.NumericRange + assert q.start is None + assert q.end == testend + + q = qp.parse("[%s to %s]" % (teststart, testend)) + assert q.__class__ == query.NumericRange + assert q.start == teststart + assert q.end == testend + + +def test_regressions(): + qp = default.QueryParser("f", None) + + # From 0.3.18, these used to require escaping. Mostly good for + # regression testing. + assert qp.parse(u("re-inker")) == query.Term("f", "re-inker") + assert qp.parse(u("0.7 wire")) == query.And([query.Term("f", "0.7"), + query.Term("f", "wire")]) + assert (qp.parse(u("daler-rowney pearl 'bell bronze'")) + == query.And([query.Term("f", "daler-rowney"), + query.Term("f", "pearl"), + query.Term("f", "bell bronze")])) + + q = qp.parse(u('22" BX')) + assert q, query.And([query.Term("f", '22"') == query.Term("f", "BX")]) + + +def test_empty_ranges(): + schema = fields.Schema(name=fields.TEXT, num=fields.NUMERIC, + date=fields.DATETIME) + qp = default.QueryParser("text", schema) + + for fname in ("name", "date"): + q = qp.parse(u("%s:[to]") % fname) + assert q.__class__ == query.Every + + +def test_empty_numeric_range(): + schema = fields.Schema(id=fields.ID, num=fields.NUMERIC) + qp = default.QueryParser("num", schema) + q = qp.parse("num:[to]") + assert q.__class__ == query.NumericRange + assert q.start is None + assert q.end is None + + +def test_numrange_multi(): + schema = fields.Schema(text=fields.TEXT, start=fields.NUMERIC, + end=fields.NUMERIC) + qp = default.QueryParser("text", schema) + + q = qp.parse("start:[2008 to]") + assert q.__class__ == query.NumericRange + assert q.fieldname == "start" + assert q.start == 2008 + assert q.end is None + + q = qp.parse("start:[2011 to 2012]") + assert q.__class__.__name__ == "NumericRange" + assert q.fieldname == "start" + assert q.start == 2011 + assert q.end == 2012 + + q = qp.parse("start:[2008 to] AND end:[2011 to 2012]") + assert q.__class__ == query.And + assert q[0].__class__ == query.NumericRange + assert q[1].__class__ == query.NumericRange + assert q[0].start == 2008 + assert q[0].end is None + assert q[1].start == 2011 + assert q[1].end == 2012 + + +def test_nonexistant_fieldnames(): + # Need an analyzer that won't mangle a URL + a = analysis.SimpleAnalyzer("\\S+") + schema = fields.Schema(id=fields.ID, text=fields.TEXT(analyzer=a)) + + qp = default.QueryParser("text", schema) + q = qp.parse(u("id:/code http://localhost/")) + assert q.__class__ == query.And + assert q[0].__class__ == query.Term + assert q[0].fieldname == "id" + assert q[0].text == "/code" + assert q[1].__class__ == query.Term + assert q[1].fieldname == "text" + assert q[1].text == "http://localhost/" + + +def test_stopped(): + schema = fields.Schema(text=fields.TEXT) + qp = default.QueryParser("text", schema) + q = qp.parse(u("a b")) + assert q == query.NullQuery + + +def test_analyzing_terms(): + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(text=fields.TEXT(analyzer=ana)) + qp = default.QueryParser("text", schema) + q = qp.parse(u("Indexed!")) + assert q.__class__ == query.Term + assert q.text == "index" + + +def test_simple_parsing(): + parser = default.SimpleParser("x", None) + q = parser.parse(u("alfa bravo charlie delta")) + assert text_type(q) == "(x:alfa OR x:bravo OR x:charlie OR x:delta)" + + q = parser.parse(u("alfa +bravo charlie delta")) + assert text_type(q) == "(x:bravo ANDMAYBE (x:alfa OR x:charlie OR x:delta))" + + q = parser.parse(u("alfa +bravo -charlie delta")) + assert (text_type(q) + == "((x:bravo ANDMAYBE (x:alfa OR x:delta)) ANDNOT x:charlie)") + + q = parser.parse(u("- alfa +bravo + delta")) + assert text_type(q) == "((x:bravo AND x:delta) ANDNOT x:alfa)" + + +def test_dismax(): + parser = default.DisMaxParser({"body": 0.8, "title": 2.5}, None) + q = parser.parse(u("alfa bravo charlie")) + assert text_type(q) == "(DisMax(body:alfa^0.8 title:alfa^2.5) OR DisMax(body:bravo^0.8 title:bravo^2.5) OR DisMax(body:charlie^0.8 title:charlie^2.5))" + + q = parser.parse(u("alfa +bravo charlie")) + assert text_type(q) == "(DisMax(body:bravo^0.8 title:bravo^2.5) ANDMAYBE (DisMax(body:alfa^0.8 title:alfa^2.5) OR DisMax(body:charlie^0.8 title:charlie^2.5)))" + + q = parser.parse(u("alfa -bravo charlie")) + assert text_type(q) == "((DisMax(body:alfa^0.8 title:alfa^2.5) OR DisMax(body:charlie^0.8 title:charlie^2.5)) ANDNOT DisMax(body:bravo^0.8 title:bravo^2.5))" + + q = parser.parse(u("alfa -bravo +charlie")) + assert text_type(q) == "((DisMax(body:charlie^0.8 title:charlie^2.5) ANDMAYBE DisMax(body:alfa^0.8 title:alfa^2.5)) ANDNOT DisMax(body:bravo^0.8 title:bravo^2.5))" + + +def test_many_clauses(): + qs = "1" + (" OR 1" * 1000) + + parser = default.QueryParser("content", None) + parser.parse(qs) + + +def test_roundtrip(): + parser = default.QueryParser("a", None) + q = parser.parse(u("a OR ((b AND c AND d AND e) OR f OR g) ANDNOT h")) + assert text_type(q) == "((a:a OR (a:b AND a:c AND a:d AND a:e) OR a:f OR a:g) ANDNOT a:h)" + + +def test_ngrams(): + schema = fields.Schema(grams=fields.NGRAM) + parser = default.QueryParser('grams', schema) + parser.remove_plugin_class(plugins.WhitespacePlugin) + + q = parser.parse(u("Hello There")) + assert q.__class__ == query.And + assert len(q) == 8 + assert [sq.text for sq in q] == ["hell", "ello", "llo ", "lo t", "o th", + " the", "ther", "here"] + + +def test_ngramwords(): + schema = fields.Schema(grams=fields.NGRAMWORDS(queryor=True)) + parser = default.QueryParser('grams', schema) + + q = parser.parse(u("Hello Tom")) + assert q.__class__ == query.And + assert q[0].__class__ == query.Or + assert q[1].__class__ == query.Term + assert q[0][0].text == "hell" + assert q[0][1].text == "ello" + assert q[1].text == "tom" + + +def test_multitoken_default(): + textfield = fields.TEXT() + assert textfield.multitoken_query == "default" + schema = fields.Schema(text=textfield) + parser = default.QueryParser('text', schema) + qstring = u("chaw-bacon") + + texts = list(schema["text"].process_text(qstring)) + assert texts == ["chaw", "bacon"] + + q = parser.parse(qstring) + assert q.__class__ == query.And + assert len(q) == 2 + assert q[0].__class__ == query.Term + assert q[0].text == "chaw" + assert q[1].__class__ == query.Term + assert q[1].text == "bacon" + + +def test_multitoken_or(): + textfield = fields.TEXT() + textfield.multitoken_query = "or" + schema = fields.Schema(text=textfield) + parser = default.QueryParser('text', schema) + qstring = u("chaw-bacon") + + texts = list(schema["text"].process_text(qstring)) + assert texts == ["chaw", "bacon"] + + q = parser.parse(qstring) + assert q.__class__ == query.Or + assert len(q) == 2 + assert q[0].__class__ == query.Term + assert q[0].text == "chaw" + assert q[1].__class__ == query.Term + assert q[1].text == "bacon" + + +def test_multitoken_phrase(): + textfield = fields.TEXT() + textfield.multitoken_query = "phrase" + schema = fields.Schema(text=textfield) + parser = default.QueryParser("text", schema) + qstring = u("chaw-bacon") + + texts = list(schema["text"].process_text(qstring)) + assert texts == ["chaw", "bacon"] + + q = parser.parse(qstring) + assert q.__class__ == query.Phrase + + +def test_singlequote_multitoken(): + schema = fields.Schema(text=fields.TEXT(multitoken_query="or")) + parser = default.QueryParser("text", schema) + q = parser.parse(u("foo bar")) + assert q.__unicode__() == "(text:foo AND text:bar)" + + q = parser.parse(u("'foo bar'")) # single quotes + assert q.__unicode__() == "(text:foo OR text:bar)" + + +def test_operator_queries(): + qp = default.QueryParser("f", None) + + q = qp.parse("a AND b OR c AND d") + assert text_type(q) == "((f:a AND f:b) OR (f:c AND f:d))" + + q = qp.parse("a OR b OR c OR d") + assert text_type(q) == "(f:a OR f:b OR f:c OR f:d)" + + q = qp.parse("a ANDMAYBE b ANDNOT c REQUIRE d") + assert text_type(q) == "((f:a ANDMAYBE (f:b ANDNOT f:c)) REQUIRE f:d)" + + +#def test_associativity(): +# left_andmaybe = (syntax.InfixOperator("ANDMAYBE", syntax.AndMaybeGroup, True), 0) +# right_andmaybe = (syntax.InfixOperator("ANDMAYBE", syntax.AndMaybeGroup, False), 0) +# not_ = (syntax.PrefixOperator("NOT", syntax.NotGroup), 0) +# +# def make_parser(*ops): +# parser = default.QueryParser("f", None) +# parser.replace_plugin(plugins.CompoundsPlugin(ops, clean=True)) +# return parser +# +# p = make_parser(left_andmaybe) +# q = p.parse("a ANDMAYBE b ANDMAYBE c ANDMAYBE d") +# assert text_type(q), "(((f:a ANDMAYBE f:b) ANDMAYBE f:c) ANDMAYBE f:d)") +# +# p = make_parser(right_andmaybe) +# q = p.parse("a ANDMAYBE b ANDMAYBE c ANDMAYBE d") +# assert text_type(q), "(f:a ANDMAYBE (f:b ANDMAYBE (f:c ANDMAYBE f:d)))") +# +# p = make_parser(not_) +# q = p.parse("a NOT b NOT c NOT d", normalize=False) +# assert text_type(q), "(f:a AND NOT f:b AND NOT f:c AND NOT f:d)") +# +# p = make_parser(left_andmaybe) +# q = p.parse("(a ANDMAYBE b) ANDMAYBE (c ANDMAYBE d)") +# assert text_type(q), "((f:a ANDMAYBE f:b) ANDMAYBE (f:c ANDMAYBE f:d))") +# +# p = make_parser(right_andmaybe) +# q = p.parse("(a ANDMAYBE b) ANDMAYBE (c ANDMAYBE d)") +# assert text_type(q), "((f:a ANDMAYBE f:b) ANDMAYBE (f:c ANDMAYBE f:d))") + + +def test_not_assoc(): + qp = default.QueryParser("text", None) + q = qp.parse(u("a AND NOT b OR c")) + assert text_type(q) == "((text:a AND NOT text:b) OR text:c)" + + qp = default.QueryParser("text", None) + q = qp.parse(u("a NOT (b OR c)")) + assert text_type(q) == "(text:a AND NOT (text:b OR text:c))" + + +def test_fieldname_space(): + qp = default.QueryParser("a", None) + q = qp.parse("Man Ray: a retrospective") + assert text_type(q) == "(a:Man AND a:Ray: AND a:a AND a:retrospective)" + + +def test_fieldname_fieldname(): + qp = default.QueryParser("a", None) + q = qp.parse("a:b:") + assert q == query.Term("a", "b:") + + +def test_paren_fieldname(): + schema = fields.Schema(kind=fields.ID, content=fields.TEXT) + + qp = default.QueryParser("content", schema) + q = qp.parse(u("(kind:1d565 OR kind:7c584) AND (stuff)")) + assert text_type(q) == "((kind:1d565 OR kind:7c584) AND content:stuff)" + + q = qp.parse(u("kind:(1d565 OR 7c584) AND (stuff)")) + assert text_type(q) == "((kind:1d565 OR kind:7c584) AND content:stuff)" + + +def test_star_paren(): + qp = default.QueryParser("content", None) + q = qp.parse(u("(*john*) AND (title:blog)")) + + assert q.__class__ == query.And + assert q[0].__class__ == query.Wildcard + assert q[1].__class__ == query.Term + assert q[0].fieldname == "content" + assert q[1].fieldname == "title" + assert q[0].text == "*john*" + assert q[1].text == "blog" + + +def test_dash(): + ana = analysis.StandardAnalyzer("[^ \t\r\n()*?]+") + schema = fields.Schema(title=fields.TEXT(analyzer=ana), + text=fields.TEXT(analyzer=ana), + time=fields.ID) + qtext = u("*Ben-Hayden*") + + qp = default.QueryParser("text", schema) + q = qp.parse(qtext) + assert q.__class__ == query.Wildcard + assert q.fieldname == "text" + assert q.text == "*ben-hayden*" + + qp = default.MultifieldParser(["title", "text", "time"], schema) + q = qp.parse(qtext) + assert q.__unicode__() == "(title:*ben-hayden* OR text:*ben-hayden* OR time:*Ben-Hayden*)" + + +def test_bool_True(): + schema = fields.Schema(text=fields.TEXT, bool=fields.BOOLEAN) + qp = default.QueryParser("text", schema) + q = qp.parse("bool:True") + assert q.__class__ == query.Term + assert q.fieldname == "bool" + assert q.text is True + + +def test_not_order(): + schema = fields.Schema(id=fields.STORED, + count=fields.KEYWORD(lowercase=True), + cats=fields.KEYWORD(lowercase=True)) + qp = default.QueryParser("count", schema) + + q1 = qp.parse(u("(NOT (count:0) AND cats:1)")) + assert q1.__class__ == query.And + assert q1[0].__class__ == query.Not + assert q1[1].__class__ == query.Term + assert q1.__unicode__() == '(NOT count:0 AND cats:1)' + + q2 = qp.parse(u("(cats:1 AND NOT (count:0))")) + assert q2.__class__ == query.And + assert q2[0].__class__ == query.Term + assert q2[1].__class__ == query.Not + assert q2.__unicode__() == '(cats:1 AND NOT count:0)' + + +def test_spacespace_and(): + qp = default.QueryParser("f", None) + # one blank before/after AND + q = qp.parse("A AND B") + assert q.__class__ == query.And + assert len(q) == 2 + assert q[0] == query.Term("f", "A") + assert q[1] == query.Term("f", "B") + + # two blanks before AND + q = qp.parse("A AND B") + assert q.__class__ == query.And + assert len(q) == 2 + assert q[0] == query.Term("f", "A") + assert q[1] == query.Term("f", "B") + + +def test_unicode_num(): + schema = fields.Schema(num=fields.NUMERIC) + parser = default.QueryParser(u("num"), schema=schema) + q = parser.parse(u("num:1")) + + _ = text_type(q) + + +def test_phrase_andmaybe(): + qp = default.QueryParser("f", None) + + q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"')) + assert isinstance(q, query.AndMaybe) + assert q[0] == query.Term("f", u("Dahmen")) + assert q[1] == query.Phrase("f", [u("Besov"), u("Spaces")]) + + +def test_phrase_boost(): + qp = default.QueryParser("f", None) + q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"^9')) + assert isinstance(q, query.AndMaybe) + assert q[0] == query.Term("f", u("Dahmen")) + assert q[1] == query.Phrase("f", [u("Besov"), u("Spaces")], boost=9) + + +def test_andmaybe_none(): + schema = fields.Schema(f=fields.TEXT, year=fields.NUMERIC) + qp = default.QueryParser("f", schema) + _ = qp.parse(u("Dahmen ANDMAYBE @year:[2000 TO]")) + + +def test_quoted_prefix(): + qp = default.QueryParser("f", None) + + expr = r"(^|(?<=[ (]))(?P\w+|[*]):" + qp.replace_plugin(plugins.FieldsPlugin(expr)) + + q = qp.parse(u('foo url:http://apple.com:8080/bar* baz')) + assert isinstance(q, query.And) + assert q[0] == query.Term("f", "foo") + assert q[1] == query.Prefix("url", "http://apple.com:8080/bar") + assert q[2] == query.Term("f", "baz") + assert len(q) == 3 diff --git a/tests/test_postings.py b/tests/test_postings.py new file mode 100644 index 0000000..48af4a2 --- /dev/null +++ b/tests/test_postings.py @@ -0,0 +1,87 @@ +from __future__ import with_statement + +from whoosh import analysis, fields +from whoosh.compat import xrange, u +from whoosh.codec import default_codec +from whoosh.formats import Existence, Frequency +from whoosh.formats import Positions, PositionBoosts +from whoosh.formats import Characters, CharacterBoosts +from whoosh.util.testing import TempStorage + + +def _roundtrip(content, format_, astype, ana=None): + with TempStorage("roundtrip") as st: + codec = default_codec() + seg = codec.new_segment(st, "") + ana = ana or analysis.StandardAnalyzer() + field = fields.FieldType(format=format_, analyzer=ana) + + fw = codec.field_writer(st, seg) + fw.start_field("f1", field) + for text, _, weight, valuestring in sorted(field.index(content)): + fw.start_term(text) + fw.add(0, weight, valuestring, None) + fw.finish_term() + fw.finish_field() + fw.close() + + tr = codec.terms_reader(st, seg) + ps = [] + for fieldname, btext in tr.terms(): + m = tr.matcher(fieldname, btext, format_) + ps.append((field.from_bytes(btext), m.value_as(astype))) + tr.close() + return ps + + +def test_existence_postings(): + content = u("alfa bravo charlie") + assert _roundtrip(content, Existence(), "frequency") == [("alfa", 1), ("bravo", 1), ("charlie", 1)] + + +def test_frequency_postings(): + content = u("alfa bravo charlie bravo alfa alfa") + assert _roundtrip(content, Frequency(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)] + + +def test_position_postings(): + content = u("alfa bravo charlie bravo alfa alfa") + assert _roundtrip(content, Positions(), "positions") == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] + assert _roundtrip(content, Positions(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)] + + +def test_character_postings(): + content = u("alfa bravo charlie bravo alfa alfa") + assert _roundtrip(content, Characters(), "characters") == [("alfa", [(0, 0, 4), (4, 25, 29), (5, 30, 34)]), + ("bravo", [(1, 5, 10), (3, 19, 24)]), + ("charlie", [(2, 11, 18)])] + assert _roundtrip(content, Characters(), "positions") == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] + assert _roundtrip(content, Characters(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)] + + +def test_posboost_postings(): + pbs = PositionBoosts() + ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() + content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa") + assert _roundtrip(content, pbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]), + ("bravo", [(1, 0.1), (3, 0.5)]), + ("charlie", [(2, 2)])] + assert _roundtrip(content, pbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] + assert _roundtrip(content, pbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)] + + +def test_charboost_postings(): + cbs = CharacterBoosts() + ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() + content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa") + assert _roundtrip(content, cbs, "character_boosts", ana) == [("alfa", [(0, 0, 4, 2), (4, 37, 41, 1), (5, 42, 46, 1)]), + ("bravo", [(1, 7, 12, 0.1), (3, 27, 32, 0.5)]), + ("charlie", [(2, 17, 24, 2)])] + assert _roundtrip(content, cbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]), + ("bravo", [(1, 0.1), (3, 0.5)]), + ("charlie", [(2, 2)])] + assert _roundtrip(content, cbs, "characters", ana) == [("alfa", [(0, 0, 4), (4, 37, 41), (5, 42, 46)]), + ("bravo", [(1, 7, 12), (3, 27, 32)]), + ("charlie", [(2, 17, 24)])] + assert _roundtrip(content, cbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] + assert _roundtrip(content, cbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)] diff --git a/tests/test_quality.py b/tests/test_quality.py new file mode 100644 index 0000000..c282855 --- /dev/null +++ b/tests/test_quality.py @@ -0,0 +1,172 @@ +from __future__ import with_statement +import random + +from whoosh import fields, matching, scoring +from whoosh.compat import b, u, xrange +from whoosh.filedb.filestore import RamStorage +from whoosh.util.numeric import length_to_byte, byte_to_length + + +def _discreet(length): + return byte_to_length(length_to_byte(length)) + + +def test_max_field_length(): + st = RamStorage() + schema = fields.Schema(t=fields.TEXT) + ix = st.create_index(schema) + for i in xrange(1, 200, 7): + w = ix.writer() + w.add_document(t=u(" ").join(["word"] * i)) + w.commit() + + with ix.reader() as r: + assert r.max_field_length("t") == _discreet(i) + + +def test_minmax_field_length(): + st = RamStorage() + schema = fields.Schema(t=fields.TEXT) + ix = st.create_index(schema) + least = 999999 + most = 0 + for _ in xrange(1, 200, 7): + w = ix.writer() + count = random.randint(1, 100) + least = min(count, least) + most = max(count, most) + w.add_document(t=u(" ").join(["word"] * count)) + w.commit() + + with ix.reader() as r: + assert r.min_field_length("t") == _discreet(least) + assert r.max_field_length("t") == _discreet(most) + + +def test_term_stats(): + schema = fields.Schema(t=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(t=u("alfa bravo charlie delta echo")) + w.add_document(t=u("bravo charlie delta echo foxtrot")) + w.add_document(t=u("charlie delta echo foxtrot golf")) + w.add_document(t=u("delta echo foxtrot")) + w.add_document(t=u("echo foxtrot golf hotel india juliet")) + w.add_document(t=u("foxtrot alfa alfa alfa")) + w.commit() + + with ix.reader() as r: + ti = r.term_info("t", u("alfa")) + assert ti.weight() == 4.0 + assert ti.doc_frequency() == 2 + assert ti.min_length() == 4 + assert ti.max_length() == 5 + assert ti.max_weight() == 3.0 + + assert r.term_info("t", u("echo")).min_length() == 3 + + assert r.doc_field_length(3, "t") == 3 + assert r.min_field_length("t") == 3 + assert r.max_field_length("t") == 6 + + w = ix.writer() + w.add_document(t=u("alfa")) + w.add_document(t=u("bravo charlie")) + w.add_document(t=u("echo foxtrot tango bravo")) + w.add_document(t=u("golf hotel")) + w.add_document(t=u("india")) + w.add_document(t=u("juliet alfa bravo charlie delta echo foxtrot")) + w.commit(merge=False) + + with ix.reader() as r: + ti = r.term_info("t", u("alfa")) + assert ti.weight() == 6.0 + assert ti.doc_frequency() == 4 + assert ti.min_length() == 1 + assert ti.max_length() == 7 + assert ti.max_weight() == 3.0 + + assert r.term_info("t", u("echo")).min_length() == 3 + + assert r.min_field_length("t") == 1 + assert r.max_field_length("t") == 7 + + +def test_min_max_id(): + schema = fields.Schema(id=fields.STORED, t=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=0, t=u("alfa bravo charlie")) + w.add_document(id=1, t=u("bravo charlie delta")) + w.add_document(id=2, t=u("charlie delta echo")) + w.add_document(id=3, t=u("delta echo foxtrot")) + w.add_document(id=4, t=u("echo foxtrot golf")) + w.commit() + + with ix.reader() as r: + ti = r.term_info("t", u("delta")) + assert ti.min_id() == 1 + assert ti.max_id() == 3 + + ti = r.term_info("t", u("alfa")) + assert ti.min_id() == 0 + assert ti.max_id() == 0 + + ti = r.term_info("t", u("foxtrot")) + assert ti.min_id() == 3 + assert ti.max_id() == 4 + + w = ix.writer() + w.add_document(id=5, t=u("foxtrot golf hotel")) + w.add_document(id=6, t=u("golf hotel alfa")) + w.add_document(id=7, t=u("hotel alfa bravo")) + w.add_document(id=8, t=u("alfa bravo charlie")) + w.commit(merge=False) + + with ix.reader() as r: + ti = r.term_info("t", u("delta")) + assert ti.min_id() == 1 + assert ti.max_id() == 3 + + ti = r.term_info("t", u("alfa")) + assert ti.min_id() == 0 + assert ti.max_id() == 8 + + ti = r.term_info("t", u("foxtrot")) + assert ti.min_id() == 3 + assert ti.max_id() == 5 + + +def test_replacements(): + sc = scoring.WeightScorer(0.25) + a = matching.ListMatcher([1, 2, 3], [0.25, 0.25, 0.25], scorer=sc) + b = matching.ListMatcher([1, 2, 3], [0.25, 0.25, 0.25], scorer=sc) + um = matching.UnionMatcher(a, b) + + a2 = a.replace(0.5) + assert a2.__class__ == matching.NullMatcherClass + + um2 = um.replace(0.5) + assert um2.__class__ == matching.IntersectionMatcher + um2 = um.replace(0.6) + assert um2.__class__ == matching.NullMatcherClass + + wm = matching.WrappingMatcher(um, boost=2.0) + wm = wm.replace(0.5) + assert wm.__class__ == matching.WrappingMatcher + assert wm.boost == 2.0 + assert wm.child.__class__ == matching.IntersectionMatcher + + ls1 = matching.ListMatcher([1, 2, 3], [0.1, 0.1, 0.1], + scorer=scoring.WeightScorer(0.1)) + ls2 = matching.ListMatcher([1, 2, 3], [0.2, 0.2, 0.2], + scorer=scoring.WeightScorer(0.2)) + ls3 = matching.ListMatcher([1, 2, 3], [0.3, 0.3, 0.3], + scorer=scoring.WeightScorer(0.3)) + mm = matching.MultiMatcher([ls1, ls2, ls3], [0, 4, 8]) + mm = mm.replace(0.25) + assert mm.current == 2 + + dm = matching.DisjunctionMaxMatcher(ls1, ls2) + dm = dm.replace(0.15) + assert dm is ls2 diff --git a/tests/test_queries.py b/tests/test_queries.py new file mode 100644 index 0000000..90c79b0 --- /dev/null +++ b/tests/test_queries.py @@ -0,0 +1,574 @@ +from __future__ import with_statement +import copy + +import pytest + +from whoosh import fields, qparser, query +from whoosh.compat import b, u +from whoosh.filedb.filestore import RamStorage +from whoosh.qparser import QueryParser +from whoosh.query import And +from whoosh.query import AndMaybe +from whoosh.query import ConstantScoreQuery +from whoosh.query import DateRange +from whoosh.query import DisjunctionMax +from whoosh.query import Every +from whoosh.query import FuzzyTerm +from whoosh.query import Not +from whoosh.query import NullQuery +from whoosh.query import NumericRange +from whoosh.query import Or +from whoosh.query import Phrase +from whoosh.query import Prefix +from whoosh.query import Require +from whoosh.query import Term +from whoosh.query import TermRange +from whoosh.query import Variations +from whoosh.query import Wildcard +from whoosh.query.spans import SpanContains +from whoosh.query.spans import SpanFirst +from whoosh.query.spans import SpanNear +from whoosh.query.spans import SpanNot +from whoosh.query.spans import SpanOr +from whoosh.util.testing import TempIndex + + +def test_all_terms(): + q = QueryParser("a", None).parse(u('hello b:there c:"my friend"')) + ts = q.all_terms(phrases=False) + assert sorted(ts) == [("a", "hello"), ("b", "there")] + ts = q.all_terms(phrases=True) + assert sorted(ts) == [("a", "hello"), ("b", "there"), ("c", "friend"), + ("c", "my")] + + +def test_existing_terms(): + s = fields.Schema(key=fields.ID, value=fields.TEXT) + ix = RamStorage().create_index(s) + + w = ix.writer() + w.add_document(key=u("a"), value=u("alfa bravo charlie delta echo")) + w.add_document(key=u("b"), value=u("foxtrot golf hotel india juliet")) + w.commit() + + r = ix.reader() + q = QueryParser("value", None).parse(u('alfa hotel tango "sierra bravo"')) + + ts = q.existing_terms(r, phrases=False) + assert sorted(ts) == [("value", b("alfa")), ("value", b("hotel"))] + + ts = q.existing_terms(r) + assert sorted(ts) == [("value", b("alfa")), ("value", b("bravo")), ("value", b("hotel"))] + + +def test_wildcard_existing_terms(): + s = fields.Schema(key=fields.ID, value=fields.TEXT) + ix = RamStorage().create_index(s) + + w = ix.writer() + w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta")) + w.add_document(key=u("a"), value=u("boggle echo render rendering renders")) + w.commit() + r = ix.reader() + qp = QueryParser("value", ix.schema) + + def words(terms): + z = [] + for t in terms: + assert t[0] == "value" + z.append(t[1]) + return b(" ").join(sorted(z)) + + q = qp.parse(u("b*")) + ts = q.existing_terms(r) + assert ts == set() + ts = q.existing_terms(r, expand=True) + assert words(ts) == b("bear boggle bravo") + + q = qp.parse(u("[a TO f]")) + ts = q.existing_terms(r) + assert ts == set() + ts = q.existing_terms(r, expand=True) + assert words(ts) == b("alfa bear boggle bravo charlie delta echo") + + q = query.Variations("value", "render") + ts = q.existing_terms(r, expand=False) + assert ts == set([("value", b("render"))]) + ts = q.existing_terms(r, expand=True) + assert words(ts) == b("render rendering renders") + + +def test_replace(): + q = And([Or([Term("a", "b"), Term("b", "c")], boost=1.2), + Variations("a", "b", boost=2.0)]) + q = q.replace("a", "b", "BB") + assert q == And([Or([Term("a", "BB"), Term("b", "c")], boost=1.2), + Variations("a", "BB", boost=2.0)]) + + +def test_apply(): + def visit(q): + if isinstance(q, (Term, Variations, FuzzyTerm)): + q.text = q.text.upper() + return q + return q.apply(visit) + + before = And([Not(Term("a", u("b"))), Variations("a", u("c")), + Not(FuzzyTerm("a", u("d")))]) + after = visit(before) + assert after == And([Not(Term("a", u("B"))), Variations("a", u("C")), + Not(FuzzyTerm("a", u("D")))]) + + def term2var(q): + if isinstance(q, Term): + return Variations(q.fieldname, q.text) + else: + return q.apply(term2var) + + q = And([Term("f", "alfa"), Or([Term("f", "bravo"), + Not(Term("f", "charlie"))])]) + q = term2var(q) + assert q == And([Variations('f', 'alfa'), + Or([Variations('f', 'bravo'), + Not(Variations('f', 'charlie'))])]) + + +def test_accept(): + def boost_phrases(q): + if isinstance(q, Phrase): + q.boost *= 2.0 + return q + + before = And([Term("a", u("b")), Or([Term("c", u("d")), + Phrase("a", [u("e"), u("f")])]), + Phrase("a", [u("g"), u("h")], boost=0.25)]) + after = before.accept(boost_phrases) + assert after == And([Term("a", u("b")), + Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")], boost=2.0)]), + Phrase("a", [u("g"), u("h")], boost=0.5)]) + + before = Phrase("a", [u("b"), u("c")], boost=2.5) + after = before.accept(boost_phrases) + assert after == Phrase("a", [u("b"), u("c")], boost=5.0) + + +def test_simplify(): + s = fields.Schema(k=fields.ID, v=fields.TEXT) + ix = RamStorage().create_index(s) + + w = ix.writer() + w.add_document(k=u("1"), v=u("aardvark apple allan alfa bear bee")) + w.add_document(k=u("2"), v=u("brie glue geewhiz goop julia")) + w.commit() + + r = ix.reader() + q1 = And([Prefix("v", "b", boost=2.0), Term("v", "juliet")]) + q2 = And([Or([Term('v', 'bear', boost=2.0), + Term('v', 'bee', boost=2.0), + Term('v', 'brie', boost=2.0)]), + Term('v', 'juliet')]) + assert q1.simplify(r) == q2 + + +def test_merge_ranges(): + q = And([TermRange("f1", u("a"), None), TermRange("f1", None, u("z"))]) + assert q.normalize() == TermRange("f1", u("a"), u("z")) + + q = And([NumericRange("f1", None, u("aaaaa")), + NumericRange("f1", u("zzzzz"), None)]) + assert q.normalize() == q + + q = And([TermRange("f1", u("a"), u("z")), TermRange("f1", "b", "x")]) + assert q.normalize() == TermRange("f1", u("a"), u("z")) + + q = And([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) + assert q.normalize() == TermRange("f1", u("f"), u("m")) + + q = Or([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) + assert q.normalize() == TermRange("f1", u("a"), u("q")) + + q = Or([TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) + assert q.normalize() == Every("f1") + + q = And([Every("f1"), Term("f1", "a"), Variations("f1", "b")]) + assert q.normalize() == Every("f1") + + q = Or([Term("f1", u("q")), TermRange("f1", u("m"), None), + TermRange("f1", None, u("n"))]) + assert q.normalize() == Every("f1") + + q = And([Or([Term("f1", u("a")), Term("f1", u("b"))]), Every("f1")]) + assert q.normalize() == Every("f1") + + q = And([Term("f1", u("a")), And([Or([Every("f1")])])]) + assert q.normalize() == Every("f1") + + +def test_normalize_compound(): + def oq(): + return Or([Term("a", u("a")), Term("a", u("b"))]) + + def nq(level): + if level == 0: + return oq() + else: + return Or([nq(level - 1), nq(level - 1), nq(level - 1)]) + + q = nq(5) + q = q.normalize() + assert q == Or([Term("a", u("a")), Term("a", u("b"))]) + + +def test_duplicates(): + q = And([Term("a", u("b")), Term("a", u("b"))]) + assert q.normalize() == Term("a", u("b")) + + q = And([Prefix("a", u("b")), Prefix("a", u("b"))]) + assert q.normalize() == Prefix("a", u("b")) + + q = And([Variations("a", u("b")), And([Variations("a", u("b")), + Term("a", u("b"))])]) + assert q.normalize() == And([Variations("a", u("b")), Term("a", u("b"))]) + + q = And([Term("a", u("b")), Prefix("a", u("b")), + Term("a", u("b"), boost=1.1)]) + assert q.normalize() == q + + # Wildcard without * or ? normalizes to Term + q = And([Wildcard("a", u("b")), + And([Wildcard("a", u("b")), Term("a", u("b"))])]) + assert q.normalize() == Term("a", u("b")) + + +# TODO: FIX THIS + +def test_query_copy_hash(): + def do(q1, q2): + q1a = copy.deepcopy(q1) + assert q1 == q1a + assert hash(q1) == hash(q1a) + assert q1 != q2 + + do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5)) + do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1), + And([Term("a", u("b")), Term("c", u("d"))], boost=1.5)) + do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]), + Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5)) + do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]), + DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))], + boost=1.5)) + do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5))) + do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5)) + do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"), + boost=1.5)) + do(FuzzyTerm("a", u("b"), constantscore=True), + FuzzyTerm("a", u("b"), constantscore=False)) + do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5)) + do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d"))) + do(TermRange("a", None, u("c")), TermRange("a", None, None)) + do(TermRange("a", u("b"), u("c"), boost=1.1), + TermRange("a", u("b"), u("c"), boost=1.5)) + do(TermRange("a", u("b"), u("c"), constantscore=True), + TermRange("a", u("b"), u("c"), constantscore=False)) + do(NumericRange("a", 1, 5), NumericRange("a", 1, 6)) + do(NumericRange("a", None, 5), NumericRange("a", None, None)) + do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5)) + do(NumericRange("a", 3, 6, constantscore=True), + NumericRange("a", 3, 6, constantscore=False)) + # do(DateRange) + do(Variations("a", u("render")), Variations("a", u("renders"))) + do(Variations("a", u("render"), boost=1.1), + Variations("a", u("renders"), boost=1.5)) + do(Phrase("a", [u("b"), u("c"), u("d")]), + Phrase("a", [u("b"), u("c"), u("e")])) + do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1), + Phrase("a", [u("b"), u("c"), u("d")], boost=1.5)) + do(Phrase("a", [u("b"), u("c"), u("d")], slop=1), + Phrase("a", [u("b"), u("c"), u("d")], slop=2)) + # do(Ordered) + do(Every(), Every("a")) + do(Every("a"), Every("b")) + do(Every("a", boost=1.1), Every("a", boost=1.5)) + do(NullQuery, Term("a", u("b"))) + do(ConstantScoreQuery(Term("a", u("b"))), + ConstantScoreQuery(Term("a", u("c")))) + do(ConstantScoreQuery(Term("a", u("b")), score=2.0), + ConstantScoreQuery(Term("a", u("c")), score=2.1)) + do(Require(Term("a", u("b")), Term("c", u("d"))), + Require(Term("a", u("b"), boost=1.1), Term("c", u("d")))) + # do(Require) + # do(AndMaybe) + # do(AndNot) + # do(Otherwise) + + do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")), + limit=2)) + do(SpanNear(Term("a", u("b")), Term("c", u("d"))), + SpanNear(Term("a", u("b")), Term("c", u("e")))) + do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1), + SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2)) + do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1), + SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2)) + do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True), + SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False)) + do(SpanNot(Term("a", u("b")), Term("a", u("c"))), + SpanNot(Term("a", u("b")), Term("a", u("d")))) + do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]), + SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))])) + do(SpanContains(Term("a", u("b")), Term("a", u("c"))), + SpanContains(Term("a", u("b")), Term("a", u("d")))) + # do(SpanBefore) + # do(SpanCondition) + + +def test_requires(): + a = Term("f", u("a")) + b = Term("f", u("b")) + assert And([a, b]).requires() == set([a, b]) + assert Or([a, b]).requires() == set() + assert AndMaybe(a, b).requires() == set([a]) + assert a.requires() == set([a]) + + +def test_highlight_daterange(): + from datetime import datetime + + schema = fields.Schema(id=fields.ID(unique=True, stored=True), + title=fields.TEXT(stored=True), + content=fields.TEXT(stored=True), + released=fields.DATETIME(stored=True)) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.update_document( + id=u('1'), + title=u('Life Aquatic'), + content=u('A nautic film crew sets out to kill a gigantic shark.'), + released=datetime(2004, 12, 25) + ) + w.update_document( + id=u('2'), + title=u('Darjeeling Limited'), + content=u('Three brothers meet in India for a life changing train ' + + 'journey.'), + released=datetime(2007, 10, 27) + ) + w.commit() + + s = ix.searcher() + r = s.search(Term('content', u('train')), terms=True) + assert len(r) == 1 + assert r[0]["id"] == "2" + assert (r[0].highlights("content") + == 'for a life changing train journey') + + r = s.search(DateRange('released', datetime(2007, 1, 1), None)) + assert len(r) == 1 + assert r[0].highlights("content") == '' + + +def test_patterns(): + domain = u("aaron able acre adage aether after ago ahi aim ajax akimbo " + "alembic all amiga amount ampere").split() + schema = fields.Schema(word=fields.KEYWORD(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + for word in domain: + w.add_document(word=word) + + with ix.reader() as r: + assert list(r.field_terms("word")) == domain + + assert list(r.expand_prefix("word", "al")) == [b("alembic"), b("all")] + q = query.Prefix("word", "al") + assert q.simplify(r).__unicode__() == "(word:alembic OR word:all)" + + q = query.Wildcard("word", "a*[ae]") + assert q.simplify(r).__unicode__() == "(word:able OR word:acre OR word:adage OR word:amiga OR word:ampere)" + assert q._find_prefix(q.text) == "a" + + q = query.Regex("word", "am.*[ae]") + assert q.simplify(r).__unicode__() == "(word:amiga OR word:ampere)" + assert q._find_prefix(q.text) == "am" + + q = query.Regex("word", "able|ago") + assert q.simplify(r).__unicode__() == "(word:able OR word:ago)" + assert q._find_prefix(q.text) == "" + + # special case: ? may mean "zero occurences" + q = query.Regex("word", "ah?i") + assert q.simplify(r).__unicode__() == "(word:ahi OR word:aim)" + assert q._find_prefix(q.text) == "a" + + # special case: * may mean "zero occurences" + q = query.Regex("word", "ah*i") + assert q.simplify(r).__unicode__() == "(word:ahi OR word:aim)" + assert q._find_prefix(q.text) == "a" + + +def test_or_nots1(): + # Issue #285 + schema = fields.Schema(a=fields.KEYWORD(stored=True), + b=fields.KEYWORD(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + with ix.writer() as w: + w.add_document(a=u("alfa"), b=u("charlie")) + + with ix.searcher() as s: + q = query.And([query.Term("a", "alfa"), + query.Or([query.Not(query.Term("b", "bravo")), + query.Not(query.Term("b", "charlie")) + ]) + ]) + r = s.search(q) + assert len(r) == 1 + + +def test_or_nots2(): + # Issue #286 + schema = fields.Schema(a=fields.KEYWORD(stored=True), + b=fields.KEYWORD(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + with ix.writer() as w: + w.add_document(b=u("bravo")) + + with ix.searcher() as s: + q = query.Or([query.Term("a", "alfa"), + query.Not(query.Term("b", "alfa")) + ]) + r = s.search(q) + assert len(r) == 1 + + +def test_or_nots3(): + schema = fields.Schema(title=fields.TEXT(stored=True), + itemtype=fields.ID(stored=True)) + with TempIndex(schema, "ornot") as ix: + w = ix.writer() + w.add_document(title=u("a1"), itemtype=u("a")) + w.add_document(title=u("a2"), itemtype=u("a")) + w.add_document(title=u("b1"), itemtype=u("b")) + w.commit() + + q = Term('itemtype', 'a') | Not(Term('itemtype', 'a')) + + with ix.searcher() as s: + r = " ".join([hit["title"] for hit in s.search(q)]) + assert r == "a1 a2 b1" + + +def test_ornot_andnot(): + schema = fields.Schema(id=fields.NUMERIC(stored=True), a=fields.KEYWORD()) + st = RamStorage() + ix = st.create_index(schema) + + with ix.writer() as w: + w.add_document(id=0, a=u("word1 word1")) + w.add_document(id=1, a=u("word1 word2")) + w.add_document(id=2, a=u("word1 foo")) + w.add_document(id=3, a=u("foo word2")) + w.add_document(id=4, a=u("foo bar")) + + with ix.searcher() as s: + qp = qparser.QueryParser("a", ix.schema) + q1 = qp.parse(u("NOT word1 NOT word2")) + q2 = qp.parse(u("NOT (word1 OR word2)")) + + r1 = [hit["id"] for hit in s.search(q1, sortedby="id")] + r2 = [hit["id"] for hit in s.search(q2, sortedby="id")] + + assert r1 == r2 == [4] + + +def test_none_in_compounds(): + with pytest.raises(query.QueryError): + _ = query.And([query.Term("a", "b"), None, query.Term("c", "d")]) + + +def test_issue_355(): + schema = fields.Schema(seats=fields.NUMERIC(bits=8, stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(seats=0) + w.add_document(seats=10) + w.add_document(seats=20) + + with ix.searcher() as s: + # Passing a bytestring for a numeric field + q = Term("seats", b("maker")) + r1 = [hit["seats"] for hit in s.search(q, limit=5)] + + # Passing a unicode string for a numeric field + q = Term("seats", u("maker")) + r2 = [hit["seats"] for hit in s.search(q, limit=5)] + + # Passing a value too large for the numeric field + q = Term("seats", 260) + r3 = [hit["seats"] for hit in s.search(q, limit=5)] + + assert r1 == r2 == r3 == [] + + +def test_sequence(): + schema = fields.Schema(id=fields.STORED, text=fields.TEXT) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=0, text=u("alfa bravo charlie delta echo")) + w.add_document(id=1, text=u("bravo charlie delta echo alfa")) + w.add_document(id=2, text=u("charlie delta echo bravo")) + w.add_document(id=3, text=u("delta echo charlie")) + w.add_document(id=4, text=u("echo delta")) + + with ix.searcher() as s: + seq = query.Sequence([query.Term("text", u("echo")), + query.Term("text", u("alfa"))]) + q = query.And([query.Term("text", "bravo"), seq]) + + r = s.search(q, limit=4) + assert len(r) == 1 + assert r[0]["id"] == 1 + + +def test_andmaybe(): + schema = fields.Schema(id=fields.STORED, text=fields.TEXT) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=0, text=u("alfa bravo charlie delta echo")) + w.add_document(id=1, text=u("bravo charlie delta echo alfa")) + w.add_document(id=2, text=u("charlie delta echo bravo")) + w.add_document(id=3, text=u("delta echo charlie")) + w.add_document(id=4, text=u("echo delta")) + + qp = qparser.QueryParser("text", schema) + q = qp.parse(u('bravo ANDMAYBE "echo alfa"')) + + with ix.searcher() as s: + r = s.search(q) + assert len(r) == 3 + assert [hit["id"] for hit in r] == [1, 2, 0] + + +def test_numeric_filter(): + schema = fields.Schema(status=fields.NUMERIC, tags=fields.TEXT) + ix = RamStorage().create_index(schema) + + # Add a single document with status = -2 + with ix.writer() as w: + w.add_document(status=-2, tags=u"alfa bravo") + + with ix.searcher() as s: + # No document should match the filter + fq = query.NumericRange("status", 0, 2) + fr = s.search(fq) + assert fr.scored_length() == 0 + + # Make sure the query would otherwise match + q = query.Term("tags", u"alfa") + r = s.search(q) + assert r.scored_length() == 1 + + # Check the query doesn't match with the filter + r = s.search(q, filter=fq) + assert r.scored_length() == 0 diff --git a/tests/test_reading.py b/tests/test_reading.py new file mode 100644 index 0000000..99a3193 --- /dev/null +++ b/tests/test_reading.py @@ -0,0 +1,397 @@ +from __future__ import with_statement +import random, threading, time + +from whoosh import analysis, fields, formats, reading +from whoosh.compat import b, u, xrange +from whoosh.reading import SegmentReader +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex + + +def _create_index(): + s = fields.Schema(f1=fields.KEYWORD(stored=True), + f2=fields.KEYWORD, + f3=fields.KEYWORD) + st = RamStorage() + ix = st.create_index(s) + return ix + + +def _one_segment_index(): + ix = _create_index() + w = ix.writer() + w.add_document(f1=u("A B C"), f2=u("1 2 3"), f3=u("X Y Z")) + w.add_document(f1=u("D E F"), f2=u("4 5 6"), f3=u("Q R S")) + w.add_document(f1=u("A E C"), f2=u("1 4 6"), f3=u("X Q S")) + w.add_document(f1=u("A A A"), f2=u("2 3 5"), f3=u("Y R Z")) + w.add_document(f1=u("A B"), f2=u("1 2"), f3=u("X Y")) + w.commit() + + return ix + + +def _multi_segment_index(): + ix = _create_index() + w = ix.writer() + w.add_document(f1=u("A B C"), f2=u("1 2 3"), f3=u("X Y Z")) + w.add_document(f1=u("D E F"), f2=u("4 5 6"), f3=u("Q R S")) + w.commit() + + w = ix.writer() + w.add_document(f1=u("A E C"), f2=u("1 4 6"), f3=u("X Q S")) + w.add_document(f1=u("A A A"), f2=u("2 3 5"), f3=u("Y R Z")) + w.commit(merge=False) + + w = ix.writer() + w.add_document(f1=u("A B"), f2=u("1 2"), f3=u("X Y")) + w.commit(merge=False) + + return ix + + +def _stats(r): + return [(fname, text, ti.doc_frequency(), ti.weight()) + for (fname, text), ti in r] + + +def _fstats(r): + return [(text, ti.doc_frequency(), ti.weight()) + for text, ti in r] + + +def test_readers(): + target = [("f1", b('A'), 4, 6), ("f1", b('B'), 2, 2), ("f1", b('C'), 2, 2), + ("f1", b('D'), 1, 1), ("f1", b('E'), 2, 2), ("f1", b('F'), 1, 1), + ("f2", b('1'), 3, 3), ("f2", b('2'), 3, 3), ("f2", b('3'), 2, 2), + ("f2", b('4'), 2, 2), ("f2", b('5'), 2, 2), ("f2", b('6'), 2, 2), + ("f3", b('Q'), 2, 2), ("f3", b('R'), 2, 2), ("f3", b('S'), 2, 2), + ("f3", b('X'), 3, 3), ("f3", b('Y'), 3, 3), ("f3", b('Z'), 2, 2)] + target = sorted(target) + + stored = [{"f1": "A B C"}, {"f1": "D E F"}, {"f1": "A E C"}, + {"f1": "A A A"}, {"f1": "A B"}] + + def t(ix): + r = ix.reader() + assert list(r.all_stored_fields()) == stored + assert sorted(_stats(r)) == target + + ix = _one_segment_index() + assert len(ix._segments()) == 1 + t(ix) + + ix = _multi_segment_index() + assert len(ix._segments()) == 3 + t(ix) + + +def test_term_inspection(): + schema = fields.Schema(title=fields.TEXT(stored=True), + content=fields.TEXT) + st = RamStorage() + ix = st.create_index(schema) + writer = ix.writer() + writer.add_document(title=u("My document"), + content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE")) + writer.add_document(title=u("My other document"), + content=u("AA AB BB CC EE EE AX AX DD")) + writer.commit() + + reader = ix.reader() + assert " ".join(reader.field_terms("content")) == "aa ab ax bb cc dd ee" + assert list(reader.expand_prefix("content", "a")) == [b('aa'), b('ab'), b('ax')] + assert set(reader.all_terms()) == set([('content', b('aa')), ('content', b('ab')), + ('content', b('ax')), ('content', b('bb')), + ('content', b('cc')), ('content', b('dd')), + ('content', b('ee')), ('title', b('document')), + ('title', b('my')), ('title', b('other'))]) + # (text, doc_freq, index_freq) + assert _fstats(reader.iter_field("content")) == [(b('aa'), 2, 6), (b('ab'), 1, 1), (b('ax'), 1, 2), + (b('bb'), 2, 5), (b('cc'), 2, 3), (b('dd'), 2, 2), + (b('ee'), 2, 4)] + assert _fstats(reader.iter_field("content", prefix="c")) == [(b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)] + assert list(reader.most_frequent_terms("content")) == [(6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')), (2, b('dd'))] + assert list(reader.most_frequent_terms("content", prefix="a")) == [(6, b('aa')), (2, b('ax')), (1, b('ab'))] + assert list(reader.most_distinctive_terms("content", 3)) == [(1.3862943611198906, b('ax')), (0.6931471805599453, b('ab')), (0.0, b('ee'))] + + +def test_vector_postings(): + s = fields.Schema(id=fields.ID(stored=True, unique=True), + content=fields.TEXT(vector=formats.Positions())) + st = RamStorage() + ix = st.create_index(s) + + writer = ix.writer() + writer.add_document(id=u('1'), + content=u('the quick brown fox jumped over the ' + + 'lazy dogs')) + writer.commit() + r = ix.reader() + + terms = list(r.vector_as("weight", 0, "content")) + assert terms == [(u('brown'), 1.0), (u('dogs'), 1.0), (u('fox'), 1.0), + (u('jumped'), 1.0), (u('lazy'), 1.0), + (u('over'), 1.0), (u('quick'), 1.0)] + + +def test_stored_fields(): + s = fields.Schema(a=fields.ID(stored=True), b=fields.STORED, + c=fields.KEYWORD, d=fields.TEXT(stored=True)) + st = RamStorage() + ix = st.create_index(s) + + writer = ix.writer() + writer.add_document(a=u("1"), b="a", c=u("zulu"), d=u("Alfa")) + writer.add_document(a=u("2"), b="b", c=u("yankee"), d=u("Bravo")) + writer.add_document(a=u("3"), b="c", c=u("xray"), d=u("Charlie")) + writer.commit() + + with ix.searcher() as sr: + assert sr.stored_fields(0) == {"a": u("1"), "b": "a", "d": u("Alfa")} + assert sr.stored_fields(2) == {"a": u("3"), "b": "c", "d": u("Charlie")} + + assert sr.document(a=u("1")) == {"a": u("1"), "b": "a", "d": u("Alfa")} + assert sr.document(a=u("2")) == {"a": u("2"), "b": "b", "d": u("Bravo")} + + +def test_stored_fields2(): + schema = fields.Schema(content=fields.TEXT(stored=True), + title=fields.TEXT(stored=True), + summary=fields.STORED, + path=fields.ID(stored=True)) + + storedkeys = ["content", "path", "summary", "title"] + assert storedkeys == schema.stored_names() + + ix = RamStorage().create_index(schema) + + writer = ix.writer() + writer.add_document(content=u("Content of this document."), + title=u("This is the title"), + summary=u("This is the summary"), path=u("/main")) + writer.add_document(content=u("Second document."), title=u("Second title"), + summary=u("Summary numero due"), path=u("/second")) + writer.add_document(content=u("Third document."), title=u("Title 3"), + summary=u("Summary treo"), path=u("/san")) + writer.commit() + + with ix.searcher() as s: + doc = s.document(path="/main") + assert doc is not None + assert ([doc[k] for k in sorted(doc.keys())] + == ["Content of this document.", "/main", + "This is the summary", "This is the title"]) + + ix.close() + + +def test_all_stored_fields(): + # all_stored_fields() should yield all stored fields, even for deleted + # documents + + schema = fields.Schema(a=fields.ID(stored=True), b=fields.STORED) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(a=u("alfa"), b=u("bravo")) + w.add_document(a=u("apple"), b=u("bear")) + w.add_document(a=u("alpaca"), b=u("beagle")) + w.add_document(a=u("aim"), b=u("box")) + + w = ix.writer() + w.delete_by_term("a", "apple") + w.delete_by_term("a", "aim") + w.commit(merge=False) + + with ix.searcher() as s: + assert s.doc_count_all() == 4 + assert s.doc_count() == 2 + sfs = list((sf["a"], sf["b"]) for sf in s.all_stored_fields()) + assert sfs == [("alfa", "bravo"), ("alpaca", "beagle")] + + +def test_first_id(): + schema = fields.Schema(path=fields.ID(stored=True)) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(path=u("/a")) + w.add_document(path=u("/b")) + w.add_document(path=u("/c")) + w.commit() + + r = ix.reader() + docid = r.first_id("path", u("/b")) + assert r.stored_fields(docid) == {"path": "/b"} + + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(path=u("/a")) + w.add_document(path=u("/b")) + w.add_document(path=u("/c")) + w.commit(merge=False) + + w = ix.writer() + w.add_document(path=u("/d")) + w.add_document(path=u("/e")) + w.add_document(path=u("/f")) + w.commit(merge=False) + + w = ix.writer() + w.add_document(path=u("/g")) + w.add_document(path=u("/h")) + w.add_document(path=u("/i")) + w.commit(merge=False) + + r = ix.reader() + assert r.__class__ == reading.MultiReader + docid = r.first_id("path", u("/e")) + assert r.stored_fields(docid) == {"path": "/e"} + + +class RecoverReader(threading.Thread): + def __init__(self, ix): + threading.Thread.__init__(self) + self.ix = ix + + def run(self): + for _ in xrange(50): + r = self.ix.reader() + r.close() + + +class RecoverWriter(threading.Thread): + domain = u("alfa bravo charlie deleta echo foxtrot golf hotel india") + domain = domain.split() + + def __init__(self, ix): + threading.Thread.__init__(self) + self.ix = ix + + def run(self): + for _ in xrange(10): + w = self.ix.writer() + w.add_document(text=random.sample(self.domain, 4)) + w.commit() + time.sleep(0.01) + + +def test_delete_recovery(): + schema = fields.Schema(text=fields.TEXT) + with TempIndex(schema, "delrecover") as ix: + rw = RecoverWriter(ix) + rr = RecoverReader(ix) + rw.start() + rr.start() + rw.join() + rr.join() + + +def test_nonexclusive_read(): + schema = fields.Schema(text=fields.TEXT) + with TempIndex(schema, "readlock") as ix: + for num in u("one two three four five").split(): + w = ix.writer() + w.add_document(text=u("Test document %s") % num) + w.commit(merge=False) + + def fn(): + for _ in xrange(5): + r = ix.reader() + assert list(r.field_terms("text")) == ["document", "five", "four", "one", "test", "three", "two"] + r.close() + + ths = [threading.Thread(target=fn) for _ in xrange(5)] + for th in ths: + th.start() + for th in ths: + th.join() + + +def test_doc_count(): + schema = fields.Schema(id=fields.NUMERIC) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + for i in xrange(10): + w.add_document(id=i) + + r = ix.reader() + assert r.doc_count() == 10 + assert r.doc_count_all() == 10 + + w = ix.writer() + w.delete_document(2) + w.delete_document(4) + w.delete_document(6) + w.delete_document(8) + w.commit() + + r = ix.reader() + assert r.doc_count() == 6 + assert r.doc_count_all() == 10 + + w = ix.writer() + for i in xrange(10, 15): + w.add_document(id=i) + w.commit(merge=False) + + r = ix.reader() + assert r.doc_count() == 11 + assert r.doc_count_all() == 15 + + w = ix.writer() + w.delete_document(10) + w.delete_document(12) + w.delete_document(14) + w.commit(merge=False) + + r = ix.reader() + assert r.doc_count() == 8 + assert r.doc_count_all() == 15 + + ix.optimize() + r = ix.reader() + assert r.doc_count() == 8 + assert r.doc_count_all() == 8 + + +def test_reader_subclasses(): + from whoosh.util.testing import check_abstract_methods + + check_abstract_methods(reading.IndexReader, SegmentReader) + check_abstract_methods(reading.IndexReader, reading.MultiReader) + check_abstract_methods(reading.IndexReader, reading.EmptyReader) + + +def test_cursor(): + schema = fields.Schema(text=fields.TEXT) + with TempIndex(schema) as ix: + with ix.writer() as w: + w.add_document(text=u"papa quebec romeo sierra tango") + w.add_document(text=u"foxtrot golf hotel india juliet") + w.add_document(text=u"alfa bravo charlie delta echo") + w.add_document(text=u"uniform victor whiskey x-ray") + w.add_document(text=u"kilo lima mike november oskar") + w.add_document(text=u"charlie alfa alfa bravo bravo bravo") + + with ix.reader() as r: + cur = r.cursor("text") + assert cur.text() == "alfa" + assert cur.next() == "bravo" + assert cur.text() == "bravo" + + assert cur.find(b"inc") == "india" + assert cur.text() == "india" + + cur.first() == "alfa" + assert cur.text() == "alfa" + + assert cur.find(b"zulu") is None + assert cur.text() is None + assert not cur.is_valid() + + assert cur.find(b"a") == "alfa" + assert cur.term_info().weight() == 3 + assert cur.next() == "bravo" + assert cur.term_info().weight() == 4 + assert cur.next() == "charlie" + assert cur.term_info().weight() == 2 diff --git a/tests/test_results.py b/tests/test_results.py new file mode 100644 index 0000000..dc04a2e --- /dev/null +++ b/tests/test_results.py @@ -0,0 +1,635 @@ +from __future__ import with_statement + +import pytest + +from whoosh import analysis, fields, formats, highlight, qparser, query +from whoosh.codec.whoosh3 import W3Codec +from whoosh.compat import u, xrange, text_type, permutations +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempStorage, TempIndex + + +def test_score_retrieval(): + schema = fields.Schema(title=fields.TEXT(stored=True), + content=fields.TEXT(stored=True)) + storage = RamStorage() + ix = storage.create_index(schema) + writer = ix.writer() + writer.add_document(title=u("Miss Mary"), + content=u("Mary had a little white lamb its fleece" + " was white as snow")) + writer.add_document(title=u("Snow White"), + content=u("Snow white lived in the forest with seven" + " dwarfs")) + writer.commit() + + with ix.searcher() as s: + results = s.search(query.Term("content", "white")) + assert len(results) == 2 + assert results[0]['title'] == u("Miss Mary") + assert results[1]['title'] == u("Snow White") + assert results.score(0) is not None + assert results.score(0) != 0 + assert results.score(0) != 1 + + +def test_resultcopy(): + schema = fields.Schema(a=fields.TEXT(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + w.add_document(a=u("alfa bravo charlie")) + w.add_document(a=u("bravo charlie delta")) + w.add_document(a=u("charlie delta echo")) + w.add_document(a=u("delta echo foxtrot")) + w.commit() + + with ix.searcher() as s: + r = s.search(qparser.QueryParser("a", None).parse(u("charlie"))) + assert len(r) == 3 + rcopy = r.copy() + assert r.top_n == rcopy.top_n + + +def test_resultslength(): + schema = fields.Schema(id=fields.ID(stored=True), + value=fields.TEXT) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(id=u("1"), value=u("alfa alfa alfa alfa alfa")) + w.add_document(id=u("2"), value=u("alfa alfa alfa alfa")) + w.add_document(id=u("3"), value=u("alfa alfa alfa")) + w.add_document(id=u("4"), value=u("alfa alfa")) + w.add_document(id=u("5"), value=u("alfa")) + w.add_document(id=u("6"), value=u("bravo")) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Term("value", u("alfa")), limit=3) + assert len(r) == 5 + assert r.scored_length() == 3 + assert r[10:] == [] + + +def test_combine(): + schema = fields.Schema(id=fields.ID(stored=True), + value=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=u("1"), value=u("alfa bravo charlie all")) + w.add_document(id=u("2"), value=u("bravo charlie echo all")) + w.add_document(id=u("3"), value=u("charlie echo foxtrot all")) + w.add_document(id=u("4"), value=u("echo foxtrot india all")) + w.add_document(id=u("5"), value=u("foxtrot india juliet all")) + w.add_document(id=u("6"), value=u("india juliet alfa all")) + w.add_document(id=u("7"), value=u("juliet alfa bravo all")) + w.add_document(id=u("8"), value=u("charlie charlie charlie all")) + w.commit() + + with ix.searcher() as s: + def idsof(r): + return "".join(hit["id"] for hit in r) + + def check(r1, methodname, r2, ids): + getattr(r1, methodname)(r2) + assert idsof(r1) == ids + + def rfor(t): + return s.search(query.Term("value", t)) + + assert idsof(rfor(u("foxtrot"))) == "345" + check(rfor(u("foxtrot")), "extend", rfor("charlie"), "345812") + check(rfor(u("foxtrot")), "filter", rfor("juliet"), "5") + check(rfor(u("charlie")), "filter", rfor("foxtrot"), "3") + check(rfor(u("all")), "filter", rfor("foxtrot"), "345") + check(rfor(u("all")), "upgrade", rfor("india"), "45612378") + check(rfor(u("charlie")), "upgrade_and_extend", rfor("echo"), "23814") + + +def test_results_filter(): + schema = fields.Schema(id=fields.STORED, words=fields.KEYWORD(stored=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id="1", words=u("bravo top")) + w.add_document(id="2", words=u("alfa top")) + w.add_document(id="3", words=u("alfa top")) + w.add_document(id="4", words=u("alfa bottom")) + w.add_document(id="5", words=u("bravo bottom")) + w.add_document(id="6", words=u("charlie bottom")) + w.add_document(id="7", words=u("charlie bottom")) + w.commit() + + with ix.searcher() as s: + def check(r, target): + result = "".join(s.stored_fields(d)["id"] for d in r.docs()) + assert result == target + + r = s.search(query.Term("words", u("alfa"))) + r.filter(s.search(query.Term("words", u("bottom")))) + check(r, "4") + + +def test_extend_empty(): + schema = fields.Schema(id=fields.STORED, words=fields.KEYWORD) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, words=u("alfa bravo charlie")) + w.add_document(id=2, words=u("bravo charlie delta")) + w.add_document(id=3, words=u("charlie delta echo")) + w.add_document(id=4, words=u("delta echo foxtrot")) + w.add_document(id=5, words=u("echo foxtrot golf")) + w.commit() + + with ix.searcher() as s: + # Get an empty results object + r1 = s.search(query.Term("words", u("hotel"))) + # Copy it + r1c = r1.copy() + # Get a non-empty results object + r2 = s.search(query.Term("words", u("delta"))) + # Copy it + r2c = r2.copy() + # Extend r1 with r2 + r1c.extend(r2c) + assert [hit["id"] for hit in r1c] == [2, 3, 4] + assert r1c.scored_length() == 3 + + +def test_extend_filtered(): + schema = fields.Schema(id=fields.STORED, text=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, text=u("alfa bravo charlie")) + w.add_document(id=2, text=u("bravo charlie delta")) + w.add_document(id=3, text=u("juliet delta echo")) + w.add_document(id=4, text=u("delta bravo alfa")) + w.add_document(id=5, text=u("foxtrot sierra tango")) + w.commit() + + hits = lambda result: [hit["id"] for hit in result] + + with ix.searcher() as s: + r1 = s.search(query.Term("text", u("alfa")), filter=set([1, 4])) + assert r1.allowed == set([1, 4]) + assert len(r1.top_n) == 0 + + r2 = s.search(query.Term("text", u("bravo"))) + assert len(r2.top_n) == 3 + assert hits(r2) == [1, 2, 4] + + r3 = r1.copy() + assert r3.allowed == set([1, 4]) + assert len(r3.top_n) == 0 + r3.extend(r2) + assert len(r3.top_n) == 3 + assert hits(r3) == [1, 2, 4] + + +def test_pages(): + from whoosh.scoring import Frequency + + schema = fields.Schema(id=fields.ID(stored=True), c=fields.TEXT) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(id=u("1"), c=u("alfa alfa alfa alfa alfa alfa")) + w.add_document(id=u("2"), c=u("alfa alfa alfa alfa alfa")) + w.add_document(id=u("3"), c=u("alfa alfa alfa alfa")) + w.add_document(id=u("4"), c=u("alfa alfa alfa")) + w.add_document(id=u("5"), c=u("alfa alfa")) + w.add_document(id=u("6"), c=u("alfa")) + w.commit() + + with ix.searcher(weighting=Frequency) as s: + q = query.Term("c", u("alfa")) + r = s.search(q) + assert [d["id"] for d in r] == ["1", "2", "3", "4", "5", "6"] + r = s.search_page(q, 2, pagelen=2) + assert [d["id"] for d in r] == ["3", "4"] + + r = s.search_page(q, 2, pagelen=4) + assert r.total == 6 + assert r.pagenum == 2 + assert r.pagelen == 2 + + +def test_pages_with_filter(): + from whoosh.scoring import Frequency + + schema = fields.Schema(id=fields.ID(stored=True), + type=fields.TEXT(), + c=fields.TEXT) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(id=u("1"), type=u("odd"), c=u("alfa alfa alfa alfa alfa alfa")) + w.add_document(id=u("2"), type=u("even"), c=u("alfa alfa alfa alfa alfa")) + w.add_document(id=u("3"), type=u("odd"), c=u("alfa alfa alfa alfa")) + w.add_document(id=u("4"), type=u("even"), c=u("alfa alfa alfa")) + w.add_document(id=u("5"), type=u("odd"), c=u("alfa alfa")) + w.add_document(id=u("6"), type=u("even"), c=u("alfa")) + w.commit() + + with ix.searcher(weighting=Frequency) as s: + q = query.Term("c", u("alfa")) + filterq = query.Term("type", u("even")) + r = s.search(q, filter=filterq) + assert [d["id"] for d in r] == ["2", "4", "6"] + r = s.search_page(q, 2, pagelen=2, filter=filterq) + assert [d["id"] for d in r] == ["6"] + + +def test_extra_slice(): + schema = fields.Schema(key=fields.ID(stored=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + for char in u("abcdefghijklmnopqrstuvwxyz"): + w.add_document(key=char) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Every(), limit=5) + assert r[6:7] == [] + + +def test_page_counts(): + from whoosh.scoring import Frequency + + schema = fields.Schema(id=fields.ID(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + for i in xrange(10): + w.add_document(id=text_type(i)) + w.commit() + + with ix.searcher(weighting=Frequency) as s: + q = query.Every("id") + + r = s.search(q) + assert len(r) == 10 + + with pytest.raises(ValueError): + s.search_page(q, 0) + + r = s.search_page(q, 1, 5) + assert len(r) == 10 + assert r.pagecount == 2 + + r = s.search_page(q, 1, 5) + assert len(r) == 10 + assert r.pagecount == 2 + + r = s.search_page(q, 2, 5) + assert len(r) == 10 + assert r.pagecount == 2 + assert r.pagenum == 2 + + r = s.search_page(q, 1, 10) + assert len(r) == 10 + assert r.pagecount == 1 + assert r.pagenum == 1 + + +def test_resultspage(): + schema = fields.Schema(id=fields.STORED, content=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + + domain = ("alfa", "bravo", "bravo", "charlie", "delta") + w = ix.writer() + for i, lst in enumerate(permutations(domain, 3)): + w.add_document(id=text_type(i), content=u(" ").join(lst)) + w.commit() + + with ix.searcher() as s: + q = query.Term("content", u("bravo")) + r = s.search(q, limit=10) + tops = list(r) + + rp = s.search_page(q, 1, pagelen=5) + assert rp.scored_length() == 5 + assert list(rp) == tops[0:5] + assert rp[10:] == [] + + rp = s.search_page(q, 2, pagelen=5) + assert list(rp) == tops[5:10] + + rp = s.search_page(q, 1, pagelen=10) + assert len(rp) == 54 + assert rp.pagecount == 6 + rp = s.search_page(q, 6, pagelen=10) + assert len(list(rp)) == 4 + assert rp.is_last_page() + + with pytest.raises(ValueError): + s.search_page(q, 0) + assert s.search_page(q, 10).pagenum == 6 + + rp = s.search_page(query.Term("content", "glonk"), 1) + assert len(rp) == 0 + assert rp.is_last_page() + + +def test_highlight_setters(): + schema = fields.Schema(text=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(text=u("Hello")) + w.commit() + + r = ix.searcher().search(query.Term("text", "hello")) + hl = highlight.Highlighter() + ucf = highlight.UppercaseFormatter() + r.highlighter = hl + r.formatter = ucf + assert hl.formatter is ucf + + +def test_snippets(): + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(text=fields.TEXT(stored=True, analyzer=ana)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(text=u("Lay out the rough animation by creating the important poses where they occur on the timeline.")) + w.add_document(text=u("Set key frames on everything that's key-able. This is for control and predictability: you don't want to accidentally leave something un-keyed. This is also much faster than selecting the parameters to key.")) + w.add_document(text=u("Use constant (straight) or sometimes linear transitions between keyframes in the channel editor. This makes the character jump between poses.")) + w.add_document(text=u("Keying everything gives quick, immediate results. But it can become difficult to tweak the animation later, especially for complex characters.")) + w.add_document(text=u("Copy the current pose to create the next one: pose the character, key everything, then copy the keyframe in the playbar to another frame, and key everything at that frame.")) + w.commit() + + target = ["Set KEY frames on everything that's KEY-able", + "Copy the current pose to create the next one: pose the character, KEY everything, then copy the keyframe in the playbar to another frame, and KEY everything at that frame", + "KEYING everything gives quick, immediate results"] + + with ix.searcher() as s: + qp = qparser.QueryParser("text", ix.schema) + q = qp.parse(u("key")) + r = s.search(q, terms=True) + r.fragmenter = highlight.SentenceFragmenter() + r.formatter = highlight.UppercaseFormatter() + + assert sorted([hit.highlights("text", top=1) for hit in r]) == sorted(target) + + +def test_keyterms(): + ana = analysis.StandardAnalyzer() + vectorformat = formats.Frequency() + schema = fields.Schema(path=fields.ID, + content=fields.TEXT(analyzer=ana, + vector=vectorformat)) + st = RamStorage() + ix = st.create_index(schema) + w = ix.writer() + w.add_document(path=u("a"), content=u("This is some generic content")) + w.add_document(path=u("b"), content=u("This is some distinctive content")) + w.commit() + + with ix.searcher() as s: + docnum = s.document_number(path=u("b")) + keyterms = list(s.key_terms([docnum], "content")) + assert len(keyterms) > 0 + assert keyterms[0][0] == "distinctive" + + r = s.search(query.Term("path", u("b"))) + keyterms2 = list(r.key_terms("content")) + assert len(keyterms2) > 0 + assert keyterms2[0][0] == "distinctive" + + +def test_lengths(): + schema = fields.Schema(id=fields.STORED, text=fields.TEXT) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(id=1, text=u("alfa bravo charlie delta echo")) + w.add_document(id=2, text=u("bravo charlie delta echo foxtrot")) + w.add_document(id=3, text=u("charlie needle echo foxtrot golf")) + w.add_document(id=4, text=u("delta echo foxtrot golf hotel")) + w.add_document(id=5, text=u("echo needle needle hotel india")) + w.add_document(id=6, text=u("foxtrot golf hotel india juliet")) + w.add_document(id=7, text=u("golf needle india juliet kilo")) + w.add_document(id=8, text=u("hotel india juliet needle lima")) + w.commit() + + with ix.searcher() as s: + q = query.Or([query.Term("text", u("needle")), query.Term("text", u("charlie"))]) + r = s.search(q, limit=2) + assert not r.has_exact_length() + assert r.estimated_length() == 7 + assert r.estimated_min_length() == 3 + assert r.scored_length() == 2 + assert len(r) == 6 + + +def test_lengths2(): + schema = fields.Schema(text=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + count = 0 + for _ in xrange(3): + w = ix.writer() + for ls in permutations(u("alfa bravo charlie").split()): + if "bravo" in ls and "charlie" in ls: + count += 1 + w.add_document(text=u(" ").join(ls)) + w.commit(merge=False) + + with ix.searcher() as s: + q = query.Or([query.Term("text", u("bravo")), query.Term("text", u("charlie"))]) + r = s.search(q, limit=None) + assert len(r) == count + + r = s.search(q, limit=3) + assert len(r) == count + + +def test_stability(): + schema = fields.Schema(text=fields.TEXT) + ix = RamStorage().create_index(schema) + domain = u("alfa bravo charlie delta").split() + w = ix.writer() + for ls in permutations(domain, 3): + w.add_document(text=u(" ").join(ls)) + w.commit() + + with ix.searcher() as s: + q = query.Term("text", u("bravo")) + last = [] + for i in xrange(s.doc_frequency("text", u("bravo"))): + # Only un-optimized results are stable + r = s.search(q, limit=i + 1, optimize=False) + docnums = [hit.docnum for hit in r] + assert docnums[:-1] == last + last = docnums + + +def test_terms(): + schema = fields.Schema(text=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(text=u("alfa sierra tango")) + w.add_document(text=u("bravo charlie delta")) + w.add_document(text=u("charlie delta echo")) + w.add_document(text=u("delta echo foxtrot")) + w.commit() + + qp = qparser.QueryParser("text", ix.schema) + q = qp.parse(u("(bravo AND charlie) OR foxtrot OR missing")) + r = ix.searcher().search(q, terms=True) + + fieldobj = schema["text"] + + def txts(tset): + return sorted(fieldobj.from_bytes(t[1]) for t in tset) + + assert txts(r.matched_terms()) == ["bravo", "charlie", "foxtrot"] + for hit in r: + value = hit["text"] + for txt in txts(hit.matched_terms()): + assert txt in value + + +def test_hit_column(): + # Not stored + schema = fields.Schema(text=fields.TEXT()) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(text=u("alfa bravo charlie")) + + with ix.searcher() as s: + r = s.search(query.Term("text", "alfa")) + assert len(r) == 1 + hit = r[0] + with pytest.raises(KeyError): + _ = hit["text"] + + # With column + schema = fields.Schema(text=fields.TEXT(sortable=True)) + ix = RamStorage().create_index(schema) + with ix.writer(codec=W3Codec()) as w: + w.add_document(text=u("alfa bravo charlie")) + + with ix.searcher() as s: + r = s.search(query.Term("text", "alfa")) + assert len(r) == 1 + hit = r[0] + assert hit["text"] == u("alfa bravo charlie") + + +def test_closed_searcher(): + from whoosh.reading import ReaderClosed + + schema = fields.Schema(key=fields.KEYWORD(stored=True, sortable=True)) + + with TempStorage() as st: + ix = st.create_index(schema) + with ix.writer() as w: + w.add_document(key=u"alfa") + w.add_document(key=u"bravo") + w.add_document(key=u"charlie") + w.add_document(key=u"delta") + w.add_document(key=u"echo") + + s = ix.searcher() + r = s.search(query.TermRange("key", "b", "d")) + s.close() + assert s.is_closed + with pytest.raises(ReaderClosed): + assert r[0]["key"] == "bravo" + with pytest.raises(ReaderClosed): + s.reader().column_reader("key") + with pytest.raises(ReaderClosed): + s.suggest("key", "brovo") + + s = ix.searcher() + r = s.search(query.TermRange("key", "b", "d")) + assert r[0] + assert r[0]["key"] == "bravo" + c = s.reader().column_reader("key") + assert c[1] == "bravo" + assert s.suggest("key", "brovo") == ["bravo"] + + +def test_paged_highlights(): + schema = fields.Schema(text=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(text=u("alfa bravo charlie delta echo foxtrot")) + w.add_document(text=u("bravo charlie delta echo foxtrot golf")) + w.add_document(text=u("charlie delta echo foxtrot golf hotel")) + w.add_document(text=u("delta echo foxtrot golf hotel india")) + w.add_document(text=u("echo foxtrot golf hotel india juliet")) + w.add_document(text=u("foxtrot golf hotel india juliet kilo")) + + with ix.searcher() as s: + q = query.Term("text", u("alfa")) + page = s.search_page(q, 1, pagelen=3) + + page.results.fragmenter = highlight.WholeFragmenter() + page.results.formatter = highlight.UppercaseFormatter() + hi = page[0].highlights("text") + assert hi == u("ALFA bravo charlie delta echo foxtrot") + + +def test_phrase_keywords(): + schema = fields.Schema(text=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(text=u("alfa bravo charlie delta")) + w.add_document(text=u("bravo charlie delta echo")) + w.add_document(text=u("charlie delta echo foxtrot")) + w.add_document(text=u("delta echo foxtrot alfa")) + w.add_document(text=u("echo foxtrot alfa bravo")) + + with ix.searcher() as s: + q = query.Phrase("text", u("alfa bravo").split()) + r = s.search(q) + assert len(r) == 2 + kts = " ".join(t for t, score in r.key_terms("text")) + assert kts == "alfa bravo charlie foxtrot delta" + + +def test_every_keywords(): + schema = fields.Schema(title=fields.TEXT, content=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(title=u("alfa"), content=u("bravo")) + w.add_document(title=u("charlie"), content=u("delta")) + + with ix.searcher() as s: + q = qparser.QueryParser("content", ix.schema).parse("*") + assert isinstance(q, query.Every) + + r = s.search(q, terms=True) + assert len(r) == 2 + hit = r[0] + assert hit["content"] == "bravo" + assert hit.highlights("content") == "" + + +def test_filter_by_result(): + schema = fields.Schema(title=fields.TEXT(stored=True), + content=fields.TEXT(stored=True)) + + with TempIndex(schema, "filter") as ix: + words = u("foo bar baz qux barney").split() + with ix.writer() as w: + for x in xrange(100): + t = u("even" if x % 2 == 0 else "odd") + c = words[x % len(words)] + w.add_document(title=t, content=c) + + with ix.searcher() as searcher: + fq = query.Term("title", "even") + filter_result = searcher.search(fq) + assert filter_result.docset is None + + q = query.Term("content", "foo") + + # filter_result.docs() + result = searcher.search(q, filter=filter_result) + assert all(x["title"] == "even" and x["content"] == "foo" + for x in result) + diff --git a/tests/test_searching.py b/tests/test_searching.py new file mode 100644 index 0000000..ec9dd85 --- /dev/null +++ b/tests/test_searching.py @@ -0,0 +1,1737 @@ +#encoding: utf-8 + +from __future__ import with_statement +import copy +from datetime import datetime, timedelta + +import pytest + +from whoosh import analysis, fields, index, qparser, query, searching, scoring +from whoosh.codec.whoosh3 import W3Codec +from whoosh.compat import b, u, text_type +from whoosh.compat import xrange, permutations, izip_longest +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex + + +def make_index(): + s = fields.Schema(key=fields.ID(stored=True), + name=fields.TEXT, + value=fields.TEXT) + st = RamStorage() + ix = st.create_index(s) + + w = ix.writer() + w.add_document(key=u("A"), name=u("Yellow brown"), + value=u("Blue red green render purple?")) + w.add_document(key=u("B"), name=u("Alpha beta"), + value=u("Gamma delta epsilon omega.")) + w.add_document(key=u("C"), name=u("One two"), + value=u("Three rendered four five.")) + w.add_document(key=u("D"), name=u("Quick went"), + value=u("Every red town.")) + w.add_document(key=u("E"), name=u("Yellow uptown"), + value=u("Interest rendering outer photo!")) + w.commit() + + return ix + + +def _get_keys(stored_fields): + return sorted([d.get("key") for d in stored_fields]) + + +def _docs(q, s): + return _get_keys([s.stored_fields(docnum) for docnum + in q.docs(s)]) + + +def _run_query(q, target): + ix = make_index() + with ix.searcher() as s: + assert target == _docs(q, s) + + +def test_empty_index(): + schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT) + st = RamStorage() + with pytest.raises(index.EmptyIndexError): + st.open_index(schema=schema) + + +def test_docs_method(): + ix = make_index() + with ix.searcher() as s: + assert _get_keys(s.documents(name="yellow")) == ["A", "E"] + assert _get_keys(s.documents(value="red")) == ["A", "D"] + assert _get_keys(s.documents()) == ["A", "B", "C", "D", "E"] + + +def test_term(): + _run_query(query.Term("name", u("yellow")), [u("A"), u("E")]) + _run_query(query.Term("value", u("zeta")), []) + _run_query(query.Term("value", u("red")), [u("A"), u("D")]) + + +def test_require(): + _run_query(query.Require(query.Term("value", u("red")), + query.Term("name", u("yellow"))), + [u("A")]) + + +def test_and(): + _run_query(query.And([query.Term("value", u("red")), + query.Term("name", u("yellow"))]), + [u("A")]) + # Missing + _run_query(query.And([query.Term("value", u("ochre")), + query.Term("name", u("glonk"))]), + []) + + +def test_or(): + _run_query(query.Or([query.Term("value", u("red")), + query.Term("name", u("yellow"))]), + [u("A"), u("D"), u("E")]) + # Missing + _run_query(query.Or([query.Term("value", u("ochre")), + query.Term("name", u("glonk"))]), + []) + _run_query(query.Or([]), []) + + +def test_ors(): + domain = u("alfa bravo charlie delta").split() + s = fields.Schema(num=fields.STORED, text=fields.TEXT) + st = RamStorage() + ix = st.create_index(s) + with ix.writer() as w: + for i, ls in enumerate(permutations(domain)): + w.add_document(num=i, text=" ".join(ls)) + + with ix.searcher() as s: + qs = [query.Term("text", word) for word in domain] + for i in xrange(1, len(domain)): + q = query.Or(qs[:i]) + r1 = [(hit.docnum, hit.score) for hit in s.search(q, limit=None)] + + q.binary_matcher = True + r2 = [(hit.docnum, hit.score) for hit in s.search(q, limit=None)] + + for item1, item2 in izip_longest(r1, r2): + assert item1[0] == item2[0] + assert item1[1] == item2[1] + + +def test_not(): + _run_query(query.And([query.Or([query.Term("value", u("red")), + query.Term("name", u("yellow"))]), + query.Not(query.Term("name", u("quick")))]), + [u("A"), u("E")]) + + +def test_topnot(): + _run_query(query.Not(query.Term("value", "red")), [u("B"), "C", "E"]) + _run_query(query.Not(query.Term("name", "yellow")), [u("B"), u("C"), + u("D")]) + + +def test_andnot(): + _run_query(query.AndNot(query.Term("name", u("yellow")), + query.Term("value", u("purple"))), + [u("E")]) + + +def test_andnot2(): + schema = fields.Schema(a=fields.ID(stored=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(a=u("bravo")) + w.add_document(a=u("echo")) + w.add_document(a=u("juliet")) + w.commit() + w = ix.writer() + w.add_document(a=u("kilo")) + w.add_document(a=u("foxtrot")) + w.add_document(a=u("charlie")) + w.commit(merge=False) + w = ix.writer() + w.delete_by_term("a", u("echo")) + w.add_document(a=u("alfa")) + w.add_document(a=u("india")) + w.add_document(a=u("delta")) + w.commit(merge=False) + + with ix.searcher() as s: + q = query.TermRange("a", u("bravo"), u("k")) + qr = [hit["a"] for hit in s.search(q)] + assert " ".join(sorted(qr)) == "bravo charlie delta foxtrot india juliet" + + oq = query.Or([query.Term("a", "bravo"), query.Term("a", "delta")]) + oqr = [hit["a"] for hit in s.search(oq)] + assert " ".join(sorted(oqr)) == "bravo delta" + + anq = query.AndNot(q, oq) + + m = anq.matcher(s) + r = s.search(anq) + assert list(anq.docs(s)) == sorted(hit.docnum for hit in r) + assert " ".join(sorted(hit["a"] for hit in r)) == "charlie foxtrot india juliet" + + +def test_variations(): + _run_query(query.Variations("value", u("render")), + [u("A"), u("C"), u("E")]) + + +def test_wildcard(): + _run_query(query.Or([query.Wildcard('value', u('*red*')), + query.Wildcard('name', u('*yellow*'))]), + [u("A"), u("C"), u("D"), u("E")]) + # Missing + _run_query(query.Wildcard('value', 'glonk*'), []) + + +def test_not2(): + schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) + storage = RamStorage() + ix = storage.create_index(schema) + writer = ix.writer() + writer.add_document(name=u("a"), value=u("alfa bravo charlie delta echo")) + writer.add_document(name=u("b"), + value=u("bravo charlie delta echo foxtrot")) + writer.add_document(name=u("c"), + value=u("charlie delta echo foxtrot golf")) + writer.add_document(name=u("d"), value=u("delta echo golf hotel india")) + writer.add_document(name=u("e"), value=u("echo golf hotel india juliet")) + writer.commit() + + with ix.searcher() as s: + p = qparser.QueryParser("value", None) + results = s.search(p.parse("echo NOT golf")) + assert sorted([d["name"] for d in results]) == ["a", "b"] + + results = s.search(p.parse("echo NOT bravo")) + assert sorted([d["name"] for d in results]) == ["c", "d", "e"] + + ix.delete_by_term("value", u("bravo")) + + with ix.searcher() as s: + results = s.search(p.parse("echo NOT charlie")) + assert sorted([d["name"] for d in results]) == ["d", "e"] + +# def test_or_minmatch(): +# schema = fields.Schema(k=fields.STORED, v=fields.TEXT) +# st = RamStorage() +# ix = st.create_index(schema) +# +# w = ix.writer() +# w.add_document(k=1, v=u("alfa bravo charlie delta echo")) +# w.add_document(k=2, v=u("bravo charlie delta echo foxtrot")) +# w.add_document(k=3, v=u("charlie delta echo foxtrot golf")) +# w.add_document(k=4, v=u("delta echo foxtrot golf hotel")) +# w.add_document(k=5, v=u("echo foxtrot golf hotel india")) +# w.add_document(k=6, v=u("foxtrot golf hotel india juliet")) +# w.commit() +# +# s = ix.searcher() +# q = Or([Term("v", "echo"), Term("v", "foxtrot")], minmatch=2) +# r = s.search(q) +# assert sorted(d["k"] for d in r), [2, 3, 4, 5]) + + +def test_range(): + schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + w.add_document(id=u("A"), content=u("alfa bravo charlie delta echo")) + w.add_document(id=u("B"), content=u("bravo charlie delta echo foxtrot")) + w.add_document(id=u("C"), content=u("charlie delta echo foxtrot golf")) + w.add_document(id=u("D"), content=u("delta echo foxtrot golf hotel")) + w.add_document(id=u("E"), content=u("echo foxtrot golf hotel india")) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("content", schema) + + q = qp.parse(u("charlie [delta TO foxtrot]")) + assert q.__class__ == query.And + assert q[0].__class__ == query.Term + assert q[1].__class__ == query.TermRange + assert q[1].start == "delta" + assert q[1].end == "foxtrot" + assert not q[1].startexcl + assert not q[1].endexcl + ids = sorted([d['id'] for d in s.search(q)]) + assert ids == [u('A'), u('B'), u('C')] + + q = qp.parse(u("foxtrot {echo TO hotel]")) + assert q.__class__ == query.And + assert q[0].__class__ == query.Term + assert q[1].__class__ == query.TermRange + assert q[1].start == "echo" + assert q[1].end == "hotel" + assert q[1].startexcl + assert not q[1].endexcl + ids = sorted([d['id'] for d in s.search(q)]) + assert ids == [u('B'), u('C'), u('D'), u('E')] + + q = qp.parse(u("{bravo TO delta}")) + assert q.__class__ == query.TermRange + assert q.start == "bravo" + assert q.end == "delta" + assert q.startexcl + assert q.endexcl + ids = sorted([d['id'] for d in s.search(q)]) + assert ids == [u('A'), u('B'), u('C')] + + # Shouldn't match anything + q = qp.parse(u("[1 to 10]")) + assert q.__class__ == query.TermRange + assert len(s.search(q)) == 0 + + +def test_range_clusiveness(): + schema = fields.Schema(id=fields.ID(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + w = ix.writer() + for letter in u("abcdefg"): + w.add_document(id=letter) + w.commit() + + with ix.searcher() as s: + def check(startexcl, endexcl, string): + q = query.TermRange("id", "b", "f", startexcl, endexcl) + r = "".join(sorted(d['id'] for d in s.search(q))) + assert r == string + + check(False, False, "bcdef") + check(True, False, "cdef") + check(True, True, "cde") + check(False, True, "bcde") + + +def test_open_ranges(): + schema = fields.Schema(id=fields.ID(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + w = ix.writer() + for letter in u("abcdefg"): + w.add_document(id=letter) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("id", schema) + + def check(qstring, result): + q = qp.parse(qstring) + r = "".join(sorted([d['id'] for d in s.search(q)])) + assert r == result + + check(u("[b TO]"), "bcdefg") + check(u("[TO e]"), "abcde") + check(u("[b TO d]"), "bcd") + check(u("{b TO]"), "cdefg") + check(u("[TO e}"), "abcd") + check(u("{b TO d}"), "c") + + +def test_open_numeric_ranges(): + domain = range(0, 1000, 7) + + schema = fields.Schema(num=fields.NUMERIC(stored=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + for i in domain: + w.add_document(num=i) + w.commit() + + qp = qparser.QueryParser("num", schema) + with ix.searcher() as s: + q = qp.parse("[100 to]") + r = [hit["num"] for hit in s.search(q, limit=None)] + assert r == [n for n in domain if n >= 100] + + q = qp.parse("[to 500]") + r = [hit["num"] for hit in s.search(q, limit=None)] + assert r == [n for n in domain if n <= 500] + + +def test_open_date_ranges(): + basedate = datetime(2011, 1, 24, 6, 25, 0, 0) + domain = [basedate + timedelta(days=n) for n in xrange(-20, 20)] + + schema = fields.Schema(date=fields.DATETIME(stored=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + for d in domain: + w.add_document(date=d) + w.commit() + + with ix.searcher() as s: + # Without date parser + qp = qparser.QueryParser("date", schema) + q = qp.parse("[2011-01-10 to]") + r = [hit["date"] for hit in s.search(q, limit=None)] + assert len(r) > 0 + target = [d for d in domain if d >= datetime(2011, 1, 10, 6, 25)] + assert r == target + + q = qp.parse("[to 2011-01-30]") + r = [hit["date"] for hit in s.search(q, limit=None)] + assert len(r) > 0 + target = [d for d in domain if d <= datetime(2011, 1, 30, 6, 25)] + assert r == target + + # With date parser + from whoosh.qparser.dateparse import DateParserPlugin + qp.add_plugin(DateParserPlugin(basedate)) + + q = qp.parse("[10 jan 2011 to]") + r = [hit["date"] for hit in s.search(q, limit=None)] + assert len(r) > 0 + target = [d for d in domain if d >= datetime(2011, 1, 10, 6, 25)] + assert r == target + + q = qp.parse("[to 30 jan 2011]") + r = [hit["date"] for hit in s.search(q, limit=None)] + assert len(r) > 0 + target = [d for d in domain if d <= datetime(2011, 1, 30, 6, 25)] + assert r == target + + +def test_negated_unlimited_ranges(): + # Whoosh should treat u("[to]") as if it was "*" + schema = fields.Schema(id=fields.ID(stored=True), num=fields.NUMERIC, + date=fields.DATETIME) + ix = RamStorage().create_index(schema) + w = ix.writer() + from string import ascii_letters + domain = text_type(ascii_letters) + + dt = datetime.now() + for i, letter in enumerate(domain): + w.add_document(id=letter, num=i, date=dt + timedelta(days=i)) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("id", schema) + + nq = qp.parse(u("NOT [to]")) + assert nq.__class__ == query.Not + q = nq.query + assert q.__class__ == query.Every + assert "".join(h["id"] for h in s.search(q, limit=None)) == domain + assert not list(nq.docs(s)) + + nq = qp.parse(u("NOT num:[to]")) + assert nq.__class__ == query.Not + q = nq.query + assert q.__class__ == query.NumericRange + assert q.start is None + assert q.end is None + assert "".join(h["id"] for h in s.search(q, limit=None)) == domain + assert not list(nq.docs(s)) + + nq = qp.parse(u("NOT date:[to]")) + assert nq.__class__ == query.Not + q = nq.query + assert q.__class__ == query.Every + assert "".join(h["id"] for h in s.search(q, limit=None)) == domain + assert not list(nq.docs(s)) + + +def test_keyword_or(): + schema = fields.Schema(a=fields.ID(stored=True), b=fields.KEYWORD) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + w.add_document(a=u("First"), b=u("ccc ddd")) + w.add_document(a=u("Second"), b=u("aaa ddd")) + w.add_document(a=u("Third"), b=u("ccc eee")) + w.commit() + + qp = qparser.QueryParser("b", schema) + with ix.searcher() as s: + qr = qp.parse(u("b:ccc OR b:eee")) + assert qr.__class__ == query.Or + r = s.search(qr) + assert len(r) == 2 + assert r[0]["a"] == "Third" + assert r[1]["a"] == "First" + + +def test_merged(): + schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) + with TempIndex(schema) as ix: + with ix.writer() as w: + w.add_document(id=u("alfa"), content=u("alfa")) + w.add_document(id=u("bravo"), content=u("bravo")) + + with ix.searcher() as s: + r = s.search(query.Term("content", u("bravo"))) + assert len(r) == 1 + assert r[0]["id"] == "bravo" + + with ix.writer() as w: + w.add_document(id=u("charlie"), content=u("charlie")) + w.optimize = True + + assert len(ix._segments()) == 1 + + with ix.searcher() as s: + r = s.search(query.Term("content", u("bravo"))) + assert len(r) == 1 + assert r[0]["id"] == "bravo" + + +def test_multireader(): + sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) + st = RamStorage() + ix = st.create_index(sc) + w = ix.writer() + w.add_document(id=u("alfa"), content=u("alfa")) + w.add_document(id=u("bravo"), content=u("bravo")) + w.add_document(id=u("charlie"), content=u("charlie")) + w.add_document(id=u("delta"), content=u("delta")) + w.add_document(id=u("echo"), content=u("echo")) + w.add_document(id=u("foxtrot"), content=u("foxtrot")) + w.add_document(id=u("golf"), content=u("golf")) + w.add_document(id=u("hotel"), content=u("hotel")) + w.add_document(id=u("india"), content=u("india")) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Term("content", u("bravo"))) + assert len(r) == 1 + assert r[0]["id"] == "bravo" + + w = ix.writer() + w.add_document(id=u("juliet"), content=u("juliet")) + w.add_document(id=u("kilo"), content=u("kilo")) + w.add_document(id=u("lima"), content=u("lima")) + w.add_document(id=u("mike"), content=u("mike")) + w.add_document(id=u("november"), content=u("november")) + w.add_document(id=u("oscar"), content=u("oscar")) + w.add_document(id=u("papa"), content=u("papa")) + w.add_document(id=u("quebec"), content=u("quebec")) + w.add_document(id=u("romeo"), content=u("romeo")) + w.commit() + assert len(ix._segments()) == 2 + + #r = ix.reader() + #assert r.__class__.__name__ == "MultiReader" + #pr = r.postings("content", u("bravo")) + + with ix.searcher() as s: + r = s.search(query.Term("content", u("bravo"))) + assert len(r) == 1 + assert r[0]["id"] == "bravo" + + +def test_posting_phrase(): + schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) + storage = RamStorage() + ix = storage.create_index(schema) + writer = ix.writer() + writer.add_document(name=u("A"), + value=u("Little Miss Muffet sat on a tuffet")) + writer.add_document(name=u("B"), value=u("Miss Little Muffet tuffet")) + writer.add_document(name=u("C"), value=u("Miss Little Muffet tuffet sat")) + writer.add_document(name=u("D"), + value=u("Gibberish blonk falunk miss muffet sat " + + "tuffet garbonzo")) + writer.add_document(name=u("E"), value=u("Blah blah blah pancakes")) + writer.commit() + + with ix.searcher() as s: + def names(results): + return sorted([fields['name'] for fields in results]) + + q = query.Phrase("value", [u("little"), u("miss"), u("muffet"), + u("sat"), u("tuffet")]) + m = q.matcher(s) + assert m.__class__.__name__ == "SpanNear2Matcher" + + r = s.search(q) + assert names(r) == ["A"] + assert len(r) == 1 + + q = query.Phrase("value", [u("miss"), u("muffet"), u("sat"), + u("tuffet")]) + assert names(s.search(q)) == ["A", "D"] + + q = query.Phrase("value", [u("falunk"), u("gibberish")]) + r = s.search(q) + assert not names(r) + assert len(r) == 0 + + q = query.Phrase("value", [u("gibberish"), u("falunk")], slop=2) + assert names(s.search(q)) == ["D"] + + q = query.Phrase("value", [u("blah")] * 4) + assert not names(s.search(q)) # blah blah blah blah + + q = query.Phrase("value", [u("blah")] * 3) + m = q.matcher(s) + assert names(s.search(q)) == ["E"] + + +def test_phrase_score(): + schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) + storage = RamStorage() + ix = storage.create_index(schema) + writer = ix.writer() + writer.add_document(name=u("A"), + value=u("Little Miss Muffet sat on a tuffet")) + writer.add_document(name=u("D"), + value=u("Gibberish blonk falunk miss muffet sat " + + "tuffet garbonzo")) + writer.add_document(name=u("E"), value=u("Blah blah blah pancakes")) + writer.add_document(name=u("F"), + value=u("Little miss muffet little miss muffet")) + writer.commit() + + with ix.searcher() as s: + q = query.Phrase("value", [u("little"), u("miss"), u("muffet")]) + m = q.matcher(s) + assert m.id() == 0 + score1 = m.weight() + assert score1 > 0 + m.next() + assert m.id() == 3 + assert m.weight() > score1 + + +def test_stop_phrase(): + schema = fields.Schema(title=fields.TEXT(stored=True)) + storage = RamStorage() + ix = storage.create_index(schema) + writer = ix.writer() + writer.add_document(title=u("Richard of York")) + writer.add_document(title=u("Lily the Pink")) + writer.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("title", schema) + q = qp.parse(u("richard of york")) + assert q.__unicode__() == "(title:richard AND title:york)" + assert len(s.search(q)) == 1 + #q = qp.parse(u("lily the pink")) + #assert len(s.search(q)), 1) + assert len(s.find("title", u("lily the pink"))) == 1 + + +def test_phrase_order(): + tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()) + schema = fields.Schema(text=tfield) + storage = RamStorage() + ix = storage.create_index(schema) + + writer = ix.writer() + for ls in permutations(["ape", "bay", "can", "day"], 4): + writer.add_document(text=u(" ").join(ls)) + writer.commit() + + with ix.searcher() as s: + def result(q): + r = s.search(q, limit=None, sortedby=None) + return sorted([d['text'] for d in r]) + + q = query.Phrase("text", ["bay", "can", "day"]) + assert result(q) == [u('ape bay can day'), u('bay can day ape')] + + +def test_phrase_sameword(): + schema = fields.Schema(id=fields.STORED, text=fields.TEXT) + storage = RamStorage() + ix = storage.create_index(schema) + + writer = ix.writer() + writer.add_document(id=1, text=u("The film Linda Linda Linda is good")) + writer.add_document(id=2, text=u("The model Linda Evangelista is pretty")) + writer.commit() + + with ix.searcher() as s: + r = s.search(query.Phrase("text", ["linda", "linda", "linda"]), + limit=None) + assert len(r) == 1 + assert r[0]["id"] == 1 + + +def test_phrase_multi(): + schema = fields.Schema(id=fields.STORED, text=fields.TEXT) + ix = RamStorage().create_index(schema) + + domain = u("alfa bravo charlie delta echo").split() + w = None + for i, ls in enumerate(permutations(domain)): + if w is None: + w = ix.writer() + w.add_document(id=i, text=u(" ").join(ls)) + if not i % 30: + w.commit() + w = None + if w is not None: + w.commit() + + with ix.searcher() as s: + q = query.Phrase("text", ["alfa", "bravo"]) + _ = s.search(q) + + +def test_missing_field_scoring(): + schema = fields.Schema(name=fields.TEXT(stored=True), + hobbies=fields.TEXT(stored=True)) + with TempIndex(schema) as ix: + with ix.writer() as w: + w.add_document(name=u('Frank'), hobbies=u('baseball, basketball')) + + with ix.reader() as r: + assert r.field_length("hobbies") == 2 + assert r.field_length("name") == 1 + + with ix.writer() as w: + w.add_document(name=u('Jonny')) + + with ix.searcher() as s: + assert s.field_length("hobbies") == 2 + assert s.field_length("name") == 2 + + parser = qparser.MultifieldParser(['name', 'hobbies'], schema) + q = parser.parse(u("baseball")) + result = s.search(q) + assert len(result) == 1 + + +def test_search_fieldname_underscores(): + s = fields.Schema(my_name=fields.ID(stored=True), my_value=fields.TEXT) + st = RamStorage() + ix = st.create_index(s) + + w = ix.writer() + w.add_document(my_name=u("Green"), my_value=u("It's not easy being green")) + w.add_document(my_name=u("Red"), + my_value=u("Hopping mad like a playground ball")) + w.commit() + + qp = qparser.QueryParser("my_value", schema=s) + with ix.searcher() as s: + r = s.search(qp.parse(u("my_name:Green"))) + assert r[0]['my_name'] == "Green" + + +def test_short_prefix(): + s = fields.Schema(name=fields.ID, value=fields.TEXT) + qp = qparser.QueryParser("value", schema=s) + q = qp.parse(u("s*")) + assert q.__class__.__name__ == "Prefix" + assert q.text == "s" + + +def test_weighting(): + from whoosh.scoring import Weighting, BaseScorer + + schema = fields.Schema(id=fields.ID(stored=True), + n_comments=fields.STORED) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + w.add_document(id=u("1"), n_comments=5) + w.add_document(id=u("2"), n_comments=12) + w.add_document(id=u("3"), n_comments=2) + w.add_document(id=u("4"), n_comments=7) + w.commit() + + # Fake Weighting implementation + class CommentWeighting(Weighting): + def scorer(self, searcher, fieldname, text, qf=1): + return self.CommentScorer(searcher.stored_fields) + + class CommentScorer(BaseScorer): + def __init__(self, stored_fields): + self.stored_fields = stored_fields + + def score(self, matcher): + sf = self.stored_fields(matcher.id()) + ncomments = sf.get("n_comments", 0) + return ncomments + + with ix.searcher(weighting=CommentWeighting()) as s: + q = query.TermRange("id", u("1"), u("4"), constantscore=False) + + r = s.search(q) + ids = [fs["id"] for fs in r] + assert ids == ["2", "4", "1", "3"] + + +def test_dismax(): + schema = fields.Schema(id=fields.STORED, + f1=fields.TEXT, f2=fields.TEXT, f3=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, f1=u("alfa bravo charlie delta"), + f2=u("alfa alfa alfa"), + f3=u("alfa echo foxtrot hotel india")) + w.commit() + + with ix.searcher(weighting=scoring.Frequency()) as s: + assert list(s.documents(f1="alfa")) == [{"id": 1}] + assert list(s.documents(f2="alfa")) == [{"id": 1}] + assert list(s.documents(f3="alfa")) == [{"id": 1}] + + qs = [query.Term("f1", "alfa"), query.Term("f2", "alfa"), + query.Term("f3", "alfa")] + dm = query.DisjunctionMax(qs) + r = s.search(dm) + assert r.score(0) == 3.0 + + +def test_deleted_wildcard(): + schema = fields.Schema(id=fields.ID(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + w.add_document(id=u("alfa")) + w.add_document(id=u("bravo")) + w.add_document(id=u("charlie")) + w.add_document(id=u("delta")) + w.add_document(id=u("echo")) + w.add_document(id=u("foxtrot")) + w.commit() + + w = ix.writer() + w.delete_by_term("id", "bravo") + w.delete_by_term("id", "delta") + w.delete_by_term("id", "echo") + w.commit() + + with ix.searcher() as s: + r = s.search(query.Every("id")) + assert sorted([d['id'] for d in r]) == ["alfa", "charlie", "foxtrot"] + + +def test_missing_wildcard(): + schema = fields.Schema(id=fields.ID(stored=True), f1=fields.TEXT, + f2=fields.TEXT) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + w.add_document(id=u("1"), f1=u("alfa"), f2=u("apple")) + w.add_document(id=u("2"), f1=u("bravo")) + w.add_document(id=u("3"), f1=u("charlie"), f2=u("candy")) + w.add_document(id=u("4"), f2=u("donut")) + w.add_document(id=u("5")) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Every("id")) + assert sorted([d['id'] for d in r]) == ["1", "2", "3", "4", "5"] + + r = s.search(query.Every("f1")) + assert sorted([d['id'] for d in r]) == ["1", "2", "3"] + + r = s.search(query.Every("f2")) + assert sorted([d['id'] for d in r]) == ["1", "3", "4"] + + +def test_finalweighting(): + from whoosh.scoring import Frequency + + schema = fields.Schema(id=fields.ID(stored=True), + summary=fields.TEXT, + n_comments=fields.STORED) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + w.add_document(id=u("1"), summary=u("alfa bravo"), n_comments=5) + w.add_document(id=u("2"), summary=u("alfa"), n_comments=12) + w.add_document(id=u("3"), summary=u("bravo"), n_comments=2) + w.add_document(id=u("4"), summary=u("bravo bravo"), n_comments=7) + w.commit() + + class CommentWeighting(Frequency): + use_final = True + + def final(self, searcher, docnum, score): + ncomments = searcher.stored_fields(docnum).get("n_comments", 0) + return ncomments + + with ix.searcher(weighting=CommentWeighting()) as s: + q = qparser.QueryParser("summary", None).parse("alfa OR bravo") + r = s.search(q) + ids = [fs["id"] for fs in r] + assert ["2", "4", "1", "3"] == ids + + +def test_outofdate(): + schema = fields.Schema(id=fields.ID(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + + w = ix.writer() + w.add_document(id=u("1")) + w.add_document(id=u("2")) + w.commit() + + s = ix.searcher() + assert s.up_to_date() + + w = ix.writer() + w.add_document(id=u("3")) + w.add_document(id=u("4")) + + assert s.up_to_date() + w.commit() + assert not s.up_to_date() + + s = s.refresh() + assert s.up_to_date() + s.close() + + +def test_find_missing(): + schema = fields.Schema(id=fields.ID, text=fields.KEYWORD(stored=True)) + ix = RamStorage().create_index(schema) + + w = ix.writer() + w.add_document(id=u("1"), text=u("alfa")) + w.add_document(id=u("2"), text=u("bravo")) + w.add_document(text=u("charlie")) + w.add_document(id=u("4"), text=u("delta")) + w.add_document(text=u("echo")) + w.add_document(id=u("6"), text=u("foxtrot")) + w.add_document(text=u("golf")) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("text", schema) + q = qp.parse(u("NOT id:*")) + r = s.search(q, limit=None) + assert list(h["text"] for h in r) == ["charlie", "echo", "golf"] + + +def test_ngram_phrase(): + f = fields.NGRAM(minsize=2, maxsize=2, phrase=True) + schema = fields.Schema(text=f, path=fields.ID(stored=True)) + ix = RamStorage().create_index(schema) + writer = ix.writer() + writer.add_document(text=u('\u9AD8\u6821\u307E\u3067\u306F\u6771\u4EAC' + '\u3067\u3001\u5927\u5B66\u304B\u3089\u306F' + '\u4EAC\u5927\u3067\u3059\u3002'), + path=u('sample')) + writer.commit() + + with ix.searcher() as s: + p = qparser.QueryParser("text", schema) + + q = p.parse(u('\u6771\u4EAC\u5927\u5B66')) + assert len(s.search(q)) == 1 + + q = p.parse(u('"\u6771\u4EAC\u5927\u5B66"')) + assert len(s.search(q)) == 0 + + q = p.parse(u('"\u306F\u6771\u4EAC\u3067"')) + assert len(s.search(q)) == 1 + + +def test_ordered(): + domain = u("alfa bravo charlie delta echo foxtrot").split(" ") + + schema = fields.Schema(f=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + writer = ix.writer() + for ls in permutations(domain): + writer.add_document(f=u(" ").join(ls)) + writer.commit() + + with ix.searcher() as s: + q = query.Ordered([query.Term("f", u("alfa")), + query.Term("f", u("charlie")), + query.Term("f", u("echo"))]) + r = s.search(q) + for hit in r: + ls = hit["f"].split() + assert "alfa" in ls + assert "charlie" in ls + assert "echo" in ls + a = ls.index("alfa") + c = ls.index("charlie") + e = ls.index("echo") + assert a < c and c < e, repr(ls) + + +def test_otherwise(): + schema = fields.Schema(id=fields.STORED, f=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, f=u("alfa one two")) + w.add_document(id=2, f=u("alfa three four")) + w.add_document(id=3, f=u("bravo four five")) + w.add_document(id=4, f=u("bravo six seven")) + w.commit() + + with ix.searcher() as s: + q = query.Otherwise(query.Term("f", u("alfa")), + query.Term("f", u("six"))) + assert [d["id"] for d in s.search(q)] == [1, 2] + + q = query.Otherwise(query.Term("f", u("tango")), + query.Term("f", u("four"))) + assert [d["id"] for d in s.search(q)] == [2, 3] + + q = query.Otherwise(query.Term("f", u("tango")), + query.Term("f", u("nine"))) + assert [d["id"] for d in s.search(q)] == [] + + +def test_fuzzyterm(): + schema = fields.Schema(id=fields.STORED, f=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, f=u("alfa bravo charlie delta")) + w.add_document(id=2, f=u("bravo charlie delta echo")) + w.add_document(id=3, f=u("charlie delta echo foxtrot")) + w.add_document(id=4, f=u("delta echo foxtrot golf")) + w.commit() + + with ix.searcher() as s: + q = query.FuzzyTerm("f", "brave") + assert [d["id"] for d in s.search(q)] == [1, 2] + + +def test_fuzzyterm2(): + schema = fields.Schema(id=fields.STORED, f=fields.TEXT(spelling=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, f=u("alfa bravo charlie delta")) + w.add_document(id=2, f=u("bravo charlie delta echo")) + w.add_document(id=3, f=u("charlie delta echo foxtrot")) + w.add_document(id=4, f=u("delta echo foxtrot golf")) + w.commit() + + with ix.searcher() as s: + assert list(s.reader().terms_within("f", u("brave"), 1)) == ["bravo"] + q = query.FuzzyTerm("f", "brave") + assert [d["id"] for d in s.search(q)] == [1, 2] + + +def test_multireader_not(): + schema = fields.Schema(id=fields.STORED, f=fields.TEXT) + + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=0, f=u("alfa bravo chralie")) + w.add_document(id=1, f=u("bravo chralie delta")) + w.add_document(id=2, f=u("charlie delta echo")) + w.add_document(id=3, f=u("delta echo foxtrot")) + w.add_document(id=4, f=u("echo foxtrot golf")) + w.commit() + + with ix.searcher() as s: + q = query.And([query.Term("f", "delta"), + query.Not(query.Term("f", "delta"))]) + r = s.search(q) + assert len(r) == 0 + + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=5, f=u("alfa bravo chralie")) + w.add_document(id=6, f=u("bravo chralie delta")) + w.commit(merge=False) + w = ix.writer() + w.add_document(id=7, f=u("charlie delta echo")) + w.add_document(id=8, f=u("delta echo foxtrot")) + w.commit(merge=False) + w = ix.writer() + w.add_document(id=9, f=u("echo foxtrot golf")) + w.add_document(id=10, f=u("foxtrot golf delta")) + w.commit(merge=False) + assert len(ix._segments()) > 1 + + with ix.searcher() as s: + q = query.And([query.Term("f", "delta"), + query.Not(query.Term("f", "delta"))]) + r = s.search(q) + assert len(r) == 0 + + +def test_boost_phrase(): + schema = fields.Schema(title=fields.TEXT(field_boost=5.0, stored=True), + text=fields.TEXT) + ix = RamStorage().create_index(schema) + domain = u("alfa bravo charlie delta").split() + w = ix.writer() + for ls in permutations(domain): + t = u(" ").join(ls) + w.add_document(title=t, text=t) + w.commit() + + q = query.Or([query.Term("title", u("alfa")), + query.Term("title", u("bravo")), + query.Phrase("text", [u("bravo"), u("charlie"), u("delta")]) + ]) + + def boost_phrases(q): + if isinstance(q, query.Phrase): + q.boost *= 1000.0 + return q + else: + return q.apply(boost_phrases) + q = boost_phrases(q) + + with ix.searcher() as s: + r = s.search(q, limit=None) + for hit in r: + if "bravo charlie delta" in hit["title"]: + assert hit.score > 100.0 + + +def test_filter(): + schema = fields.Schema(id=fields.STORED, path=fields.ID, text=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, path=u("/a/1"), text=u("alfa bravo charlie")) + w.add_document(id=2, path=u("/b/1"), text=u("bravo charlie delta")) + w.add_document(id=3, path=u("/c/1"), text=u("charlie delta echo")) + w.commit(merge=False) + w = ix.writer() + w.add_document(id=4, path=u("/a/2"), text=u("delta echo alfa")) + w.add_document(id=5, path=u("/b/2"), text=u("echo alfa bravo")) + w.add_document(id=6, path=u("/c/2"), text=u("alfa bravo charlie")) + w.commit(merge=False) + w = ix.writer() + w.add_document(id=7, path=u("/a/3"), text=u("bravo charlie delta")) + w.add_document(id=8, path=u("/b/3"), text=u("charlie delta echo")) + w.add_document(id=9, path=u("/c/3"), text=u("delta echo alfa")) + w.commit(merge=False) + + with ix.searcher() as s: + fq = query.Or([query.Prefix("path", "/a"), + query.Prefix("path", "/b")]) + r = s.search(query.Term("text", "alfa"), filter=fq) + assert [d["id"] for d in r] == [1, 4, 5] + + r = s.search(query.Term("text", "bravo"), filter=fq) + assert [d["id"] for d in r] == [1, 2, 5, 7, ] + + +def test_fieldboost(): + schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=0, a=u("alfa bravo charlie"), b=u("echo foxtrot india")) + w.add_document(id=1, a=u("delta bravo charlie"), b=u("alfa alfa alfa")) + w.add_document(id=2, a=u("alfa alfa alfa"), b=u("echo foxtrot india")) + w.add_document(id=3, a=u("alfa sierra romeo"), b=u("alfa tango echo")) + w.add_document(id=4, a=u("bravo charlie delta"), b=u("alfa foxtrot india")) + w.add_document(id=5, a=u("alfa alfa echo"), b=u("tango tango tango")) + w.add_document(id=6, a=u("alfa bravo echo"), b=u("alfa alfa tango")) + w.commit() + + def field_booster(fieldname, factor=2.0): + "Returns a function which will boost the given field in a query tree" + def booster_fn(obj): + if obj.is_leaf() and obj.field() == fieldname: + obj = copy.deepcopy(obj) + obj.boost *= factor + return obj + else: + return obj + return booster_fn + + with ix.searcher() as s: + q = query.Or([query.Term("a", u("alfa")), + query.Term("b", u("alfa"))]) + q = q.accept(field_booster("a", 100.0)) + assert text_type(q) == text_type("(a:alfa^100.0 OR b:alfa)") + r = s.search(q) + assert [hit["id"] for hit in r] == [2, 5, 6, 3, 0, 1, 4] + + +def test_andmaybe_quality(): + schema = fields.Schema(id=fields.STORED, title=fields.TEXT(stored=True), + year=fields.NUMERIC) + ix = RamStorage().create_index(schema) + + domain = [(u('Alpha Bravo Charlie Delta'), 2000), + (u('Echo Bravo Foxtrot'), 2000), (u('Bravo Golf Hotel'), 2002), + (u('Bravo India'), 2002), (u('Juliet Kilo Bravo'), 2004), + (u('Lima Bravo Mike'), 2004)] + w = ix.writer() + for title, year in domain: + w.add_document(title=title, year=year) + w.commit() + + with ix.searcher() as s: + qp = qparser.QueryParser("title", ix.schema) + q = qp.parse(u("title:bravo ANDMAYBE year:2004")) + + titles = [hit["title"] for hit in s.search(q, limit=None)[:2]] + assert "Juliet Kilo Bravo" in titles + + titles = [hit["title"] for hit in s.search(q, limit=2)] + assert "Juliet Kilo Bravo" in titles + + +def test_collect_limit(): + schema = fields.Schema(id=fields.STORED, text=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id="a", text=u("alfa bravo charlie delta echo")) + w.add_document(id="b", text=u("bravo charlie delta echo foxtrot")) + w.add_document(id="c", text=u("charlie delta echo foxtrot golf")) + w.add_document(id="d", text=u("delta echo foxtrot golf hotel")) + w.add_document(id="e", text=u("echo foxtrot golf hotel india")) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Term("text", u("golf")), limit=10) + assert len(r) == 3 + count = 0 + for _ in r: + count += 1 + assert count == 3 + + w = ix.writer() + w.add_document(id="f", text=u("foxtrot golf hotel india juliet")) + w.add_document(id="g", text=u("golf hotel india juliet kilo")) + w.add_document(id="h", text=u("hotel india juliet kilo lima")) + w.add_document(id="i", text=u("india juliet kilo lima mike")) + w.add_document(id="j", text=u("juliet kilo lima mike november")) + w.commit(merge=False) + + with ix.searcher() as s: + r = s.search(query.Term("text", u("golf")), limit=20) + assert len(r) == 5 + count = 0 + for _ in r: + count += 1 + assert count == 5 + + +def test_scorer(): + schema = fields.Schema(key=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(key=u("alfa alfa alfa")) + w.add_document(key=u("alfa alfa alfa alfa")) + w.add_document(key=u("alfa alfa")) + w.commit() + w = ix.writer() + w.add_document(key=u("alfa alfa alfa alfa alfa alfa")) + w.add_document(key=u("alfa")) + w.add_document(key=u("alfa alfa alfa alfa alfa")) + w.commit(merge=False) + +# dw = scoring.DebugModel() +# s = ix.searcher(weighting=dw) +# r = s.search(query.Term("key", "alfa")) +# log = dw.log +# assert log, [('key', 'alfa', 0, 3.0, 3), +# ('key', 'alfa', 1, 4.0, 4), +# ('key', 'alfa', 2, 2.0, 2), +# ('key', 'alfa', 0, 6.0, 6), +# ('key', 'alfa', 1, 1.0, 1), +# ('key', 'alfa', 2, 5.0, 5)]) + + +def test_pos_scorer(): + ana = analysis.SimpleAnalyzer() + schema = fields.Schema(id=fields.STORED, key=fields.TEXT(analyzer=ana)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=0, key=u("0 0 1 0 0 0")) + w.add_document(id=1, key=u("0 0 0 1 0 0")) + w.add_document(id=2, key=u("0 1 0 0 0 0")) + w.commit() + w = ix.writer() + w.add_document(id=3, key=u("0 0 0 0 0 1")) + w.add_document(id=4, key=u("1 0 0 0 0 0")) + w.add_document(id=5, key=u("0 0 0 0 1 0")) + w.commit(merge=False) + + def pos_score_fn(searcher, fieldname, text, matcher): + poses = matcher.value_as("positions") + return 1.0 / (poses[0] + 1) + pos_weighting = scoring.FunctionWeighting(pos_score_fn) + + s = ix.searcher(weighting=pos_weighting) + r = s.search(query.Term("key", "1")) + assert [hit["id"] for hit in r] == [4, 2, 0, 1, 5, 3] + + +# def test_too_many_prefix_positions(): +# schema = fields.Schema(id=fields.STORED, text=fields.TEXT) +# ix = RamStorage().create_index(schema) +# with ix.writer() as w: +# for i in xrange(200): +# text = u("a%s" % i) +# w.add_document(id=i, text=text) +# +# q = query.Prefix("text", u("a")) +# q.TOO_MANY_CLAUSES = 100 +# +# with ix.searcher() as s: +# m = q.matcher(s) +# assert m.supports("positions") +# items = list(m.items_as("positions")) +# assert [(i, [0]) for i in xrange(200)] == items + + +def test_collapse(): + from whoosh import collectors + + # id, text, size, tag + domain = [("a", "blah blah blah", 5, "x"), + ("b", "blah", 3, "y"), + ("c", "blah blah blah blah", 2, "z"), + ("d", "blah blah", 4, "x"), + ("e", "bloop", 1, "-"), + ("f", "blah blah blah blah blah", 6, "x"), + ("g", "blah", 8, "w"), + ("h", "blah blah", 7, "=")] + + schema = fields.Schema(id=fields.STORED, text=fields.TEXT, + size=fields.NUMERIC, + tag=fields.KEYWORD(sortable=True)) + ix = RamStorage().create_index(schema) + with ix.writer(codec=W3Codec()) as w: + for id, text, size, tag in domain: + w.add_document(id=u(id), text=u(text), size=size, tag=u(tag)) + + with ix.searcher() as s: + q = query.Term("text", "blah") + r = s.search(q, limit=None) + assert " ".join(hit["id"] for hit in r) == "f c a d h b g" + + col = s.collector(limit=3) + col = collectors.CollapseCollector(col, "tag") + s.search_with_collector(q, col) + r = col.results() + assert " ".join(hit["id"] for hit in r) == "f c h" + + col = s.collector(limit=None) + col = collectors.CollapseCollector(col, "tag") + s.search_with_collector(q, col) + r = col.results() + assert " ".join(hit["id"] for hit in r) == "f c h b g" + + r = s.search(query.Every(), sortedby="size") + assert " ".join(hit["id"] for hit in r) == "e c b d a f h g" + + col = s.collector(sortedby="size") + col = collectors.CollapseCollector(col, "tag") + s.search_with_collector(query.Every(), col) + r = col.results() + assert " ".join(hit["id"] for hit in r) == "e c b d h g" + + +def test_collapse_nocolumn(): + from whoosh import collectors + + # id, text, size, tag + domain = [("a", "blah blah blah", 5, "x"), + ("b", "blah", 3, "y"), + ("c", "blah blah blah blah", 2, "z"), + ("d", "blah blah", 4, "x"), + ("e", "bloop", 1, "-"), + ("f", "blah blah blah blah blah", 6, "x"), + ("g", "blah", 8, "w"), + ("h", "blah blah", 7, "=")] + + schema = fields.Schema(id=fields.STORED, text=fields.TEXT, + size=fields.NUMERIC, + tag=fields.KEYWORD) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + for id, text, size, tag in domain: + w.add_document(id=u(id), text=u(text), size=size, tag=u(tag)) + + with ix.searcher() as s: + q = query.Term("text", "blah") + r = s.search(q, limit=None) + assert " ".join(hit["id"] for hit in r) == "f c a d h b g" + + col = s.collector(limit=3) + col = collectors.CollapseCollector(col, "tag") + s.search_with_collector(q, col) + r = col.results() + assert " ".join(hit["id"] for hit in r) == "f c h" + + col = s.collector(limit=None) + col = collectors.CollapseCollector(col, "tag") + s.search_with_collector(q, col) + r = col.results() + assert " ".join(hit["id"] for hit in r) == "f c h b g" + + r = s.search(query.Every(), sortedby="size") + assert " ".join(hit["id"] for hit in r) == "e c b d a f h g" + + col = s.collector(sortedby="size") + col = collectors.CollapseCollector(col, "tag") + s.search_with_collector(query.Every(), col) + r = col.results() + assert " ".join(hit["id"] for hit in r) == "e c b d h g" + + +def test_collapse_length(): + domain = u("alfa apple agnostic aplomb arc " + "bravo big braid beer " + "charlie crouch car " + "delta dog " + "echo " + "foxtrot fold flip " + "golf gym goop" + ).split() + + schema = fields.Schema(key=fields.ID(sortable=True), + word=fields.ID(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer(codec=W3Codec()) as w: + for word in domain: + w.add_document(key=word[0], word=word) + + with ix.searcher() as s: + q = query.Every() + + def check(r): + words = " ".join(hit["word"] for hit in r) + assert words == "alfa bravo charlie delta echo foxtrot golf" + assert r.scored_length() == 7 + assert len(r) == 7 + + r = s.search(q, collapse="key", collapse_limit=1, limit=None) + check(r) + + r = s.search(q, collapse="key", collapse_limit=1, limit=50) + check(r) + + r = s.search(q, collapse="key", collapse_limit=1, limit=10) + check(r) + + +def test_collapse_length_nocolumn(): + domain = u("alfa apple agnostic aplomb arc " + "bravo big braid beer " + "charlie crouch car " + "delta dog " + "echo " + "foxtrot fold flip " + "golf gym goop" + ).split() + + schema = fields.Schema(key=fields.ID(), + word=fields.ID(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + for word in domain: + w.add_document(key=word[0], word=word) + + with ix.searcher() as s: + q = query.Every() + + def check(r): + words = " ".join(hit["word"] for hit in r) + assert words == "alfa bravo charlie delta echo foxtrot golf" + assert r.scored_length() == 7 + assert len(r) == 7 + + r = s.search(q, collapse="key", collapse_limit=1, limit=None) + check(r) + + r = s.search(q, collapse="key", collapse_limit=1, limit=50) + check(r) + + r = s.search(q, collapse="key", collapse_limit=1, limit=10) + check(r) + + +def test_collapse_order(): + from whoosh import sorting + + schema = fields.Schema(id=fields.STORED, + price=fields.NUMERIC(sortable=True), + rating=fields.NUMERIC(sortable=True), + tag=fields.ID(sortable=True)) + ix = RamStorage().create_index(schema) + with ix.writer(codec=W3Codec()) as w: + w.add_document(id="a", price=10, rating=1, tag=u("x")) + w.add_document(id="b", price=80, rating=3, tag=u("y")) + w.add_document(id="c", price=60, rating=1, tag=u("z")) + w.add_document(id="d", price=30, rating=2) + w.add_document(id="e", price=50, rating=3, tag=u("x")) + w.add_document(id="f", price=20, rating=1, tag=u("y")) + w.add_document(id="g", price=50, rating=2, tag=u("z")) + w.add_document(id="h", price=90, rating=5) + w.add_document(id="i", price=50, rating=5, tag=u("x")) + w.add_document(id="j", price=40, rating=1, tag=u("y")) + w.add_document(id="k", price=50, rating=4, tag=u("z")) + w.add_document(id="l", price=70, rating=2) + + with ix.searcher() as s: + def check(kwargs, target): + r = s.search(query.Every(), limit=None, **kwargs) + assert " ".join(hit["id"] for hit in r) == target + + price = sorting.FieldFacet("price", reverse=True) + rating = sorting.FieldFacet("rating", reverse=True) + tag = sorting.FieldFacet("tag") + + check(dict(sortedby=price), "h b l c e g i k j d f a") + check(dict(sortedby=price, collapse=tag), "h b l c e d") + check(dict(sortedby=price, collapse=tag, collapse_order=rating), + "h b l i k d") + + +def test_collapse_order_nocolumn(): + from whoosh import sorting + + schema = fields.Schema(id=fields.STORED, + price=fields.NUMERIC(), + rating=fields.NUMERIC(), + tag=fields.ID()) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id="a", price=10, rating=1, tag=u("x")) + w.add_document(id="b", price=80, rating=3, tag=u("y")) + w.add_document(id="c", price=60, rating=1, tag=u("z")) + w.add_document(id="d", price=30, rating=2) + w.add_document(id="e", price=50, rating=3, tag=u("x")) + w.add_document(id="f", price=20, rating=1, tag=u("y")) + w.add_document(id="g", price=50, rating=2, tag=u("z")) + w.add_document(id="h", price=90, rating=5) + w.add_document(id="i", price=50, rating=5, tag=u("x")) + w.add_document(id="j", price=40, rating=1, tag=u("y")) + w.add_document(id="k", price=50, rating=4, tag=u("z")) + w.add_document(id="l", price=70, rating=2) + + with ix.searcher() as s: + def check(kwargs, target): + r = s.search(query.Every(), limit=None, **kwargs) + assert " ".join(hit["id"] for hit in r) == target + + price = sorting.FieldFacet("price", reverse=True) + rating = sorting.FieldFacet("rating", reverse=True) + tag = sorting.FieldFacet("tag") + + check(dict(sortedby=price), "h b l c e g i k j d f a") + check(dict(sortedby=price, collapse=tag), "h b l c e d") + check(dict(sortedby=price, collapse=tag, collapse_order=rating), + "h b l i k d") + + +def test_coord(): + from whoosh.matching import CoordMatcher + + schema = fields.Schema(id=fields.STORED, hits=fields.STORED, + tags=fields.KEYWORD) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=0, hits=0, tags=u("blah blah blah blah")) + w.add_document(id=1, hits=0, tags=u("echo echo blah blah")) + w.add_document(id=2, hits=1, tags=u("bravo charlie delta echo")) + w.add_document(id=3, hits=2, tags=u("charlie delta echo foxtrot")) + w.add_document(id=4, hits=3, tags=u("delta echo foxtrot golf")) + w.add_document(id=5, hits=3, tags=u("echo foxtrot golf hotel")) + w.add_document(id=6, hits=2, tags=u("foxtrot golf hotel india")) + w.add_document(id=7, hits=1, tags=u("golf hotel india juliet")) + w.add_document(id=8, hits=0, tags=u("foxtrot foxtrot foo foo")) + w.add_document(id=9, hits=0, tags=u("foo foo foo foo")) + + og = qparser.OrGroup.factory(0.99) + qp = qparser.QueryParser("tags", schema, group=og) + q = qp.parse("golf foxtrot echo") + assert q.__class__ == query.Or + assert q.scale == 0.99 + + with ix.searcher() as s: + m = q.matcher(s) + assert type(m) == CoordMatcher + + r = s.search(q, optimize=False) + assert [hit["id"] for hit in r] == [4, 5, 3, 6, 1, 8, 2, 7] + + +def test_keyword_search(): + schema = fields.Schema(tags=fields.KEYWORD) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(tags=u("keyword1 keyword2 keyword3 keyword4 keyword5")) + + with ix.searcher() as s: + r = s.search_page(query.Term("tags", "keyword3"), 1) + assert r + + +def test_groupedby_with_terms(): + schema = fields.Schema(content=fields.TEXT, organism=fields.ID) + ix = RamStorage().create_index(schema) + + with ix.writer() as w: + w.add_document(organism=u("mus"), content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study00:00:00")) + w.add_document(organism=u("mus"), content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study")) + w.add_document(organism=u("hs"), content=u("This is the first document we've added!")) + + with ix.searcher() as s: + q = qparser.QueryParser("content", schema=ix.schema).parse(u("IPFSTD1")) + r = s.search(q, groupedby=["organism"], terms=True) + assert len(r) == 2 + assert r.groups("organism") == {"mus": [1, 0]} + assert r.has_matched_terms() + assert r.matched_terms() == set([('content', b('ipfstd1'))]) + + +def test_score_length(): + schema = fields.Schema(a=fields.TEXT, b=fields.TEXT) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(a=u("alfa bravo charlie")) + w.add_document(b=u("delta echo foxtrot")) + w.add_document(a=u("golf hotel india")) + + with ix.writer() as w: + w.merge = False + w.add_document(b=u("juliet kilo lima")) + # In the second segment, there is an "a" field here, but in the + # corresponding document in the first segment, the field doesn't exist, + # so if the scorer is getting segment offsets wrong, scoring this + # document will error + w.add_document(a=u("mike november oskar")) + w.add_document(b=u("papa quebec romeo")) + + with ix.searcher() as s: + assert not s.is_atomic() + p = s.postings("a", "mike") + while p.is_active(): + docnum = p.id() + score = p.score() + p.next() + + +def test_terms_with_filter(): + schema = fields.Schema(text=fields.TEXT) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(text=u("alfa bravo charlie delta")) + w.add_document(text=u("bravo charlie delta echo")) + w.add_document(text=u("charlie delta echo foxtrot")) + w.add_document(text=u("delta echo foxtrot golf")) + w.add_document(text=u("echo foxtrot golf hotel")) + w.add_document(text=u("foxtrot golf hotel alfa")) + w.add_document(text=u("golf hotel alfa bravo")) + w.add_document(text=u("hotel alfa bravo charlie")) + + with ix.searcher() as s: + workingset = set([1, 2, 3]) + q = query.Term("text", u("foxtrot")) + r = s.search_page(q, pagenum=1, pagelen=5, terms=True, + filter=workingset) + + assert r.scored_length() == 2 + assert [hit.docnum for hit in r] == [2, 3] + + +def test_terms_to_bytes(): + schema = fields.Schema(a=fields.TEXT, b=fields.NUMERIC, id=fields.STORED) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=0, a=u("alfa bravo"), b=100) + w.add_document(id=1, a=u("bravo charlie"), b=200) + w.add_document(id=2, a=u("charlie delta"), b=100) + w.add_document(id=3, a=u("delta echo"), b=200) + + with ix.searcher() as s: + t1 = query.Term("b", 200) + t2 = query.Term("a", "bravo") + q = query.And([t1, t2]) + r = s.search(q) + assert [hit["id"] for hit in r] == [1] + + +def test_issue_334(): + schema = fields.Schema( + kind=fields.ID(stored=True), + name=fields.ID(stored=True), + returns=fields.ID(stored=True), + ) + ix = RamStorage().create_index(schema) + + with ix.writer() as w: + + with w.group(): + w.add_document(kind=u('class'), name=u('Index')) + w.add_document(kind=u('method'), name=u('add document'), + returns=u('void')) + w.add_document(kind=u('method'), name=u('add reader'), + returns=u('void')) + w.add_document(kind=u('method'), name=u('close'), + returns=u('void')) + with w.group(): + w.add_document(kind=u('class'), name=u('Accumulator')) + w.add_document(kind=u('method'), name=u('add'), + returns=u('void')) + w.add_document(kind=u('method'), name=u('get result'), + returns=u('number')) + with w.group(): + w.add_document(kind=u('class'), name=u('Calculator')) + w.add_document(kind=u('method'), name=u('add'), + returns=u('number')) + w.add_document(kind=u('method'), name=u('add all'), + returns=u('number')) + w.add_document(kind=u('method'), name=u('add some'), + returns=u('number')) + w.add_document(kind=u('method'), name=u('multiply'), + returns=u('number')) + w.add_document(kind=u('method'), name=u('close'), + returns=u('void')) + with w.group(): + w.add_document(kind=u('class'), name=u('Deleter')) + w.add_document(kind=u('method'), name=u('add'), + returns=u('void')) + w.add_document(kind=u('method'), name=u('delete'), + returns=u('void')) + + with ix.searcher() as s: + pq = query.Term('kind', 'class') + cq = query.Term('name', 'Calculator') + + q = query.NestedChildren(pq, cq) & query.Term('returns', 'void') + r = s.search(q) + assert len(r) == 1 + assert r[0]["name"] == u("close") + + +def test_find_decimals(): + from decimal import Decimal + + schema = fields.Schema(name=fields.KEYWORD(stored=True), + num=fields.NUMERIC(Decimal, decimal_places=5)) + ix = RamStorage().create_index(schema) + + with ix.writer() as w: + w.add_document(name=u("alfa"), num=Decimal("1.5")) + w.add_document(name=u("bravo"), num=Decimal("2.1")) + w.add_document(name=u("charlie"), num=Decimal("5.3")) + w.add_document(name=u("delta"), num=Decimal(3)) + w.add_document(name=u("echo"), num=Decimal("3.00001")) + w.add_document(name=u("foxtrot"), num=Decimal("3")) + + qp = qparser.QueryParser("name", ix.schema) + q = qp.parse("num:3.0") + assert isinstance(q, query.Term) + + with ix.searcher() as s: + r = s.search(q) + names = " ".join(sorted(hit["name"] for hit in r)) + assert names == "delta foxtrot" + + diff --git a/tests/test_sorting.py b/tests/test_sorting.py new file mode 100644 index 0000000..f81645c --- /dev/null +++ b/tests/test_sorting.py @@ -0,0 +1,1053 @@ +from __future__ import with_statement +from datetime import datetime, timedelta +import random +import gc + +from whoosh import fields, query, sorting +from whoosh.compat import b, u +from whoosh.compat import permutations, xrange +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex + + +try: + import multiprocessing +except ImportError: + pass +else: + class MPFCTask(multiprocessing.Process): + def __init__(self, storage, indexname): + multiprocessing.Process.__init__(self) + self.storage = storage + self.indexname = indexname + + def run(self): + ix = self.storage.open_index(self.indexname) + with ix.searcher() as s: + r = s.search(query.Every(), sortedby="key", limit=None) + result = "".join([h["key"] for h in r]) + assert result == "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + + +docs = ({"id": u("zulu"), "num": 100, "tag": u("one"), "frac": 0.75}, + {"id": u("xray"), "num": -5, "tag": u("three"), "frac": 2.0}, + {"id": u("yankee"), "num": 3, "tag": u("two"), "frac": 5.5}, + + {"id": u("alfa"), "num": 7, "tag": u("three"), "frac": 2.25}, + {"id": u("tango"), "num": 2, "tag": u("two"), "frac": 1.75}, + {"id": u("foxtrot"), "num": -800, "tag": u("two"), "frac": 3.25}, + + {"id": u("sierra"), "num": 1, "tag": u("one"), "frac": 4.75}, + {"id": u("whiskey"), "num": 0, "tag": u("three"), "frac": 5.25}, + {"id": u("bravo"), "num": 582045, "tag": u("three"), "frac": 1.25}, + ) + + +def get_schema(): + return fields.Schema(id=fields.ID(stored=True), + num=fields.NUMERIC(stored=True), + frac=fields.NUMERIC(float, stored=True), + tag=fields.ID(stored=True), + ev=fields.ID, + ) + + +def make_single_index(ix): + w = ix.writer() + for doc in docs: + w.add_document(ev=u("a"), **doc) + w.commit() + + +def make_multi_index(ix): + for i in xrange(0, len(docs), 3): + w = ix.writer() + for doc in docs[i:i + 3]: + w.add_document(ev=u("a"), **doc) + w.commit(merge=False) + + +def try_sort(sortedby, key, q=None, limit=None, reverse=False): + if q is None: + q = query.Term("ev", u("a")) + + correct = [d["id"] for d in sorted(docs, key=key, reverse=reverse)][:limit] + schema = get_schema() + + for fn in (make_single_index, make_multi_index): + ix = RamStorage().create_index(schema) + fn(ix) + with ix.searcher() as s: + r = s.search(q, sortedby=sortedby, limit=limit, + reverse=reverse) + rids = [d["id"] for d in r] + assert rids == correct + + +def test_sortedby(): + try_sort("id", lambda d: d["id"]) + try_sort("id", lambda d: d["id"], limit=5) + try_sort("id", lambda d: d["id"], reverse=True) + try_sort("id", lambda d: d["id"], limit=5, reverse=True) + + +def test_multisort(): + mf = sorting.MultiFacet(["tag", "id"]) + try_sort(mf, lambda d: (d["tag"], d["id"])) + try_sort(mf, lambda d: (d["tag"], d["id"]), reverse=True) + try_sort(mf, lambda d: (d["tag"], d["id"]), limit=5) + try_sort(mf, lambda d: (d["tag"], d["id"]), reverse=True, limit=5) + + +def test_numeric(): + try_sort("num", lambda d: d["num"]) + try_sort("num", lambda d: d["num"], reverse=True) + try_sort("num", lambda d: d["num"], limit=5) + try_sort("frac", lambda d: d["frac"]) + + +def test_empty_field(): + schema = fields.Schema(id=fields.STORED, key=fields.KEYWORD) + with TempIndex(schema, "emptysort") as ix: + w = ix.writer() + w.add_document(id=1) + w.add_document(id=2) + w.add_document(id=3) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Every(), sortedby="key") + assert [h["id"] for h in r] == [1, 2, 3] + + +def test_page_sorted(): + schema = fields.Schema(key=fields.ID(stored=True)) + with TempIndex(schema, "pagesorted") as ix: + domain = list(u("abcdefghijklmnopqrstuvwxyz")) + random.shuffle(domain) + + w = ix.writer() + for char in domain: + w.add_document(key=char) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Every(), sortedby="key", limit=5) + assert r.scored_length() == 5 + assert len(r) == s.doc_count_all() + + rp = s.search_page(query.Every(), 1, pagelen=5, sortedby="key") + assert "".join([h["key"] for h in rp]) == "abcde" + assert rp[10:] == [] + + rp = s.search_page(query.Term("key", "glonk"), 1, pagelen=5, + sortedby="key") + assert len(rp) == 0 + assert rp.is_last_page() + + +def test_score_facet(): + schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT, + c=fields.ID) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, a=u("alfa alfa bravo"), b=u("bottle"), c=u("c")) + w.add_document(id=2, a=u("alfa alfa alfa"), b=u("bottle"), c=u("c")) + w.commit() + w = ix.writer() + w.add_document(id=3, a=u("alfa bravo bravo"), b=u("bottle"), c=u("c")) + w.add_document(id=4, a=u("alfa bravo alfa"), b=u("apple"), c=u("c")) + w.commit(merge=False) + w = ix.writer() + w.add_document(id=5, a=u("alfa bravo bravo"), b=u("apple"), c=u("c")) + w.add_document(id=6, a=u("alfa alfa alfa"), b=u("apple"), c=u("c")) + w.commit(merge=False) + + with ix.searcher() as s: + facet = sorting.MultiFacet(["b", sorting.ScoreFacet()]) + r = s.search(q=query.Term("a", u("alfa")), sortedby=facet) + assert [h["id"] for h in r] == [6, 4, 5, 2, 1, 3] + + +def test_function_facet(): + schema = fields.Schema(id=fields.STORED, + text=fields.TEXT(stored=True, vector=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + domain = ("alfa", "bravo", "charlie") + count = 1 + for w1 in domain: + for w2 in domain: + for w3 in domain: + for w4 in domain: + w.add_document(id=count, + text=u(" ").join((w1, w2, w3, w4))) + count += 1 + w.commit() + + def fn(searcher, docnum): + v = dict(searcher.vector_as("frequency", docnum, "text")) + # Give high score to documents that have equal number of "alfa" + # and "bravo". Negate value so higher values sort first + return 0 - (1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0)) + + with ix.searcher() as s: + q = query.And([query.Term("text", u("alfa")), + query.Term("text", u("bravo"))]) + + fnfacet = sorting.FunctionFacet(fn) + r = s.search(q, sortedby=fnfacet) + texts = [hit["text"] for hit in r] + for t in texts[:10]: + tks = t.split() + assert tks.count("alfa") == tks.count("bravo") + + +def test_numeric_field_facet(): + schema = fields.Schema(id=fields.STORED, v1=fields.NUMERIC, + v2=fields.NUMERIC) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=1, v1=2, v2=100) + w.add_document(id=2, v1=1, v2=50) + w.commit() + w = ix.writer() + w.add_document(id=3, v1=2, v2=200) + w.add_document(id=4, v1=1, v2=100) + w.commit() + w = ix.writer(merge=False) + w.add_document(id=5, v1=2, v2=50) + w.add_document(id=6, v1=1, v2=200) + w.commit() + + with ix.searcher() as s: + mf = sorting.MultiFacet().add_field("v1").add_field("v2", reverse=True) + r = s.search(query.Every(), sortedby=mf) + assert [hit["id"] for hit in r] == [6, 4, 2, 3, 1, 5] + + +def test_query_facet(): + schema = fields.Schema(id=fields.STORED, v=fields.ID) + ix = RamStorage().create_index(schema) + for i, ltr in enumerate(u("iacgbehdf")): + w = ix.writer() + w.add_document(id=i, v=ltr) + w.commit(merge=False) + + with ix.searcher() as s: + q1 = query.TermRange("v", "a", "c") + q2 = query.TermRange("v", "d", "f") + q3 = query.TermRange("v", "g", "i") + + assert [hit["id"] for hit in s.search(q1)] == [1, 2, 4] + assert [hit["id"] for hit in s.search(q2)] == [5, 7, 8] + assert [hit["id"] for hit in s.search(q3)] == [0, 3, 6] + + facet = sorting.QueryFacet({"a-c": q1, "d-f": q2, "g-i": q3}) + r = s.search(query.Every(), groupedby=facet) + # If you specify a facet without a name, it's automatically called + # "facet" + assert r.groups("facet") == {"a-c": [1, 2, 4], + "d-f": [5, 7, 8], + "g-i": [0, 3, 6]} + + +def test_query_facet_overlap(): + domain = u("abcdefghi") + schema = fields.Schema(v=fields.KEYWORD(stored=True), num=fields.NUMERIC(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + for i, ltr in enumerate(domain): + v = "%s %s" % (ltr, domain[8 - i]) + w.add_document(num=i, v=v) + + with ix.searcher() as s: + q1 = query.TermRange("v", "a", "c") + q2 = query.TermRange("v", "d", "f") + q3 = query.TermRange("v", "g", "i") + + facets = sorting.Facets() + facets.add_query("myfacet", {"a-c": q1, "d-f": q2, "g-i": q3}, allow_overlap=True) + r = s.search(query.Every(), groupedby=facets) + gr = r.groups("myfacet") + assert r.groups("myfacet") == {'a-c': [0, 1, 2, 6, 7, 8], + 'd-f': [3, 4, 5], + 'g-i': [0, 1, 2, 6, 7, 8]} + + +def test_missing_field_facet(): + schema = fields.Schema(id=fields.STORED, tag=fields.ID) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=0, tag=u("alfa")) + w.add_document(id=1, tag=u("alfa")) + w.add_document(id=2) + w.add_document(id=3, tag=u("bravo")) + w.add_document(id=4) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Every(), groupedby="tag") + assert r.groups("tag") == {None: [2, 4], 'bravo': [3], 'alfa': [0, 1]} + + +def test_missing_numeric_facet(): + schema = fields.Schema(id=fields.STORED, tag=fields.NUMERIC) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=0, tag=1) + w.add_document(id=1, tag=1) + w.add_document(id=2) + w.add_document(id=3, tag=0) + w.add_document(id=4) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Every(), groupedby="tag") + assert r.groups("tag") == {None: [2, 4], 0: [3], 1: [0, 1]} + + +def test_missing_overlap(): + schema = fields.Schema(a=fields.NUMERIC(stored=True), + b=fields.KEYWORD(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(a=0, b=u("one two")) + w.add_document(a=1) + w.add_document(a=2, b=u("two three")) + w.add_document(a=3) + w.add_document(a=4, b=u("three four")) + + with ix.searcher() as s: + facet = sorting.FieldFacet("b", allow_overlap=True) + r = s.search(query.Every(), groupedby=facet) + target = {"one": [0], "two": [0, 2], "three": [2, 4],"four": [4], + None: [1, 3]} + assert r.groups() == target + + +def test_date_facet(): + from whoosh import columns + + schema = fields.Schema(id=fields.STORED, date=fields.DATETIME) + dc = schema["date"].default_column() + assert isinstance(dc, columns.NumericColumn) + + ix = RamStorage().create_index(schema) + w = ix.writer() + d1 = datetime(2011, 7, 13) + d2 = datetime(1984, 3, 29) + w.add_document(id=0, date=d1) + w.add_document(id=1, date=d1) + w.add_document(id=2) + w.add_document(id=3, date=d2) + w.add_document(id=4) + w.commit() + + with ix.searcher() as s: + r = s.search(query.Every(), groupedby="date") + assert r.groups() + assert r.groups() == {d1: [0, 1], d2: [3], None: [2, 4]} + + +def test_range_facet(): + schema = fields.Schema(id=fields.STORED, price=fields.NUMERIC) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=0, price=200) + w.add_document(id=1, price=100) + w.add_document(id=2) + w.add_document(id=3, price=50) + w.add_document(id=4, price=500) + w.add_document(id=5, price=125) + w.commit() + + with ix.searcher() as s: + rf = sorting.RangeFacet("price", 0, 1000, 100) + r = s.search(query.Every(), groupedby={"price": rf}) + assert r.groups("price") == {(0, 100): [3], (100, 200): [1, 5], + (200, 300): [0], (500, 600): [4], + None: [2]} + + +def test_range_gaps(): + schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC) + ix = RamStorage().create_index(schema) + w = ix.writer() + for i in range(10): + w.add_document(id=i, num=i) + w.commit() + + with ix.searcher() as s: + rf = sorting.RangeFacet("num", 0, 1000, [1, 2, 3]) + r = s.search(query.Every(), groupedby={"num": rf}) + assert r.groups("num") == {(0, 1): [0], + (1, 3): [1, 2], + (3, 6): [3, 4, 5], + (6, 9): [6, 7, 8], + (9, 12): [9]} + + +def test_daterange_facet(): + schema = fields.Schema(id=fields.STORED, date=fields.DATETIME) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(id=0, date=datetime(2001, 1, 15)) + w.add_document(id=1, date=datetime(2001, 1, 10)) + w.add_document(id=2) + w.add_document(id=3, date=datetime(2001, 1, 3)) + w.add_document(id=4, date=datetime(2001, 1, 8)) + w.add_document(id=5, date=datetime(2001, 1, 6)) + w.commit() + + with ix.searcher() as s: + rf = sorting.DateRangeFacet("date", datetime(2001, 1, 1), + datetime(2001, 1, 20), timedelta(days=5)) + r = s.search(query.Every(), groupedby={"date": rf}) + dt = datetime + assert r.groups("date") == {(dt(2001, 1, 1, 0, 0), dt(2001, 1, 6, 0, 0)): [3], + (dt(2001, 1, 6, 0, 0), dt(2001, 1, 11, 0, 0)): [1, 4, 5], + (dt(2001, 1, 11, 0, 0), dt(2001, 1, 16, 0, 0)): [0], + None: [2]} + + +def test_relative_daterange(): + from whoosh.support.relativedelta import relativedelta + dt = datetime + + schema = fields.Schema(id=fields.STORED, date=fields.DATETIME) + ix = RamStorage().create_index(schema) + basedate = datetime(2001, 1, 1) + count = 0 + with ix.writer() as w: + while basedate < datetime(2001, 12, 1): + w.add_document(id=count, date=basedate) + basedate += timedelta(days=14, hours=16) + count += 1 + + with ix.searcher() as s: + gap = relativedelta(months=1) + rf = sorting.DateRangeFacet("date", dt(2001, 1, 1), + dt(2001, 12, 31), gap) + r = s.search(query.Every(), groupedby={"date": rf}) + assert r.groups("date") == {(dt(2001, 1, 1), dt(2001, 2, 1)): [0, 1, 2], + (dt(2001, 2, 1), dt(2001, 3, 1)): [3, 4], + (dt(2001, 3, 1), dt(2001, 4, 1)): [5, 6], + (dt(2001, 4, 1), dt(2001, 5, 1)): [7, 8], + (dt(2001, 5, 1), dt(2001, 6, 1)): [9, 10], + (dt(2001, 6, 1), dt(2001, 7, 1)): [11, 12], + (dt(2001, 7, 1), dt(2001, 8, 1)): [13, 14], + (dt(2001, 8, 1), dt(2001, 9, 1)): [15, 16], + (dt(2001, 9, 1), dt(2001, 10, 1)): [17, 18], + (dt(2001, 10, 1), dt(2001, 11, 1)): [19, 20], + (dt(2001, 11, 1), dt(2001, 12, 1)): [21, 22], + } + + +def test_overlapping_vector(): + schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD(vector=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=0, tags=u("alfa bravo charlie")) + w.add_document(id=1, tags=u("bravo charlie delta")) + w.add_document(id=2, tags=u("charlie delta echo")) + w.add_document(id=3, tags=u("delta echo alfa")) + w.add_document(id=4, tags=u("echo alfa bravo")) + + with ix.searcher() as s: + of = sorting.FieldFacet("tags", allow_overlap=True) + cat = of.categorizer(s) + assert cat._use_vectors + + r = s.search(query.Every(), groupedby={"tags": of}) + assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], + 'charlie': [0, 1, 2], 'delta': [1, 2, 3], + 'echo': [2, 3, 4]} + + fcts = sorting.Facets() + fcts.add_field("tags", allow_overlap=True) + r = s.search(query.Every(), groupedby=fcts) + assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], + 'charlie': [0, 1, 2], 'delta': [1, 2, 3], + 'echo': [2, 3, 4]} + + +def test_overlapping_lists(): + schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=0, tags=u("alfa bravo charlie")) + w.add_document(id=1, tags=u("bravo charlie delta")) + w.add_document(id=2, tags=u("charlie delta echo")) + w.add_document(id=3, tags=u("delta echo alfa")) + w.add_document(id=4, tags=u("echo alfa bravo")) + + with ix.searcher() as s: + of = sorting.FieldFacet("tags", allow_overlap=True) + cat = of.categorizer(s) + assert not cat._use_vectors + + r = s.search(query.Every(), groupedby={"tags": of}) + assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], + 'charlie': [0, 1, 2], 'delta': [1, 2, 3], + 'echo': [2, 3, 4]} + + fcts = sorting.Facets() + fcts.add_field("tags", allow_overlap=True) + r = s.search(query.Every(), groupedby=fcts) + assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], + 'charlie': [0, 1, 2], 'delta': [1, 2, 3], + 'echo': [2, 3, 4]} + + +def test_field_facets(): + def check(method): + with TempIndex(get_schema()) as ix: + method(ix) + with ix.searcher() as s: + results = s.search(query.Every(), groupedby="tag") + groups = results.groups() + assert sorted(groups.items()) == [(u('one'), [0, 6]), + (u('three'), [1, 3, 7, 8]), + (u('two'), [2, 4, 5])] + + check(make_single_index) + check(make_multi_index) + + +def test_multifacet(): + schema = fields.Schema(tag=fields.ID(stored=True), + size=fields.ID(stored=True)) + with TempIndex(schema, "multifacet") as ix: + w = ix.writer() + w.add_document(tag=u("alfa"), size=u("small")) + w.add_document(tag=u("bravo"), size=u("medium")) + w.add_document(tag=u("alfa"), size=u("large")) + w.add_document(tag=u("bravo"), size=u("small")) + w.add_document(tag=u("alfa"), size=u("medium")) + w.add_document(tag=u("bravo"), size=u("medium")) + w.commit() + + correct = {(u('bravo'), u('medium')): [1, 5], + (u('alfa'), u('large')): [2], + (u('alfa'), u('medium')): [4], + (u('alfa'), u('small')): [0], + (u('bravo'), u('small')): [3]} + + with ix.searcher() as s: + facet = sorting.MultiFacet(["tag", "size"]) + r = s.search(query.Every(), groupedby={"tag/size": facet}) + cats = r.groups(("tag/size")) + assert cats == correct + + +def test_sort_filter(): + schema = fields.Schema(group=fields.ID(stored=True), + key=fields.ID(stored=True)) + groups = u("alfa bravo charlie").split() + keys = u("abcdefghijklmnopqrstuvwxyz") + source = [] + for i in xrange(100): + key = keys[i % len(keys)] + group = groups[i % len(groups)] + source.append({"key": key, "group": group}) + source.sort(key=lambda x: (x["key"], x["group"])) + + sample = list(source) + random.shuffle(sample) + + with TempIndex(schema, "sortfilter") as ix: + w = ix.writer() + for i, fs in enumerate(sample): + w.add_document(**fs) + i += 1 + if not i % 26: + w.commit(merge=False) + w = ix.writer() + w.commit() + + fq = query.Term("group", u("bravo")) + + with ix.searcher() as s: + r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, + limit=20) + assert [h.fields() for h in r] == [d for d in source if d["group"] == "bravo"][:20] + + fq = query.Term("group", u("bravo")) + r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, + limit=None) + assert [h.fields() for h in r] == [d for d in source if d["group"] == "bravo"] + + ix.optimize() + + with ix.searcher() as s: + r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, + limit=20) + assert [h.fields() for h in r] == [d for d in source if d["group"] == "bravo"][:20] + + fq = query.Term("group", u("bravo")) + r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, + limit=None) + assert [h.fields() for h in r] == [d for d in source if d["group"] == "bravo"] + + +def test_sorting_function(): + schema = fields.Schema(id=fields.STORED, + text=fields.TEXT(stored=True, vector=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + domain = ("alfa", "bravo", "charlie") + count = 1 + for w1 in domain: + for w2 in domain: + for w3 in domain: + for w4 in domain: + w.add_document(id=count, + text=u(" ").join((w1, w2, w3, w4))) + count += 1 + w.commit() + + def fn(searcher, docnum): + v = dict(searcher.vector_as("frequency", docnum, "text")) + # Sort documents that have equal number of "alfa" + # and "bravo" first + return 0 - 1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0) + fnfacet = sorting.FunctionFacet(fn) + + with ix.searcher() as s: + q = query.And([query.Term("text", u("alfa")), + query.Term("text", u("bravo"))]) + results = s.search(q, sortedby=fnfacet) + r = [hit["text"] for hit in results] + for t in r[:10]: + tks = t.split() + assert tks.count("alfa") == tks.count("bravo") + + +class test_translate(): + domain = [("alfa", 100, 50), ("bravo", 20, 80), ("charlie", 10, 10), + ("delta", 82, 39), ("echo", 20, 73), ("foxtrot", 81, 59), + ("golf", 39, 93), ("hotel", 57, 48), ("india", 84, 75), + ] + + schema = fields.Schema(name=fields.TEXT(sortable=True), + a=fields.NUMERIC(sortable=True), + b=fields.NUMERIC(sortable=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + for name, a, b in domain: + w.add_document(name=u(name), a=a, b=b) + + with ix.searcher() as s: + q = query.Every() + + # Baseline: just sort by a field + r = s.search(q, sortedby="a") + assert " ".join([hit["name"] for hit in r]) == "charlie bravo echo golf hotel foxtrot delta india alfa" + + # Sort by reversed name + target = [x[0] for x in sorted(domain, key=lambda x: x[0][::-1])] + tf = sorting.TranslateFacet(lambda name: name[::-1], sorting.FieldFacet("name")) + r = s.search(q, sortedby=tf) + assert [hit["name"] for hit in r] == target + + # Sort by average of a and b + def avg(a, b): + return (a + b) / 2 + + target = [x[0] for x in sorted(domain, key=lambda x: (x[1] + x[2]) / 2)] + af = sorting.FieldFacet("a") + bf = sorting.FieldFacet("b") + tf = sorting.TranslateFacet(avg, af, bf) + r = s.search(q, sortedby=tf) + assert [hit["name"] for hit in r] == target + + +def test_sorted_groups(): + schema = fields.Schema(a=fields.STORED, b=fields.TEXT, c=fields.ID) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(a=0, b=u("blah"), c=u("apple")) + w.add_document(a=1, b=u("blah blah"), c=u("bear")) + w.add_document(a=2, b=u("blah blah blah"), c=u("apple")) + w.add_document(a=3, b=u("blah blah blah blah"), c=u("bear")) + w.add_document(a=4, b=u("blah blah blah blah blah"), c=u("apple")) + w.add_document(a=5, b=u("blah blah blah blah blah blah"), c=u("bear")) + + with ix.searcher() as s: + q = query.Term("b", "blah") + r = s.search(q, groupedby="c") + gs = r.groups("c") + assert gs["apple"] == [4, 2, 0] + assert gs["bear"] == [5, 3, 1] + + +def test_group_types(): + schema = fields.Schema(a=fields.STORED, b=fields.TEXT, c=fields.ID) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(a=0, b=u("blah"), c=u("apple")) + w.add_document(a=1, b=u("blah blah"), c=u("bear")) + w.add_document(a=2, b=u("blah blah blah"), c=u("apple")) + w.add_document(a=3, b=u("blah blah blah blah"), c=u("bear")) + w.add_document(a=4, b=u("blah blah blah blah blah"), c=u("apple")) + w.add_document(a=5, b=u("blah blah blah blah blah blah"), c=u("bear")) + w.add_document(a=6, b=u("blah blah blah blah blah blah blah"), + c=u("apple")) + + with ix.searcher() as s: + q = query.Term("b", "blah") + + f = sorting.FieldFacet("c", maptype=sorting.UnorderedList) + r = s.search(q, groupedby=f) + gs = r.groups() + assert gs["apple"] == [0, 2, 4, 6] + assert gs["bear"] == [1, 3, 5] + + f = sorting.FieldFacet("c", maptype=sorting.Count) + r = s.search(q, groupedby=f) + gs = r.groups() + assert gs["apple"] == 4 + assert gs["bear"] == 3 + + r = s.search(q, groupedby="c", maptype=sorting.Count) + gs = r.groups() + assert gs["apple"] == 4 + assert gs["bear"] == 3 + + f = sorting.FieldFacet("c", maptype=sorting.Best) + r = s.search(q, groupedby=f) + gs = r.groups() + assert gs["apple"] == 6 + assert gs["bear"] == 5 + + +def test_nocachefield_segments(): + schema = fields.Schema(a=fields.ID(stored=True)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(a=u("bravo")) + w.add_document(a=u("echo")) + w.add_document(a=u("juliet")) + w.commit() + w = ix.writer() + w.add_document(a=u("kilo")) + w.add_document(a=u("foxtrot")) + w.add_document(a=u("charlie")) + w.commit(merge=False) + w = ix.writer() + w.delete_by_term("a", u("echo")) + w.add_document(a=u("alfa")) + w.add_document(a=u("india")) + w.add_document(a=u("delta")) + w.commit(merge=False) + + with ix.searcher() as s: + q = query.TermRange("a", u("bravo"), u("k")) + facet = sorting.FieldFacet("a", reverse=True) + + r = s.search(q, sortedby=facet) + assert [hit["a"] for hit in r] == ["juliet", "india", "foxtrot", "delta", "charlie", "bravo"] + + mq = query.Or([query.Term("a", u("bravo")), + query.Term("a", u("delta"))]) + anq = query.AndNot(q, mq) + r = s.search(anq, sortedby=facet) + assert [hit["a"] for hit in r] == ["juliet", "india", "foxtrot", "charlie"] + + mq = query.Or([query.Term("a", u("bravo")), + query.Term("a", u("delta"))]) + r = s.search(q, mask=mq, sortedby=facet) + assert [hit["a"] for hit in r] == ["juliet", "india", "foxtrot", "charlie"] + + fq = query.Or([query.Term("a", u("alfa")), + query.Term("a", u("charlie")), + query.Term("a", u("echo")), + query.Term("a", u("india")), + ]) + r = s.search(query.Every(), filter=fq, sortedby=facet) + assert [hit["a"] for hit in r] == ["india", "charlie", "alfa"] + + nq = query.Not(query.Or([query.Term("a", u("alfa")), + query.Term("a", u("india"))])) + r = s.search(query.Every(), filter=nq, sortedby=facet) + assert [hit["a"] for hit in r] == ["kilo", "juliet", "foxtrot", "delta", "charlie", "bravo"] + + +def test_groupby_phrase(): + domain = {"Alan Ball": "Tel Aviv", "Alan Charles": "San Francisco", + "Alan Darwin": "London", "Alan Eames": "Paris"} + + schema = fields.Schema(name=fields.TEXT(stored=True), + city=fields.TEXT(stored=True), + city_g=fields.ID(stored=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + for name, city in domain.items(): + w.add_document(name=u(name), city=u(city), city_g=u(city)) + + with ix.searcher() as s: + q = query.Term("name", "alan") + r = s.search(q, groupedby="city_g") + keys = sorted(r.groups().keys()) + assert keys == ["London", "Paris", "San Francisco", "Tel Aviv"] + + sff = sorting.StoredFieldFacet("city") + r = s.search(q, groupedby=sff) + keys = sorted(r.groups().keys()) + assert keys == ["London", "Paris", "San Francisco", "Tel Aviv"] + + +def test_sort_text_field(): + domain = (("Visual Display of Quantitative Information, The", 10), + ("Envisioning Information", 10), + ("Visual Explanations", 10), + ("Beautiful Evidence", -10), + ("Visual and Statistical Thinking", -10), + ("Cognitive Style of Powerpoint", -10)) + sorted_titles = sorted(d[0] for d in domain) + + schema = fields.Schema(title=fields.TEXT(stored=True, sortable=True), + num=fields.NUMERIC(sortable=True)) + + def test(ix): + with ix.searcher() as s: + # Sort by title + r = s.search(query.Every(), sortedby="title") + titles = [hit["title"] for hit in r] + assert titles == sorted_titles + + # Sort by reverse title + facet = sorting.FieldFacet("title", reverse=True) + r = s.search(query.Every(), sortedby=facet) + assert [hit["title"] for hit in r] == list(reversed(sorted_titles)) + + # Sort by num (-10 to 10) first, and within that, by reverse title + facet = sorting.MultiFacet() + facet.add_field("num") + facet.add_field("title", reverse=True) + + r = s.search(query.Every(), sortedby=facet) + target = ["Visual and Statistical Thinking", + "Cognitive Style of Powerpoint", + "Beautiful Evidence", + "Visual Explanations", + "Visual Display of Quantitative Information, The", + "Envisioning Information", + ] + assert [hit["title"] for hit in r] == target + + # Single segment + with TempIndex(schema) as ix: + with ix.writer() as w: + for title, num in domain: + w.add_document(title=u(title), num=num) + test(ix) + + # Multisegment + with TempIndex(schema) as ix: + # Segment 1 + with ix.writer() as w: + for title, num in domain[:3]: + w.add_document(title=u(title), num=num) + # Segment 2 + with ix.writer() as w: + for title, num in domain[3:]: + w.add_document(title=u(title), num=num) + w.merge = False + test(ix) + + +def test_filtered_grouped(): + schema = fields.Schema(tag=fields.ID, text=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + domain = u("alfa bravo charlie delta echo foxtrot").split() + + with ix.writer() as w: + for i, ls in enumerate(permutations(domain, 3)): + tag = u(str(i % 3)) + w.add_document(tag=tag, text=u(" ").join(ls)) + + with ix.searcher() as s: + f = query.And([query.Term("text", "charlie"), + query.Term("text", "delta")]) + r = s.search(query.Every(), filter=f, groupedby="tag", limit=None) + assert len(r) == 24 + + +def test_add_sortable(): + from whoosh import columns + + st = RamStorage() + schema = fields.Schema(chapter=fields.ID(stored=True), price=fields.NUMERIC) + ix = st.create_index(schema) + with ix.writer() as w: + w.add_document(chapter=u("alfa"), price=100) + w.add_document(chapter=u("bravo"), price=200) + w.add_document(chapter=u("charlie"), price=300) + w.add_document(chapter=u("delta"), price=400) + with ix.writer() as w: + w.add_document(chapter=u("bravo"), price=500) + w.add_document(chapter=u("alfa"), price=600) + w.add_document(chapter=u("delta"), price=100) + w.add_document(chapter=u("charlie"), price=200) + w.merge = False + + with ix.reader() as r: + assert not r.has_column("chapter") + assert not r.has_column("price") + + with ix.writer() as w: + sorting.add_sortable(w, "chapter", sorting.StoredFieldFacet("chapter")) + sorting.add_sortable(w, "price", sorting.FieldFacet("price")) + w.merge = False + + with ix.reader() as r: + assert r.has_column("chapter") + assert r.has_column("price") + + chapr = r.column_reader("chapter") + pricer = r.column_reader("price") + + assert chapr[0] == u"alfa" + assert pricer[0] == 100 + + ix.optimize() + + with ix.reader() as r: + assert r.has_column("chapter") + assert r.has_column("price") + + chapr = r.column_reader("chapter") + pricer = r.column_reader("price") + + assert chapr[0] == u"alfa" + assert pricer[0] == 100 + + +def test_missing_column(): + from whoosh import collectors + + schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=0, tags=u("alfa bravo charlie")) + w.add_document(id=1, tags=u("bravo charlie delta")) + w.add_document(id=2, tags=u("charlie delta echo")) + w.merge = False + + with ix.writer() as w: + w.add_field("age", fields.NUMERIC(sortable=True)) + + w.add_document(id=3, tags=u("delta echo foxtrot"), age=10) + w.add_document(id=4, tags=u("echo foxtrot golf"), age=5) + w.add_document(id=5, tags=u("foxtrot golf alfa"), age=20) + w.merge = False + + with ix.writer() as w: + w.add_document(id=6, tags=u("golf alfa bravo"), age=2) + w.add_document(id=7, tags=u("alfa hotel india"), age=50) + w.add_document(id=8, tags=u("hotel india bravo"), age=15) + w.merge = False + + with ix.searcher() as s: + assert not s.is_atomic() + + q = query.Term("tags", u("alfa")) + + # Have to use yucky low-level collector API to make sure we used a + # ColumnCategorizer to do the sorting + c = s.collector(sortedby="age") + assert isinstance(c, collectors.SortingCollector) + s.search_with_collector(q, c) + assert isinstance(c.categorizer, sorting.ColumnCategorizer) + + r = c.results() + assert [hit["id"] for hit in r] == [6, 5, 7, 0] + + r = s.search(q, sortedby="age", reverse=True) + assert [hit["id"] for hit in r] == [0, 7, 5, 6] + + +def test_compound_sort(): + fspec = fields.KEYWORD(stored=True, sortable=True) + schema = fields.Schema(a=fspec, b=fspec, c=fspec) + ix = RamStorage().create_index(schema) + + alist = u("alfa bravo alfa bravo alfa bravo alfa bravo alfa bravo").split() + blist = u("alfa bravo charlie alfa bravo charlie alfa bravo charlie alfa").split() + clist = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet").split() + assert all(len(ls) == 10 for ls in (alist, blist, clist)) + + with ix.writer() as w: + for i in xrange(10): + w.add_document(a=alist[i], b=blist[i], c=clist[i]) + + with ix.searcher() as s: + q = query.Every() + sortedby = [sorting.FieldFacet("a"), + sorting.FieldFacet("b", reverse=True), + sorting.FieldFacet("c")] + + r = s.search(q, sortedby=sortedby) + output = [] + for hit in r: + output.append(" ".join((hit["a"], hit["b"], hit["c"]))) + + assert output == [ + "alfa charlie charlie", + "alfa charlie india", + "alfa bravo echo", + "alfa alfa alfa", + "alfa alfa golf", + "bravo charlie foxtrot", + "bravo bravo bravo", + "bravo bravo hotel", + "bravo alfa delta", + "bravo alfa juliet", + ] + + +def test_column_scoring(): + from whoosh import scoring + + # "sortable=True" on the "id" field tells it to build a column store + # of field values. If you didn't ever need to actually search on this field, + # you could get JUST the column using count=fields.COLUMN + schema = fields.Schema(id=fields.ID(sortable=True), + tag=fields.KEYWORD) + + class MyWeighting(scoring.WeightingModel): + def scorer(self, searcher, fieldname, text, qf=1): + # Pass the searcher to the scorer so it can look up values in the + # "count" field + return MyScorer(searcher) + + class MyScorer(scoring.BaseScorer): + def __init__(self, searcher): + self.searcher = searcher + # Get a column value reader for the "id" field + self.col = searcher.reader().column_reader("id") + + def score(self, matcher): + # Get the document number of the current match + docnum = matcher.id() + # Use the value from the column as the score + # Note: the return value must be a number, so for this contrived + # example we'll call ord() on the ID letter + id_value = self.col[docnum] + return ord(id_value) + + with TempIndex(schema) as ix: + with ix.writer() as w: + w.add_document(id=u"a", tag=u"foo") + w.add_document(id=u"b", tag=u"foo") + w.add_document(id=u"c", tag=u"foo") + w.add_document(id=u"d", tag=u"foo") + + with ix.searcher(weighting=MyWeighting()) as s: + r = s.search(query.Term("tag", u"foo")) + # Note that higher scores are better, so higher letters come first + assert [hit["id"] for hit in r] == ["d", "c", "b", "a"] + + diff --git a/tests/test_spans.py b/tests/test_spans.py new file mode 100644 index 0000000..926b20d --- /dev/null +++ b/tests/test_spans.py @@ -0,0 +1,339 @@ +from __future__ import with_statement + +from whoosh import analysis, fields, formats +from whoosh.compat import u, xrange, permutations +from whoosh.filedb.filestore import RamStorage +from whoosh.query import spans +from whoosh.query import And, Or, Term, Phrase + + +domain = ("alfa", "bravo", "bravo", "charlie", "delta", "echo") +_ix = None + + +def get_index(): + global _ix + + if _ix is not None: + return _ix + + charfield = fields.FieldType(formats.Characters(), + analysis.SimpleAnalyzer(), + scorable=True, stored=True) + schema = fields.Schema(text=charfield) + st = RamStorage() + _ix = st.create_index(schema) + + w = _ix.writer() + for ls in permutations(domain, 4): + w.add_document(text=u(" ").join(ls), _stored_text=ls) + w.commit() + + return _ix + + +def test_multimatcher(): + schema = fields.Schema(content=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + + domain = ("alfa", "bravo", "charlie", "delta") + + for _ in xrange(3): + w = ix.writer() + for ls in permutations(domain): + w.add_document(content=u(" ").join(ls)) + w.commit(merge=False) + + q = Term("content", "bravo") + with ix.searcher() as s: + m = q.matcher(s) + while m.is_active(): + content = s.stored_fields(m.id())["content"].split() + spans = m.spans() + for span in spans: + assert content[span.start] == "bravo" + m.next() + + +def test_excludematcher(): + schema = fields.Schema(content=fields.TEXT(stored=True)) + ix = RamStorage().create_index(schema) + + domain = ("alfa", "bravo", "charlie", "delta") + + for _ in xrange(3): + w = ix.writer() + for ls in permutations(domain): + w.add_document(content=u(" ").join(ls)) + w.commit(merge=False) + + w = ix.writer() + w.delete_document(5) + w.delete_document(10) + w.delete_document(28) + w.commit(merge=False) + + q = Term("content", "bravo") + with ix.searcher() as s: + m = q.matcher(s) + while m.is_active(): + content = s.stored_fields(m.id())["content"].split() + spans = m.spans() + for span in spans: + assert content[span.start] == "bravo" + m.next() + + +def test_span_term(): + ix = get_index() + with ix.searcher() as s: + alllists = [d["text"] for d in s.all_stored_fields()] + + for word in domain: + q = Term("text", word) + m = q.matcher(s) + + ids = set() + while m.is_active(): + id = m.id() + sps = m.spans() + ids.add(id) + original = list(s.stored_fields(id)["text"]) + assert word in original + + if word != "bravo": + assert len(sps) == 1 + assert original.index(word) == sps[0].start + assert original.index(word) == sps[0].end + m.next() + + for i, ls in enumerate(alllists): + if word in ls: + assert i in ids + else: + assert i not in ids + + +def test_span_first(): + ix = get_index() + with ix.searcher() as s: + for word in domain: + q = spans.SpanFirst(Term("text", word)) + m = q.matcher(s) + while m.is_active(): + sps = m.spans() + original = s.stored_fields(m.id())["text"] + assert original[0] == word + assert len(sps) == 1 + assert sps[0].start == 0 + assert sps[0].end == 0 + m.next() + + q = spans.SpanFirst(Term("text", "bravo"), limit=1) + m = q.matcher(s) + while m.is_active(): + orig = s.stored_fields(m.id())["text"] + for sp in m.spans(): + assert orig[sp.start] == "bravo" + m.next() + + +def test_span_near(): + ix = get_index() + with ix.searcher() as s: + def test(q): + m = q.matcher(s) + while m.is_active(): + yield s.stored_fields(m.id())["text"], m.spans() + m.next() + + for orig, sps in test(spans.SpanNear(Term("text", "alfa"), + Term("text", "bravo"), + ordered=True)): + assert orig[sps[0].start] == "alfa" + assert orig[sps[0].end] == "bravo" + + for orig, sps in test(spans.SpanNear(Term("text", "alfa"), + Term("text", "bravo"), + ordered=False)): + first = orig[sps[0].start] + second = orig[sps[0].end] + assert ((first == "alfa" and second == "bravo") or (first == "bravo" and second == "alfa")) + + for orig, sps in test(spans.SpanNear(Term("text", "bravo"), + Term("text", "bravo"), + ordered=True)): + text = " ".join(orig) + assert text.find("bravo bravo") > -1 + + q = spans.SpanNear(spans.SpanNear(Term("text", "alfa"), + Term("text", "charlie")), + Term("text", "echo")) + for orig, sps in test(q): + text = " ".join(orig) + assert text.find("alfa charlie echo") > -1 + + q = spans.SpanNear(Or([Term("text", "alfa"), Term("text", "charlie")]), + Term("text", "echo"), ordered=True) + for orig, sps in test(q): + text = " ".join(orig) + assert (text.find("alfa echo") > -1 + or text.find("charlie echo") > -1) + + +def test_near_unordered(): + schema = fields.Schema(text=fields.TEXT(stored=True)) + st = RamStorage() + ix = st.create_index(schema) + w = ix.writer() + w.add_document(text=u("alfa bravo charlie delta echo")) + w.add_document(text=u("alfa bravo delta echo charlie")) + w.add_document(text=u("alfa charlie bravo delta echo")) + w.add_document(text=u("echo delta alfa foxtrot")) + w.commit() + + with ix.searcher() as s: + q = spans.SpanNear(Term("text", "bravo"), Term("text", "charlie"), + ordered=False) + r = sorted(d["text"] for d in s.search(q)) + assert r == [u('alfa bravo charlie delta echo'), u('alfa charlie bravo delta echo')] + + +def test_span_near2(): + ana = analysis.SimpleAnalyzer() + schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True)) + st = RamStorage() + ix = st.create_index(schema) + w = ix.writer() + w.add_document(text=u("The Lucene library is by Doug Cutting and Whoosh " + + "was made by Matt Chaput")) + w.commit() + + nq1 = spans.SpanNear(Term("text", "lucene"), Term("text", "doug"), slop=5) + nq2 = spans.SpanNear(nq1, Term("text", "whoosh"), slop=4) + + with ix.searcher() as s: + m = nq2.matcher(s) + assert m.spans() == [spans.Span(1, 8)] + + +def test_span_not(): + ix = get_index() + with ix.searcher() as s: + nq = spans.SpanNear(Term("text", "alfa"), Term("text", "charlie"), + slop=2) + bq = Term("text", "bravo") + q = spans.SpanNot(nq, bq) + m = q.matcher(s) + while m.is_active(): + orig = list(s.stored_fields(m.id())["text"]) + i1 = orig.index("alfa") + i2 = orig.index("charlie") + dist = i2 - i1 + assert 0 < dist < 3 + if "bravo" in orig: + assert orig.index("bravo") != i1 + 1 + m.next() + + +def test_span_or(): + ix = get_index() + with ix.searcher() as s: + nq = spans.SpanNear(Term("text", "alfa"), Term("text", "charlie"), + slop=2) + bq = Term("text", "bravo") + q = spans.SpanOr([nq, bq]) + m = q.matcher(s) + while m.is_active(): + orig = s.stored_fields(m.id())["text"] + assert ("alfa" in orig and "charlie" in orig) or "bravo" in orig + m.next() + + +def test_span_contains(): + ix = get_index() + with ix.searcher() as s: + nq = spans.SpanNear(Term("text", "alfa"), Term("text", "charlie"), + slop=3) + cq = spans.SpanContains(nq, Term("text", "echo")) + + m = cq.matcher(s) + ls = [] + while m.is_active(): + orig = s.stored_fields(m.id())["text"] + ls.append(" ".join(orig)) + m.next() + ls.sort() + assert ls == ['alfa bravo echo charlie', 'alfa bravo echo charlie', + 'alfa delta echo charlie', 'alfa echo bravo charlie', + 'alfa echo bravo charlie', 'alfa echo charlie bravo', + 'alfa echo charlie bravo', 'alfa echo charlie delta', + 'alfa echo delta charlie', 'bravo alfa echo charlie', + 'bravo alfa echo charlie', 'delta alfa echo charlie', + ] + + +def test_span_before(): + ix = get_index() + with ix.searcher() as s: + bq = spans.SpanBefore(Term("text", "alfa"), Term("text", "charlie")) + m = bq.matcher(s) + while m.is_active(): + orig = list(s.stored_fields(m.id())["text"]) + assert "alfa" in orig + assert "charlie" in orig + assert orig.index("alfa") < orig.index("charlie") + m.next() + + +def test_span_condition(): + ix = get_index() + with ix.searcher() as s: + sc = spans.SpanCondition(Term("text", "alfa"), Term("text", "charlie")) + m = sc.matcher(s) + while m.is_active(): + orig = list(s.stored_fields(m.id())["text"]) + assert "alfa" in orig + assert "charlie" in orig + for span in m.spans(): + assert orig[span.start] == "alfa" + m.next() + + +def test_regular_or(): + ix = get_index() + with ix.searcher() as s: + oq = Or([Term("text", "bravo"), Term("text", "alfa")]) + m = oq.matcher(s) + while m.is_active(): + orig = s.stored_fields(m.id())["text"] + for span in m.spans(): + v = orig[span.start] + assert v == "bravo" or v == "alfa" + m.next() + + +def test_regular_and(): + ix = get_index() + with ix.searcher() as s: + aq = And([Term("text", "bravo"), Term("text", "alfa")]) + m = aq.matcher(s) + while m.is_active(): + orig = s.stored_fields(m.id())["text"] + for span in m.spans(): + v = orig[span.start] + assert v == "bravo" or v == "alfa" + m.next() + + +def test_span_characters(): + ix = get_index() + with ix.searcher() as s: + pq = Phrase("text", ["bravo", "echo"]) + m = pq.matcher(s) + while m.is_active(): + orig = " ".join(s.stored_fields(m.id())["text"]) + for span in m.spans(): + startchar, endchar = span.startchar, span.endchar + assert orig[startchar:endchar] == "bravo echo" + m.next() diff --git a/tests/test_spelling.py b/tests/test_spelling.py new file mode 100644 index 0000000..ce5284f --- /dev/null +++ b/tests/test_spelling.py @@ -0,0 +1,353 @@ +from __future__ import with_statement +import gzip + +from whoosh import analysis, fields, highlight, query, spelling +from whoosh.compat import b, u, permutations +from whoosh.qparser import QueryParser +from whoosh.support.levenshtein import levenshtein +from whoosh.util.testing import TempIndex + + +_wordlist = sorted(u("render animation animate shader shading zebra koala" + "ready kismet reaction page delete quick fox jumped" + "over lazy dog wicked erase red team yellow under interest" + "open print acrid sear deaf feed grow heal jolly kilt" + "low zone xylophone crown vale brown neat meat reduction" + "blunder preaction lamppost").split()) + + +def test_list_corrector(): + corr = spelling.ListCorrector(_wordlist) + typo = "reoction" + sugs = list(corr.suggest(typo, maxdist=2)) + target = [w for w in _wordlist if levenshtein(typo, w) <= 2] + assert sugs == target + + +def test_automaton(): + schema = fields.Schema(text=fields.TEXT) + with TempIndex(schema, "automatonspell") as ix: + with ix.writer() as w: + w.add_document(text=u" ".join(_wordlist)) + + with ix.reader() as r: + bterms = list(r.lexicon("text")) + words = [bterm.decode("utf8") for bterm in bterms] + assert words == _wordlist + + typo = "reoction" + sugs = list(r.terms_within("text", typo, maxdist=2)) + target = [w for w in _wordlist if levenshtein(typo, w) <= 2] + assert sugs == target + + +def test_reader_corrector(): + schema = fields.Schema(text=fields.TEXT()) + with TempIndex(schema) as ix: + with ix.writer() as w: + w.add_document(text=u"render zorro kaori postal") + w.add_document(text=u"reader zebra koala pastry") + w.add_document(text=u"leader libra oola paster") + w.add_document(text=u"feeder lorry zoala baster") + + with ix.reader() as r: + sp = spelling.ReaderCorrector(r, "text", schema["text"]) + assert sp.suggest(u"koala", maxdist=1) == [u'koala', u"zoala"] + + target = [u'kaori', u'koala', u'oola'] + sugs = sp.suggest(u"kaola", maxdist=2) + assert sugs == target + + +def test_unicode_spelling(): + schema = fields.Schema(text=fields.ID()) + + domain = [u"\u0924\u092a\u093e\u0907\u0939\u0930\u0941", + u"\u65e5\u672c", + u"\uc774\uc124\ud76c", + ] + + with TempIndex(schema) as ix: + with ix.writer() as w: + for word in domain: + w.add_document(text=word) + + with ix.reader() as r: + rc = spelling.ReaderCorrector(r, "text", schema["text"]) + assert rc.suggest(u"\u65e5\u672e\u672c") == [u"\u65e5\u672c"] + + +def test_wordfile(): + import os.path + + path = os.path.join(os.path.dirname(__file__), "english-words.10.gz") + wordfile = gzip.open(path, "rb") + words = sorted(line.decode("latin1").strip().lower() for line in wordfile) + + cor = spelling.ListCorrector(words) + assert cor.suggest("specail") == ["special"] + + +def test_query_highlight(): + qp = QueryParser("a", None) + hf = highlight.HtmlFormatter() + + def do(text, terms): + q = qp.parse(text) + tks = [tk for tk in q.all_tokens() if tk.text in terms] + for tk in tks: + if tk.startchar is None or tk.endchar is None: + assert False, tk + fragment = highlight.Fragment(text, tks) + return hf.format_fragment(fragment) + + assert do("a b c d", ["b"]) == 'a b c d' + assert do('a (x:b OR y:"c d") e', ("b", "c")) == 'a (x:b OR y:"c d") e' + + +def test_query_terms(): + qp = QueryParser("a", None) + + q = qp.parse("alfa b:(bravo OR c:charlie) delta") + assert sorted(q.iter_all_terms()) == [("a", "alfa"), ("a", "delta"), + ("b", "bravo"), ("c", "charlie")] + + q = qp.parse("alfa brav*") + assert sorted(q.iter_all_terms()) == [("a", "alfa")] + + q = qp.parse('a b:("b c" d)^2 e') + tokens = [(t.fieldname, t.text, t.boost) for t in q.all_tokens()] + assert tokens == [('a', 'a', 1.0), ('b', 'b', 2.0), ('b', 'c', 2.0), + ('b', 'd', 2.0), ('a', 'e', 1.0)] + + +def test_correct_query(): + schema = fields.Schema(a=fields.TEXT(), b=fields.TEXT) + with TempIndex(schema) as ix: + with ix.writer() as w: + w.add_document(a=u"alfa bravo charlie delta") + w.add_document(a=u"delta echo foxtrot golf") + w.add_document(a=u"golf hotel india juliet") + w.add_document(a=u"juliet kilo lima mike") + + with ix.searcher() as s: + qp = QueryParser("a", ix.schema) + qtext = u'alpha ("brovo november" OR b:dolta) detail' + q = qp.parse(qtext, ix.schema) + + c = s.correct_query(q, qtext) + cq = c.query + assert isinstance(cq, query.And) + assert cq[0].text == "alfa" + assert isinstance(cq[1], query.Or) + assert isinstance(cq[1][0], query.Phrase) + assert cq[1][0].words == ["bravo", "november"] + + qtext = u'alpha b:("brovo november" a:delta) detail' + q = qp.parse(qtext, ix.schema) + c = s.correct_query(q, qtext) + assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)' + assert c.string == 'alfa b:("brovo november" a:delta) detail' + + hf = highlight.HtmlFormatter(classname="c") + assert c.format_string(hf) == 'alfa b:("brovo november" a:delta) detail' + + +def test_spelling_field(): + text = u"rendering shading modeling reactions" + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) + + assert schema["text"].spelling + assert schema["text"].separate_spelling() + + with TempIndex(schema) as ix: + with ix.writer() as w: + w.add_document(text=text) + + with ix.searcher() as s: + r = s.reader() + fieldobj = schema["text"] + words = [fieldobj.from_bytes(t) for t in r.lexicon("text")] + assert words == ["model", "reaction", "render", "shade"] + + words = [fieldobj.from_bytes(t) for t in r.lexicon("spell_text")] + assert words == ["modeling", "reactions", "rendering", "shading"] + + # suggest() automatically looks in the spell_text field because + # it calls fieldobj.spelling_fieldname() first + assert s.suggest("text", "renderink") == ["rendering"] + + with ix.writer() as w: + w.delete_document(0) + + +def test_correct_spell_field(): + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) + with TempIndex(schema) as ix: + with ix.writer() as w: + w.add_document(text=u"rendering shading modeling reactions") + + with ix.searcher() as s: + text = s.schema["text"] + spell_text = s.schema["spell_text"] + + r = s.reader() + words = [text.from_bytes(t) for t in r.lexicon("text")] + assert words == ["model", "reaction", "render", "shade"] + + words = [spell_text.from_bytes(t) for t in r.lexicon("spell_text")] + assert words == ["modeling", "reactions", "rendering", "shading"] + + qp = QueryParser("text", s.schema) + qtext = u"renderink" + q = qp.parse(qtext, s.schema) + + r = s.search(q) + assert len(r) == 0 + + c = s.correct_query(q, qtext) + assert c.string == "rendering" + assert c.query == query.Term("text", "rendering") + + hf = highlight.HtmlFormatter(classname="c") + assert c.format_string(hf) == 'rendering' + + +def test_suggest_prefix(): + domain = ("Shoot To Kill", + "Bloom, Split and Deviate", + "Rankle the Seas and the Skies", + "Lightning Flash Flame Shell", + "Flower Wind Rage and Flower God Roar, Heavenly Wind Rage and " + "Heavenly Demon Sneer", + "All Waves, Rise now and Become my Shield, Lightning, Strike " + "now and Become my Blade", + "Cry, Raise Your Head, Rain Without end", + "Sting All Enemies To Death", + "Reduce All Creation to Ash", + "Sit Upon the Frozen Heavens", + "Call forth the Twilight") + + schema = fields.Schema(content=fields.TEXT(stored=True, ), + quick=fields.NGRAM(maxsize=10, stored=True)) + with TempIndex(schema, "sugprefix") as ix: + with ix.writer() as w: + for item in domain: + content = u(item) + w.add_document(content=content, quick=content) + + with ix.searcher() as s: + sugs = s.suggest("content", u"ra", maxdist=2, prefix=2) + assert sugs == ['rage', 'rain'] + + sugs = s.suggest("content", "ra", maxdist=2, prefix=1) + assert sugs == ["rage", "rain", "roar"] + + +def test_prefix_address(): + fieldtype = fields.TEXT() + schema = fields.Schema(f1=fieldtype, f2=fieldtype) + with TempIndex(schema, "prefixaddr") as ix: + with ix.writer() as w: + w.add_document(f1=u"aabc aawx aaqr aade", + f2=u"aa12 aa34 aa56 aa78") + + with ix.searcher() as s: + sugs = s.suggest("f1", u"aa", maxdist=2, prefix=2) + assert sorted(sugs) == ["aabc", "aade", "aaqr", "aawx"] + + sugs = s.suggest("f2", u"aa", maxdist=2, prefix=2) + assert sorted(sugs) == ["aa12", "aa34", "aa56", "aa78"] + + +def test_correct_correct(): + from whoosh import qparser + + schema = fields.Schema(a=fields.TEXT()) + with TempIndex(schema) as ix: + with ix.writer() as w: + w.add_document(a=u'dworska') + w.add_document(a=u'swojska') + + with ix.searcher() as s: + s = ix.searcher() + qtext = u'dworska' + + qp = qparser.QueryParser('a', ix.schema) + q = qp.parse(qtext, ix.schema) + c = s.correct_query(q, qtext) + + assert c.string == "dworska" + string = c.format_string(highlight.UppercaseFormatter()) + assert string == "dworska" + + +def test_very_long_words(): + import sys + length = int(sys.getrecursionlimit() * 1.5) + + strings1 = [u(chr(i) * length) for i in range(65, 70)] + strings2 = [u(chr(i) * length) for i in range(71, 75)] + + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(text=fields.TEXT(analyzer=ana, )) + with TempIndex(schema) as ix: + with ix.writer() as w: + for string in strings1: + w.add_document(text=string) + + with ix.writer() as w: + for string in strings2: + w.add_document(text=string) + w.optimize = True + + +# def test_add_spelling(): +# schema = fields.Schema(text1=fields.TEXT, text2=fields.TEXT) +# ix = RamStorage().create_index(schema) +# w = ix.writer() +# w.add_document(text1=u"render zorro kaori postal", text2=u"alfa") +# w.add_document(text1=u"reader zebra koala pastry", text2=u"alpa") +# w.add_document(text1=u"leader libra ooala paster", text2=u"alpha") +# w.add_document(text1=u"feeder lorry zoala baster", text2=u"olfo") +# w.commit() +# +# with ix.reader() as r: +# assert not r.has_word_graph("text1") +# assert not r.has_word_graph("text2") +# +# from whoosh.writing import add_spelling +# add_spelling(ix, ["text1", "text2"]) +# +# with ix.reader() as r: +# assert r.has_word_graph("text1") +# assert r.has_word_graph("text2") +# +# sp = spelling.ReaderCorrector(r, "text1") +# assert sp.suggest(u"kaola", maxdist=1) == [u'koala'] +# assert sp.suggest(u"kaola", maxdist=2) == [u'koala', u'kaori', u'ooala', u'zoala'] +# +# sp = spelling.ReaderCorrector(r, "text2") +# assert sp.suggest(u"alfo", maxdist=1) == [u"alfa", u"olfo"] + + +# def test_multicorrector(): +# schema = fields.Schema(text=fields.TEXT()) +# ix = RamStorage().create_index(schema) +# domain = u"special specious spectacular spongy spring specials".split() +# for word in domain: +# w = ix.writer() +# w.add_document(text=word) +# w.commit(merge=False) +# +# c1 = ix.reader().corrector("text") +# +# wordlist = sorted(u"bear bare beer sprung".split()) +# c2 = words_to_corrector(wordlist) +# +# mc = spelling.MultiCorrector([c1, c2]) +# assert mc.suggest("specail") == ["special", "specials"] +# assert mc.suggest("beur") == ["bear", "beer"] +# assert mc.suggest("sprang") == ["sprung", "spring"] diff --git a/tests/test_tables.py b/tests/test_tables.py new file mode 100644 index 0000000..dab02ce --- /dev/null +++ b/tests/test_tables.py @@ -0,0 +1,215 @@ +# encoding: utf-8 + +from __future__ import with_statement +import random + +from whoosh.compat import b, xrange, iteritems +from whoosh.filedb.filestore import RamStorage +from whoosh.filedb.filetables import HashReader, HashWriter +from whoosh.filedb.filetables import OrderedHashWriter, OrderedHashReader +from whoosh.util.testing import TempStorage + + +def test_hash_single(): + st = RamStorage() + hw = HashWriter(st.create_file("test.hsh")) + hw.add(b("alfa"), b("bravo")) + hw.close() + + hr = HashReader.open(st, "test.hsh") + assert hr.get(b("alfa")) == b("bravo") + assert hr.get(b("foo")) is None + + +def test_hash(): + with TempStorage("hash") as st: + hwf = st.create_file("test.hsh") + hw = HashWriter(hwf) + hw.add(b("foo"), b("bar")) + hw.add(b("glonk"), b("baz")) + hw.close() + + hr = HashReader.open(st, "test.hsh") + assert hr.get(b("foo")) == b("bar") + assert hr.get(b("baz")) is None + hr.close() + + +def test_hash_extras(): + st = RamStorage() + hw = HashWriter(st.create_file("test.hsh")) + hw.extras["test"] = 100 + hw.add(b("foo"), b("bar")) + hw.add(b("glonk"), b("baz")) + hw.close() + + hr = HashReader.open(st, "test.hsh") + assert hr.extras["test"] == 100 + assert hr.get(b("foo")) == b("bar") + assert hr.get(b("baz")) is None + hr.close() + + +def test_hash_contents(): + samp = [('alfa', 'bravo'), ('charlie', 'delta'), ('echo', 'foxtrot'), + ('golf', 'hotel'), ('india', 'juliet'), ('kilo', 'lima'), + ('mike', 'november'), ('oskar', 'papa'), ('quebec', 'romeo'), + ('sierra', 'tango'), ('ultra', 'victor'), ('whiskey', 'xray'), + ] + # Convert to bytes + samp = set((b(k), b(v)) for k, v in samp) + + with TempStorage("hashcontents") as st: + hw = HashWriter(st.create_file("test.hsh")) + hw.add_all(samp) + hw.close() + + hr = HashReader.open(st, "test.hsh") + + probes = list(samp) + random.shuffle(probes) + for key, value in probes: + assert hr[key] == value + + assert set(hr.keys()) == set([k for k, v in samp]) + assert set(hr.values()) == set([v for k, v in samp]) + assert set(hr.items()) == samp + + hr.close() + + +def test_random_hash(): + from string import ascii_letters as domain + + times = 1000 + minlen = 1 + maxlen = len(domain) + + def randstring(): + s = "".join(random.sample(domain, random.randint(minlen, maxlen))) + return b(s) + + with TempStorage("randomhash") as st: + samp = dict((randstring(), randstring()) for _ in xrange(times)) + + hw = HashWriter(st.create_file("test.hsh")) + for k, v in iteritems(samp): + hw.add(k, v) + hw.close() + + keys = list(samp.keys()) + random.shuffle(keys) + hr = HashReader.open(st, "test.hsh") + for k in keys: + assert hr[k] == samp[k] + hr.close() + + +def test_random_access(): + times = 1000 + with TempStorage("orderedhash") as st: + hw = HashWriter(st.create_file("test.hsh")) + hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times)) + hw.close() + + keys = list(range(times)) + random.shuffle(keys) + hr = HashReader.open(st, "test.hsh") + for x in keys: + assert hr[b("%08x" % x)] == b(str(x)) + hr.close() + + +def test_ordered_closest(): + keys = ['alfa', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf', + 'hotel', 'india', 'juliet', 'kilo', 'lima', 'mike', 'november'] + # Make into bytes for Python 3 + keys = [b(k) for k in keys] + values = [str(len(k)).encode("ascii") for k in keys] + + with TempStorage("orderedclosest") as st: + hw = OrderedHashWriter(st.create_file("test.hsh")) + hw.add_all(zip(keys, values)) + hw.close() + + hr = OrderedHashReader.open(st, "test.hsh") + ck = hr.closest_key + assert ck(b('')) == b('alfa') + assert ck(b(' ')) == b('alfa') + assert ck(b('alfa')) == b('alfa') + assert ck(b('bravot')) == b('charlie') + assert ck(b('charlie')) == b('charlie') + assert ck(b('kiloton')) == b('lima') + assert ck(b('oskar')) is None + assert list(hr.keys()) == keys + assert list(hr.values()) == values + assert list(hr.keys_from(b('f'))) == keys[5:] + hr.close() + + +def test_extras(): + st = RamStorage() + hw = HashWriter(st.create_file("test")) + hw.extras["test"] = 100 + hw.extras["blah"] = "foo" + hw.close() + + hr = HashReader(st.open_file("test"), st.file_length("test")) + assert hr.extras["test"] == 100 + assert hr.extras["blah"] == "foo" + hr.close() + + hw = OrderedHashWriter(st.create_file("test")) + hw.extras["test"] = 100 + hw.extras["blah"] = "foo" + hw.close() + + hr = HashReader(st.open_file("test"), st.file_length("test")) + assert hr.extras["test"] == 100 + assert hr.extras["blah"] == "foo" + hr.close() + + hr = OrderedHashReader(st.open_file("test"), st.file_length("test")) + assert hr.extras["test"] == 100 + assert hr.extras["blah"] == "foo" + hr.close() + + +def test_checksum_file(): + from whoosh.filedb.structfile import ChecksumFile + from zlib import crc32 + + def wr(f): + f.write(b("Testing")) + f.write_int(-100) + f.write_varint(10395) + f.write_string(b("Hello")) + f.write_ushort(32959) + + st = RamStorage() + # Write a file normally + f = st.create_file("control") + wr(f) + f.close() + # Checksum the contents + f = st.open_file("control") + target = crc32(f.read()) & 0xffffffff + f.close() + + # Write a file with checksumming + f = st.create_file("test") + cf = ChecksumFile(f) + wr(cf) + assert cf.checksum() == target + f.close() + + # Read the file with checksumming + f = st.open_file("test") + cf = ChecksumFile(f) + assert cf.read(7) == b("Testing") + assert cf.read_int() == -100 + assert cf.read_varint() == 10395 + assert cf.read_string() == b("Hello") + assert cf.read_ushort() == 32959 + assert cf.checksum() == target + cf.close() diff --git a/tests/test_vectors.py b/tests/test_vectors.py new file mode 100644 index 0000000..11e3c0b --- /dev/null +++ b/tests/test_vectors.py @@ -0,0 +1,103 @@ +from __future__ import with_statement + +from whoosh import fields, formats +from whoosh.compat import u +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex + + +def test_single_term(): + schema = fields.Schema(text=fields.TEXT(vector=True)) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(text=u("TEST TEST TEST")) + with ix.searcher() as s: + v = s.vector(0, "text") + assert v.is_active() + + +def test_vector_reading(): + schema = fields.Schema(title=fields.TEXT, + content=fields.TEXT(vector=formats.Frequency())) + + with TempIndex(schema, "vectorreading") as ix: + writer = ix.writer() + writer.add_document(title=u("one"), + content=u("This is the story of the black " + + "hole story")) + writer.commit() + + with ix.reader() as r: + assert list(r.vector_as("frequency", 0, "content")) == [(u('black'), 1), (u('hole'), 1), (u('story'), 2)] + + +def test_vector_merge(): + schema = fields.Schema(title=fields.TEXT, + content=fields.TEXT(vector=formats.Frequency())) + + with TempIndex(schema, "vectormerge") as ix: + writer = ix.writer() + writer.add_document(title=u("one"), + content=u("This is the story of the black hole " + + "story")) + writer.commit() + + writer = ix.writer() + writer.add_document(title=u("two"), + content=u("You can read along in your book")) + writer.commit() + + with ix.searcher() as s: + r = s.reader() + + docnum = s.document_number(title=u("one")) + vec = list(r.vector_as("frequency", docnum, "content")) + assert vec == [(u('black'), 1), (u('hole'), 1), (u('story'), 2)] + + docnum = s.document_number(title=u("two")) + + vec = list(r.vector_as("frequency", docnum, "content")) + assert vec == [(u('along'), 1), (u('book'), 1), (u('read'), 1)] + + +def test_vector_unicode(): + cf = fields.TEXT(vector=True) + schema = fields.Schema(id=fields.NUMERIC, text=cf) + with TempIndex(schema) as ix: + with ix.writer() as w: + w.add_document(id=0, text=u"\u13a0\u13a1\u13a2 \u13a3\u13a4\u13a5") + w.add_document(id=1, text=u"\u13a6\u13a7\u13a8 \u13a9\u13aa\u13ab") + + with ix.writer() as w: + w.add_document(id=2, text=u"\u13ac\u13ad\u13ae \u13af\u13b0\u13b1") + w.add_document(id=3, text=u"\u13b2\u13b3\u13b4 \u13b5\u13b6\u13b7") + + with ix.searcher() as s: + docnum = s.document_number(id=2) + vec = list(s.vector_as("frequency", docnum, "text")) + assert len(vec) == 2 + + assert vec[0][0] == u"\u13ac\u13ad\u13ae" + assert vec[0][1] == 1 + + assert vec[1][0] == u"\u13af\u13b0\u13b1" + assert vec[1][1] == 1 + + +def test_add_vectored_field(): + schema = fields.Schema(id=fields.ID(stored=True), f1=fields.TEXT) + ix = RamStorage().create_index(schema) + with ix.writer() as w: + w.add_document(id=u("a"), f1=u("Testing one two three")) + + with ix.writer() as w: + w.add_field("f2", fields.TEXT(vector=True)) + w.add_document(id=u("b"), f2=u("Frosting four five six")) + + with ix.searcher() as s: + docnum1 = s.document_number(id="a") + assert not s.has_vector(docnum1, "f1") + + docnum2 = s.document_number(id="b") + assert not s.has_vector(docnum2, "f1") + assert s.has_vector(docnum2, "f2") diff --git a/tests/test_weightings.py b/tests/test_weightings.py new file mode 100644 index 0000000..a83aab6 --- /dev/null +++ b/tests/test_weightings.py @@ -0,0 +1,81 @@ +from __future__ import with_statement +import inspect +from random import choice, randint +import sys + +from whoosh import fields, query, scoring +from whoosh.compat import u, xrange, permutations +from whoosh.filedb.filestore import RamStorage + + +def _weighting_classes(ignore): + # Get all the subclasses of Weighting in whoosh.scoring + return [c for _, c in inspect.getmembers(scoring, inspect.isclass) + if scoring.Weighting in c.__bases__ and c not in ignore] + + +def test_all(): + domain = [u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), + u("foxtrot")] + schema = fields.Schema(text=fields.TEXT) + storage = RamStorage() + ix = storage.create_index(schema) + w = ix.writer() + for _ in xrange(100): + w.add_document(text=u(" ").join(choice(domain) + for _ in xrange(randint(10, 20)))) + w.commit() + + # List ABCs that should not be tested + abcs = () + # provide initializer arguments for any weighting classes that require them + init_args = {"MultiWeighting": ([scoring.BM25F()], + {"text": scoring.Frequency()}), + "ReverseWeighting": ([scoring.BM25F()], {})} + + for wclass in _weighting_classes(abcs): + try: + if wclass.__name__ in init_args: + args, kwargs = init_args[wclass.__name__] + weighting = wclass(*args, **kwargs) + else: + weighting = wclass() + except TypeError: + e = sys.exc_info()[1] + raise TypeError("Error instantiating %r: %s" % (wclass, e)) + + with ix.searcher(weighting=weighting) as s: + try: + for word in domain: + s.search(query.Term("text", word)) + except Exception: + e = sys.exc_info()[1] + e.msg = "Error searching with %r: %s" % (wclass, e) + raise + + +def test_compatibility(): + from whoosh.scoring import Weighting + + # This is the old way of doing a custom weighting model, check that + # it's still supported... + class LegacyWeighting(Weighting): + use_final = True + + def score(self, searcher, fieldname, text, docnum, weight): + return weight + 0.5 + + def final(self, searcher, docnum, score): + return score * 1.5 + + schema = fields.Schema(text=fields.TEXT) + ix = RamStorage().create_index(schema) + w = ix.writer() + domain = "alfa bravo charlie delta".split() + for ls in permutations(domain, 3): + w.add_document(text=u(" ").join(ls)) + w.commit() + + s = ix.searcher(weighting=LegacyWeighting()) + r = s.search(query.Term("text", u("bravo"))) + assert r.score(0) == 2.25 diff --git a/tests/test_writing.py b/tests/test_writing.py new file mode 100644 index 0000000..5b985f7 --- /dev/null +++ b/tests/test_writing.py @@ -0,0 +1,430 @@ +from __future__ import with_statement +import random, time, threading + +import pytest + +from whoosh import analysis, fields, query, writing +from whoosh.compat import b, u, xrange, text_type +from whoosh.filedb.filestore import RamStorage +from whoosh.util.testing import TempIndex + + +def test_no_stored(): + schema = fields.Schema(id=fields.ID, text=fields.TEXT) + with TempIndex(schema, "nostored") as ix: + domain = (u"alfa", u"bravo", u"charlie", u"delta", u"echo", + u"foxtrot", u"golf", u"hotel", u"india") + + w = ix.writer() + for i in xrange(20): + w.add_document(id=text_type(i), + text=u" ".join(random.sample(domain, 5))) + w.commit() + + with ix.reader() as r: + assert sorted([int(id) for id in r.lexicon("id")]) == list(range(20)) + + +def test_asyncwriter(): + schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) + with TempIndex(schema, "asyncwriter") as ix: + domain = (u"alfa", u"bravo", u"charlie", u"delta", u"echo", + u"foxtrot", u"golf", u"hotel", u"india") + + writers = [] + # Simulate doing 20 (near-)simultaneous commits. If we weren't using + # AsyncWriter, at least some of these would fail because the first + # writer wouldn't be finished yet. + for i in xrange(20): + w = writing.AsyncWriter(ix) + writers.append(w) + w.add_document(id=text_type(i), + text=u" ".join(random.sample(domain, 5))) + w.commit() + + # Wait for all writers to finish before checking the results + for w in writers: + if w.running: + w.join() + + # Check whether all documents made it into the index. + with ix.reader() as r: + assert sorted([int(id) for id in r.lexicon("id")]) == list(range(20)) + + +def test_asyncwriter_no_stored(): + schema = fields.Schema(id=fields.ID, text=fields.TEXT) + with TempIndex(schema, "asyncnostored") as ix: + domain = (u"alfa", u"bravo", u"charlie", u"delta", u"echo", + u"foxtrot", u"golf", u"hotel", u"india") + + writers = [] + # Simulate doing 20 (near-)simultaneous commits. If we weren't using + # AsyncWriter, at least some of these would fail because the first + # writer wouldn't be finished yet. + for i in xrange(20): + w = writing.AsyncWriter(ix) + writers.append(w) + w.add_document(id=text_type(i), + text=u" ".join(random.sample(domain, 5))) + w.commit() + + # Wait for all writers to finish before checking the results + for w in writers: + if w.running: + w.join() + + # Check whether all documents made it into the index. + with ix.reader() as r: + assert sorted([int(id) for id in r.lexicon("id")]) == list(range(20)) + + +def test_updates(): + schema = fields.Schema(id=fields.ID(unique=True, stored=True)) + ix = RamStorage().create_index(schema) + for _ in xrange(10): + with ix.writer() as w: + w.update_document(id=u"a") + assert ix.doc_count() == 1 + + +def test_buffered(): + schema = fields.Schema(id=fields.ID, text=fields.TEXT) + with TempIndex(schema, "buffered") as ix: + domain = u"alfa bravo charlie delta echo foxtrot golf hotel india" + domain = domain.split() + + w = writing.BufferedWriter(ix, period=None, limit=10, + commitargs={"merge": False}) + for i in xrange(20): + w.add_document(id=text_type(i), + text=u" ".join(random.sample(domain, 5))) + time.sleep(0.1) + w.close() + + assert len(ix._segments()) == 2 + + +def test_buffered_search(): + schema = fields.Schema(id=fields.STORED, text=fields.TEXT) + with TempIndex(schema, "bufferedsearch") as ix: + w = writing.BufferedWriter(ix, period=None, limit=5) + w.add_document(id=1, text=u"alfa bravo charlie") + w.add_document(id=2, text=u"bravo tango delta") + w.add_document(id=3, text=u"tango delta echo") + w.add_document(id=4, text=u"charlie delta echo") + + with w.searcher() as s: + r = s.search(query.Term("text", u"tango")) + assert sorted([d["id"] for d in r]) == [2, 3] + + w.add_document(id=5, text=u"foxtrot golf hotel") + w.add_document(id=6, text=u"india tango juliet") + w.add_document(id=7, text=u"tango kilo lima") + w.add_document(id=8, text=u"mike november echo") + + with w.searcher() as s: + r = s.search(query.Term("text", u"tango")) + assert sorted([d["id"] for d in r]) == [2, 3, 6, 7] + + w.close() + + +def test_buffered_update(): + schema = fields.Schema(id=fields.ID(stored=True, unique=True), + payload=fields.STORED) + with TempIndex(schema, "bufferedupdate") as ix: + w = writing.BufferedWriter(ix, period=None, limit=5) + for i in xrange(10): + for char in u"abc": + fs = dict(id=char, payload=text_type(i) + char) + w.update_document(**fs) + + with w.reader() as r: + sfs = [sf for _, sf in r.iter_docs()] + sfs = sorted(sfs, key=lambda x: x["id"]) + assert sfs == [{'id': u('a'), 'payload': u('9a')}, + {'id': u('b'), 'payload': u('9b')}, + {'id': u('c'), 'payload': u('9c')}] + assert r.doc_count() == 3 + + w.close() + + +def test_buffered_threads(): + domain = u"alfa bravo charlie delta".split() + schema = fields.Schema(name=fields.ID(unique=True, stored=True)) + with TempIndex(schema, "buffthreads") as ix: + w = writing.BufferedWriter(ix, limit=10) + + class SimWriter(threading.Thread): + def run(self): + for _ in xrange(5): + w.update_document(name=random.choice(domain)) + time.sleep(random.uniform(0.01, 0.1)) + + threads = [SimWriter() for _ in xrange(5)] + for thread in threads: + thread.start() + for thread in threads: + thread.join() + w.close() + + with ix.reader() as r: + assert r.doc_count() == 4 + names = sorted([d["name"] for d in r.all_stored_fields()]) + assert names == domain + + +def test_fractional_weights(): + ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() + + # With Positions format + schema = fields.Schema(f=fields.TEXT(analyzer=ana)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(f=u"alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5") + w.commit() + + with ix.searcher() as s: + wts = [] + for word in s.lexicon("f"): + p = s.postings("f", word) + wts.append(p.weight()) + assert wts == [0.5, 1.5, 2.0, 1.5] + + # Try again with Frequency format + schema = fields.Schema(f=fields.TEXT(analyzer=ana, phrase=False)) + ix = RamStorage().create_index(schema) + w = ix.writer() + w.add_document(f=u"alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5") + w.commit() + + with ix.searcher() as s: + wts = [] + for word in s.lexicon("f"): + p = s.postings("f", word) + wts.append(p.weight()) + assert wts == [0.5, 1.5, 2.0, 1.5] + + +def test_cancel_delete(): + schema = fields.Schema(id=fields.ID(stored=True)) + # Single segment + with TempIndex(schema, "canceldelete1") as ix: + w = ix.writer() + for char in u"ABCD": + w.add_document(id=char) + w.commit() + + with ix.reader() as r: + assert not r.has_deletions() + + w = ix.writer() + w.delete_document(2) + w.delete_document(3) + w.cancel() + + with ix.reader() as r: + assert not r.has_deletions() + assert not r.is_deleted(2) + assert not r.is_deleted(3) + + # Multiple segments + with TempIndex(schema, "canceldelete2") as ix: + for char in u"ABCD": + w = ix.writer() + w.add_document(id=char) + w.commit(merge=False) + + with ix.reader() as r: + assert not r.has_deletions() + + w = ix.writer() + w.delete_document(2) + w.delete_document(3) + w.cancel() + + with ix.reader() as r: + assert not r.has_deletions() + assert not r.is_deleted(2) + assert not r.is_deleted(3) + + +def test_delete_nonexistant(): + from whoosh.writing import IndexingError + + schema = fields.Schema(id=fields.ID(stored=True)) + # Single segment + with TempIndex(schema, "deletenon1") as ix: + w = ix.writer() + for char in u"ABC": + w.add_document(id=char) + w.commit() + + try: + w = ix.writer() + with pytest.raises(IndexingError): + w.delete_document(5) + finally: + w.cancel() + + # Multiple segments + with TempIndex(schema, "deletenon1") as ix: + for char in u"ABC": + w = ix.writer() + w.add_document(id=char) + w.commit(merge=False) + + try: + w = ix.writer() + with pytest.raises(IndexingError): + w.delete_document(5) + finally: + w.cancel() + + +def test_add_field(): + schema = fields.Schema(a=fields.TEXT) + with TempIndex(schema, "addfield") as ix: + with ix.writer() as w: + w.add_document(a=u"alfa bravo charlie") + with ix.writer() as w: + w.add_field("b", fields.ID(stored=True)) + w.add_field("c*", fields.ID(stored=True), glob=True) + w.add_document(a=u"delta echo foxtrot", b=u"india", cat=u"juliet") + + with ix.searcher() as s: + fs = s.document(b=u"india") + assert fs == {"b": "india", "cat": "juliet"} + + +def test_add_reader(): + schema = fields.Schema(i=fields.ID(stored=True, unique=True), + a=fields.TEXT(stored=True, spelling=True), + b=fields.TEXT(vector=True)) + with TempIndex(schema, "addreader") as ix: + with ix.writer() as w: + w.add_document(i=u"0", a=u"alfa bravo charlie delta", + b=u"able baker coxwell dog") + w.add_document(i=u"1", a=u"bravo charlie delta echo", + b=u"elf fabio gong hiker") + w.add_document(i=u"2", a=u"charlie delta echo foxtrot", + b=u"india joker king loopy") + w.add_document(i=u"3", a=u"delta echo foxtrot golf", + b=u"mister noogie oompah pancake") + + with ix.writer() as w: + w.delete_by_term("i", "1") + w.delete_by_term("i", "3") + + with ix.writer() as w: + w.add_document(i=u"4", a=u"hotel india juliet kilo", + b=u"quick rhubarb soggy trap") + w.add_document(i=u"5", a=u"india juliet kilo lima", + b=u"umber violet weird xray") + w.optimize = True + + with ix.reader() as r: + assert r.doc_count() == 4 + + sfs = sorted(r.all_stored_fields(), key=lambda d: d["i"]) + assert sfs == [ + {"i": u"0", "a": u"alfa bravo charlie delta"}, + {"i": u"2", "a": u"charlie delta echo foxtrot"}, + {"i": u"4", "a": u"hotel india juliet kilo"}, + {"i": u"5", "a": u"india juliet kilo lima"}, + ] + + assert " ".join(r.field_terms("a")) == "alfa bravo charlie delta echo foxtrot hotel india juliet kilo lima" + + vs = [] + for docnum in r.all_doc_ids(): + v = r.vector(docnum, "b") + vs.append(list(v.all_ids())) + assert vs == [["quick", "rhubarb", "soggy", "trap"], + ["umber", "violet", "weird", "xray"], + ["able", "baker", "coxwell", "dog"], + ["india", "joker", "king", "loopy"] + ] + + +def test_add_reader_spelling(): + # Test whether add_spell_word() items get copied over in a merge + + # Because b is stemming and spelled, it will use add_spell_word() + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(a=fields.TEXT(analyzer=ana), + b=fields.TEXT(analyzer=ana, spelling=True)) + + with TempIndex(schema, "addreadersp") as ix: + with ix.writer() as w: + w.add_document(a=u"rendering modeling", + b=u"rendering modeling") + w.add_document(a=u"flying rolling", + b=u"flying rolling") + + with ix.writer() as w: + w.add_document(a=u"writing eyeing", + b=u"writing eyeing") + w.add_document(a=u"undoing indicating", + b=u"undoing indicating") + w.optimize = True + + with ix.reader() as r: + sws = list(r.lexicon("spell_b")) + assert sws == [b"eyeing", b"flying", b"indicating", b"modeling", + b"rendering", b"rolling", b"undoing", b"writing"] + + assert list(r.terms_within("a", "undoink", 1)) == [] + assert list(r.terms_within("b", "undoink", 1)) == ["undoing"] + + +def test_clear(): + schema = fields.Schema(a=fields.KEYWORD) + ix = RamStorage().create_index(schema) + + # Add some segments + with ix.writer() as w: + w.add_document(a=u"one two three") + w.merge = False + with ix.writer() as w: + w.add_document(a=u"two three four") + w.merge = False + with ix.writer() as w: + w.add_document(a=u"three four five") + w.merge = False + + # Clear + with ix.writer() as w: + w.add_document(a=u"foo bar baz") + w.mergetype = writing.CLEAR + + with ix.searcher() as s: + assert s.doc_count_all() == 1 + assert list(s.reader().lexicon("a")) == [b("bar"), b("baz"), b("foo")] + + +def test_spellable_list(): + # Make sure a spellable field works with a list of pre-analyzed tokens + + ana = analysis.StemmingAnalyzer() + schema = fields.Schema(Location=fields.STORED,Lang=fields.STORED, + Title=fields.TEXT(spelling=True, analyzer=ana)) + ix = RamStorage().create_index(schema) + + doc = {'Location': '1000/123', 'Lang': 'E', + 'Title': ['Introduction', 'Numerical', 'Analysis']} + + with ix.writer() as w: + w.add_document(**doc) + + +def test_zero_procs(): + schema = fields.Schema(text=fields.TEXT) + ix = RamStorage().create_index(schema) + with ix.writer(procs=0) as w: + assert isinstance(w, writing.IndexWriter) + + with ix.writer(procs=1) as w: + assert isinstance(w, writing.IndexWriter)