104 lines
3.6 KiB
Python
104 lines
3.6 KiB
Python
from __future__ import with_statement
|
|
|
|
from whoosh import fields, formats
|
|
from whoosh.compat import u
|
|
from whoosh.filedb.filestore import RamStorage
|
|
from whoosh.util.testing import TempIndex
|
|
|
|
|
|
def test_single_term():
|
|
schema = fields.Schema(text=fields.TEXT(vector=True))
|
|
ix = RamStorage().create_index(schema)
|
|
with ix.writer() as w:
|
|
w.add_document(text=u("TEST TEST TEST"))
|
|
with ix.searcher() as s:
|
|
v = s.vector(0, "text")
|
|
assert v.is_active()
|
|
|
|
|
|
def test_vector_reading():
|
|
schema = fields.Schema(title=fields.TEXT,
|
|
content=fields.TEXT(vector=formats.Frequency()))
|
|
|
|
with TempIndex(schema, "vectorreading") as ix:
|
|
writer = ix.writer()
|
|
writer.add_document(title=u("one"),
|
|
content=u("This is the story of the black " +
|
|
"hole story"))
|
|
writer.commit()
|
|
|
|
with ix.reader() as r:
|
|
assert list(r.vector_as("frequency", 0, "content")) == [(u('black'), 1), (u('hole'), 1), (u('story'), 2)]
|
|
|
|
|
|
def test_vector_merge():
|
|
schema = fields.Schema(title=fields.TEXT,
|
|
content=fields.TEXT(vector=formats.Frequency()))
|
|
|
|
with TempIndex(schema, "vectormerge") as ix:
|
|
writer = ix.writer()
|
|
writer.add_document(title=u("one"),
|
|
content=u("This is the story of the black hole " +
|
|
"story"))
|
|
writer.commit()
|
|
|
|
writer = ix.writer()
|
|
writer.add_document(title=u("two"),
|
|
content=u("You can read along in your book"))
|
|
writer.commit()
|
|
|
|
with ix.searcher() as s:
|
|
r = s.reader()
|
|
|
|
docnum = s.document_number(title=u("one"))
|
|
vec = list(r.vector_as("frequency", docnum, "content"))
|
|
assert vec == [(u('black'), 1), (u('hole'), 1), (u('story'), 2)]
|
|
|
|
docnum = s.document_number(title=u("two"))
|
|
|
|
vec = list(r.vector_as("frequency", docnum, "content"))
|
|
assert vec == [(u('along'), 1), (u('book'), 1), (u('read'), 1)]
|
|
|
|
|
|
def test_vector_unicode():
|
|
cf = fields.TEXT(vector=True)
|
|
schema = fields.Schema(id=fields.NUMERIC, text=cf)
|
|
with TempIndex(schema) as ix:
|
|
with ix.writer() as w:
|
|
w.add_document(id=0, text=u"\u13a0\u13a1\u13a2 \u13a3\u13a4\u13a5")
|
|
w.add_document(id=1, text=u"\u13a6\u13a7\u13a8 \u13a9\u13aa\u13ab")
|
|
|
|
with ix.writer() as w:
|
|
w.add_document(id=2, text=u"\u13ac\u13ad\u13ae \u13af\u13b0\u13b1")
|
|
w.add_document(id=3, text=u"\u13b2\u13b3\u13b4 \u13b5\u13b6\u13b7")
|
|
|
|
with ix.searcher() as s:
|
|
docnum = s.document_number(id=2)
|
|
vec = list(s.vector_as("frequency", docnum, "text"))
|
|
assert len(vec) == 2
|
|
|
|
assert vec[0][0] == u"\u13ac\u13ad\u13ae"
|
|
assert vec[0][1] == 1
|
|
|
|
assert vec[1][0] == u"\u13af\u13b0\u13b1"
|
|
assert vec[1][1] == 1
|
|
|
|
|
|
def test_add_vectored_field():
|
|
schema = fields.Schema(id=fields.ID(stored=True), f1=fields.TEXT)
|
|
ix = RamStorage().create_index(schema)
|
|
with ix.writer() as w:
|
|
w.add_document(id=u("a"), f1=u("Testing one two three"))
|
|
|
|
with ix.writer() as w:
|
|
w.add_field("f2", fields.TEXT(vector=True))
|
|
w.add_document(id=u("b"), f2=u("Frosting four five six"))
|
|
|
|
with ix.searcher() as s:
|
|
docnum1 = s.document_number(id="a")
|
|
assert not s.has_vector(docnum1, "f1")
|
|
|
|
docnum2 = s.document_number(id="b")
|
|
assert not s.has_vector(docnum2, "f1")
|
|
assert s.has_vector(docnum2, "f2")
|