开源搜索组件whoosh笔记

2018-08-17 本文已影响127人在下GoGo闯

coding=utf-8

import os,json,time
from whoosh.index import create_in
from whoosh.fields import *
from jieba.analyse import ChineseAnalyzer
from whoosh.qparser import QueryParser
from whoosh import qparser, scoring
from whoosh import index
import MySQLdb as mdb

reload(sys)
sys.setdefaultencoding('utf8')

start = time.time()
con = mdb.connect('localhost','root','ab24562660','kznews',charset='utf8',unix_socket='/tmp/mysql.sock')

def new_index_sql():
# 按照schema定义信息，增加需要建立索引的文档
# 注意：字符串格式需要为unicode格式

writer = ix.writer(limitmb=256,procs=4)
cur = con.cursor()

n = 0
a = cur.execute("select count(*) from xinwen ")
number = int(cur.fetchone()[0])

with con:
    cur.execute("select id,cid,title from xinwen")
    numrows = int(cur.rowcount)
    for i in range(numrows):
        row = cur.fetchone()

        pid = row[0]
        cid = row[1]
        title = row[2]

        writer.add_document(title=title, pid=pid, cid=cid)

        n += 1
        percent=float(n)*100/float(number)
        sys.stdout.write("-----------> 完成百分比：%.2f" % percent)
        sys.stdout.write("%\r")
        sys.stdout.flush()

writer.commit()
sys.stdout.flush()

def search_index(words):
with ix.searcher() as s:
qp = QueryParser('title',schema=ix.schema)
for word in words:
q = qp.parse(u'%s' % word)
results = s.search(q,limit=20)
for i in results:
print word + "----->" + i["title"],i.highlights("title"),i.score

使用结巴中文分词

analyzer = ChineseAnalyzer()

创建schema, stored为True表示能够被检索

schema = Schema(
title=TEXT(stored=True, analyzer=analyzer),
pid=NUMERIC(stored=True),
cid=NUMERIC(stored=True),
)

存储schema信息至'indexdir'目录下

indexdir = 'xinwen_search/'
if not os.path.exists(indexdir):
os.mkdir(indexdir)
try:
ix=index.open_dir(indexdir)
print '>>>>>>>> 已创建索引 <<<<<<<<<<'
except:
print '>>>>>>>> 未创建索引 <<<<<<<<<<'
ix = create_in(indexdir, schema)
new_index_sql()

words = ["太阳","下雨","月亮","螺丝","刀把","挖掘机"]
search_index(words)
end = time.time()
print "完成时间: %f s" % (end - start)
在创建索引的速度上真心慢的一逼，200万条标题，xunsearch创建索引大概花半小时，whoosh花了一个多小时。

但是创建索引速度慢不要紧，大不了在晚上运行，只要查询索引速度不慢就好了。

况且，whoosh索引数据的体积很小，200万条标题，创建索引后只有不到700M。

而xunsearch创建完后居然有几个G，太尼玛占空间了。但是看了文档，估计whoosh在搜索相关性和搜索灵活性上比不过xunsearch。则实际情况，相机选择。

微信公众号：流量贩子

GoGo闯的公众号

知识星球（日后有福利发之，比如一段能写黄段子的Python代码~~~）

GoGo闯的知识星球

开源搜索组件whoosh笔记

coding=utf-8

使用结巴中文分词

创建schema, stored为True表示能够被检索

存储schema信息至'indexdir'目录下

猜你喜欢

热点阅读