sphinxapi.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579
  1. #
  2. # $Id$
  3. #
  4. # Python version of Sphinx searchd client (Python API)
  5. #
  6. # Copyright (c) 2006-2007, Andrew Aksyonoff
  7. # Copyright (c) 2006, Mike Osadnik
  8. # All rights reserved
  9. #
  10. # This program is free software; you can redistribute it and/or modify
  11. # it under the terms of the GNU General Public License. You should have
  12. # received a copy of the GPL license along with this program; if you
  13. # did not, you can find it at http://www.gnu.org/
  14. #
  15. import select
  16. import socket
  17. from struct import *
  18. # known searchd commands
  19. SEARCHD_COMMAND_SEARCH = 0
  20. SEARCHD_COMMAND_EXCERPT = 1
  21. # current client-side command implementation versions
  22. VER_COMMAND_SEARCH = 0x107
  23. VER_COMMAND_EXCERPT = 0x100
  24. # known searchd status codes
  25. SEARCHD_OK = 0
  26. SEARCHD_ERROR = 1
  27. SEARCHD_RETRY = 2
  28. SEARCHD_WARNING = 3
  29. # known match modes
  30. SPH_MATCH_ALL = 0
  31. SPH_MATCH_ANY = 1
  32. SPH_MATCH_PHRASE = 2
  33. SPH_MATCH_BOOLEAN = 3
  34. SPH_MATCH_EXTENDED = 4
  35. # known sort modes
  36. SPH_SORT_RELEVANCE = 0
  37. SPH_SORT_ATTR_DESC = 1
  38. SPH_SORT_ATTR_ASC = 2
  39. SPH_SORT_TIME_SEGMENTS = 3
  40. SPH_SORT_EXTENDED = 4
  41. # known attribute types
  42. SPH_ATTR_INTEGER = 1
  43. SPH_ATTR_TIMESTAMP = 2
  44. # known grouping functions
  45. SPH_GROUPBY_DAY = 0
  46. SPH_GROUPBY_WEEK = 1
  47. SPH_GROUPBY_MONTH = 2
  48. SPH_GROUPBY_YEAR = 3
  49. SPH_GROUPBY_ATTR = 4
  50. class SphinxClient:
  51. _host = 'localhost' # searchd host (default is "localhost")
  52. _port = 3312 # searchd port (default is 3312)
  53. _offset = 0 # how much records to seek from result-set start (default is 0)
  54. _limit = 20 # how much records to return from result-set starting at offset (default is 20)
  55. _mode = SPH_MATCH_ALL # query matching mode (default is SPH_MATCH_ALL)
  56. _weights = [] # per-field weights (default is 1 for all fields)
  57. _sort = SPH_SORT_RELEVANCE # match sorting mode (default is SPH_SORT_RELEVANCE)
  58. _sortby = '' # attribute to sort by (defualt is "")
  59. _min_id = 0 # min ID to match (default is 0)
  60. _max_id = 0xFFFFFFFF # max ID to match (default is UINT_MAX)
  61. _filters = [] # search filters
  62. _groupby = '' # group-by attribute name
  63. _groupfunc = SPH_GROUPBY_DAY # group-by function (to pre-process group-by attribute value with)
  64. _groupsort = '@group desc' # group-by sorting clause (to sort groups in result set with)
  65. _maxmatches = 1000 # max matches to retrieve
  66. _error = '' # last error message
  67. _warning = '' # last warning message
  68. def __init__ (self):
  69. """
  70. create a new client object and fill defaults
  71. """
  72. pass
  73. def GetLastError (self):
  74. """
  75. get last error message (string)
  76. """
  77. return self._error
  78. def GetLastWarning (self):
  79. """
  80. get last warning message (string)
  81. """
  82. return self._warning
  83. def SetServer (self, host, port):
  84. """
  85. set searchd server
  86. """
  87. assert(isinstance(host, str))
  88. assert(isinstance(port, int))
  89. self._host = host
  90. self._port = port
  91. def _Connect (self):
  92. """
  93. connect to searchd server
  94. """
  95. try:
  96. sock = socket.socket ( socket.AF_INET, socket.SOCK_STREAM )
  97. sock.connect ( ( self._host, self._port ) )
  98. except socket.error, msg:
  99. if sock:
  100. sock.close()
  101. self._error = 'connection to %s:%s failed (%s)' % ( self._host, self._port, msg )
  102. return 0
  103. v = unpack('>L', sock.recv(4))
  104. if v<1:
  105. sock.close()
  106. self._error = 'expected searchd protocol version, got %s' % v
  107. return 0
  108. # all ok, send my version
  109. sock.send(pack('>L', 1))
  110. return sock
  111. def _GetResponse (self, sock, client_ver):
  112. """
  113. get and check response packet from searchd server
  114. """
  115. (status, ver, length) = unpack('>2HL', sock.recv(8))
  116. response = ''
  117. left = length
  118. while left>0:
  119. chunk = sock.recv(left)
  120. if chunk:
  121. response += chunk
  122. left -= len(chunk)
  123. else:
  124. break
  125. sock.close()
  126. # check response
  127. read = len(response)
  128. if not response or read!=length:
  129. if length:
  130. self._error = 'failed to read searchd response (status=%s, ver=%s, len=%s, read=%s)' \
  131. % (status, ver, length, read)
  132. else:
  133. self._error = 'received zero-sized searchd response'
  134. return None
  135. # check status
  136. if status==SEARCHD_WARNING:
  137. wend = 4 + unpack ( '>L', response[0:4] )[0]
  138. self._warning = response[4:wend]
  139. return response[wend:]
  140. if status==SEARCHD_ERROR:
  141. self._error = 'searchd error: '+response[4:]
  142. return None
  143. if status==SEARCHD_RETRY:
  144. self._error = 'temporary searchd error: '+response[4:]
  145. return None
  146. if status!=SEARCHD_OK:
  147. self._error = 'unknown status code %d' % status
  148. return None
  149. # check version
  150. if ver<client_ver:
  151. self._warning = 'searchd command v.%d.%d older than client\'s v.%d.%d, some options might not work' \
  152. % (ver>>8, ver&0xff, client_ver>>8, client_ver&0xff)
  153. return response
  154. def SetLimits (self, offset, limit, maxmatches=0):
  155. """
  156. set match offset, count, and max number to retrieve
  157. """
  158. assert(isinstance(offset, int) and offset>=0)
  159. assert(isinstance(limit, int) and limit>0)
  160. assert(maxmatches>=0)
  161. self._offset = offset
  162. self._limit = limit
  163. if maxmatches>0:
  164. self._maxmatches = maxmatches
  165. def SetMatchMode (self, mode):
  166. """
  167. set match mode
  168. """
  169. assert(mode in [SPH_MATCH_ALL, SPH_MATCH_ANY, SPH_MATCH_PHRASE, SPH_MATCH_BOOLEAN, SPH_MATCH_EXTENDED])
  170. self._mode = mode
  171. def SetSortMode ( self, mode, clause='' ):
  172. """
  173. set sort mode
  174. """
  175. assert ( mode in [SPH_SORT_RELEVANCE, SPH_SORT_ATTR_DESC, SPH_SORT_ATTR_ASC, SPH_SORT_TIME_SEGMENTS, SPH_SORT_EXTENDED] )
  176. assert ( isinstance ( clause, str ) )
  177. self._sort = mode
  178. self._sortby = clause
  179. def SetWeights (self, weights):
  180. """
  181. set per-field weights
  182. """
  183. assert(isinstance(weights, list))
  184. for w in weights:
  185. assert(isinstance(w, int))
  186. self._weights = weights
  187. def SetIDRange (self, minid, maxid):
  188. """
  189. set IDs range to match
  190. only match those records where document ID
  191. is beetwen minid and maxid (including minid and maxid)
  192. """
  193. assert(isinstance(minid, int))
  194. assert(isinstance(maxid, int))
  195. assert(minid<=maxid)
  196. self._min_id = minid
  197. self._max_id = maxid
  198. def SetFilter ( self, attribute, values, exclude=0 ):
  199. """
  200. set values filter
  201. only match those records where $attribute column values
  202. are in specified set
  203. """
  204. assert(isinstance(attribute, str))
  205. assert(isinstance(values, list))
  206. assert(values)
  207. for value in values:
  208. assert(isinstance(value, int))
  209. self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'values':values } )
  210. def SetFilterRange (self, attribute, min_, max_, exclude=0 ):
  211. """
  212. set range filter
  213. only match those records where $attribute column value
  214. is beetwen $min and $max (including $min and $max)
  215. """
  216. assert(isinstance(attribute, str))
  217. assert(isinstance(min_, int))
  218. assert(isinstance(max_, int))
  219. assert(min_<=max_)
  220. self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'min':min_, 'max':max_ } )
  221. def SetGroupBy ( self, attribute, func, groupsort='@group desc' ):
  222. """
  223. set grouping attribute and function
  224. in grouping mode, all matches are assigned to different groups
  225. based on grouping function value.
  226. each group keeps track of the total match count, and the best match
  227. (in this group) according to current sorting function.
  228. the final result set contains one best match per group, with
  229. grouping function value and matches count attached.
  230. groups in result set could be sorted by any sorting clause,
  231. including both document attributes and the following special
  232. internal Sphinx attributes:
  233. - @id - match document ID;
  234. - @weight, @rank, @relevance - match weight;
  235. - @group - groupby function value;
  236. - @count - amount of matches in group.
  237. the default mode is to sort by groupby value in descending order,
  238. ie. by "@group desc".
  239. "total_found" would contain total amount of matching groups over
  240. the whole index.
  241. WARNING: grouping is done in fixed memory and thus its results
  242. are only approximate; so there might be more groups reported
  243. in total_found than actually present. @count might also
  244. be underestimated.
  245. for example, if sorting by relevance and grouping by "published"
  246. attribute with SPH_GROUPBY_DAY function, then the result set will
  247. contain one most relevant match per each day when there were any
  248. matches published, with day number and per-day match count attached,
  249. and sorted by day number in descending order (ie. recent days first).
  250. """
  251. assert(isinstance(attribute, str))
  252. assert(func in [SPH_GROUPBY_DAY, SPH_GROUPBY_WEEK, SPH_GROUPBY_MONTH, SPH_GROUPBY_YEAR, SPH_GROUPBY_ATTR] )
  253. assert(isinstance(groupsort, str))
  254. self._groupby = attribute
  255. self._groupfunc = func
  256. self._groupsort = groupsort
  257. def Query (self, query, index='*'):
  258. """
  259. connect to searchd server and run given search query
  260. "query" is query string
  261. "index" is index name to query, default is "*" which means to query all indexes
  262. returns false on failure
  263. returns hash which has the following keys on success:
  264. "matches"
  265. an array of found matches represented as ( "id", "weight", "attrs" ) hashes
  266. "total"
  267. total amount of matches retrieved (upto SPH_MAX_MATCHES, see sphinx.h)
  268. "total_found"
  269. total amount of matching documents in index
  270. "time"
  271. search time
  272. "words"
  273. an array of ( "word", "docs", "hits" ) hashes which contains
  274. docs and hits count for stemmed (!) query words
  275. """
  276. sock = self._Connect()
  277. if not sock:
  278. return {}
  279. # build request
  280. req = [pack('>4L', self._offset, self._limit, self._mode, self._sort)]
  281. req.append(pack('>L', len(self._sortby)))
  282. req.append(self._sortby)
  283. req.append(pack('>L', len(query)))
  284. req.append(query)
  285. req.append(pack('>L', len(self._weights)))
  286. for w in self._weights:
  287. req.append(pack('>L', w))
  288. req.append(pack('>L', len(index)))
  289. req.append(index)
  290. req.append(pack('>L', self._min_id))
  291. req.append(pack('>L', self._max_id))
  292. # filters
  293. req.append ( pack ( '>L', len(self._filters) ) )
  294. for f in self._filters:
  295. req.append ( pack ( '>L', len(f['attr']) ) )
  296. req.append ( f['attr'] )
  297. if ( 'values' in f ):
  298. req.append ( pack ( '>L', len(f['values']) ) )
  299. for v in f['values']:
  300. req.append ( pack ( '>L', v ) )
  301. else:
  302. req.append ( pack ( '>3L', 0, f['min'], f['max'] ) )
  303. req.append ( pack ( '>L', f['exclude'] ) )
  304. # group-by, max-matches, group-sort
  305. req.append ( pack ( '>2L', self._groupfunc, len(self._groupby) ) )
  306. req.append ( self._groupby )
  307. req.append ( pack ( '>2L', self._maxmatches, len(self._groupsort) ) )
  308. req.append ( self._groupsort )
  309. # send query, get response
  310. req = ''.join(req)
  311. length = len(req)
  312. req = pack('>2HL', SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, length)+req
  313. sock.send(req)
  314. response = self._GetResponse(sock, VER_COMMAND_SEARCH)
  315. if not response:
  316. return {}
  317. # parse response
  318. result = {}
  319. max_ = len(response)
  320. # read schema
  321. p = 0
  322. fields = []
  323. attrs = []
  324. nfields = unpack('>L', response[p:p+4])[0]
  325. p += 4
  326. while nfields>0 and p<max_:
  327. nfields -= 1
  328. length = unpack('>L', response[p:p+4])[0]
  329. p += 4
  330. fields.append(response[p:p+length])
  331. p += length
  332. result['fields'] = fields
  333. nattrs = unpack('>L', response[p:p+4])[0]
  334. p += 4
  335. while nattrs>0 and p<max_:
  336. nattrs -= 1
  337. length = unpack('>L', response[p:p+4])[0]
  338. p += 4
  339. attr = response[p:p+length]
  340. p += length
  341. type_ = unpack('>L', response[p:p+4])[0]
  342. p += 4
  343. attrs.append([attr,type_])
  344. result['attrs'] = attrs
  345. # read match count
  346. count = unpack('>L', response[p:p+4])[0]
  347. p += 4
  348. # read matches
  349. result['matches'] = []
  350. while count>0 and p<max_:
  351. count -= 1
  352. doc, weight = unpack('>2L', response[p:p+8])
  353. p += 8
  354. match = { 'id':doc, 'weight':weight, 'attrs':{} }
  355. for i in range(len(attrs)):
  356. match['attrs'][attrs[i][0]] = unpack('>L', response[p:p+4])[0]
  357. p += 4
  358. result['matches'].append ( match )
  359. result['total'], result['total_found'], result['time'], words = \
  360. unpack('>4L', response[p:p+16])
  361. result['time'] = '%.3f' % (result['time']/1000.0)
  362. p += 16
  363. result['words'] = []
  364. while words>0:
  365. words -= 1
  366. length = unpack('>L', response[p:p+4])[0]
  367. p += 4
  368. word = response[p:p+length]
  369. p += length
  370. docs, hits = unpack('>2L', response[p:p+8])
  371. p += 8
  372. result['words'].append({'word':word, 'docs':docs, 'hits':hits})
  373. sock.close()
  374. return result
  375. def BuildExcerpts (self, docs, index, words, opts=None):
  376. """
  377. connect to searchd server and generate exceprts from given documents
  378. "docs" is an array of strings which represent the documents' contents
  379. "index" is a string specifiying the index which settings will be used
  380. for stemming, lexing and case folding
  381. "words" is a string which contains the words to highlight
  382. "opts" is a hash which contains additional optional highlighting parameters:
  383. "before_match"
  384. a string to insert before a set of matching words, default is "<b>"
  385. "after_match"
  386. a string to insert after a set of matching words, default is "<b>"
  387. "chunk_separator"
  388. a string to insert between excerpts chunks, default is " ... "
  389. "limit"
  390. max excerpt size in symbols (codepoints), default is 256
  391. "around"
  392. how much words to highlight around each match, default is 5
  393. returns false on failure
  394. returns an array of string excerpts on success
  395. """
  396. if not opts:
  397. opts = {}
  398. assert(isinstance(docs, list))
  399. assert(isinstance(index, str))
  400. assert(isinstance(words, str))
  401. assert(isinstance(opts, dict))
  402. sock = self._Connect()
  403. if not sock:
  404. return []
  405. # fixup options
  406. opts.setdefault('before_match', '<b>')
  407. opts.setdefault('after_match', '</b>')
  408. opts.setdefault('chunk_separator', ' ... ')
  409. opts.setdefault('limit', 256)
  410. opts.setdefault('around', 5)
  411. # build request
  412. # v.1.0 req
  413. # mode=0, flags=1 (remove spaces)
  414. req = [pack('>2L', 0, 1)]
  415. # req index
  416. req.append(pack('>L', len(index)))
  417. req.append(index)
  418. # req words
  419. req.append(pack('>L', len(words)))
  420. req.append(words)
  421. # options
  422. req.append(pack('>L', len(opts['before_match'])))
  423. req.append(opts['before_match'])
  424. req.append(pack('>L', len(opts['after_match'])))
  425. req.append(opts['after_match'])
  426. req.append(pack('>L', len(opts['chunk_separator'])))
  427. req.append(opts['chunk_separator'])
  428. req.append(pack('>L', int(opts['limit'])))
  429. req.append(pack('>L', int(opts['around'])))
  430. # documents
  431. req.append(pack('>L', len(docs)))
  432. for doc in docs:
  433. assert(isinstance(doc, str))
  434. req.append(pack('>L', len(doc)))
  435. req.append(doc)
  436. req = ''.join(req)
  437. # send query, get response
  438. length = len(req)
  439. # add header
  440. req = pack('>2HL', SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, length)+req
  441. wrote = sock.send(req)
  442. response = self._GetResponse(sock, VER_COMMAND_EXCERPT )
  443. if not response:
  444. return []
  445. # parse response
  446. pos = 0
  447. res = []
  448. rlen = len(response)
  449. for i in range(len(docs)):
  450. length = unpack('>L', response[pos:pos+4])[0]
  451. pos += 4
  452. if pos+length > rlen:
  453. self._error = 'incomplete reply'
  454. return []
  455. res.append(response[pos:pos+length])
  456. pos += length
  457. return res
  458. #
  459. # $Id$
  460. #