sphinxapi.py 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231
  1. #
  2. # $Id$
  3. #
  4. # Python version of Sphinx searchd client (Python API)
  5. #
  6. # Copyright (c) 2006, Mike Osadnik
  7. # Copyright (c) 2006-2015, Andrew Aksyonoff
  8. # Copyright (c) 2008-2015, Sphinx Technologies Inc
  9. # All rights reserved
  10. #
  11. # This program is free software; you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License. You should have
  13. # received a copy of the GPL license along with this program; if you
  14. # did not, you can find it at http://www.gnu.org/
  15. #
  16. # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  17. # WARNING
  18. # We strongly recommend you to use SphinxQL instead of the API
  19. # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  20. import sys
  21. import select
  22. import socket
  23. import re
  24. from struct import *
  25. # known searchd commands
  26. SEARCHD_COMMAND_SEARCH = 0
  27. SEARCHD_COMMAND_EXCERPT = 1
  28. SEARCHD_COMMAND_UPDATE = 2
  29. SEARCHD_COMMAND_KEYWORDS = 3
  30. SEARCHD_COMMAND_PERSIST = 4
  31. SEARCHD_COMMAND_STATUS = 5
  32. SEARCHD_COMMAND_FLUSHATTRS = 7
  33. # current client-side command implementation versions
  34. VER_COMMAND_SEARCH = 0x11E
  35. VER_COMMAND_EXCERPT = 0x104
  36. VER_COMMAND_UPDATE = 0x103
  37. VER_COMMAND_KEYWORDS = 0x100
  38. VER_COMMAND_STATUS = 0x101
  39. VER_COMMAND_FLUSHATTRS = 0x100
  40. # known searchd status codes
  41. SEARCHD_OK = 0
  42. SEARCHD_ERROR = 1
  43. SEARCHD_RETRY = 2
  44. SEARCHD_WARNING = 3
  45. # known match modes
  46. SPH_MATCH_ALL = 0
  47. SPH_MATCH_ANY = 1
  48. SPH_MATCH_PHRASE = 2
  49. SPH_MATCH_BOOLEAN = 3
  50. SPH_MATCH_EXTENDED = 4
  51. SPH_MATCH_FULLSCAN = 5
  52. SPH_MATCH_EXTENDED2 = 6
  53. # known ranking modes (extended2 mode only)
  54. SPH_RANK_PROXIMITY_BM25 = 0 # default mode, phrase proximity major factor and BM25 minor one
  55. SPH_RANK_BM25 = 1 # statistical mode, BM25 ranking only (faster but worse quality)
  56. SPH_RANK_NONE = 2 # no ranking, all matches get a weight of 1
  57. SPH_RANK_WORDCOUNT = 3 # simple word-count weighting, rank is a weighted sum of per-field keyword occurence counts
  58. SPH_RANK_PROXIMITY = 4
  59. SPH_RANK_MATCHANY = 5
  60. SPH_RANK_FIELDMASK = 6
  61. SPH_RANK_SPH04 = 7
  62. SPH_RANK_EXPR = 8
  63. SPH_RANK_TOTAL = 9
  64. # known sort modes
  65. SPH_SORT_RELEVANCE = 0
  66. SPH_SORT_ATTR_DESC = 1
  67. SPH_SORT_ATTR_ASC = 2
  68. SPH_SORT_TIME_SEGMENTS = 3
  69. SPH_SORT_EXTENDED = 4
  70. SPH_SORT_EXPR = 5
  71. # known filter types
  72. SPH_FILTER_VALUES = 0
  73. SPH_FILTER_RANGE = 1
  74. SPH_FILTER_FLOATRANGE = 2
  75. SPH_FILTER_STRING = 3
  76. # known attribute types
  77. SPH_ATTR_NONE = 0
  78. SPH_ATTR_INTEGER = 1
  79. SPH_ATTR_TIMESTAMP = 2
  80. SPH_ATTR_ORDINAL = 3
  81. SPH_ATTR_BOOL = 4
  82. SPH_ATTR_FLOAT = 5
  83. SPH_ATTR_BIGINT = 6
  84. SPH_ATTR_STRING = 7
  85. SPH_ATTR_FACTORS = 1001
  86. SPH_ATTR_MULTI = 0X40000001L
  87. SPH_ATTR_MULTI64 = 0X40000002L
  88. SPH_ATTR_TYPES = (SPH_ATTR_NONE,
  89. SPH_ATTR_INTEGER,
  90. SPH_ATTR_TIMESTAMP,
  91. SPH_ATTR_ORDINAL,
  92. SPH_ATTR_BOOL,
  93. SPH_ATTR_FLOAT,
  94. SPH_ATTR_BIGINT,
  95. SPH_ATTR_STRING,
  96. SPH_ATTR_MULTI,
  97. SPH_ATTR_MULTI64)
  98. # known grouping functions
  99. SPH_GROUPBY_DAY = 0
  100. SPH_GROUPBY_WEEK = 1
  101. SPH_GROUPBY_MONTH = 2
  102. SPH_GROUPBY_YEAR = 3
  103. SPH_GROUPBY_ATTR = 4
  104. SPH_GROUPBY_ATTRPAIR = 5
  105. class SphinxClient:
  106. def __init__ (self):
  107. """
  108. Create a new client object, and fill defaults.
  109. """
  110. self._host = 'localhost' # searchd host (default is "localhost")
  111. self._port = 9312 # searchd port (default is 9312)
  112. self._path = None # searchd unix-domain socket path
  113. self._socket = None
  114. self._offset = 0 # how much records to seek from result-set start (default is 0)
  115. self._limit = 20 # how much records to return from result-set starting at offset (default is 20)
  116. self._mode = SPH_MATCH_EXTENDED2 # query matching mode (default is SPH_MATCH_EXTENDED2)
  117. self._weights = [] # per-field weights (default is 1 for all fields)
  118. self._sort = SPH_SORT_RELEVANCE # match sorting mode (default is SPH_SORT_RELEVANCE)
  119. self._sortby = '' # attribute to sort by (defualt is "")
  120. self._min_id = 0 # min ID to match (default is 0)
  121. self._max_id = 0 # max ID to match (default is UINT_MAX)
  122. self._filters = [] # search filters
  123. self._groupby = '' # group-by attribute name
  124. self._groupfunc = SPH_GROUPBY_DAY # group-by function (to pre-process group-by attribute value with)
  125. self._groupsort = '@group desc' # group-by sorting clause (to sort groups in result set with)
  126. self._groupdistinct = '' # group-by count-distinct attribute
  127. self._maxmatches = 1000 # max matches to retrieve
  128. self._cutoff = 0 # cutoff to stop searching at
  129. self._retrycount = 0 # distributed retry count
  130. self._retrydelay = 0 # distributed retry delay
  131. self._anchor = {} # geographical anchor point
  132. self._indexweights = {} # per-index weights
  133. self._ranker = SPH_RANK_PROXIMITY_BM25 # ranking mode
  134. self._rankexpr = '' # ranking expression for SPH_RANK_EXPR
  135. self._maxquerytime = 0 # max query time, milliseconds (default is 0, do not limit)
  136. self._timeout = 1.0 # connection timeout
  137. self._fieldweights = {} # per-field-name weights
  138. self._overrides = {} # per-query attribute values overrides
  139. self._select = '*' # select-list (attributes or expressions, with optional aliases)
  140. self._query_flags = SetBit ( 0, 6, True ) # default idf=tfidf_normalized
  141. self._predictedtime = 0 # per-query max_predicted_time
  142. self._outerorderby = '' # outer match sort by
  143. self._outeroffset = 0 # outer offset
  144. self._outerlimit = 0 # outer limit
  145. self._hasouter = False # sub-select enabled
  146. self._error = '' # last error message
  147. self._warning = '' # last warning message
  148. self._reqs = [] # requests array for multi-query
  149. def __del__ (self):
  150. if self._socket:
  151. self._socket.close()
  152. def GetLastError (self):
  153. """
  154. Get last error message (string).
  155. """
  156. return self._error
  157. def GetLastWarning (self):
  158. """
  159. Get last warning message (string).
  160. """
  161. return self._warning
  162. def SetServer (self, host, port = None):
  163. """
  164. Set searchd server host and port.
  165. """
  166. assert(isinstance(host, str))
  167. if host.startswith('/'):
  168. self._path = host
  169. return
  170. elif host.startswith('unix://'):
  171. self._path = host[7:]
  172. return
  173. self._host = host
  174. if isinstance(port, int):
  175. assert(port>0 and port<65536)
  176. self._port = port
  177. self._path = None
  178. def SetConnectTimeout ( self, timeout ):
  179. """
  180. Set connection timeout ( float second )
  181. """
  182. assert (isinstance(timeout, float))
  183. # set timeout to 0 make connaection non-blocking that is wrong so timeout got clipped to reasonable minimum
  184. self._timeout = max ( 0.001, timeout )
  185. def _Connect (self):
  186. """
  187. INTERNAL METHOD, DO NOT CALL. Connects to searchd server.
  188. """
  189. if self._socket:
  190. # we have a socket, but is it still alive?
  191. sr, sw, _ = select.select ( [self._socket], [self._socket], [], 0 )
  192. # this is how alive socket should look
  193. if len(sr)==0 and len(sw)==1:
  194. return self._socket
  195. # oops, looks like it was closed, lets reopen
  196. self._socket.close()
  197. self._socket = None
  198. try:
  199. if self._path:
  200. af = socket.AF_UNIX
  201. addr = self._path
  202. desc = self._path
  203. else:
  204. af = socket.AF_INET
  205. addr = ( self._host, self._port )
  206. desc = '%s;%s' % addr
  207. sock = socket.socket ( af, socket.SOCK_STREAM )
  208. sock.settimeout ( self._timeout )
  209. sock.connect ( addr )
  210. except socket.error, msg:
  211. if sock:
  212. sock.close()
  213. self._error = 'connection to %s failed (%s)' % ( desc, msg )
  214. return
  215. v = unpack('>L', sock.recv(4))
  216. if v<1:
  217. sock.close()
  218. self._error = 'expected searchd protocol version, got %s' % v
  219. return
  220. # all ok, send my version
  221. sock.send(pack('>L', 1))
  222. return sock
  223. def _GetResponse (self, sock, client_ver):
  224. """
  225. INTERNAL METHOD, DO NOT CALL. Gets and checks response packet from searchd server.
  226. """
  227. (status, ver, length) = unpack('>2HL', sock.recv(8))
  228. response = ''
  229. left = length
  230. while left>0:
  231. chunk = sock.recv(left)
  232. if chunk:
  233. response += chunk
  234. left -= len(chunk)
  235. else:
  236. break
  237. if not self._socket:
  238. sock.close()
  239. # check response
  240. read = len(response)
  241. if not response or read!=length:
  242. if length:
  243. self._error = 'failed to read searchd response (status=%s, ver=%s, len=%s, read=%s)' \
  244. % (status, ver, length, read)
  245. else:
  246. self._error = 'received zero-sized searchd response'
  247. return None
  248. # check status
  249. if status==SEARCHD_WARNING:
  250. wend = 4 + unpack ( '>L', response[0:4] )[0]
  251. self._warning = response[4:wend]
  252. return response[wend:]
  253. if status==SEARCHD_ERROR:
  254. self._error = 'searchd error: '+response[4:]
  255. return None
  256. if status==SEARCHD_RETRY:
  257. self._error = 'temporary searchd error: '+response[4:]
  258. return None
  259. if status!=SEARCHD_OK:
  260. self._error = 'unknown status code %d' % status
  261. return None
  262. # check version
  263. if ver<client_ver:
  264. self._warning = 'searchd command v.%d.%d older than client\'s v.%d.%d, some options might not work' \
  265. % (ver>>8, ver&0xff, client_ver>>8, client_ver&0xff)
  266. return response
  267. def _Send ( self, sock, req ):
  268. """
  269. INTERNAL METHOD, DO NOT CALL. send request to searchd server.
  270. """
  271. total = 0
  272. while True:
  273. sent = sock.send ( req[total:] )
  274. if sent<=0:
  275. break
  276. total = total + sent
  277. return total
  278. def SetLimits (self, offset, limit, maxmatches=0, cutoff=0):
  279. """
  280. Set offset and count into result set, and optionally set max-matches and cutoff limits.
  281. """
  282. assert ( type(offset) in [int,long] and 0<=offset<16777216 )
  283. assert ( type(limit) in [int,long] and 0<limit<16777216 )
  284. assert(maxmatches>=0)
  285. self._offset = offset
  286. self._limit = limit
  287. if maxmatches>0:
  288. self._maxmatches = maxmatches
  289. if cutoff>=0:
  290. self._cutoff = cutoff
  291. def SetMaxQueryTime (self, maxquerytime):
  292. """
  293. Set maximum query time, in milliseconds, per-index. 0 means 'do not limit'.
  294. """
  295. assert(isinstance(maxquerytime,int) and maxquerytime>0)
  296. self._maxquerytime = maxquerytime
  297. def SetMatchMode (self, mode):
  298. """
  299. Set matching mode.
  300. """
  301. print >> sys.stderr, 'DEPRECATED: Do not call this method or, even better, use SphinxQL instead of an API'
  302. assert(mode in [SPH_MATCH_ALL, SPH_MATCH_ANY, SPH_MATCH_PHRASE, SPH_MATCH_BOOLEAN, SPH_MATCH_EXTENDED, SPH_MATCH_FULLSCAN, SPH_MATCH_EXTENDED2])
  303. self._mode = mode
  304. def SetRankingMode ( self, ranker, rankexpr='' ):
  305. """
  306. Set ranking mode.
  307. """
  308. assert(ranker>=0 and ranker<SPH_RANK_TOTAL)
  309. self._ranker = ranker
  310. self._rankexpr = rankexpr
  311. def SetSortMode ( self, mode, clause='' ):
  312. """
  313. Set sorting mode.
  314. """
  315. assert ( mode in [SPH_SORT_RELEVANCE, SPH_SORT_ATTR_DESC, SPH_SORT_ATTR_ASC, SPH_SORT_TIME_SEGMENTS, SPH_SORT_EXTENDED, SPH_SORT_EXPR] )
  316. assert ( isinstance ( clause, str ) )
  317. self._sort = mode
  318. self._sortby = clause
  319. def SetFieldWeights (self, weights):
  320. """
  321. Bind per-field weights by name; expects (name,field_weight) dictionary as argument.
  322. """
  323. assert(isinstance(weights,dict))
  324. for key,val in weights.items():
  325. assert(isinstance(key,str))
  326. AssertUInt32 ( val )
  327. self._fieldweights = weights
  328. def SetIndexWeights (self, weights):
  329. """
  330. Bind per-index weights by name; expects (name,index_weight) dictionary as argument.
  331. """
  332. assert(isinstance(weights,dict))
  333. for key,val in weights.items():
  334. assert(isinstance(key,str))
  335. AssertUInt32(val)
  336. self._indexweights = weights
  337. def SetIDRange (self, minid, maxid):
  338. """
  339. Set IDs range to match.
  340. Only match records if document ID is beetwen $min and $max (inclusive).
  341. """
  342. assert(isinstance(minid, (int, long)))
  343. assert(isinstance(maxid, (int, long)))
  344. assert(minid<=maxid)
  345. self._min_id = minid
  346. self._max_id = maxid
  347. def SetFilter ( self, attribute, values, exclude=0 ):
  348. """
  349. Set values set filter.
  350. Only match records where 'attribute' value is in given 'values' set.
  351. """
  352. assert(isinstance(attribute, str))
  353. assert iter(values)
  354. for value in values:
  355. AssertInt32 ( value )
  356. self._filters.append ( { 'type':SPH_FILTER_VALUES, 'attr':attribute, 'exclude':exclude, 'values':values } )
  357. def SetFilterString ( self, attribute, value, exclude=0 ):
  358. """
  359. Set string filter.
  360. Only match records where 'attribute' value is equal
  361. """
  362. assert(isinstance(attribute, str))
  363. assert(isinstance(value, str))
  364. print ( "attr='%s' val='%s' " % ( attribute, value ) )
  365. self._filters.append ( { 'type':SPH_FILTER_STRING, 'attr':attribute, 'exclude':exclude, 'value':value } )
  366. def SetFilterRange (self, attribute, min_, max_, exclude=0 ):
  367. """
  368. Set range filter.
  369. Only match records if 'attribute' value is beetwen 'min_' and 'max_' (inclusive).
  370. """
  371. assert(isinstance(attribute, str))
  372. AssertInt32(min_)
  373. AssertInt32(max_)
  374. assert(min_<=max_)
  375. self._filters.append ( { 'type':SPH_FILTER_RANGE, 'attr':attribute, 'exclude':exclude, 'min':min_, 'max':max_ } )
  376. def SetFilterFloatRange (self, attribute, min_, max_, exclude=0 ):
  377. assert(isinstance(attribute,str))
  378. assert(isinstance(min_,float))
  379. assert(isinstance(max_,float))
  380. assert(min_ <= max_)
  381. self._filters.append ( {'type':SPH_FILTER_FLOATRANGE, 'attr':attribute, 'exclude':exclude, 'min':min_, 'max':max_} )
  382. def SetGeoAnchor (self, attrlat, attrlong, latitude, longitude):
  383. assert(isinstance(attrlat,str))
  384. assert(isinstance(attrlong,str))
  385. assert(isinstance(latitude,float))
  386. assert(isinstance(longitude,float))
  387. self._anchor['attrlat'] = attrlat
  388. self._anchor['attrlong'] = attrlong
  389. self._anchor['lat'] = latitude
  390. self._anchor['long'] = longitude
  391. def SetGroupBy ( self, attribute, func, groupsort='@group desc' ):
  392. """
  393. Set grouping attribute and function.
  394. """
  395. assert(isinstance(attribute, str))
  396. assert(func in [SPH_GROUPBY_DAY, SPH_GROUPBY_WEEK, SPH_GROUPBY_MONTH, SPH_GROUPBY_YEAR, SPH_GROUPBY_ATTR, SPH_GROUPBY_ATTRPAIR] )
  397. assert(isinstance(groupsort, str))
  398. self._groupby = attribute
  399. self._groupfunc = func
  400. self._groupsort = groupsort
  401. def SetGroupDistinct (self, attribute):
  402. assert(isinstance(attribute,str))
  403. self._groupdistinct = attribute
  404. def SetRetries (self, count, delay=0):
  405. assert(isinstance(count,int) and count>=0)
  406. assert(isinstance(delay,int) and delay>=0)
  407. self._retrycount = count
  408. self._retrydelay = delay
  409. def SetOverride (self, name, type, values):
  410. print >> sys.stderr, 'DEPRECATED: Do not call this method. Use SphinxQL REMAP() function instead.'
  411. assert(isinstance(name, str))
  412. assert(type in SPH_ATTR_TYPES)
  413. assert(isinstance(values, dict))
  414. self._overrides[name] = {'name': name, 'type': type, 'values': values}
  415. def SetSelect (self, select):
  416. assert(isinstance(select, str))
  417. self._select = select
  418. def SetQueryFlag ( self, name, value ):
  419. known_names = [ "reverse_scan", "sort_method", "max_predicted_time", "boolean_simplify", "idf", "global_idf" ]
  420. flags = { "reverse_scan":[0, 1], "sort_method":["pq", "kbuffer"],"max_predicted_time":[0], "boolean_simplify":[True, False], "idf":["normalized", "plain", "tfidf_normalized", "tfidf_unnormalized"], "global_idf":[True, False] }
  421. assert ( name in known_names )
  422. assert ( value in flags[name] or ( name=="max_predicted_time" and isinstance(value, (int, long)) and value>=0))
  423. if name=="reverse_scan":
  424. self._query_flags = SetBit ( self._query_flags, 0, value==1 )
  425. if name=="sort_method":
  426. self._query_flags = SetBit ( self._query_flags, 1, value=="kbuffer" )
  427. if name=="max_predicted_time":
  428. self._query_flags = SetBit ( self._query_flags, 2, value>0 )
  429. self._predictedtime = int(value)
  430. if name=="boolean_simplify":
  431. self._query_flags= SetBit ( self._query_flags, 3, value )
  432. if name=="idf" and ( value=="plain" or value=="normalized" ) :
  433. self._query_flags = SetBit ( self._query_flags, 4, value=="plain" )
  434. if name=="global_idf":
  435. self._query_flags= SetBit ( self._query_flags, 5, value )
  436. if name=="idf" and ( value=="tfidf_normalized" or value=="tfidf_unnormalized" ) :
  437. self._query_flags = SetBit ( self._query_flags, 6, value=="tfidf_normalized" )
  438. def SetOuterSelect ( self, orderby, offset, limit ):
  439. assert(isinstance(orderby, str))
  440. assert(isinstance(offset, (int, long)))
  441. assert(isinstance(limit, (int, long)))
  442. assert ( offset>=0 )
  443. assert ( limit>0 )
  444. self._outerorderby = orderby
  445. self._outeroffset = offset
  446. self._outerlimit = limit
  447. self._hasouter = True
  448. def ResetOverrides (self):
  449. self._overrides = {}
  450. def ResetFilters (self):
  451. """
  452. Clear all filters (for multi-queries).
  453. """
  454. self._filters = []
  455. self._anchor = {}
  456. def ResetGroupBy (self):
  457. """
  458. Clear groupby settings (for multi-queries).
  459. """
  460. self._groupby = ''
  461. self._groupfunc = SPH_GROUPBY_DAY
  462. self._groupsort = '@group desc'
  463. self._groupdistinct = ''
  464. def ResetQueryFlag (self):
  465. self._query_flags = SetBit ( 0, 6, True ) # default idf=tfidf_normalized
  466. self._predictedtime = 0
  467. def ResetOuterSelect (self):
  468. self._outerorderby = ''
  469. self._outeroffset = 0
  470. self._outerlimit = 0
  471. self._hasouter = False
  472. def Query (self, query, index='*', comment=''):
  473. """
  474. Connect to searchd server and run given search query.
  475. Returns None on failure; result set hash on success (see documentation for details).
  476. """
  477. assert(len(self._reqs)==0)
  478. self.AddQuery(query,index,comment)
  479. results = self.RunQueries()
  480. self._reqs = [] # we won't re-run erroneous batch
  481. if not results or len(results)==0:
  482. return None
  483. self._error = results[0]['error']
  484. self._warning = results[0]['warning']
  485. if results[0]['status'] == SEARCHD_ERROR:
  486. return None
  487. return results[0]
  488. def AddQuery (self, query, index='*', comment=''):
  489. """
  490. Add query to batch.
  491. """
  492. # build request
  493. req = []
  494. req.append(pack('>5L', self._query_flags, self._offset, self._limit, self._mode, self._ranker))
  495. if self._ranker==SPH_RANK_EXPR:
  496. req.append(pack('>L', len(self._rankexpr)))
  497. req.append(self._rankexpr)
  498. req.append(pack('>L', self._sort))
  499. req.append(pack('>L', len(self._sortby)))
  500. req.append(self._sortby)
  501. if isinstance(query,unicode):
  502. query = query.encode('utf-8')
  503. assert(isinstance(query,str))
  504. req.append(pack('>L', len(query)))
  505. req.append(query)
  506. req.append(pack('>L', len(self._weights)))
  507. for w in self._weights:
  508. req.append(pack('>L', w))
  509. assert(isinstance(index,str))
  510. req.append(pack('>L', len(index)))
  511. req.append(index)
  512. req.append(pack('>L',1)) # id64 range marker
  513. req.append(pack('>Q', self._min_id))
  514. req.append(pack('>Q', self._max_id))
  515. # filters
  516. req.append ( pack ( '>L', len(self._filters) ) )
  517. for f in self._filters:
  518. req.append ( pack ( '>L', len(f['attr'])) + f['attr'])
  519. filtertype = f['type']
  520. req.append ( pack ( '>L', filtertype))
  521. if filtertype == SPH_FILTER_VALUES:
  522. req.append ( pack ('>L', len(f['values'])))
  523. for val in f['values']:
  524. req.append ( pack ('>q', val))
  525. elif filtertype == SPH_FILTER_RANGE:
  526. req.append ( pack ('>2q', f['min'], f['max']))
  527. elif filtertype == SPH_FILTER_FLOATRANGE:
  528. req.append ( pack ('>2f', f['min'], f['max']))
  529. elif filtertype == SPH_FILTER_STRING:
  530. req.append ( pack ( '>L', len(f['value']) ) )
  531. req.append ( f['value'] )
  532. req.append ( pack ( '>L', f['exclude'] ) )
  533. # group-by, max-matches, group-sort
  534. req.append ( pack ( '>2L', self._groupfunc, len(self._groupby) ) )
  535. req.append ( self._groupby )
  536. req.append ( pack ( '>2L', self._maxmatches, len(self._groupsort) ) )
  537. req.append ( self._groupsort )
  538. req.append ( pack ( '>LLL', self._cutoff, self._retrycount, self._retrydelay))
  539. req.append ( pack ( '>L', len(self._groupdistinct)))
  540. req.append ( self._groupdistinct)
  541. # anchor point
  542. if len(self._anchor) == 0:
  543. req.append ( pack ('>L', 0))
  544. else:
  545. attrlat, attrlong = self._anchor['attrlat'], self._anchor['attrlong']
  546. latitude, longitude = self._anchor['lat'], self._anchor['long']
  547. req.append ( pack ('>L', 1))
  548. req.append ( pack ('>L', len(attrlat)) + attrlat)
  549. req.append ( pack ('>L', len(attrlong)) + attrlong)
  550. req.append ( pack ('>f', latitude) + pack ('>f', longitude))
  551. # per-index weights
  552. req.append ( pack ('>L',len(self._indexweights)))
  553. for indx,weight in self._indexweights.items():
  554. req.append ( pack ('>L',len(indx)) + indx + pack ('>L',weight))
  555. # max query time
  556. req.append ( pack ('>L', self._maxquerytime) )
  557. # per-field weights
  558. req.append ( pack ('>L',len(self._fieldweights) ) )
  559. for field,weight in self._fieldweights.items():
  560. req.append ( pack ('>L',len(field)) + field + pack ('>L',weight) )
  561. # comment
  562. comment = str(comment)
  563. req.append ( pack('>L',len(comment)) + comment )
  564. # attribute overrides
  565. req.append ( pack('>L', len(self._overrides)) )
  566. for v in self._overrides.values():
  567. req.extend ( ( pack('>L', len(v['name'])), v['name'] ) )
  568. req.append ( pack('>LL', v['type'], len(v['values'])) )
  569. for id, value in v['values'].iteritems():
  570. req.append ( pack('>Q', id) )
  571. if v['type'] == SPH_ATTR_FLOAT:
  572. req.append ( pack('>f', value) )
  573. elif v['type'] == SPH_ATTR_BIGINT:
  574. req.append ( pack('>q', value) )
  575. else:
  576. req.append ( pack('>l', value) )
  577. # select-list
  578. req.append ( pack('>L', len(self._select)) )
  579. req.append ( self._select )
  580. if self._predictedtime>0:
  581. req.append ( pack('>L', self._predictedtime ) )
  582. # outer
  583. req.append ( pack('>L',len(self._outerorderby)) + self._outerorderby )
  584. req.append ( pack ( '>2L', self._outeroffset, self._outerlimit ) )
  585. if self._hasouter:
  586. req.append ( pack('>L', 1) )
  587. else:
  588. req.append ( pack('>L', 0) )
  589. # send query, get response
  590. req = ''.join(req)
  591. self._reqs.append(req)
  592. return
  593. def RunQueries (self):
  594. """
  595. Run queries batch.
  596. Returns None on network IO failure; or an array of result set hashes on success.
  597. """
  598. if len(self._reqs)==0:
  599. self._error = 'no queries defined, issue AddQuery() first'
  600. return None
  601. sock = self._Connect()
  602. if not sock:
  603. return None
  604. req = ''.join(self._reqs)
  605. length = len(req)+8
  606. req = pack('>HHLLL', SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, length, 0, len(self._reqs))+req
  607. self._Send ( sock, req )
  608. response = self._GetResponse(sock, VER_COMMAND_SEARCH)
  609. if not response:
  610. return None
  611. nreqs = len(self._reqs)
  612. # parse response
  613. max_ = len(response)
  614. p = 0
  615. results = []
  616. for i in range(0,nreqs,1):
  617. result = {}
  618. results.append(result)
  619. result['error'] = ''
  620. result['warning'] = ''
  621. status = unpack('>L', response[p:p+4])[0]
  622. p += 4
  623. result['status'] = status
  624. if status != SEARCHD_OK:
  625. length = unpack('>L', response[p:p+4])[0]
  626. p += 4
  627. message = response[p:p+length]
  628. p += length
  629. if status == SEARCHD_WARNING:
  630. result['warning'] = message
  631. else:
  632. result['error'] = message
  633. continue
  634. # read schema
  635. fields = []
  636. attrs = []
  637. nfields = unpack('>L', response[p:p+4])[0]
  638. p += 4
  639. while nfields>0 and p<max_:
  640. nfields -= 1
  641. length = unpack('>L', response[p:p+4])[0]
  642. p += 4
  643. fields.append(response[p:p+length])
  644. p += length
  645. result['fields'] = fields
  646. nattrs = unpack('>L', response[p:p+4])[0]
  647. p += 4
  648. while nattrs>0 and p<max_:
  649. nattrs -= 1
  650. length = unpack('>L', response[p:p+4])[0]
  651. p += 4
  652. attr = response[p:p+length]
  653. p += length
  654. type_ = unpack('>L', response[p:p+4])[0]
  655. p += 4
  656. attrs.append([attr,type_])
  657. result['attrs'] = attrs
  658. # read match count
  659. count = unpack('>L', response[p:p+4])[0]
  660. p += 4
  661. id64 = unpack('>L', response[p:p+4])[0]
  662. p += 4
  663. # read matches
  664. result['matches'] = []
  665. while count>0 and p<max_:
  666. count -= 1
  667. if id64:
  668. doc, weight = unpack('>QL', response[p:p+12])
  669. p += 12
  670. else:
  671. doc, weight = unpack('>2L', response[p:p+8])
  672. p += 8
  673. match = { 'id':doc, 'weight':weight, 'attrs':{} }
  674. for i in range(len(attrs)):
  675. if attrs[i][1] == SPH_ATTR_FLOAT:
  676. match['attrs'][attrs[i][0]] = unpack('>f', response[p:p+4])[0]
  677. elif attrs[i][1] == SPH_ATTR_BIGINT:
  678. match['attrs'][attrs[i][0]] = unpack('>q', response[p:p+8])[0]
  679. p += 4
  680. elif attrs[i][1] == SPH_ATTR_STRING:
  681. slen = unpack('>L', response[p:p+4])[0]
  682. p += 4
  683. match['attrs'][attrs[i][0]] = ''
  684. if slen>0:
  685. match['attrs'][attrs[i][0]] = response[p:p+slen]
  686. p += slen-4
  687. elif attrs[i][1] == SPH_ATTR_FACTORS:
  688. slen = unpack('>L', response[p:p+4])[0]
  689. p += 4
  690. match['attrs'][attrs[i][0]] = ''
  691. if slen>0:
  692. match['attrs'][attrs[i][0]] = response[p:p+slen-4]
  693. p += slen-4
  694. p -= 4
  695. elif attrs[i][1] == SPH_ATTR_MULTI:
  696. match['attrs'][attrs[i][0]] = []
  697. nvals = unpack('>L', response[p:p+4])[0]
  698. p += 4
  699. for n in range(0,nvals,1):
  700. match['attrs'][attrs[i][0]].append(unpack('>L', response[p:p+4])[0])
  701. p += 4
  702. p -= 4
  703. elif attrs[i][1] == SPH_ATTR_MULTI64:
  704. match['attrs'][attrs[i][0]] = []
  705. nvals = unpack('>L', response[p:p+4])[0]
  706. nvals = nvals/2
  707. p += 4
  708. for n in range(0,nvals,1):
  709. match['attrs'][attrs[i][0]].append(unpack('>q', response[p:p+8])[0])
  710. p += 8
  711. p -= 4
  712. else:
  713. match['attrs'][attrs[i][0]] = unpack('>L', response[p:p+4])[0]
  714. p += 4
  715. result['matches'].append ( match )
  716. result['total'], result['total_found'], result['time'], words = unpack('>4L', response[p:p+16])
  717. result['time'] = '%.3f' % (result['time']/1000.0)
  718. p += 16
  719. result['words'] = []
  720. while words>0:
  721. words -= 1
  722. length = unpack('>L', response[p:p+4])[0]
  723. p += 4
  724. word = response[p:p+length]
  725. p += length
  726. docs, hits = unpack('>2L', response[p:p+8])
  727. p += 8
  728. result['words'].append({'word':word, 'docs':docs, 'hits':hits})
  729. self._reqs = []
  730. return results
  731. def BuildExcerpts (self, docs, index, words, opts=None):
  732. """
  733. Connect to searchd server and generate exceprts from given documents.
  734. """
  735. if not opts:
  736. opts = {}
  737. if isinstance(words,unicode):
  738. words = words.encode('utf-8')
  739. assert(isinstance(docs, list))
  740. assert(isinstance(index, str))
  741. assert(isinstance(words, str))
  742. assert(isinstance(opts, dict))
  743. sock = self._Connect()
  744. if not sock:
  745. return None
  746. # fixup options
  747. opts.setdefault('before_match', '<b>')
  748. opts.setdefault('after_match', '</b>')
  749. opts.setdefault('chunk_separator', ' ... ')
  750. opts.setdefault('html_strip_mode', 'index')
  751. opts.setdefault('limit', 256)
  752. opts.setdefault('limit_passages', 0)
  753. opts.setdefault('limit_words', 0)
  754. opts.setdefault('around', 5)
  755. opts.setdefault('start_passage_id', 1)
  756. opts.setdefault('passage_boundary', 'none')
  757. # build request
  758. # v.1.0 req
  759. flags = 1 # (remove spaces)
  760. if opts.get('exact_phrase'): flags |= 2
  761. if opts.get('single_passage'): flags |= 4
  762. if opts.get('use_boundaries'): flags |= 8
  763. if opts.get('weight_order'): flags |= 16
  764. if opts.get('query_mode'): flags |= 32
  765. if opts.get('force_all_words'): flags |= 64
  766. if opts.get('load_files'): flags |= 128
  767. if opts.get('allow_empty'): flags |= 256
  768. if opts.get('emit_zones'): flags |= 512
  769. if opts.get('load_files_scattered'): flags |= 1024
  770. # mode=0, flags
  771. req = [pack('>2L', 0, flags)]
  772. # req index
  773. req.append(pack('>L', len(index)))
  774. req.append(index)
  775. # req words
  776. req.append(pack('>L', len(words)))
  777. req.append(words)
  778. # options
  779. req.append(pack('>L', len(opts['before_match'])))
  780. req.append(opts['before_match'])
  781. req.append(pack('>L', len(opts['after_match'])))
  782. req.append(opts['after_match'])
  783. req.append(pack('>L', len(opts['chunk_separator'])))
  784. req.append(opts['chunk_separator'])
  785. req.append(pack('>L', int(opts['limit'])))
  786. req.append(pack('>L', int(opts['around'])))
  787. req.append(pack('>L', int(opts['limit_passages'])))
  788. req.append(pack('>L', int(opts['limit_words'])))
  789. req.append(pack('>L', int(opts['start_passage_id'])))
  790. req.append(pack('>L', len(opts['html_strip_mode'])))
  791. req.append((opts['html_strip_mode']))
  792. req.append(pack('>L', len(opts['passage_boundary'])))
  793. req.append((opts['passage_boundary']))
  794. # documents
  795. req.append(pack('>L', len(docs)))
  796. for doc in docs:
  797. if isinstance(doc,unicode):
  798. doc = doc.encode('utf-8')
  799. assert(isinstance(doc, str))
  800. req.append(pack('>L', len(doc)))
  801. req.append(doc)
  802. req = ''.join(req)
  803. # send query, get response
  804. length = len(req)
  805. # add header
  806. req = pack('>2HL', SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, length)+req
  807. self._Send ( sock, req )
  808. response = self._GetResponse(sock, VER_COMMAND_EXCERPT )
  809. if not response:
  810. return []
  811. # parse response
  812. pos = 0
  813. res = []
  814. rlen = len(response)
  815. for i in range(len(docs)):
  816. length = unpack('>L', response[pos:pos+4])[0]
  817. pos += 4
  818. if pos+length > rlen:
  819. self._error = 'incomplete reply'
  820. return []
  821. res.append(response[pos:pos+length])
  822. pos += length
  823. return res
  824. def UpdateAttributes ( self, index, attrs, values, mva=False, ignorenonexistent=False ):
  825. """
  826. Update given attribute values on given documents in given indexes.
  827. Returns amount of updated documents (0 or more) on success, or -1 on failure.
  828. 'attrs' must be a list of strings.
  829. 'values' must be a dict with int key (document ID) and list of int values (new attribute values).
  830. optional boolean parameter 'mva' points that there is update of MVA attributes.
  831. In this case the 'values' must be a dict with int key (document ID) and list of lists of int values
  832. (new MVA attribute values).
  833. Optional boolean parameter 'ignorenonexistent' points that the update will silently ignore any warnings about
  834. trying to update a column which is not exists in current index schema.
  835. Example:
  836. res = cl.UpdateAttributes ( 'test1', [ 'group_id', 'date_added' ], { 2:[123,1000000000], 4:[456,1234567890] } )
  837. """
  838. assert ( isinstance ( index, str ) )
  839. assert ( isinstance ( attrs, list ) )
  840. assert ( isinstance ( values, dict ) )
  841. for attr in attrs:
  842. assert ( isinstance ( attr, str ) )
  843. for docid, entry in values.items():
  844. AssertUInt32(docid)
  845. assert ( isinstance ( entry, list ) )
  846. assert ( len(attrs)==len(entry) )
  847. for val in entry:
  848. if mva:
  849. assert ( isinstance ( val, list ) )
  850. for vals in val:
  851. AssertInt32(vals)
  852. else:
  853. AssertInt32(val)
  854. # build request
  855. req = [ pack('>L',len(index)), index ]
  856. req.append ( pack('>L',len(attrs)) )
  857. ignore_absent = 0
  858. if ignorenonexistent: ignore_absent = 1
  859. req.append ( pack('>L', ignore_absent ) )
  860. mva_attr = 0
  861. if mva: mva_attr = 1
  862. for attr in attrs:
  863. req.append ( pack('>L',len(attr)) + attr )
  864. req.append ( pack('>L', mva_attr ) )
  865. req.append ( pack('>L',len(values)) )
  866. for docid, entry in values.items():
  867. req.append ( pack('>Q',docid) )
  868. for val in entry:
  869. val_len = val
  870. if mva: val_len = len ( val )
  871. req.append ( pack('>L',val_len ) )
  872. if mva:
  873. for vals in val:
  874. req.append ( pack ('>L',vals) )
  875. # connect, send query, get response
  876. sock = self._Connect()
  877. if not sock:
  878. return None
  879. req = ''.join(req)
  880. length = len(req)
  881. req = pack ( '>2HL', SEARCHD_COMMAND_UPDATE, VER_COMMAND_UPDATE, length ) + req
  882. self._Send ( sock, req )
  883. response = self._GetResponse ( sock, VER_COMMAND_UPDATE )
  884. if not response:
  885. return -1
  886. # parse response
  887. updated = unpack ( '>L', response[0:4] )[0]
  888. return updated
  889. def BuildKeywords ( self, query, index, hits ):
  890. """
  891. Connect to searchd server, and generate keywords list for a given query.
  892. Returns None on failure, or a list of keywords on success.
  893. """
  894. assert ( isinstance ( query, str ) )
  895. assert ( isinstance ( index, str ) )
  896. assert ( isinstance ( hits, int ) )
  897. # build request
  898. req = [ pack ( '>L', len(query) ) + query ]
  899. req.append ( pack ( '>L', len(index) ) + index )
  900. req.append ( pack ( '>L', hits ) )
  901. # connect, send query, get response
  902. sock = self._Connect()
  903. if not sock:
  904. return None
  905. req = ''.join(req)
  906. length = len(req)
  907. req = pack ( '>2HL', SEARCHD_COMMAND_KEYWORDS, VER_COMMAND_KEYWORDS, length ) + req
  908. self._Send ( sock, req )
  909. response = self._GetResponse ( sock, VER_COMMAND_KEYWORDS )
  910. if not response:
  911. return None
  912. # parse response
  913. res = []
  914. nwords = unpack ( '>L', response[0:4] )[0]
  915. p = 4
  916. max_ = len(response)
  917. while nwords>0 and p<max_:
  918. nwords -= 1
  919. length = unpack ( '>L', response[p:p+4] )[0]
  920. p += 4
  921. tokenized = response[p:p+length]
  922. p += length
  923. length = unpack ( '>L', response[p:p+4] )[0]
  924. p += 4
  925. normalized = response[p:p+length]
  926. p += length
  927. entry = { 'tokenized':tokenized, 'normalized':normalized }
  928. if hits:
  929. entry['docs'], entry['hits'] = unpack ( '>2L', response[p:p+8] )
  930. p += 8
  931. res.append ( entry )
  932. if nwords>0 or p>max_:
  933. self._error = 'incomplete reply'
  934. return None
  935. return res
  936. def Status ( self, session=False ):
  937. """
  938. Get the status
  939. """
  940. # connect, send query, get response
  941. sock = self._Connect()
  942. if not sock:
  943. return None
  944. sess = 1
  945. if session:
  946. sess = 0
  947. req = pack ( '>2HLL', SEARCHD_COMMAND_STATUS, VER_COMMAND_STATUS, 4, sess )
  948. self._Send ( sock, req )
  949. response = self._GetResponse ( sock, VER_COMMAND_STATUS )
  950. if not response:
  951. return None
  952. # parse response
  953. res = []
  954. p = 8
  955. max_ = len(response)
  956. while p<max_:
  957. length = unpack ( '>L', response[p:p+4] )[0]
  958. k = response[p+4:p+length+4]
  959. p += 4+length
  960. length = unpack ( '>L', response[p:p+4] )[0]
  961. v = response[p+4:p+length+4]
  962. p += 4+length
  963. res += [[k, v]]
  964. return res
  965. ### persistent connections
  966. def Open(self):
  967. if self._socket:
  968. self._error = 'already connected'
  969. return None
  970. server = self._Connect()
  971. if not server:
  972. return None
  973. # command, command version = 0, body length = 4, body = 1
  974. request = pack ( '>hhII', SEARCHD_COMMAND_PERSIST, 0, 4, 1 )
  975. self._Send ( server, request )
  976. self._socket = server
  977. return True
  978. def Close(self):
  979. if not self._socket:
  980. self._error = 'not connected'
  981. return
  982. self._socket.close()
  983. self._socket = None
  984. def EscapeString(self, string):
  985. return re.sub(r"([=\(\)|\-!@~\"&/\\\^\$\=\<])", r"\\\1", string)
  986. def FlushAttributes(self):
  987. sock = self._Connect()
  988. if not sock:
  989. return -1
  990. request = pack ( '>hhI', SEARCHD_COMMAND_FLUSHATTRS, VER_COMMAND_FLUSHATTRS, 0 ) # cmd, ver, bodylen
  991. self._Send ( sock, request )
  992. response = self._GetResponse ( sock, VER_COMMAND_FLUSHATTRS )
  993. if not response or len(response)!=4:
  994. self._error = 'unexpected response length'
  995. return -1
  996. tag = unpack ( '>L', response[0:4] )[0]
  997. return tag
  998. def AssertInt32 ( value ):
  999. assert(isinstance(value, (int, long)))
  1000. assert(value>=-2**32-1 and value<=2**32-1)
  1001. def AssertUInt32 ( value ):
  1002. assert(isinstance(value, (int, long)))
  1003. assert(value>=0 and value<=2**32-1)
  1004. def SetBit ( flag, bit, on ):
  1005. if on:
  1006. flag += ( 1<<bit )
  1007. else:
  1008. reset = 255 ^ ( 1<<bit )
  1009. flag = flag & reset
  1010. return flag
  1011. #
  1012. # $Id$
  1013. #