| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579 |
- #
- # $Id$
- #
- # Python version of Sphinx searchd client (Python API)
- #
- # Copyright (c) 2006-2007, Andrew Aksyonoff
- # Copyright (c) 2006, Mike Osadnik
- # All rights reserved
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License. You should have
- # received a copy of the GPL license along with this program; if you
- # did not, you can find it at http://www.gnu.org/
- #
- import select
- import socket
- from struct import *
- # known searchd commands
- SEARCHD_COMMAND_SEARCH = 0
- SEARCHD_COMMAND_EXCERPT = 1
- # current client-side command implementation versions
- VER_COMMAND_SEARCH = 0x107
- VER_COMMAND_EXCERPT = 0x100
- # known searchd status codes
- SEARCHD_OK = 0
- SEARCHD_ERROR = 1
- SEARCHD_RETRY = 2
- SEARCHD_WARNING = 3
- # known match modes
- SPH_MATCH_ALL = 0
- SPH_MATCH_ANY = 1
- SPH_MATCH_PHRASE = 2
- SPH_MATCH_BOOLEAN = 3
- SPH_MATCH_EXTENDED = 4
- # known sort modes
- SPH_SORT_RELEVANCE = 0
- SPH_SORT_ATTR_DESC = 1
- SPH_SORT_ATTR_ASC = 2
- SPH_SORT_TIME_SEGMENTS = 3
- SPH_SORT_EXTENDED = 4
- # known attribute types
- SPH_ATTR_INTEGER = 1
- SPH_ATTR_TIMESTAMP = 2
- # known grouping functions
- SPH_GROUPBY_DAY = 0
- SPH_GROUPBY_WEEK = 1
- SPH_GROUPBY_MONTH = 2
- SPH_GROUPBY_YEAR = 3
- SPH_GROUPBY_ATTR = 4
- class SphinxClient:
- _host = 'localhost' # searchd host (default is "localhost")
- _port = 3312 # searchd port (default is 3312)
- _offset = 0 # how much records to seek from result-set start (default is 0)
- _limit = 20 # how much records to return from result-set starting at offset (default is 20)
- _mode = SPH_MATCH_ALL # query matching mode (default is SPH_MATCH_ALL)
- _weights = [] # per-field weights (default is 1 for all fields)
- _sort = SPH_SORT_RELEVANCE # match sorting mode (default is SPH_SORT_RELEVANCE)
- _sortby = '' # attribute to sort by (defualt is "")
- _min_id = 0 # min ID to match (default is 0)
- _max_id = 0xFFFFFFFF # max ID to match (default is UINT_MAX)
- _filters = [] # search filters
- _groupby = '' # group-by attribute name
- _groupfunc = SPH_GROUPBY_DAY # group-by function (to pre-process group-by attribute value with)
- _groupsort = '@group desc' # group-by sorting clause (to sort groups in result set with)
- _maxmatches = 1000 # max matches to retrieve
- _error = '' # last error message
- _warning = '' # last warning message
- def __init__ (self):
- """
- create a new client object and fill defaults
- """
- pass
- def GetLastError (self):
- """
- get last error message (string)
- """
- return self._error
- def GetLastWarning (self):
- """
- get last warning message (string)
- """
- return self._warning
- def SetServer (self, host, port):
- """
- set searchd server
- """
- assert(isinstance(host, str))
- assert(isinstance(port, int))
- self._host = host
- self._port = port
- def _Connect (self):
- """
- connect to searchd server
- """
- try:
- sock = socket.socket ( socket.AF_INET, socket.SOCK_STREAM )
- sock.connect ( ( self._host, self._port ) )
- except socket.error, msg:
- if sock:
- sock.close()
- self._error = 'connection to %s:%s failed (%s)' % ( self._host, self._port, msg )
- return 0
- v = unpack('>L', sock.recv(4))
- if v<1:
- sock.close()
- self._error = 'expected searchd protocol version, got %s' % v
- return 0
- # all ok, send my version
- sock.send(pack('>L', 1))
- return sock
- def _GetResponse (self, sock, client_ver):
- """
- get and check response packet from searchd server
- """
- (status, ver, length) = unpack('>2HL', sock.recv(8))
- response = ''
- left = length
- while left>0:
- chunk = sock.recv(left)
- if chunk:
- response += chunk
- left -= len(chunk)
- else:
- break
- sock.close()
- # check response
- read = len(response)
- if not response or read!=length:
- if length:
- self._error = 'failed to read searchd response (status=%s, ver=%s, len=%s, read=%s)' \
- % (status, ver, length, read)
- else:
- self._error = 'received zero-sized searchd response'
- return None
- # check status
- if status==SEARCHD_WARNING:
- wend = 4 + unpack ( '>L', response[0:4] )[0]
- self._warning = response[4:wend]
- return response[wend:]
- if status==SEARCHD_ERROR:
- self._error = 'searchd error: '+response[4:]
- return None
- if status==SEARCHD_RETRY:
- self._error = 'temporary searchd error: '+response[4:]
- return None
- if status!=SEARCHD_OK:
- self._error = 'unknown status code %d' % status
- return None
- # check version
- if ver<client_ver:
- self._warning = 'searchd command v.%d.%d older than client\'s v.%d.%d, some options might not work' \
- % (ver>>8, ver&0xff, client_ver>>8, client_ver&0xff)
- return response
- def SetLimits (self, offset, limit, maxmatches=0):
- """
- set match offset, count, and max number to retrieve
- """
- assert(isinstance(offset, int) and offset>=0)
- assert(isinstance(limit, int) and limit>0)
- assert(maxmatches>=0)
- self._offset = offset
- self._limit = limit
- if maxmatches>0:
- self._maxmatches = maxmatches
- def SetMatchMode (self, mode):
- """
- set match mode
- """
- assert(mode in [SPH_MATCH_ALL, SPH_MATCH_ANY, SPH_MATCH_PHRASE, SPH_MATCH_BOOLEAN, SPH_MATCH_EXTENDED])
- self._mode = mode
- def SetSortMode ( self, mode, clause='' ):
- """
- set sort mode
- """
- assert ( mode in [SPH_SORT_RELEVANCE, SPH_SORT_ATTR_DESC, SPH_SORT_ATTR_ASC, SPH_SORT_TIME_SEGMENTS, SPH_SORT_EXTENDED] )
- assert ( isinstance ( clause, str ) )
- self._sort = mode
- self._sortby = clause
- def SetWeights (self, weights):
- """
- set per-field weights
- """
- assert(isinstance(weights, list))
- for w in weights:
- assert(isinstance(w, int))
- self._weights = weights
- def SetIDRange (self, minid, maxid):
- """
- set IDs range to match
- only match those records where document ID
- is beetwen minid and maxid (including minid and maxid)
- """
- assert(isinstance(minid, int))
- assert(isinstance(maxid, int))
- assert(minid<=maxid)
- self._min_id = minid
- self._max_id = maxid
- def SetFilter ( self, attribute, values, exclude=0 ):
- """
- set values filter
- only match those records where $attribute column values
- are in specified set
- """
- assert(isinstance(attribute, str))
- assert(isinstance(values, list))
- assert(values)
- for value in values:
- assert(isinstance(value, int))
- self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'values':values } )
- def SetFilterRange (self, attribute, min_, max_, exclude=0 ):
- """
- set range filter
- only match those records where $attribute column value
- is beetwen $min and $max (including $min and $max)
- """
- assert(isinstance(attribute, str))
- assert(isinstance(min_, int))
- assert(isinstance(max_, int))
- assert(min_<=max_)
- self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'min':min_, 'max':max_ } )
- def SetGroupBy ( self, attribute, func, groupsort='@group desc' ):
- """
- set grouping attribute and function
- in grouping mode, all matches are assigned to different groups
- based on grouping function value.
- each group keeps track of the total match count, and the best match
- (in this group) according to current sorting function.
- the final result set contains one best match per group, with
- grouping function value and matches count attached.
- groups in result set could be sorted by any sorting clause,
- including both document attributes and the following special
- internal Sphinx attributes:
- - @id - match document ID;
- - @weight, @rank, @relevance - match weight;
- - @group - groupby function value;
- - @count - amount of matches in group.
- the default mode is to sort by groupby value in descending order,
- ie. by "@group desc".
- "total_found" would contain total amount of matching groups over
- the whole index.
- WARNING: grouping is done in fixed memory and thus its results
- are only approximate; so there might be more groups reported
- in total_found than actually present. @count might also
- be underestimated.
- for example, if sorting by relevance and grouping by "published"
- attribute with SPH_GROUPBY_DAY function, then the result set will
- contain one most relevant match per each day when there were any
- matches published, with day number and per-day match count attached,
- and sorted by day number in descending order (ie. recent days first).
- """
- assert(isinstance(attribute, str))
- assert(func in [SPH_GROUPBY_DAY, SPH_GROUPBY_WEEK, SPH_GROUPBY_MONTH, SPH_GROUPBY_YEAR, SPH_GROUPBY_ATTR] )
- assert(isinstance(groupsort, str))
- self._groupby = attribute
- self._groupfunc = func
- self._groupsort = groupsort
- def Query (self, query, index='*'):
- """
- connect to searchd server and run given search query
- "query" is query string
- "index" is index name to query, default is "*" which means to query all indexes
- returns false on failure
- returns hash which has the following keys on success:
- "matches"
- an array of found matches represented as ( "id", "weight", "attrs" ) hashes
- "total"
- total amount of matches retrieved (upto SPH_MAX_MATCHES, see sphinx.h)
- "total_found"
- total amount of matching documents in index
- "time"
- search time
- "words"
- an array of ( "word", "docs", "hits" ) hashes which contains
- docs and hits count for stemmed (!) query words
- """
- sock = self._Connect()
- if not sock:
- return {}
- # build request
- req = [pack('>4L', self._offset, self._limit, self._mode, self._sort)]
- req.append(pack('>L', len(self._sortby)))
- req.append(self._sortby)
- req.append(pack('>L', len(query)))
- req.append(query)
- req.append(pack('>L', len(self._weights)))
- for w in self._weights:
- req.append(pack('>L', w))
- req.append(pack('>L', len(index)))
- req.append(index)
- req.append(pack('>L', self._min_id))
- req.append(pack('>L', self._max_id))
- # filters
- req.append ( pack ( '>L', len(self._filters) ) )
- for f in self._filters:
- req.append ( pack ( '>L', len(f['attr']) ) )
- req.append ( f['attr'] )
- if ( 'values' in f ):
- req.append ( pack ( '>L', len(f['values']) ) )
- for v in f['values']:
- req.append ( pack ( '>L', v ) )
- else:
- req.append ( pack ( '>3L', 0, f['min'], f['max'] ) )
- req.append ( pack ( '>L', f['exclude'] ) )
- # group-by, max-matches, group-sort
- req.append ( pack ( '>2L', self._groupfunc, len(self._groupby) ) )
- req.append ( self._groupby )
- req.append ( pack ( '>2L', self._maxmatches, len(self._groupsort) ) )
- req.append ( self._groupsort )
- # send query, get response
- req = ''.join(req)
- length = len(req)
- req = pack('>2HL', SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, length)+req
- sock.send(req)
- response = self._GetResponse(sock, VER_COMMAND_SEARCH)
- if not response:
- return {}
- # parse response
- result = {}
- max_ = len(response)
- # read schema
- p = 0
- fields = []
- attrs = []
- nfields = unpack('>L', response[p:p+4])[0]
- p += 4
- while nfields>0 and p<max_:
- nfields -= 1
- length = unpack('>L', response[p:p+4])[0]
- p += 4
- fields.append(response[p:p+length])
- p += length
- result['fields'] = fields
- nattrs = unpack('>L', response[p:p+4])[0]
- p += 4
- while nattrs>0 and p<max_:
- nattrs -= 1
- length = unpack('>L', response[p:p+4])[0]
- p += 4
- attr = response[p:p+length]
- p += length
- type_ = unpack('>L', response[p:p+4])[0]
- p += 4
- attrs.append([attr,type_])
- result['attrs'] = attrs
- # read match count
- count = unpack('>L', response[p:p+4])[0]
- p += 4
- # read matches
- result['matches'] = []
- while count>0 and p<max_:
- count -= 1
- doc, weight = unpack('>2L', response[p:p+8])
- p += 8
- match = { 'id':doc, 'weight':weight, 'attrs':{} }
- for i in range(len(attrs)):
- match['attrs'][attrs[i][0]] = unpack('>L', response[p:p+4])[0]
- p += 4
- result['matches'].append ( match )
- result['total'], result['total_found'], result['time'], words = \
- unpack('>4L', response[p:p+16])
- result['time'] = '%.3f' % (result['time']/1000.0)
- p += 16
- result['words'] = []
- while words>0:
- words -= 1
- length = unpack('>L', response[p:p+4])[0]
- p += 4
- word = response[p:p+length]
- p += length
- docs, hits = unpack('>2L', response[p:p+8])
- p += 8
- result['words'].append({'word':word, 'docs':docs, 'hits':hits})
- sock.close()
- return result
- def BuildExcerpts (self, docs, index, words, opts=None):
- """
- connect to searchd server and generate exceprts from given documents
- "docs" is an array of strings which represent the documents' contents
- "index" is a string specifiying the index which settings will be used
- for stemming, lexing and case folding
- "words" is a string which contains the words to highlight
- "opts" is a hash which contains additional optional highlighting parameters:
- "before_match"
- a string to insert before a set of matching words, default is "<b>"
- "after_match"
- a string to insert after a set of matching words, default is "<b>"
- "chunk_separator"
- a string to insert between excerpts chunks, default is " ... "
- "limit"
- max excerpt size in symbols (codepoints), default is 256
- "around"
- how much words to highlight around each match, default is 5
- returns false on failure
- returns an array of string excerpts on success
- """
- if not opts:
- opts = {}
- assert(isinstance(docs, list))
- assert(isinstance(index, str))
- assert(isinstance(words, str))
- assert(isinstance(opts, dict))
- sock = self._Connect()
- if not sock:
- return []
- # fixup options
- opts.setdefault('before_match', '<b>')
- opts.setdefault('after_match', '</b>')
- opts.setdefault('chunk_separator', ' ... ')
- opts.setdefault('limit', 256)
- opts.setdefault('around', 5)
- # build request
- # v.1.0 req
- # mode=0, flags=1 (remove spaces)
- req = [pack('>2L', 0, 1)]
- # req index
- req.append(pack('>L', len(index)))
- req.append(index)
- # req words
- req.append(pack('>L', len(words)))
- req.append(words)
- # options
- req.append(pack('>L', len(opts['before_match'])))
- req.append(opts['before_match'])
- req.append(pack('>L', len(opts['after_match'])))
- req.append(opts['after_match'])
- req.append(pack('>L', len(opts['chunk_separator'])))
- req.append(opts['chunk_separator'])
- req.append(pack('>L', int(opts['limit'])))
- req.append(pack('>L', int(opts['around'])))
- # documents
- req.append(pack('>L', len(docs)))
- for doc in docs:
- assert(isinstance(doc, str))
- req.append(pack('>L', len(doc)))
- req.append(doc)
- req = ''.join(req)
- # send query, get response
- length = len(req)
- # add header
- req = pack('>2HL', SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, length)+req
- wrote = sock.send(req)
- response = self._GetResponse(sock, VER_COMMAND_EXCERPT )
- if not response:
- return []
- # parse response
- pos = 0
- res = []
- rlen = len(response)
- for i in range(len(docs)):
- length = unpack('>L', response[pos:pos+4])[0]
- pos += 4
- if pos+length > rlen:
- self._error = 'incomplete reply'
- return []
- res.append(response[pos:pos+length])
- pos += length
- return res
- #
- # $Id$
- #
|