Browse Source

implemented docstore debug checks

Ilya Kuznetsov 6 years ago
parent
commit
d8a174c90d

+ 2 - 1
src/CMakeLists.txt

@@ -17,7 +17,8 @@ set ( LIBSPHINX_SRCS sphinx.cpp sphinxexcerpt.cpp
 		sphinxqcache.cpp sphinxjsonquery.cpp
 		attribute.cpp secondaryindex.cpp killlist.cpp searchnode.cpp json/cJSON.c
 		sphinxpq.cpp icu.cpp global_idf.cpp docstore.cpp lz4/lz4.c lz4/lz4hc.c
-		searchdexpr.cpp snippetfunctor.cpp snippetindex.cpp snippetstream.cpp )
+		searchdexpr.cpp snippetfunctor.cpp snippetindex.cpp snippetstream.cpp
+		indexcheck.cpp datareader.cpp indexformat.cpp )
 set ( INDEXER_SRCS indexer.cpp )
 set ( INDEXTOOL_SRCS indextool.cpp )
 set ( SEARCHD_SRCS_TESTABLE searchdha.cpp http/http_parser.c searchdhttp.cpp

+ 1 - 0
src/attribute.cpp

@@ -16,6 +16,7 @@
 
 #include "sphinxint.h"
 #include "sphinxjson.h"
+#include "indexcheck.h"
 
 //////////////////////////////////////////////////////////////////////////
 // blob attributes

+ 276 - 0
src/datareader.cpp

@@ -0,0 +1,276 @@
+//
+// Copyright (c) 2017-2019, Manticore Software LTD (http://manticoresearch.com)
+// All rights reserved
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License. You should have
+// received a copy of the GPL license along with this program; if you
+// did not, you can find it at http://www.gnu.org/
+//
+
+#include "datareader.h"
+#include "sphinxint.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+inline static ESphQueryState StateByKind ( DataReaderFactory_c::Kind_e eKind )
+{
+	switch ( eKind )
+	{
+	case DataReaderFactory_c::DOCS: return SPH_QSTATE_READ_DOCS;
+	case DataReaderFactory_c::HITS: return SPH_QSTATE_READ_HITS;
+	default: return SPH_QSTATE_IO;
+	}
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+class FileBlockReader_c : public FileBlockReader_i
+{
+public:
+	explicit FileBlockReader_c ( const char * szFileName )
+		: m_szFileName ( szFileName )
+	{}
+
+	RowID_t		UnzipRowid() override { return UnzipInt (); };
+	SphWordID_t	UnzipWordid() override { return UnzipOffset (); };
+
+protected:
+	const char * m_szFileName = nullptr;
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+// imitate CSphReader but fully in memory (intended to be used with mmap)
+class ThinMMapReader_c : public FileBlockReader_c
+{
+public:
+	SphOffset_t GetPos () const final
+	{
+		if ( !m_pPointer )
+			return 0;
+
+		assert ( m_pBase );
+		return m_pPointer - m_pBase;
+	}
+
+	void SeekTo ( SphOffset_t iPos, int /*iSizeHint*/ ) final
+	{
+		m_pPointer = m_pBase + iPos;
+	}
+
+	DWORD		UnzipInt () final;
+	uint64_t	UnzipOffset () final;
+
+	void Reset () final
+	{
+		m_pPointer = m_pBase;
+	}
+
+protected:
+	~ThinMMapReader_c() final {}
+
+private:
+	friend class MMapFactory_c;
+
+	const BYTE *	m_pBase = nullptr;
+	const BYTE *	m_pPointer = nullptr;
+	SphOffset_t		m_iSize = 0;
+
+	ThinMMapReader_c ( const BYTE * pArena, SphOffset_t iSize, const char * sFileName )
+		: FileBlockReader_c ( sFileName )
+	{
+		m_pPointer = m_pBase = pArena;
+		m_iSize = iSize;
+	}
+
+	BYTE GetByte()
+	{
+		auto iPos = m_pPointer - m_pBase;
+		if ( iPos>=0 && iPos<m_iSize )
+			return *m_pPointer++;
+
+		sphWarning( "INTERNAL: out-of-range in ThinMMapReader_c: trying to read '%s' at " INT64_FMT ", from mmap of "
+			INT64_FMT ", query most probably would FAIL; report the fact to dev!",
+			( m_szFileName ? m_szFileName : "" ), int64_t(iPos), int64_t(m_iSize) );
+
+		return 0; // it's better then crash because of unexpected read out-of-range (file reader does the same there)
+	}
+};
+
+
+DWORD ThinMMapReader_c::UnzipInt()
+{
+	SPH_VARINT_DECODE ( DWORD, GetByte() );
+}
+
+
+uint64_t ThinMMapReader_c::UnzipOffset()
+{
+	SPH_VARINT_DECODE ( uint64_t, GetByte() );
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+class DirectFileReader_c : public FileBlockReader_c, protected FileReader_c
+{
+	friend class DirectFactory_c;
+
+public:
+	SphOffset_t GetPos () const final
+	{
+		return FileReader_c::GetPos();
+	}
+
+	void SeekTo ( SphOffset_t iPos, int iSizeHint ) final
+	{
+		FileReader_c::SeekTo ( iPos, iSizeHint );
+	}
+
+	DWORD UnzipInt() final
+	{
+		return FileReader_c::UnzipInt();
+	}
+
+	uint64_t UnzipOffset() final
+	{
+		return FileReader_c::UnzipOffset();
+	}
+
+	void Reset() final
+	{
+		FileReader_c::Reset();
+	}
+
+protected:
+	explicit DirectFileReader_c ( BYTE * pBuf, int iSize, const char * szFileName )
+		: FileBlockReader_c ( szFileName )
+		, FileReader_c ( pBuf, iSize )
+	{}
+
+	~DirectFileReader_c() final {}
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+// producer of readers which access by Seek + Read
+class DirectFactory_c : public DataReaderFactory_c
+{
+public:
+	DirectFactory_c ( const CSphString & sFile, CSphString & sError, ESphQueryState eState, int iReadBuffer, int iReadUnhinted )
+		: m_eWorkState ( eState )
+		, m_iReadBuffer ( iReadBuffer )
+		, m_iReadUnhinted ( iReadUnhinted )
+	{
+		SetValid ( m_dReader.Open ( sFile, sError ) );
+	}
+
+	SphOffset_t GetFilesize () const final
+	{
+		return m_dReader.GetFilesize();
+	}
+
+	SphOffset_t GetPos () const final
+	{
+		return m_iPos;
+	}
+
+	void SeekTo ( SphOffset_t iPos ) final
+	{
+		m_iPos = iPos;
+	}
+
+	// returns depended reader sharing same FD as maker
+	FileBlockReader_c * MakeReader ( BYTE * pBuf, int iSize ) final
+	{
+		auto pFileReader = new DirectFileReader_c ( pBuf, iSize, m_dReader.GetFilename().cstr() );
+		pFileReader->SetFile ( m_dReader.GetFD(), m_dReader.GetFilename().cstr() );
+		pFileReader->SetBuffers ( m_iReadBuffer, m_iReadUnhinted );
+		if ( m_iPos )
+			pFileReader->SeekTo ( m_iPos, READ_NO_SIZE_HINT );
+
+		pFileReader->m_pProfile = m_dReader.m_pProfile;
+		pFileReader->m_eProfileState = m_eWorkState;
+		return pFileReader;
+	}
+
+	void SetProfile ( CSphQueryProfile* pProfile ) final
+	{
+		m_dReader.m_pProfile = pProfile;
+	}
+
+protected:
+	~DirectFactory_c() final {} // d-tr only by Release
+
+private:
+	CSphAutoreader	m_dReader;
+	ESphQueryState	m_eWorkState;
+	SphOffset_t		m_iPos = 0;
+	int				m_iReadBuffer = 0;
+	int				m_iReadUnhinted = 0;
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+// producer of readers which access by MMap
+class MMapFactory_c : public DataReaderFactory_c
+{
+public:
+	MMapFactory_c ( const CSphString & sFile, CSphString & sError, FileAccess_e eAccess )
+	{
+		SetValid ( m_tBackendFile.Setup ( sFile, sError ) );
+		if ( eAccess==FileAccess_e::MLOCK )
+			m_tBackendFile.MemLock( sError );
+	}
+
+	SphOffset_t GetFilesize () const final
+	{
+		return m_tBackendFile.GetLength64 ();
+	}
+
+	SphOffset_t GetPos () const final
+	{
+		return m_iPos;
+	}
+
+	void SeekTo ( SphOffset_t iPos ) final
+	{
+		m_iPos = iPos;
+	}
+
+	// returns depended reader sharing same mmap as maker
+	FileBlockReader_c * MakeReader ( BYTE *, int ) final
+	{
+		auto pReader = new ThinMMapReader_c ( m_tBackendFile.GetWritePtr(),
+			m_tBackendFile.GetLength64(), m_tBackendFile.GetFileName() );
+		if ( m_iPos )
+			pReader->SeekTo ( m_iPos, 0 );
+		return pReader;
+	}
+
+protected:
+	~MMapFactory_c() final {} // d-tr only by Release
+
+private:
+	CSphMappedBuffer<BYTE>	m_tBackendFile;
+	SphOffset_t				m_iPos = 0;
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+extern int g_iReadUnhinted;
+
+DataReaderFactory_c * NewProxyReader ( const CSphString & sFile, CSphString & sError, DataReaderFactory_c::Kind_e eKind, int iReadBuffer, FileAccess_e eAccess )
+{
+	auto eState = StateByKind ( eKind );
+	DataReaderFactory_c * pReader = nullptr;
+
+	if ( eAccess==FileAccess_e::FILE )
+		pReader = new DirectFactory_c ( sFile, sError, eState, iReadBuffer, g_iReadUnhinted );
+	else
+		pReader = new MMapFactory_c ( sFile, sError, eAccess );
+
+	if ( !pReader->IsValid ())
+		SafeRelease ( pReader )
+		return pReader;
+}

+ 65 - 0
src/datareader.h

@@ -0,0 +1,65 @@
+//
+// Copyright (c) 2017-2019, Manticore Software LTD (http://manticoresearch.com)
+// All rights reserved
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License. You should have
+// received a copy of the GPL license along with this program; if you
+// did not, you can find it at http://www.gnu.org/
+//
+
+#ifndef _datareader_
+#define _datareader_
+
+#include "sphinx.h"
+
+#define READ_NO_SIZE_HINT 0
+
+// Reader from file or filemap
+class FileBlockReader_i : public ISphRefcountedMT
+{
+public:
+	virtual SphOffset_t	GetPos() const = 0;
+	virtual void		SeekTo ( SphOffset_t iPos, int iSizeHint ) = 0;
+	virtual DWORD		UnzipInt() = 0;
+	virtual uint64_t	UnzipOffset() = 0;
+	virtual RowID_t		UnzipRowid() = 0;
+	virtual SphWordID_t	UnzipWordid() = 0;
+	virtual void		Reset () = 0;
+};
+
+
+using FileBlockReaderPtr_c = CSphRefcountedPtr<FileBlockReader_i>;
+
+// producer of readers from file or filemap
+class DataReaderFactory_c : public ISphRefcountedMT
+{
+public:
+	enum Kind_e
+	{
+		DOCS,
+		HITS
+	};
+
+	bool						IsValid () const { return m_bValid; }
+
+	virtual SphOffset_t			GetFilesize () const = 0;
+	virtual SphOffset_t			GetPos () const = 0;
+	virtual void				SeekTo ( SphOffset_t ) = 0;
+	virtual FileBlockReader_i *	MakeReader ( BYTE * pBuf, int iSize ) = 0;
+	virtual void				SetProfile ( CSphQueryProfile * ) {};
+
+protected:
+								~DataReaderFactory_c () override {}
+
+	void						SetValid ( bool bValid ) { m_bValid = bValid; }
+
+private:
+	bool m_bValid = false;
+};
+
+using DataReaderFactoryPtr_c = CSphRefcountedPtr<DataReaderFactory_c>;
+
+DataReaderFactory_c * NewProxyReader ( const CSphString & sFile, CSphString & sError, DataReaderFactory_c::Kind_e eKind, int iReadBuffer, FileAccess_e eAccess );
+
+#endif // _datareader_

+ 320 - 3
src/docstore.cpp

@@ -16,6 +16,7 @@
 
 #include "sphinxint.h"
 #include "attribute.h"
+#include "indexcheck.h"
 #include "lz4/lz4.h"
 #include "lz4/lz4hc.h"
 
@@ -28,7 +29,8 @@ enum BlockFlags_e : BYTE
 enum BlockType_e : BYTE
 {
 	BLOCK_TYPE_SMALL,
-	BLOCK_TYPE_BIG
+	BLOCK_TYPE_BIG,
+	BLOCK_TYPE_TOTAL
 };
 
 enum DocFlags_e : BYTE
@@ -717,6 +719,8 @@ static void CreateFieldRemap ( VecTraits_T<int> & dFieldInRset, const VecTraits_
 
 class Docstore_c : public Docstore_i, public DocstoreSettings_t
 {
+	friend class DocstoreChecker_c;
+
 public:
 						Docstore_c ( const CSphString & sFilename );
 						~Docstore_c();
@@ -765,7 +769,7 @@ private:
 
 	bool						ProcessSmallBlockDoc ( RowID_t tCurDocRowID, RowID_t tRowID, const VecTraits_T<int> * pFieldIds, const CSphFixedVector<int> & dFieldInRset, bool bPack,
 		MemoryReader2_c & tReader, CSphBitvec & tEmptyFields, DocstoreDoc_t & tResult ) const;
-	const void					ProcessBigBlockField ( int iField, const FieldInfo_t & tInfo, int iFieldInRset, bool bPack, int64_t iSessionId, SphOffset_t & tOffset, DocstoreDoc_t & tResult ) const;
+	void						ProcessBigBlockField ( int iField, const FieldInfo_t & tInfo, int iFieldInRset, bool bPack, int64_t iSessionId, SphOffset_t & tOffset, DocstoreDoc_t & tResult ) const;
 };
 
 
@@ -1084,7 +1088,7 @@ BlockCache_c::BlockData_t Docstore_c::UncompressBigBlockField ( SphOffset_t tOff
 }
 
 
-const void Docstore_c::ProcessBigBlockField ( int iField, const FieldInfo_t & tInfo, int iFieldInRset, bool bPack, int64_t iSessionId, SphOffset_t & tOffset, DocstoreDoc_t & tResult ) const
+void Docstore_c::ProcessBigBlockField ( int iField, const FieldInfo_t & tInfo, int iFieldInRset, bool bPack, int64_t iSessionId, SphOffset_t & tOffset, DocstoreDoc_t & tResult ) const
 {
 	if ( tInfo.m_uFlags & FIELD_FLAG_EMPTY )
 		return;
@@ -1743,6 +1747,312 @@ DocstoreSession_c::~DocstoreSession_c()
 
 //////////////////////////////////////////////////////////////////////////
 
+class DocstoreChecker_c
+{
+public:
+						DocstoreChecker_c ( CSphAutoreader & tReader, DebugCheckError_c & tReporter );
+
+	bool				Check();
+
+private:
+	CSphAutoreader &	m_tReader;
+	DebugCheckError_c &	m_tReporter;
+	const char *		m_szFilename = nullptr;
+	DocstoreFields_c	m_tFields;
+	CSphScopedPtr<Compressor_i> m_pCompressor{nullptr};
+
+	void				CheckSmallBlockDoc ( MemoryReader2_c & tReader, CSphBitvec & tEmptyFields, SphOffset_t tOffset );
+	void				CheckSmallBlock ( const Docstore_c::Block_t & tBlock );
+	void				CheckBlock ( const Docstore_c::Block_t & tBlock );
+	void				CheckBigBlockField ( const Docstore_c::FieldInfo_t & tInfo, SphOffset_t & tOffset );
+	void				CheckBigBlock ( const Docstore_c::Block_t & tBlock );
+};
+
+
+DocstoreChecker_c::DocstoreChecker_c ( CSphAutoreader & tReader, DebugCheckError_c & tReporter )
+	: m_tReader ( tReader )
+	, m_tReporter ( tReporter )
+	, m_szFilename ( tReader.GetFilename().cstr() )
+{}
+
+
+bool DocstoreChecker_c::Check()
+{
+	DWORD uStorageVersion = m_tReader.GetDword();
+	if ( uStorageVersion > STORAGE_VERSION )
+		return m_tReporter.Fail ( "Unable to load docstore: %s is v.%d, binary is v.%d", m_szFilename, uStorageVersion, STORAGE_VERSION );
+
+	m_tReader.GetDword();	// block size
+	BYTE uCompression = m_tReader.GetByte();
+	if ( uCompression > 2 )
+		return m_tReporter.Fail ( "Unknown docstore compression %u in %s", uCompression, m_szFilename );
+
+	Compression_e eCompression = Byte2Compression(uCompression);
+	m_pCompressor = CreateCompressor ( eCompression, DEFAULT_COMPRESSION_LEVEL );
+	if ( !m_pCompressor.Ptr() )
+		return m_tReporter.Fail ( "Unable to create compressor in %s", m_szFilename );
+
+	DWORD uNumFields = m_tReader.GetDword();
+	const DWORD MAX_SANE_FIELDS = 32768;
+	if ( uNumFields > MAX_SANE_FIELDS )
+		return m_tReporter.Fail ( "Too many docstore fields (%u) in %s", uNumFields, m_szFilename );
+
+	for ( int i = 0; i < (int)uNumFields; i++ )
+	{
+		BYTE uDataType = m_tReader.GetByte();
+		if ( uDataType > DOCSTORE_TOTAL )
+			return m_tReporter.Fail ( "Unknown docstore data type (%u) in %s", uDataType, m_szFilename );
+
+		DocstoreDataType_e eType = (DocstoreDataType_e)uDataType;
+		CSphString sName = m_tReader.GetString();
+		const int MAX_SANE_FIELD_NAME_LEN = 32768;
+		if ( sName.Length() > MAX_SANE_FIELD_NAME_LEN )
+			return m_tReporter.Fail ( "Docstore field name too long (%d) in %s", sName.Length(), m_szFilename );
+
+		m_tFields.AddField ( sName, eType );
+	}
+
+	DWORD uNumBlocks = m_tReader.GetDword();
+	if ( !uNumBlocks )
+		return m_tReporter.Fail ( "Docstore has 0 blocks in %s", m_szFilename );
+
+	SphOffset_t tHeaderOffset = m_tReader.GetOffset();
+	if ( tHeaderOffset <= 0 || tHeaderOffset >= m_tReader.GetFilesize() )
+		return m_tReporter.Fail ( "Wrong docstore header offset (" INT64_FMT ") in %s", tHeaderOffset, m_szFilename );
+
+	m_tReader.SeekTo ( tHeaderOffset, 0 );
+
+	CSphFixedVector<Docstore_c::Block_t> dBlocks(uNumBlocks);
+
+	DWORD tPrevBlockRowID = 0;
+	SphOffset_t tPrevBlockOffset = 0;
+	for ( auto & i : dBlocks )
+	{
+		RowID_t uUnzipped = m_tReader.UnzipRowid();
+		if ( (int64_t)uUnzipped + tPrevBlockRowID >= (int64_t)0xFFFFFFFF )
+			m_tReporter.Fail ( "Docstore rowid overflow in %s", m_szFilename );
+
+		i.m_tRowID = uUnzipped + tPrevBlockRowID;
+		BYTE uBlockType = m_tReader.GetByte();
+		if ( uBlockType>BLOCK_TYPE_TOTAL )
+			return m_tReporter.Fail ( "Unknown docstore block type (%u) in %s", uBlockType, m_szFilename );
+
+		i.m_eType = (BlockType_e)uBlockType;
+		i.m_tOffset = m_tReader.UnzipOffset() + tPrevBlockOffset;
+		if ( i.m_tOffset <= 0 || i.m_tOffset >= m_tReader.GetFilesize() )
+			return m_tReporter.Fail ( "Wrong docstore block offset (" INT64_FMT ") in %s", i.m_tOffset, m_szFilename );
+
+		if ( i.m_eType==BLOCK_TYPE_BIG )
+			i.m_uHeaderSize = m_tReader.UnzipInt();
+
+		tPrevBlockRowID = i.m_tRowID;
+		tPrevBlockOffset = i.m_tOffset;
+	}
+
+	for ( int i = 1; i<dBlocks.GetLength(); i++ )
+	{
+		if ( dBlocks[i-1].m_tOffset>=dBlocks[i].m_tOffset )
+			return m_tReporter.Fail ( "Descending docstore block offset in %s", m_szFilename );
+
+		dBlocks[i-1].m_uSize = dBlocks[i].m_tOffset-dBlocks[i-1].m_tOffset;
+	}
+
+	dBlocks.Last().m_uSize = tHeaderOffset-dBlocks.Last().m_tOffset;
+
+	for ( auto & i : dBlocks )
+	{
+		if ( i.m_tOffset+i.m_uSize > m_tReader.GetFilesize() )
+			return m_tReporter.Fail ( "Docstore block size+offset out of bounds in %s", m_szFilename );
+
+		CheckBlock(i);
+	}
+
+	if ( m_tReader.GetErrorFlag() )
+		return m_tReporter.Fail ( "%s", m_tReader.GetErrorMessage().cstr() );
+
+	return true;
+}
+
+
+void DocstoreChecker_c::CheckSmallBlockDoc ( MemoryReader2_c & tReader, CSphBitvec & tEmptyFields, SphOffset_t tOffset )
+{
+	BYTE uDocFlags = tReader.GetByte();
+
+	if ( uDocFlags & ( ~(DOC_FLAG_ALL_EMPTY | DOC_FLAG_EMPTY_BITMASK) ) )
+		m_tReporter.Fail ( "Unknown docstore doc flag (%u) in %s (offset " INT64_FMT ")", uDocFlags, m_szFilename, tOffset );
+
+	if ( uDocFlags & DOC_FLAG_ALL_EMPTY )
+		return;
+
+	DWORD uBitMaskSize = tEmptyFields.GetSize()*sizeof(DWORD);
+
+	bool bHasBitmask = !!(uDocFlags & DOC_FLAG_EMPTY_BITMASK);
+	if ( bHasBitmask )
+	{
+		memcpy ( tEmptyFields.Begin(), tReader.Begin()+tReader.GetPos(), uBitMaskSize );
+		tReader.SetPos ( tReader.GetPos()+uBitMaskSize );
+	}
+
+	for ( int iField = 0; iField < m_tFields.GetNumFields(); iField++ )
+		if ( !bHasBitmask || !tEmptyFields.BitGet(iField) )
+		{
+			DWORD uFieldLength = tReader.UnzipInt();
+			tReader.SetPos ( tReader.GetPos()+uFieldLength );
+			if ( tReader.GetPos() > tReader.GetLength() )
+				m_tReporter.Fail ( "Out of bounds in docstore field data in %s (offset " INT64_FMT ")", m_szFilename, tOffset );
+		}
+}
+
+
+void DocstoreChecker_c::CheckSmallBlock ( const Docstore_c::Block_t & tBlock )
+{
+	CSphFixedVector<BYTE> dBlock ( tBlock.m_uSize );
+
+	m_tReader.SeekTo ( tBlock.m_tOffset, 0 );
+	m_tReader.GetBytes ( dBlock.Begin(), dBlock.GetLength() );
+
+	MemoryReader2_c tBlockReader ( dBlock.Begin(), dBlock.GetLength() );
+	BlockCache_c::BlockData_t tResult;
+	tResult.m_uFlags = tBlockReader.GetByte();
+	tResult.m_uNumDocs = tBlockReader.UnzipInt();
+	tResult.m_uSize = tBlockReader.UnzipInt();
+	DWORD uCompressedLength = tResult.m_uSize;
+	bool bCompressed = tResult.m_uFlags & BLOCK_FLAG_COMPRESSED;
+	if ( bCompressed )
+		uCompressedLength = tBlockReader.UnzipInt();
+
+	if ( tResult.m_uFlags!=0 && tResult.m_uFlags!=BLOCK_FLAG_COMPRESSED )
+		m_tReporter.Fail ( "Unknown docstore small block flag (%u) in %s (offset " INT64_FMT ")", tResult.m_uFlags, m_szFilename, tBlock.m_tOffset );
+
+	if ( uCompressedLength>tResult.m_uSize )
+		m_tReporter.Fail ( "Docstore block size mismatch: compressed=%u, uncompressed=%u in %s (offset " INT64_FMT ")", uCompressedLength, tResult.m_uSize, m_szFilename, tBlock.m_tOffset );
+
+	const BYTE * pBody = dBlock.Begin() + tBlockReader.GetPos();
+
+	CSphFixedVector<BYTE> dDecompressed(0);
+	if ( bCompressed )
+	{
+		dDecompressed.Reset ( tResult.m_uSize );
+		if ( !m_pCompressor->Decompress ( VecTraits_T<const BYTE> (pBody, uCompressedLength), dDecompressed) )
+			m_tReporter.Fail ( "Error decompressing small block in %s (offset " INT64_FMT ")", m_szFilename, tBlock.m_tOffset );
+
+		tResult.m_pData = dDecompressed.LeakData();
+	}
+	else
+	{
+		// we can't just pass tResult.m_pData because it doesn't point to the start of the allocated block
+		tResult.m_pData = new BYTE[tResult.m_uSize];
+		memcpy ( tResult.m_pData, pBody, tResult.m_uSize );
+	}
+
+	MemoryReader2_c tReader ( tResult.m_pData, tResult.m_uSize );
+	CSphBitvec tEmptyFields ( m_tFields.GetNumFields() );
+	for ( int i = 0; i < (int)tResult.m_uNumDocs; i++ )
+		CheckSmallBlockDoc ( tReader, tEmptyFields, tBlock.m_tOffset );
+
+	SafeDelete ( tResult.m_pData );
+}
+
+
+void DocstoreChecker_c::CheckBigBlockField ( const Docstore_c::FieldInfo_t & tInfo, SphOffset_t & tOffset )
+{
+	if ( tInfo.m_uFlags & FIELD_FLAG_EMPTY )
+		return;
+
+	bool bCompressed = !!( tInfo.m_uFlags & FIELD_FLAG_COMPRESSED );
+	SphOffset_t tOffsetDelta = bCompressed ? tInfo.m_uCompressedLen : tInfo.m_uUncompressedLen;
+	BlockCache_c::BlockData_t tBlockData;
+
+	CSphFixedVector<BYTE> dField ( tOffsetDelta );
+	m_tReader.SeekTo ( tOffset, 0 );
+	m_tReader.GetBytes ( dField.Begin(), dField.GetLength() );
+
+	tBlockData.m_uSize = tInfo.m_uUncompressedLen;
+
+	if ( bCompressed )
+	{
+		CSphFixedVector<BYTE> dDecompressed(0);
+		dDecompressed.Reset ( tBlockData.m_uSize );
+		if ( !m_pCompressor->Decompress ( dField, dDecompressed ) )
+			m_tReporter.Fail ( "Error decompressing big block in %s (offset " INT64_FMT ")", m_szFilename, tOffset );
+	}
+
+	tOffset += tOffsetDelta;
+
+	if ( tOffset > m_tReader.GetFilesize() )
+		m_tReporter.Fail ( "Docstore block size+offset out of bounds in %s (offset " INT64_FMT ")", m_szFilename, tOffset );
+}
+
+
+void DocstoreChecker_c::CheckBigBlock ( const Docstore_c::Block_t & tBlock )
+{
+	CSphFixedVector<Docstore_c::FieldInfo_t> dFieldInfo ( m_tFields.GetNumFields() );
+
+	CSphFixedVector<BYTE> dBlockHeader(tBlock.m_uHeaderSize);
+	CSphFixedVector<BYTE> dBlock ( tBlock.m_uSize );
+
+	m_tReader.SeekTo ( tBlock.m_tOffset, 0 );
+	m_tReader.GetBytes ( dBlockHeader.Begin(), dBlockHeader.GetLength() );
+
+	MemoryReader2_c tReader ( dBlockHeader.Begin(), dBlockHeader.GetLength() );
+
+	CSphVector<int> dFieldSort;
+	BYTE uBlockFlags = tReader.GetByte();
+	if ( uBlockFlags & ~BLOCK_FLAG_FIELD_REORDER )
+		m_tReporter.Fail ( "Unknown docstore big block flag (%u) in %s (offset " INT64_FMT ")", uBlockFlags, m_szFilename, tBlock.m_tOffset );
+
+	bool bNeedReorder = !!( uBlockFlags & BLOCK_FLAG_FIELD_REORDER );
+	if ( bNeedReorder )
+	{
+		dFieldSort.Resize ( m_tFields.GetNumFields() );
+		for ( auto & i : dFieldSort )
+		{
+			i = tReader.UnzipInt();
+			if ( i<0 || i>m_tFields.GetNumFields() )
+				m_tReporter.Fail ( "Error in docstore field remap (%d) in %s (offset " INT64_FMT ")", i, m_szFilename, tBlock.m_tOffset );
+		}
+	}
+
+	for ( int i = 0; i < m_tFields.GetNumFields(); i++ )
+	{
+		int iField = bNeedReorder ? dFieldSort[i] : i;
+		Docstore_c::FieldInfo_t & tInfo = dFieldInfo[iField];
+
+		tInfo.m_uFlags = tReader.GetByte();
+		if ( tInfo.m_uFlags & (~(FIELD_FLAG_EMPTY | FIELD_FLAG_COMPRESSED) ) )
+			m_tReporter.Fail ( "Unknown docstore big block field flag (%u) in %s (offset " INT64_FMT ")", tInfo.m_uFlags, m_szFilename, tBlock.m_tOffset );
+
+		if ( tInfo.m_uFlags & FIELD_FLAG_EMPTY )
+			continue;
+
+		tInfo.m_uUncompressedLen = tReader.UnzipInt();
+		if ( tInfo.m_uFlags & FIELD_FLAG_COMPRESSED )
+			tInfo.m_uCompressedLen = tReader.UnzipInt();
+
+		if ( tInfo.m_uCompressedLen>tInfo.m_uUncompressedLen )
+			m_tReporter.Fail ( "Docstore block size mismatch: compressed=%u, uncompressed=%u in %s (offset " INT64_FMT ")", tInfo.m_uCompressedLen, tInfo.m_uUncompressedLen, m_szFilename, tBlock.m_tOffset );
+
+		if ( tReader.GetPos() > tReader.GetLength() )
+			m_tReporter.Fail ( "Out of bounds in docstore field data in %s (offset " INT64_FMT ")", m_szFilename, tBlock.m_tOffset );
+	}
+
+	SphOffset_t tOffset = tBlock.m_tOffset+tBlock.m_uHeaderSize;
+
+	for ( int i = 0; i < m_tFields.GetNumFields(); i++ )
+		CheckBigBlockField ( dFieldInfo[bNeedReorder ? dFieldSort[i] : i], tOffset );
+}
+
+
+void DocstoreChecker_c::CheckBlock ( const Docstore_c::Block_t & tBlock )
+{
+	if ( tBlock.m_eType==BLOCK_TYPE_SMALL )
+		CheckSmallBlock(tBlock);
+	else
+		CheckBigBlock(tBlock);
+}
+
+//////////////////////////////////////////////////////////////////////////
+
 Docstore_i * CreateDocstore ( const CSphString & sFilename, CSphString & sError )
 {
 	CSphScopedPtr<Docstore_c> pDocstore ( new Docstore_c(sFilename) );
@@ -1786,4 +2096,11 @@ void ShutdownDocstore()
 {
 	BlockCache_c::Done();
 	DocstoreReaders_c::Done();
+}
+
+
+bool CheckDocstore ( CSphAutoreader & tReader, DebugCheckError_c & tReporter )
+{
+	DocstoreChecker_c tChecker ( tReader, tReporter );
+	return tChecker.Check();
 }

+ 4 - 0
src/docstore.h

@@ -102,6 +102,10 @@ DocstoreFields_i *	CreateDocstoreFields();
 void				InitDocstore ( int64_t iCacheSize );
 void				ShutdownDocstore();
 
+class DebugCheckError_c;
+class CSphAutoreader;
+bool				CheckDocstore ( CSphAutoreader & tReader, DebugCheckError_c & tReporter );
+
 #endif
 
 //

+ 3 - 0
src/gtests/gtests_tokenizer.cpp

@@ -798,6 +798,8 @@ TEST_F ( QueryParser, soft_whitespace4 )
 
 
 static CSphSourceStats g_tTmpDummyStat;
+static FileAccessSettings_t g_tDummyFASettings;
+
 class CSphDummyIndex : public CSphIndex
 {
 public:
@@ -811,6 +813,7 @@ public:
 	void				Dealloc () override {}
 	void				Preread () override {}
 	void				SetMemorySettings ( const FileAccessSettings_t & ) override {}
+	const FileAccessSettings_t & GetMemorySettings() const override { return g_tDummyFASettings; }
 	void				SetBase ( const char * ) override {}
 	bool				Rename ( const char * ) override { return false; }
 	bool				Lock () override { return true; }

+ 1 - 3
src/index_converter.cpp

@@ -24,6 +24,7 @@
 #include "sphinxstem.h"
 #include "sphinxpq.h"
 #include "accumulator.h"
+#include "indexformat.h"
 
 namespace legacy
 {
@@ -41,9 +42,6 @@ STATIC_SIZE_ASSERT ( SphDocID_t, 8 );
 
 const DWORD SPH_SKIPLIST_BLOCK=128;
 
-static const int MAX_KEYWORD_BYTES = SPH_MAX_WORD_LEN*3+4;
-static const int DOCLIST_HINT_THRESH = 256;
-
 static const DWORD META_HEADER_MAGIC	= 0x54525053;	///< my magic 'SPRT' header
 static const DWORD META_VERSION		= 14;			///< current version
 

+ 1338 - 0
src/indexcheck.cpp

@@ -0,0 +1,1338 @@
+//
+// Copyright (c) 2017-2019, Manticore Software LTD (http://manticoresearch.com)
+// Copyright (c) 2001-2016, Andrew Aksyonoff
+// Copyright (c) 2008-2016, Sphinx Technologies Inc
+// All rights reserved
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License. You should have
+// received a copy of the GPL license along with this program; if you
+// did not, you can find it at http://www.gnu.org/
+//
+
+#include "indexcheck.h"
+
+#include "sphinxint.h"
+#include "attribute.h"
+#include "indexformat.h"
+#include "secondaryindex.h"
+#include "docstore.h"
+
+
+DebugCheckError_c::DebugCheckError_c ( FILE * pFile )
+	: m_pFile ( pFile )
+{
+	assert ( pFile );
+	m_bProgress = isatty ( fileno ( pFile ) )!=0;
+	m_tStartTime = sphMicroTimer();
+}
+
+
+void DebugCheckError_c::Msg ( const char * szFmt, ... )
+{
+	assert ( m_pFile );
+	va_list ap;
+	va_start ( ap, szFmt );
+	vfprintf ( m_pFile, szFmt, ap );
+	fprintf ( m_pFile, "\n" );
+	va_end ( ap );
+}
+
+
+bool DebugCheckError_c::Fail ( const char * szFmt, ... )
+{
+	assert ( m_pFile );
+	const int FAILS_THRESH = 100;
+	if ( ++m_nFails>=FAILS_THRESH )
+		return false;
+
+	va_list ap;
+	va_start ( ap, szFmt );
+	fprintf ( m_pFile, "FAILED, " );
+	vfprintf ( m_pFile, szFmt, ap );
+	if ( m_iSegment>=0 )
+		fprintf ( m_pFile, " (segment: %d)", m_iSegment );
+
+	fprintf ( m_pFile, "\n" );
+	va_end ( ap );
+
+	m_nFailsPrinted++;
+	if ( m_nFailsPrinted==FAILS_THRESH )
+		fprintf ( m_pFile, "(threshold reached; suppressing further output)\n" );
+
+	return false;
+}
+
+
+void DebugCheckError_c::Progress ( const char * szFmt, ... )
+{
+	if ( !m_bProgress )
+		return;
+
+	assert ( m_pFile );
+
+	va_list ap;
+	va_start ( ap, szFmt );
+	vfprintf ( m_pFile, szFmt, ap );
+	fprintf ( m_pFile, "\r" );
+	va_end ( ap );
+
+	fflush ( m_pFile );
+}
+
+
+void DebugCheckError_c::Done()
+{
+	assert ( m_pFile );
+
+	// well, no known kinds of failures, maybe some unknown ones
+	int64_t tmCheck = sphMicroTimer() - m_tStartTime;
+	if ( !m_nFails )
+		fprintf ( m_pFile, "check passed" );
+	else if ( m_nFails!=m_nFailsPrinted )
+		fprintf ( m_pFile, "check FAILED, " INT64_FMT " of " INT64_FMT " failures reported", m_nFailsPrinted, m_nFails );
+	else
+		fprintf ( m_pFile, "check FAILED, " INT64_FMT " failures reported", m_nFails );
+
+	fprintf ( m_pFile, ", %d.%d sec elapsed\n", (int)(tmCheck/1000000), (int)((tmCheck/100000)%10) );
+}
+
+
+void DebugCheckError_c::SetSegment ( int iSegment )
+{
+	m_iSegment = iSegment;
+}
+
+
+int64_t DebugCheckError_c::GetNumFails() const
+{
+	return m_nFails;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+class FileDebugCheckReader_c : public DebugCheckReader_i
+{
+public:
+	FileDebugCheckReader_c ( CSphAutoreader * pReader )
+		: m_pReader ( pReader )
+	{}
+
+	~FileDebugCheckReader_c() final {}
+
+	int64_t GetLengthBytes() final
+	{
+		return ( m_pReader ? m_pReader->GetFilesize() : 0 );
+	}
+
+	bool GetBytes ( void * pData, int iSize ) final
+	{
+		if ( !m_pReader )
+			return false;
+
+		m_pReader->GetBytes ( pData, iSize );
+		return !m_pReader->GetErrorFlag();
+	}
+
+	bool SeekTo ( int64_t iOff, int iHint ) final
+	{
+		if ( !m_pReader )
+			return false;
+
+		m_pReader->SeekTo ( iOff, iHint );
+		return !m_pReader->GetErrorFlag();
+	}
+
+private:
+	CSphAutoreader * m_pReader = nullptr;
+};
+
+
+void DebugCheckHelper_c::DebugCheck_Attributes ( DebugCheckReader_i & tAttrs, DebugCheckReader_i & tBlobs, int64_t nRows, int64_t iMinMaxBytes, const CSphSchema & tSchema, DebugCheckError_c & tReporter ) const
+{
+	// empty?
+	if ( !tAttrs.GetLengthBytes() )
+		return;
+
+	tReporter.Msg ( "checking rows..." );
+
+	if ( !tSchema.GetAttrsCount() )
+		tReporter.Fail ( "no attributes in schema; schema should at least have '%s' attr", sphGetDocidName() );
+
+	if ( tSchema.GetAttr(0).m_sName!=sphGetDocidName() )
+		tReporter.Fail ( "first attribute in schema should be '%s'", tSchema.GetAttr(0).m_sName.cstr() );
+
+	if ( tSchema.GetAttr(0).m_eAttrType!=SPH_ATTR_BIGINT )
+		tReporter.Fail ( "%s attribute should be BIGINT", sphGetDocidName() );
+
+	const CSphColumnInfo * pBlobLocator = nullptr;
+	int nBlobAttrs = 0;
+
+	if ( tSchema.HasBlobAttrs() )
+	{
+		pBlobLocator = tSchema.GetAttr ( sphGetBlobLocatorName() );
+
+		if ( !pBlobLocator )
+			tReporter.Fail ( "schema has blob attrs, but no blob locator '%s'", sphGetBlobLocatorName() );
+
+		if ( tSchema.GetAttr(1).m_sName!=sphGetBlobLocatorName() )
+			tReporter.Fail ( "second attribute in schema should be '%s'", sphGetBlobLocatorName() );
+
+		if ( tSchema.GetAttr(1).m_eAttrType!=SPH_ATTR_BIGINT )
+			tReporter.Fail ( "%s attribute should be BIGINT", sphGetBlobLocatorName() );
+
+		if ( !tBlobs.GetLengthBytes() )
+			tReporter.Fail ( "schema has blob attrs, but blob file is empty" );
+
+		for ( int i = 0; i < tSchema.GetAttrsCount(); i++ )
+			if ( sphIsBlobAttr(  tSchema.GetAttr(i).m_eAttrType ) )
+				nBlobAttrs++;
+	} else
+	{
+		if ( tBlobs.GetLengthBytes() )
+			tReporter.Fail ( "schema has no blob attrs but has blob rows" );
+	}
+
+	// sizes and counts
+	DWORD uStride = tSchema.GetRowSize();
+
+	int64_t iAttrElemCount = ( tAttrs.GetLengthBytes() - iMinMaxBytes ) / sizeof(CSphRowitem);
+	int64_t iAttrExpected = nRows*uStride;
+	if ( iAttrExpected > iAttrElemCount )
+		tReporter.Fail ( "rowitems count mismatch (expected=" INT64_FMT ", loaded=" INT64_FMT ")", iAttrExpected, iAttrElemCount );
+
+	CSphVector<CSphAttrLocator> dFloatItems;
+	for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
+	{
+		const CSphColumnInfo & tAttr = tSchema.GetAttr(i);
+		if ( tAttr.m_eAttrType==SPH_ATTR_FLOAT )
+			dFloatItems.Add	( tAttr.m_tLocator );
+	}
+
+	CSphFixedVector<CSphRowitem> dRow ( tSchema.GetRowSize() );
+	const CSphRowitem * pRow = dRow.Begin();
+	tAttrs.SeekTo ( 0, dRow.GetLengthBytes() );
+
+	for ( int64_t iRow=0; iRow<nRows; iRow++ )
+	{
+		tAttrs.GetBytes ( dRow.Begin(), dRow.GetLengthBytes() );
+		DocID_t tDocID = sphGetDocID(pRow);
+
+		///////////////////////////
+		// check blobs
+		///////////////////////////
+
+		if ( pBlobLocator )
+		{
+			int64_t iBlobOffset1 = sphGetBlobRowOffset(pRow);
+			int64_t iBlobOffset2 = sphGetRowAttr ( pRow, pBlobLocator->m_tLocator );
+
+			if ( iBlobOffset1!=iBlobOffset2 )
+				tReporter.Fail ( "blob row locator mismatch (row=" INT64_FMT ", docid=" INT64_FMT ", offset1=" INT64_FMT ", offset2=" INT64_FMT ", rowid=" INT64_FMT " of " INT64_FMT ")",
+					iRow, tDocID, iBlobOffset1, iBlobOffset2, iRow, nRows );
+
+			CSphString sError;
+			if ( !sphCheckBlobRow ( iBlobOffset1, tBlobs, tSchema, sError ) )
+				tReporter.Fail ( "%s at offset " INT64_FMT ", docid=" INT64_FMT ", rowid=" INT64_FMT " of " INT64_FMT, sError.cstr(), iBlobOffset1, tDocID, iRow, nRows );
+		}
+
+		///////////////////////////
+		// check floats
+		///////////////////////////
+
+		ARRAY_FOREACH ( iItem, dFloatItems )
+		{
+			const DWORD uValue = (DWORD)sphGetRowAttr ( pRow, dFloatItems[ iItem ] );
+			const DWORD uExp = ( uValue >> 23 ) & 0xff;
+			const DWORD uMantissa = uValue & 0x003fffff;
+
+			// check normalized
+			if ( uExp==0 && uMantissa!=0 )
+				tReporter.Fail ( "float attribute value is unnormalized (row=" INT64_FMT ", attr=%d, id=" INT64_FMT ", raw=0x%x, value=%f)", 	iRow, iItem, tDocID, uValue, sphDW2F ( uValue ) );
+
+			// check +-inf
+			if ( uExp==0xff && uMantissa==0 )
+				tReporter.Fail ( "float attribute is infinity (row=" INT64_FMT ", attr=%d, id=" INT64_FMT ", raw=0x%x, value=%f)", iRow, iItem, tDocID, uValue, sphDW2F ( uValue ) );
+		}
+	}
+}
+
+
+void DebugCheckHelper_c::DebugCheck_DeadRowMap ( int64_t iSizeBytes, int64_t nRows, DebugCheckError_c & tReporter ) const
+{
+	tReporter.Msg ( "checking dead row map..." );
+
+	int64_t nExpectedEntries = int(( nRows+31 ) / 32);
+	int64_t iExpectedSize = nExpectedEntries*sizeof(DWORD);
+	if ( iSizeBytes!=iExpectedSize )
+		tReporter.Fail ( "unexpected dead row map: " INT64_FMT ", expected: " INT64_FMT " bytes", iSizeBytes, iExpectedSize );
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+class DiskIndexChecker_c : public DiskIndexChecker_i, public DebugCheckHelper_c
+{
+public:
+			DiskIndexChecker_c ( CSphIndex & tIndex, DebugCheckError_c & tReporter );
+
+	bool	OpenFiles ( CSphString & sError ) final;
+	void	Setup ( int64_t iNumRows, int64_t iDocinfoIndex, int64_t iMinMaxIndex, bool bCheckIdDups ) final;
+	CSphVector<SphWordID_t> & GetHitlessWords() final { return m_dHitlessWords; }
+
+	void	Check() final;
+
+private:
+	CSphIndex &				m_tIndex;
+	CSphAutoreader			m_tDictReader;
+	DataReaderFactoryPtr_c	m_pDocsReader;
+	DataReaderFactoryPtr_c	m_pHitsReader;
+	CSphAutoreader			m_tSkipsReader;
+	CSphAutoreader			m_tDeadRowReader;
+	CSphAutoreader			m_tAttrReader;
+	CSphAutoreader			m_tBlobReader;
+	CSphAutoreader			m_tDocstoreReader;
+	CSphVector<SphWordID_t> m_dHitlessWords;
+
+	DebugCheckError_c &		m_tReporter;
+
+	bool					m_bHasBlobs = false;
+	bool					m_bHasDocstore = false;
+	bool					m_bIsEmpty = false;
+	DWORD					m_uVersion = 0;
+	int64_t					m_iNumRows = 0;
+	int64_t					m_iDocinfoIndex = 0;
+	int64_t					m_iMinMaxIndex = 0;
+	bool					m_bCheckIdDups = false;
+	CSphSchema				m_tSchema;
+	CWordlist				m_tWordlist;
+
+	void	CheckDictionary();
+	void	CheckDocs();
+	void	CheckAttributes();
+	void	CheckKillList() const;
+	void	CheckBlockIndex();
+	void	CheckDocidLookup();
+	void	CheckDocids();
+	void	CheckDocstore();
+	
+	bool		ReadHeader ( CSphString & sError );
+	CSphString	GetFilename ( ESphExt eExt ) const;
+};
+
+
+DiskIndexChecker_c::DiskIndexChecker_c ( CSphIndex & tIndex, DebugCheckError_c & tReporter )
+	: m_tIndex ( tIndex )
+	, m_tReporter ( tReporter )
+{}
+
+
+bool DiskIndexChecker_c::ReadHeader ( CSphString & sError )
+{
+	CSphAutoreader tHeaderReader;
+	if ( !tHeaderReader.Open ( GetFilename(SPH_EXT_SPH), sError ) )
+		return false;
+
+	const char * szHeader = tHeaderReader.GetFilename().cstr();
+
+	// magic header
+	const char * szFmt = CheckFmtMagic ( tHeaderReader.GetDword() );
+	if ( szFmt )
+	{
+		sError.SetSprintf ( szFmt, szHeader );
+		return false;
+	}
+
+	// version
+	m_uVersion = tHeaderReader.GetDword();
+	if ( m_uVersion<=1 || m_uVersion>INDEX_FORMAT_VERSION )
+	{
+		sError.SetSprintf ( "%s is v.%d, binary is v.%d", szHeader, m_uVersion, INDEX_FORMAT_VERSION );
+		return false;
+	}
+
+	// we don't support anything prior to v54
+	DWORD uMinFormatVer = 54;
+	if ( m_uVersion<uMinFormatVer )
+	{
+		sError.SetSprintf ( "indexes prior to v.%d are no longer supported (use index_converter tool); %s is v.%d", uMinFormatVer, szHeader, m_uVersion );
+		return false;
+	}
+
+	// schema
+	ReadSchema ( tHeaderReader, m_tSchema, m_uVersion );
+
+	// dictionary header (wordlist checkpoints, infix blocks, etc)
+	m_tWordlist.m_iDictCheckpointsOffset = tHeaderReader.GetOffset();
+	m_tWordlist.m_iDictCheckpoints = tHeaderReader.GetDword();
+	m_tWordlist.m_iInfixCodepointBytes = tHeaderReader.GetByte();
+	m_tWordlist.m_iInfixBlocksOffset = tHeaderReader.GetDword();
+	m_tWordlist.m_iInfixBlocksWordsSize = tHeaderReader.GetDword();
+
+	m_tWordlist.m_dCheckpoints.Reset ( m_tWordlist.m_iDictCheckpoints );
+
+	if ( !m_tWordlist.Preread ( GetFilename(SPH_EXT_SPI).cstr(), m_tIndex.GetDictionary()->GetSettings().m_bWordDict, m_tIndex.GetSettings().m_iSkiplistBlockSize, sError ) )
+		return false;
+
+	// FIXME! add more header checks
+
+	return true;
+}
+
+
+bool DiskIndexChecker_c::OpenFiles ( CSphString & sError )
+{
+	if ( !ReadHeader(sError) )
+		return m_tReporter.Fail ( "error reading index header: %s", sError.cstr() );
+
+	if ( !m_tDictReader.Open ( GetFilename(SPH_EXT_SPI), sError ) )
+		return m_tReporter.Fail ( "unable to open dictionary: %s", sError.cstr() );
+
+	// use file reader during debug check to lower memory pressure
+	m_pDocsReader = NewProxyReader ( GetFilename(SPH_EXT_SPD), sError, DataReaderFactory_c::DOCS, m_tIndex.GetMemorySettings().m_iReadBufferDocList, FileAccess_e::FILE );
+	if ( !m_pDocsReader )
+		return m_tReporter.Fail ( "unable to open doclist: %s", sError.cstr() );
+
+	// use file reader during debug check to lower memory pressure
+	m_pHitsReader = NewProxyReader ( GetFilename(SPH_EXT_SPP), sError, DataReaderFactory_c::HITS, m_tIndex.GetMemorySettings().m_iReadBufferHitList, FileAccess_e::FILE );
+	if ( !m_pHitsReader )
+		return m_tReporter.Fail ( "unable to open hitlist: %s", sError.cstr() );
+
+	if ( !m_tSkipsReader.Open ( GetFilename(SPH_EXT_SPE), sError ) )
+		return m_tReporter.Fail ( "unable to open skiplist: %s", sError.cstr () );
+
+	if ( !m_tDeadRowReader.Open ( GetFilename(SPH_EXT_SPM).cstr(), sError ) )
+		return m_tReporter.Fail ( "unable to open dead-row map: %s", sError.cstr() );
+
+	if ( !m_tAttrReader.Open ( GetFilename(SPH_EXT_SPA).cstr(), sError ) )
+		return m_tReporter.Fail ( "unable to open attributes: %s", sError.cstr() );
+
+	if ( m_tSchema.GetAttr ( sphGetBlobLocatorName() ) )
+	{
+		if ( !m_tBlobReader.Open ( GetFilename(SPH_EXT_SPB), sError ) )
+			return m_tReporter.Fail ( "unable to open blobs: %s", sError.cstr() );
+
+		m_bHasBlobs = true;
+	}
+
+	if ( m_uVersion>=57 && m_tSchema.HasStoredFields() )
+	{
+		if ( !m_tDocstoreReader.Open ( GetFilename(SPH_EXT_SPDS).cstr(), sError ) )
+			return m_tReporter.Fail ( "unable to open docstore: %s", sError.cstr() );
+
+		m_bHasDocstore = true;
+	}
+
+	CSphAutofile tDocinfo ( GetFilename(SPH_EXT_SPA), SPH_O_READ, sError );
+	if ( tDocinfo.GetFD()<0 )
+		return false;
+
+	m_bIsEmpty = m_tAttrReader.GetFilesize()==0;
+
+	return true;
+}
+
+
+void DiskIndexChecker_c::Setup ( int64_t iNumRows, int64_t iDocinfoIndex, int64_t iMinMaxIndex, bool bCheckIdDups )
+{
+	m_iNumRows = iNumRows;
+	m_iDocinfoIndex = iDocinfoIndex;
+	m_iMinMaxIndex = iMinMaxIndex;
+	m_bCheckIdDups = bCheckIdDups;
+}
+
+
+void DiskIndexChecker_c::Check()
+{
+	CheckDictionary();
+	CheckDocs();
+	CheckAttributes();
+	CheckBlockIndex();
+	CheckKillList();
+	CheckDocstore();
+
+	DebugCheck_DeadRowMap ( m_tDeadRowReader.GetFilesize(), m_iNumRows, m_tReporter );
+	CheckDocidLookup();
+
+	if ( m_bCheckIdDups )
+		CheckDocids();
+}
+
+
+void DiskIndexChecker_c::CheckDictionary()
+{
+	m_tReporter.Msg ( "checking dictionary..." );
+
+	const CSphIndexSettings & tIndexSettings = m_tIndex.GetSettings();
+
+	SphWordID_t uWordid = 0;
+	int64_t iDoclistOffset = 0;
+	int iWordsTotal = 0;
+
+	char sWord[MAX_KEYWORD_BYTES], sLastWord[MAX_KEYWORD_BYTES];
+	memset ( sWord, 0, sizeof(sWord) );
+	memset ( sLastWord, 0, sizeof(sLastWord) );
+
+	const int iWordPerCP = SPH_WORDLIST_CHECKPOINT;
+	const bool bWordDict = m_tIndex.GetDictionary()->GetSettings().m_bWordDict;
+
+	CSphVector<CSphWordlistCheckpoint> dCheckpoints;
+	dCheckpoints.Reserve ( m_tWordlist.m_iDictCheckpoints );
+	CSphVector<char> dCheckpointWords;
+
+	CSphAutoreader & tDictReader = m_tDictReader;
+
+	tDictReader.GetByte();
+	int iLastSkipsOffset = 0;
+	SphOffset_t iWordsEnd = m_tWordlist.GetWordsEnd();
+
+	while ( tDictReader.GetPos()!=iWordsEnd && !m_bIsEmpty )
+	{
+		// sanity checks
+		if ( tDictReader.GetPos()>=iWordsEnd )
+		{
+			m_tReporter.Fail ( "reading past checkpoints" );
+			break;
+		}
+
+		// store current entry pos (for checkpointing later), read next delta
+		const int64_t iDictPos = tDictReader.GetPos();
+		SphWordID_t iDeltaWord = 0;
+		if ( bWordDict )
+			iDeltaWord = tDictReader.GetByte();
+		else
+			iDeltaWord = tDictReader.UnzipWordid();
+
+		// checkpoint encountered, handle it
+		if ( !iDeltaWord )
+		{
+			tDictReader.UnzipOffset();
+
+			if ( ( iWordsTotal%iWordPerCP )!=0 && tDictReader.GetPos()!=iWordsEnd )
+				m_tReporter.Fail ( "unexpected checkpoint (pos=" INT64_FMT ", word=%d, words=%d, expected=%d)", iDictPos, iWordsTotal, ( iWordsTotal%iWordPerCP ), iWordPerCP );
+
+			uWordid = 0;
+			iDoclistOffset = 0;
+			continue;
+		}
+
+		SphWordID_t uNewWordid = 0;
+		SphOffset_t iNewDoclistOffset = 0;
+		int iDocs = 0;
+		int iHits = 0;
+		bool bHitless = false;
+
+		if ( bWordDict )
+		{
+			// unpack next word
+			// must be in sync with DictEnd()!
+			BYTE uPack = (BYTE)iDeltaWord;
+			int iMatch, iDelta;
+			if ( uPack & 0x80 )
+			{
+				iDelta = ( ( uPack>>4 ) & 7 ) + 1;
+				iMatch = uPack & 15;
+			} else
+			{
+				iDelta = uPack & 127;
+				iMatch = tDictReader.GetByte();
+			}
+			const int iLastWordLen = strlen(sLastWord);
+			if ( iMatch+iDelta>=(int)sizeof(sLastWord)-1 || iMatch>iLastWordLen )
+			{
+				m_tReporter.Fail ( "wrong word-delta (pos=" INT64_FMT ", word=%s, len=%d, begin=%d, delta=%d)", iDictPos, sLastWord, iLastWordLen, iMatch, iDelta );
+				tDictReader.SkipBytes ( iDelta );
+			} else
+			{
+				tDictReader.GetBytes ( sWord+iMatch, iDelta );
+				sWord [ iMatch+iDelta ] = '\0';
+			}
+
+			iNewDoclistOffset = tDictReader.UnzipOffset();
+			iDocs = tDictReader.UnzipInt();
+			iHits = tDictReader.UnzipInt();
+			int iHint = 0;
+			if ( iDocs>=DOCLIST_HINT_THRESH )
+				iHint = tDictReader.GetByte();
+
+			iHint = DoclistHintUnpack ( iDocs, (BYTE)iHint );
+
+			if ( m_tIndex.GetSettings().m_eHitless==SPH_HITLESS_SOME && ( iDocs & HITLESS_DOC_FLAG )!=0 )
+			{
+				iDocs = ( iDocs & HITLESS_DOC_MASK );
+				bHitless = true;
+			}
+
+			const int iNewWordLen = strlen(sWord);
+
+			if ( iNewWordLen==0 )
+				m_tReporter.Fail ( "empty word in dictionary (pos=" INT64_FMT ")", iDictPos );
+
+			if ( iLastWordLen && iNewWordLen )
+				if ( sphDictCmpStrictly ( sWord, iNewWordLen, sLastWord, iLastWordLen )<=0 )
+					m_tReporter.Fail ( "word order decreased (pos=" INT64_FMT ", word=%s, prev=%s)", iDictPos, sLastWord, sWord );
+
+			if ( iHint<0 )
+				m_tReporter.Fail ( "invalid word hint (pos=" INT64_FMT ", word=%s, hint=%d)", iDictPos, sWord, iHint );
+
+			if ( iDocs<=0 || iHits<=0 || iHits<iDocs )
+				m_tReporter.Fail ( "invalid docs/hits (pos=" INT64_FMT ", word=%s, docs=" INT64_FMT ", hits=" INT64_FMT ")", (int64_t)iDictPos, sWord, (int64_t)iDocs, (int64_t)iHits );
+
+			memcpy ( sLastWord, sWord, sizeof(sLastWord) );
+		} else
+		{
+			// finish reading the entire entry
+			uNewWordid = uWordid + iDeltaWord;
+			iNewDoclistOffset = iDoclistOffset + tDictReader.UnzipOffset();
+			iDocs = tDictReader.UnzipInt();
+			iHits = tDictReader.UnzipInt();
+			bHitless = ( m_dHitlessWords.BinarySearch ( uNewWordid )!=NULL );
+			if ( bHitless )
+				iDocs = ( iDocs & HITLESS_DOC_MASK );
+
+			if ( uNewWordid<=uWordid )
+				m_tReporter.Fail ( "wordid decreased (pos=" INT64_FMT ", wordid=" UINT64_FMT ", previd=" UINT64_FMT ")", (int64_t)iDictPos, (uint64_t)uNewWordid, (uint64_t)uWordid );
+
+			if ( iNewDoclistOffset<=iDoclistOffset )
+				m_tReporter.Fail ( "doclist offset decreased (pos=" INT64_FMT ", wordid=" UINT64_FMT ")", (int64_t)iDictPos, (uint64_t)uNewWordid );
+
+			if ( iDocs<=0 || iHits<=0 || iHits<iDocs )
+				m_tReporter.Fail ( "invalid docs/hits (pos=" INT64_FMT ", wordid=" UINT64_FMT ", docs=" INT64_FMT ", hits=" INT64_FMT ", hitless=%s)",
+					(int64_t)iDictPos, (uint64_t)uNewWordid, (int64_t)iDocs, (int64_t)iHits, ( bHitless?"true":"false" ) );
+		}
+
+		assert ( tIndexSettings.m_iSkiplistBlockSize>0 );
+
+		// skiplist
+		if ( iDocs>tIndexSettings.m_iSkiplistBlockSize && !bHitless )
+		{
+			int iSkipsOffset = tDictReader.UnzipInt();
+			if ( !bWordDict && iSkipsOffset<iLastSkipsOffset )
+				m_tReporter.Fail ( "descending skiplist pos (last=%d, cur=%d, wordid=%llu)", iLastSkipsOffset, iSkipsOffset, UINT64 ( uNewWordid ) );
+
+			iLastSkipsOffset = iSkipsOffset;
+		}
+
+		// update stats, add checkpoint
+		if ( ( iWordsTotal%iWordPerCP )==0 )
+		{
+			CSphWordlistCheckpoint & tCP = dCheckpoints.Add();
+			tCP.m_iWordlistOffset = iDictPos;
+
+			if ( bWordDict )
+			{
+				const int iLen = strlen ( sWord );
+				char * sArenaWord = dCheckpointWords.AddN ( iLen + 1 );
+				memcpy ( sArenaWord, sWord, iLen );
+				sArenaWord[iLen] = '\0';
+				tCP.m_uWordID = sArenaWord - dCheckpointWords.Begin();
+			} else
+				tCP.m_uWordID = uNewWordid;
+		}
+
+		// TODO add back infix checking
+
+		uWordid = uNewWordid;
+		iDoclistOffset = iNewDoclistOffset;
+		iWordsTotal++;
+	}
+
+	// check the checkpoints
+	if ( dCheckpoints.GetLength()!=m_tWordlist.m_iDictCheckpoints )
+		m_tReporter.Fail ( "checkpoint count mismatch (read=%d, calc=%d)", m_tWordlist.m_iDictCheckpoints, dCheckpoints.GetLength() );
+
+	m_tWordlist.DebugPopulateCheckpoints();
+	for ( int i=0; i < Min ( dCheckpoints.GetLength(), m_tWordlist.m_iDictCheckpoints ); i++ )
+	{
+		CSphWordlistCheckpoint tRefCP = dCheckpoints[i];
+		const CSphWordlistCheckpoint & tCP = m_tWordlist.m_dCheckpoints[i];
+		const int iLen = bWordDict ? strlen ( tCP.m_sWord ) : 0;
+		if ( bWordDict )
+			tRefCP.m_sWord = dCheckpointWords.Begin() + tRefCP.m_uWordID;
+		if ( bWordDict && ( tRefCP.m_sWord[0]=='\0' || tCP.m_sWord[0]=='\0' ) )
+		{
+			m_tReporter.Fail ( "empty checkpoint %d (read_word=%s, read_len=%u, readpos=" INT64_FMT ", calc_word=%s, calc_len=%u, calcpos=" INT64_FMT ")",
+				i, tCP.m_sWord, (DWORD)strlen ( tCP.m_sWord ), (int64_t)tCP.m_iWordlistOffset,
+				tRefCP.m_sWord, (DWORD)strlen ( tRefCP.m_sWord ), (int64_t)tRefCP.m_iWordlistOffset );
+
+		} else if ( sphCheckpointCmpStrictly ( tCP.m_sWord, iLen, tCP.m_uWordID, bWordDict, tRefCP ) || tRefCP.m_iWordlistOffset!=tCP.m_iWordlistOffset )
+		{
+			if ( bWordDict )
+			{
+				m_tReporter.Fail ( "checkpoint %d differs (read_word=%s, readpos=" INT64_FMT ", calc_word=%s, calcpos=" INT64_FMT ")",
+					i,
+					tCP.m_sWord,
+					(int64_t)tCP.m_iWordlistOffset,
+					tRefCP.m_sWord,
+					(int64_t)tRefCP.m_iWordlistOffset );
+			} else
+			{
+				m_tReporter.Fail ( "checkpoint %d differs (readid=" UINT64_FMT ", readpos=" INT64_FMT ", calcid=" UINT64_FMT ", calcpos=" INT64_FMT ")",
+					i,
+					(uint64_t)tCP.m_uWordID,
+					(int64_t)tCP.m_iWordlistOffset,
+					(uint64_t)tRefCP.m_uWordID,
+					(int64_t)tRefCP.m_iWordlistOffset );
+			}
+		}
+	}
+
+	dCheckpoints.Reset();
+	dCheckpointWords.Reset();
+}
+
+
+void DiskIndexChecker_c::CheckDocs()
+{
+	const CSphIndexSettings & tIndexSettings = m_tIndex.GetSettings();
+
+	m_tReporter.Msg ( "checking data..." );
+
+	int64_t iDocsSize = m_pDocsReader->GetFilesize();
+	int64_t iSkiplistLen = m_tSkipsReader.GetFilesize();
+
+	m_tDictReader.SeekTo ( 1, READ_NO_SIZE_HINT );
+	m_pDocsReader->SeekTo ( 1 );
+	m_pHitsReader->SeekTo ( 1 );
+
+	SphWordID_t uWordid = 0;
+	int64_t iDoclistOffset = 0;
+	int iDictDocs, iDictHits;
+	bool bHitless = false;
+
+	const bool bWordDict = m_tIndex.GetDictionary()->GetSettings().m_bWordDict;
+
+	char sWord[MAX_KEYWORD_BYTES];
+	memset ( sWord, 0, sizeof(sWord) );
+
+	int iWordsChecked = 0;
+	int iWordsTotal = 0;
+
+	SphOffset_t iWordsEnd = m_tWordlist.GetWordsEnd();
+	while ( m_tDictReader.GetPos()<iWordsEnd )
+	{
+		bHitless = false;
+		SphWordID_t iDeltaWord = 0;
+		if ( bWordDict )
+			iDeltaWord = m_tDictReader.GetByte();
+		else
+			iDeltaWord = m_tDictReader.UnzipWordid();
+
+		if ( !iDeltaWord )
+		{
+			m_tDictReader.UnzipOffset();
+
+			uWordid = 0;
+			iDoclistOffset = 0;
+			continue;
+		}
+
+		if ( bWordDict )
+		{
+			// unpack next word
+			// must be in sync with DictEnd()!
+			BYTE uPack = (BYTE)iDeltaWord;
+
+			int iMatch, iDelta;
+			if ( uPack & 0x80 )
+			{
+				iDelta = ( ( uPack>>4 ) & 7 ) + 1;
+				iMatch = uPack & 15;
+			} else
+			{
+				iDelta = uPack & 127;
+				iMatch = m_tDictReader.GetByte();
+			}
+			const int iLastWordLen = strlen(sWord);
+			if ( iMatch+iDelta>=(int)sizeof(sWord)-1 || iMatch>iLastWordLen )
+				m_tDictReader.SkipBytes ( iDelta );
+			else
+			{
+				m_tDictReader.GetBytes ( sWord+iMatch, iDelta );
+				sWord [ iMatch+iDelta ] = '\0';
+			}
+
+			iDoclistOffset = m_tDictReader.UnzipOffset();
+			iDictDocs = m_tDictReader.UnzipInt();
+			iDictHits = m_tDictReader.UnzipInt();
+			if ( iDictDocs>=DOCLIST_HINT_THRESH )
+				m_tDictReader.GetByte();
+
+			if ( tIndexSettings.m_eHitless==SPH_HITLESS_SOME && ( iDictDocs & HITLESS_DOC_FLAG ) )
+			{
+				iDictDocs = ( iDictDocs & HITLESS_DOC_MASK );
+				bHitless = true;
+			}
+		} else
+		{
+			// finish reading the entire entry
+			uWordid = uWordid + iDeltaWord;
+			bHitless = ( m_dHitlessWords.BinarySearch ( uWordid )!=NULL );
+			iDoclistOffset = iDoclistOffset + m_tDictReader.UnzipOffset();
+			iDictDocs = m_tDictReader.UnzipInt();
+			if ( bHitless )
+				iDictDocs = ( iDictDocs & HITLESS_DOC_MASK );
+			iDictHits = m_tDictReader.UnzipInt();
+		}
+
+		int64_t iSkipsOffset = 0;
+		if ( iDictDocs>tIndexSettings.m_iSkiplistBlockSize && !bHitless )
+		{
+			if ( m_uVersion<=57 )
+				iSkipsOffset = (int)m_tDictReader.UnzipInt();
+			else
+				iSkipsOffset = m_tDictReader.UnzipOffset();
+		}
+
+		// check whether the offset is as expected
+		if ( iDoclistOffset!=m_pDocsReader->GetPos() )
+		{
+			if ( !bWordDict )
+				m_tReporter.Fail ( "unexpected doclist offset (wordid=" UINT64_FMT "(%s)(%d), dictpos=" INT64_FMT ", doclistpos=" INT64_FMT ")",
+					(uint64_t)uWordid, sWord, iWordsChecked, iDoclistOffset, (int64_t) m_pDocsReader->GetPos() );
+
+			if ( iDoclistOffset>=iDocsSize || iDoclistOffset<0 )
+			{
+				m_tReporter.Fail ( "unexpected doclist offset, off the file (wordid=" UINT64_FMT "(%s)(%d), dictpos=" INT64_FMT ", doclistsize=" INT64_FMT ")",
+					(uint64_t)uWordid, sWord, iWordsChecked, iDoclistOffset, iDocsSize );
+				iWordsChecked++;
+				continue;
+			} else
+				m_pDocsReader->SeekTo ( iDoclistOffset );
+		}
+
+		// create and manually setup doclist reader
+		DiskIndexQwordTraits_c * pQword = sphCreateDiskIndexQword ( tIndexSettings.m_eHitFormat==SPH_HIT_FORMAT_INLINE );
+
+		pQword->m_tDoc.Reset ( m_tSchema.GetDynamicSize() );
+		pQword->m_tDoc.m_tRowID = INVALID_ROWID;
+		pQword->m_iDocs = 0;
+		pQword->m_iHits = 0;
+		pQword->SetDocReader ( m_pDocsReader );
+//		pQword->m_rdDoclist.SeekTo ( tDocsReader.GetPos(), READ_NO_SIZE_HINT );
+		pQword->SetHitReader ( m_pHitsReader );
+//		pQword->m_rdHitlist.SeekTo ( tHitsReader.GetPos(), READ_NO_SIZE_HINT );
+
+		// loop the doclist
+		int iDoclistDocs = 0;
+		int iDoclistHits = 0;
+		int iHitlistHits = 0;
+
+		bHitless |= ( tIndexSettings.m_eHitless==SPH_HITLESS_ALL ||
+			( tIndexSettings.m_eHitless==SPH_HITLESS_SOME && m_dHitlessWords.BinarySearch ( uWordid ) ) );
+		pQword->m_bHasHitlist = !bHitless;
+
+		CSphVector<SkiplistEntry_t> dDoclistSkips;
+		while (true)
+		{
+			// skiplist state is saved just *before* decoding those boundary entries
+			if ( ( iDoclistDocs & ( tIndexSettings.m_iSkiplistBlockSize-1 ) )==0 )
+			{
+				SkiplistEntry_t & tBlock = dDoclistSkips.Add();
+				tBlock.m_tBaseRowIDPlus1 = pQword->m_tDoc.m_tRowID+1;
+				tBlock.m_iOffset = pQword->m_rdDoclist->GetPos();
+				tBlock.m_iBaseHitlistPos = pQword->m_uHitPosition;
+			}
+
+			// FIXME? this can fail on a broken entry (eg fieldid over 256)
+			const CSphMatch & tDoc = pQword->GetNextDoc();
+			if ( tDoc.m_tRowID==INVALID_ROWID )
+				break;
+
+			// checks!
+			if ( tDoc.m_tRowID>m_iNumRows )
+				m_tReporter.Fail ( "rowid out of bounds (wordid=" UINT64_FMT "(%s), rowid=%u)",	uint64_t(uWordid), sWord, tDoc.m_tRowID );
+
+			iDoclistDocs++;
+			iDoclistHits += pQword->m_uMatchHits;
+
+			// check position in case of regular (not-inline) hit
+			if (!( pQword->m_iHitlistPos>>63 ))
+			{
+				if ( !bWordDict && pQword->m_iHitlistPos!=pQword->m_rdHitlist->GetPos() )
+					m_tReporter.Fail ( "unexpected hitlist offset (wordid=" UINT64_FMT "(%s), rowid=%u, expected=" INT64_FMT ", actual=" INT64_FMT ")",
+						(uint64_t)uWordid, sWord, pQword->m_tDoc.m_tRowID, (int64_t)pQword->m_iHitlistPos, (int64_t)pQword->m_rdHitlist->GetPos() );
+			}
+
+			// aim
+			pQword->SeekHitlist ( pQword->m_iHitlistPos );
+
+			// loop the hitlist
+			int iDocHits = 0;
+			FieldMask_t dFieldMask;
+			dFieldMask.UnsetAll();
+			Hitpos_t uLastHit = EMPTY_HIT;
+
+			while ( !bHitless )
+			{
+				Hitpos_t uHit = pQword->GetNextHit();
+				if ( uHit==EMPTY_HIT )
+					break;
+
+				if ( !( uLastHit<uHit ) )
+					m_tReporter.Fail ( "hit entries sorting order decreased (wordid=" UINT64_FMT "(%s), rowid=%u, hit=%u, last=%u)", (uint64_t)uWordid, sWord, pQword->m_tDoc.m_tRowID, uHit, uLastHit );
+
+				if ( HITMAN::GetField ( uLastHit )==HITMAN::GetField ( uHit ) )
+				{
+					if ( !( HITMAN::GetPos ( uLastHit )<HITMAN::GetPos ( uHit ) ) )
+						m_tReporter.Fail ( "hit decreased (wordid=" UINT64_FMT "(%s), rowid=%u, hit=%u, last=%u)",	(uint64_t)uWordid, sWord, pQword->m_tDoc.m_tRowID, HITMAN::GetPos ( uHit ), HITMAN::GetPos ( uLastHit ) );
+
+					if ( HITMAN::IsEnd ( uLastHit ) )
+						m_tReporter.Fail ( "multiple tail hits (wordid=" UINT64_FMT "(%s), rowid=%u, hit=0x%x, last=0x%x)", (uint64_t)uWordid, sWord, pQword->m_tDoc.m_tRowID, uHit, uLastHit );
+				} else
+				{
+					if ( !( HITMAN::GetField ( uLastHit )<HITMAN::GetField ( uHit ) ) )
+						m_tReporter.Fail ( "hit field decreased (wordid=" UINT64_FMT "(%s), rowid=%u, hit field=%u, last field=%u)", (uint64_t)uWordid, sWord, pQword->m_tDoc.m_tRowID, HITMAN::GetField ( uHit ), HITMAN::GetField ( uLastHit ) );
+				}
+
+				uLastHit = uHit;
+
+				int iField = HITMAN::GetField ( uHit );
+				if ( iField<0 || iField>=SPH_MAX_FIELDS )
+					m_tReporter.Fail ( "hit field out of bounds (wordid=" UINT64_FMT "(%s), rowid=%u, field=%d)", (uint64_t)uWordid, sWord, pQword->m_tDoc.m_tRowID, iField );
+				else if ( iField>=m_tSchema.GetFieldsCount() )
+					m_tReporter.Fail ( "hit field out of schema (wordid=" UINT64_FMT "(%s), rowid=%u, field=%d)", (uint64_t)uWordid, sWord, pQword->m_tDoc.m_tRowID, iField );
+				else
+					dFieldMask.Set(iField);
+
+				iDocHits++; // to check doclist entry
+				iHitlistHits++; // to check dictionary entry
+			}
+
+			// check hit count
+			if ( iDocHits!=(int)pQword->m_uMatchHits && !bHitless )
+				m_tReporter.Fail ( "doc hit count mismatch (wordid=" UINT64_FMT "(%s), rowid=%u, doclist=%d, hitlist=%d)", (uint64_t)uWordid, sWord, pQword->m_tDoc.m_tRowID, pQword->m_uMatchHits, iDocHits );
+
+			if ( m_tSchema.GetFieldsCount()>32 )
+				pQword->CollectHitMask();
+
+			// check the mask
+			if ( memcmp ( dFieldMask.m_dMask, pQword->m_dQwordFields.m_dMask, sizeof(dFieldMask.m_dMask) ) && !bHitless )
+				m_tReporter.Fail ( "field mask mismatch (wordid=" UINT64_FMT "(%s), rowid=%u)", (uint64_t)uWordid, sWord, pQword->m_tDoc.m_tRowID );
+
+			// update my hitlist reader
+			m_pHitsReader->SeekTo ( pQword->m_rdHitlist->GetPos() );
+		}
+
+		// do checks
+		if ( iDictDocs!=iDoclistDocs )
+			m_tReporter.Fail ( "doc count mismatch (wordid=" UINT64_FMT "(%s), dict=%d, doclist=%d, hitless=%s)", uint64_t(uWordid), sWord, iDictDocs, iDoclistDocs, ( bHitless?"true":"false" ) );
+
+		if ( ( iDictHits!=iDoclistHits || iDictHits!=iHitlistHits ) && !bHitless )
+			m_tReporter.Fail ( "hit count mismatch (wordid=" UINT64_FMT "(%s), dict=%d, doclist=%d, hitlist=%d)", uint64_t(uWordid), sWord, iDictHits, iDoclistHits, iHitlistHits );
+
+		while ( iDoclistDocs>tIndexSettings.m_iSkiplistBlockSize && !bHitless )
+		{
+			if ( iSkipsOffset<=0 || iSkipsOffset>iSkiplistLen )
+			{
+				m_tReporter.Fail ( "invalid skiplist offset (wordid=%llu(%s), off=" INT64_FMT ", max=" INT64_FMT ")", UINT64 ( uWordid ), sWord, iSkipsOffset, iSkiplistLen );
+				break;
+			}
+
+			// boundary adjustment
+			if ( ( iDoclistDocs & ( tIndexSettings.m_iSkiplistBlockSize-1 ) )==0 )
+				dDoclistSkips.Pop();
+
+			SkiplistEntry_t t;
+			t.m_tBaseRowIDPlus1 = 0;
+			t.m_iOffset = iDoclistOffset;
+			t.m_iBaseHitlistPos = 0;
+
+			// hint is: dDoclistSkips * ZIPPED( sizeof(int64_t) * 3 ) == dDoclistSkips * 8
+			m_tSkipsReader.SeekTo ( iSkipsOffset, dDoclistSkips.GetLength ()*8 );
+			int i = 0;
+			while ( ++i<dDoclistSkips.GetLength() )
+			{
+				const SkiplistEntry_t & r = dDoclistSkips[i];
+
+				RowID_t tRowIDDelta = m_tSkipsReader.UnzipRowid();
+				uint64_t uOff = m_tSkipsReader.UnzipOffset();
+				uint64_t uPosDelta = m_tSkipsReader.UnzipOffset();
+
+				if ( m_tSkipsReader.GetErrorFlag () )
+				{
+					m_tReporter.Fail ( "skiplist reading error (wordid=%llu(%s), exp=%d, got=%d, error='%s')", UINT64 ( uWordid ), sWord, i, dDoclistSkips.GetLength (), m_tSkipsReader.GetErrorMessage ().cstr () );
+					m_tSkipsReader.ResetError();
+					break;
+				}
+
+				t.m_tBaseRowIDPlus1 += tIndexSettings.m_iSkiplistBlockSize + tRowIDDelta;
+				t.m_iOffset += 4*tIndexSettings.m_iSkiplistBlockSize + uOff;
+				t.m_iBaseHitlistPos += uPosDelta;
+				if ( t.m_tBaseRowIDPlus1!=r.m_tBaseRowIDPlus1 || t.m_iOffset!=r.m_iOffset || t.m_iBaseHitlistPos!=r.m_iBaseHitlistPos )
+				{
+					m_tReporter.Fail ( "skiplist entry %d mismatch (wordid=%llu(%s), exp={%u, %llu, %llu}, got={%u, %llu, %llu})",
+						i, UINT64 ( uWordid ), sWord,
+						r.m_tBaseRowIDPlus1, UINT64 ( r.m_iOffset ), UINT64 ( r.m_iBaseHitlistPos ),
+						t.m_tBaseRowIDPlus1, UINT64 ( t.m_iOffset ), UINT64 ( t.m_iBaseHitlistPos ) );
+					break;
+				}
+			}
+			break;
+		}
+
+		// move my reader instance forward too
+		m_pDocsReader->SeekTo ( pQword->m_rdDoclist->GetPos() );
+
+		// cleanup
+		SafeDelete ( pQword );
+
+		// progress bar
+		if ( (++iWordsChecked)%1000==0 )
+			m_tReporter.Progress ( "%d/%d", iWordsChecked, iWordsTotal );
+	}
+}
+
+
+void DiskIndexChecker_c::CheckAttributes()
+{
+	const int64_t iMinMaxStart = sizeof(DWORD) * m_iMinMaxIndex;
+	const int64_t iMinMaxEnd = sizeof(DWORD) * m_iMinMaxIndex + sizeof(DWORD) * ( m_iDocinfoIndex+1 ) * m_tSchema.GetRowSize() * 2;
+	const int64_t iMinMaxBytes = iMinMaxEnd - iMinMaxStart;
+
+	FileDebugCheckReader_c tAttrReader ( &m_tAttrReader );
+	FileDebugCheckReader_c tBlobReader ( m_bHasBlobs ? &m_tBlobReader : nullptr );
+
+	// common code with RT index
+	DebugCheck_Attributes ( tAttrReader, tBlobReader, m_iNumRows, iMinMaxBytes, m_tSchema, m_tReporter );
+}
+
+
+void DiskIndexChecker_c::CheckKillList() const
+{
+	m_tReporter.Msg ( "checking kill-list..." );
+
+	CSphString sSPK = GetFilename(SPH_EXT_SPK);
+	if ( !sphIsReadable ( sSPK.cstr() ) )
+		return;
+
+	CSphString sError;
+	CSphAutoreader tReader;
+	if ( !tReader.Open ( sSPK.cstr(), sError ) )
+	{
+		m_tReporter.Fail ( "unable to open kill-list: %s", sError.cstr() );
+		return;
+	}
+
+	DWORD nIndexes = tReader.GetDword();
+	for ( int i = 0; i < (int)nIndexes; i++ )
+	{
+		CSphString sIndex = tReader.GetString();
+		if ( tReader.GetErrorFlag() )
+		{
+			m_tReporter.Fail ( "error reading index name from kill-list: %s", tReader.GetErrorMessage().cstr() );
+			return;
+		}
+
+		DWORD uFlags = tReader.GetDword();
+		DWORD uMask = KillListTarget_t::USE_KLIST | KillListTarget_t::USE_DOCIDS;
+		if ( uFlags & (~uMask) )
+		{
+			m_tReporter.Fail ( "unknown index flags in kill-list: %u", uMask );
+			return;
+		}
+	}
+
+	DWORD nKills = tReader.GetDword();
+	if ( tReader.GetErrorFlag() )
+	{
+		m_tReporter.Fail ( "error reading kill-list" );
+		return;
+	}
+
+	for ( DWORD i = 0; i<nKills; i++ )
+	{
+		DocID_t tDelta = tReader.UnzipOffset();
+		if ( tDelta<=0 )
+		{
+			m_tReporter.Fail ( "descending docids found in kill-list" );
+			return;
+		}
+
+		if ( tReader.GetErrorFlag() )
+		{
+			m_tReporter.Fail ( "error docids from kill-list" );
+			return;
+		}
+	}
+}
+
+
+void DiskIndexChecker_c::CheckBlockIndex()
+{
+	m_tReporter.Msg ( "checking attribute blocks index..." );
+
+	int64_t iAllRowsTotal = m_iNumRows + (m_iDocinfoIndex+1)*2;
+	DWORD uStride = m_tSchema.GetRowSize();
+	int64_t iLoadedRowItems = m_tAttrReader.GetFilesize() / sizeof(CSphRowitem);
+	if ( iAllRowsTotal*uStride>iLoadedRowItems && m_iNumRows )
+		m_tReporter.Fail ( "rowitems count mismatch (expected=" INT64_FMT ", loaded=" INT64_FMT ")", iAllRowsTotal*uStride, iLoadedRowItems );
+
+	// check size
+	const int64_t iTempDocinfoIndex = ( m_iNumRows+DOCINFO_INDEX_FREQ-1 ) / DOCINFO_INDEX_FREQ;
+	if ( iTempDocinfoIndex!=m_iDocinfoIndex )
+		m_tReporter.Fail ( "block count differs (expected=" INT64_FMT ", got=" INT64_FMT ")", iTempDocinfoIndex, m_iDocinfoIndex );
+
+	CSphFixedVector<CSphRowitem> dRow ( m_tSchema.GetRowSize() );
+	const CSphRowitem * pRow = dRow.Begin();
+	m_tAttrReader.SeekTo ( 0, dRow.GetLengthBytes() );
+
+	const int64_t iMinMaxEnd = sizeof(DWORD) * m_iMinMaxIndex + sizeof(DWORD) * ( m_iDocinfoIndex+1 ) * uStride * 2;
+	CSphFixedVector<DWORD> dMinMax ( uStride*2 );
+	const DWORD * pMinEntry = dMinMax.Begin();
+	const DWORD * pMinAttrs = pMinEntry;
+	const DWORD * pMaxAttrs = pMinAttrs + uStride;
+
+	for ( int64_t iIndexEntry=0; iIndexEntry<m_iNumRows; iIndexEntry++ )
+	{
+		const int64_t iBlock = iIndexEntry / DOCINFO_INDEX_FREQ;
+
+		// we have to do some checks in border cases, for example: when move from 1st to 2nd block
+		const int64_t iPrevEntryBlock = ( iIndexEntry-1 )/DOCINFO_INDEX_FREQ;
+		const bool bIsBordersCheckTime = ( iPrevEntryBlock!=iBlock );
+		if ( bIsBordersCheckTime || iIndexEntry==0 )
+		{
+			int64_t iPos = m_tAttrReader.GetPos();
+
+			int64_t iBlockPos = sizeof(DWORD) * m_iMinMaxIndex + sizeof(DWORD) * iBlock * uStride * 2;
+			// check docid vs global range
+			if ( int64_t( iBlockPos + sizeof(DWORD) * uStride) > iMinMaxEnd )
+				m_tReporter.Fail ( "unexpected block index end (row=" INT64_FMT ", block=" INT64_FMT ")", iIndexEntry, iBlock );
+
+			m_tAttrReader.SeekTo ( iBlockPos, dMinMax.GetLengthBytes() );
+			m_tAttrReader.GetBytes ( dMinMax.Begin(), dMinMax.GetLengthBytes() );
+			if ( m_tAttrReader.GetErrorFlag() )
+				m_tReporter.Fail ( "unexpected block index (row=" INT64_FMT ", block=" INT64_FMT ")", iIndexEntry, iBlock );
+
+			m_tAttrReader.SeekTo ( iPos, dRow.GetLengthBytes() );
+		}
+
+		m_tAttrReader.GetBytes ( dRow.Begin(), dRow.GetLengthBytes() );
+		const DocID_t tDocID = sphGetDocID(pRow);
+
+		// check values vs blocks range
+		for ( int iItem=0; iItem < m_tSchema.GetAttrsCount(); iItem++ )
+		{
+			const CSphColumnInfo & tCol = m_tSchema.GetAttr(iItem);
+			if ( tCol.m_sName==sphGetBlobLocatorName() )
+				continue;
+
+			switch ( tCol.m_eAttrType )
+			{
+			case SPH_ATTR_INTEGER:
+			case SPH_ATTR_TIMESTAMP:
+			case SPH_ATTR_BOOL:
+			case SPH_ATTR_BIGINT:
+			{
+				const SphAttr_t uVal = sphGetRowAttr ( pRow, tCol.m_tLocator );
+				const SphAttr_t uMin = sphGetRowAttr ( pMinAttrs, tCol.m_tLocator );
+				const SphAttr_t uMax = sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator );
+
+				// checks is attribute min max range valid
+				if ( uMin > uMax && bIsBordersCheckTime )
+					m_tReporter.Fail ( "invalid attribute range (row=" INT64_FMT ", block=" INT64_FMT ", min=" INT64_FMT ", max=" INT64_FMT ")", iIndexEntry, iBlock, uMin, uMax );
+
+				if ( uVal < uMin || uVal > uMax )
+					m_tReporter.Fail ( "unexpected attribute value (row=" INT64_FMT ", attr=%u, docid=" INT64_FMT ", block=" INT64_FMT ", value=0x" UINT64_FMT ", min=0x" UINT64_FMT ", max=0x" UINT64_FMT ")",
+						iIndexEntry, iItem, tDocID, iBlock, uint64_t(uVal), uint64_t(uMin), uint64_t(uMax) );
+			}
+			break;
+
+			case SPH_ATTR_FLOAT:
+			{
+				const float fVal = sphDW2F ( (DWORD)sphGetRowAttr ( pRow, tCol.m_tLocator ) );
+				const float fMin = sphDW2F ( (DWORD)sphGetRowAttr ( pMinAttrs, tCol.m_tLocator ) );
+				const float fMax = sphDW2F ( (DWORD)sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator ) );
+
+				// checks is attribute min max range valid
+				if ( fMin > fMax && bIsBordersCheckTime )
+					m_tReporter.Fail ( "invalid attribute range (row=" INT64_FMT ", block=" INT64_FMT ", min=%f, max=%f)", iIndexEntry, iBlock, fMin, fMax );
+
+				if ( fVal < fMin || fVal > fMax )
+					m_tReporter.Fail ( "unexpected attribute value (row=" INT64_FMT ", attr=%u, docid=" INT64_FMT ", block=" INT64_FMT ", value=%f, min=%f, max=%f)", iIndexEntry, iItem, tDocID, iBlock, fVal, fMin, fMax );
+			}
+			break;
+
+			default:
+				break;
+			}
+		}
+
+		// progress bar
+		if ( iIndexEntry%1000==0 )
+			m_tReporter.Progress ( INT64_FMT"/" INT64_FMT, iIndexEntry, m_iNumRows );
+	}
+}
+
+
+void DiskIndexChecker_c::CheckDocidLookup()
+{
+	CSphString sError;
+	m_tReporter.Msg ( "checking doc-id lookup..." );
+
+	CSphAutoreader tLookup;
+	if ( !tLookup.Open ( GetFilename(SPH_EXT_SPT), sError ) )
+	{
+		m_tReporter.Fail ( "unable to lookup file: %s", sError.cstr() );
+		return;
+	}
+	int64_t iLookupEnd = tLookup.GetFilesize();
+
+	CSphFixedVector<CSphRowitem> dRow ( m_tSchema.GetRowSize() );
+	m_tAttrReader.SeekTo ( 0, dRow.GetLengthBytes() );
+	CSphBitvec dRowids ( m_iNumRows );
+
+	int iDocs = tLookup.GetDword();
+	int iDocsPerCheckpoint = tLookup.GetDword();
+	tLookup.GetOffset(); // max docid
+	int64_t iLookupBase = tLookup.GetPos();
+
+	int iCheckpoints = ( iDocs + iDocsPerCheckpoint - 1 ) / iDocsPerCheckpoint;
+
+	DocidLookupCheckpoint_t tCp;
+	DocID_t tLastDocID = 0;
+	int iCp = 0;
+	while ( tLookup.GetPos()<iLookupEnd && iCp<iCheckpoints )
+	{
+		tLookup.SeekTo ( sizeof(DocidLookupCheckpoint_t) * iCp + iLookupBase, sizeof(DocidLookupCheckpoint_t) );
+
+		DocidLookupCheckpoint_t tPrevCp = tCp;
+		tCp.m_tBaseDocID = tLookup.GetOffset();
+		tCp.m_tOffset = tLookup.GetOffset();
+		tLastDocID = tCp.m_tBaseDocID;
+
+		if ( tPrevCp.m_tBaseDocID>=tCp.m_tBaseDocID )
+			m_tReporter.Fail ( "descending docid at checkpoint %d, previous docid " INT64_FMT " docid " INT64_FMT, iCp, tPrevCp.m_tBaseDocID, tCp.m_tBaseDocID );
+
+		tLookup.SeekTo ( tCp.m_tOffset, sizeof(DWORD) * 3 * iDocsPerCheckpoint );
+
+		int iCpDocs = iDocsPerCheckpoint;
+		// last checkpoint might have less docs
+		if ( iCp==iCheckpoints-1 )
+		{
+			int iLefover = ( iDocs % iDocsPerCheckpoint );
+			iCpDocs = ( iLefover ? iLefover : iDocsPerCheckpoint );
+		}
+
+		for ( int i=0; i<iCpDocs; i++ )
+		{
+			DocID_t tDelta = 0;
+			DocID_t tDocID = 0;
+			RowID_t tRowID = INVALID_ROWID;
+
+			if ( !( i % iCpDocs ) )
+			{
+				tDocID = tLastDocID;
+				tRowID = tLookup.GetDword();
+			} else
+			{
+				tDelta = tLookup.UnzipOffset();
+				tRowID = tLookup.GetDword();
+				if ( tDelta<0 )
+					m_tReporter.Fail ( "invalid docid delta " INT64_FMT " at row %u, checkpoint %d, doc %d, last docid " INT64_FMT,
+						tDocID, tRowID, iCp, i, tLastDocID );
+				else
+					tDocID = tLastDocID + tDelta;
+
+			}
+
+			if ( tRowID>=m_iNumRows )
+				m_tReporter.Fail ( "rowid %u out of bounds " INT64_FMT, tRowID, m_iNumRows );
+			else
+			{
+				// read only docid
+				m_tAttrReader.SeekTo ( dRow.GetLengthBytes() * tRowID, sizeof(DocID_t) );
+				m_tAttrReader.GetBytes ( dRow.Begin(), sizeof(DocID_t) );
+
+				if ( dRowids.BitGet ( tRowID ) )
+					m_tReporter.Fail ( "row %u already mapped, current docid" INT64_FMT " checkpoint %d, doc %d", tRowID, INT64_FMT, iCp, i );
+
+				dRowids.BitSet ( tRowID );
+
+				if ( tDocID!=sphGetDocID ( dRow.Begin() ) )
+					m_tReporter.Fail ( "invalid docid " INT64_FMT "(" INT64_FMT ") at row %u, checkpoint %d, doc %d, last docid " INT64_FMT,
+						tDocID, sphGetDocID ( dRow.Begin() ), tRowID, iCp, i, tLastDocID );
+			}
+
+			tLastDocID = tDocID;
+		}
+
+		iCp++;
+	}
+
+	for ( int i=0; i<m_iNumRows; i++ )
+	{
+		if ( dRowids.BitGet ( i ) )
+			continue;
+
+		m_tAttrReader.SeekTo ( dRow.GetLengthBytes() * i, sizeof(DocID_t) );
+		m_tAttrReader.GetBytes ( dRow.Begin(), sizeof(DocID_t) );
+
+		DocID_t tDocID = sphGetDocID ( dRow.Begin() );
+		
+		m_tReporter.Fail ( "row %u(" INT64_FMT ") not mapped at lookup, docid " INT64_FMT, i, m_iNumRows, tDocID );
+	}
+}
+
+
+struct DocRow_fn
+{
+	bool IsLess ( const DocidRowidPair_t & tA, DocidRowidPair_t & tB ) const
+	{
+		if ( tA.m_tDocID==tB.m_tDocID && tA.m_tRowID<tB.m_tRowID )
+			return true;
+
+		return ( tA.m_tDocID<tB.m_tDocID );
+	}
+};
+
+
+void DiskIndexChecker_c::CheckDocids()
+{
+	CSphString sError;
+	m_tReporter.Msg ( "checking docid douplicates ..." );
+
+	CSphFixedVector<CSphRowitem> dRow ( m_tSchema.GetRowSize() );
+	m_tAttrReader.SeekTo ( 0, dRow.GetLengthBytes() );
+
+	CSphFixedVector<DocidRowidPair_t> dRows ( m_iNumRows );
+	for ( int i=0; i<m_iNumRows; i++ )
+	{
+		m_tAttrReader.SeekTo ( dRow.GetLengthBytes() * i, sizeof(DocID_t) );
+		m_tAttrReader.GetBytes ( dRow.Begin(), sizeof(DocID_t) );
+
+		dRows[i].m_tRowID = i;
+		dRows[i].m_tDocID = sphGetDocID ( dRow.Begin() );
+	}
+
+	dRows.Sort ( DocRow_fn() );
+	for ( int i=1; i<dRows.GetLength(); i++ )
+	{
+		if ( dRows[i].m_tDocID==dRows[i-1].m_tDocID )
+			m_tReporter.Fail ( "duplicate of docid " INT64_FMT " found at rows %u %u", dRows[i].m_tDocID, dRows[i-1].m_tRowID, dRows[i].m_tRowID );
+	}
+}
+
+
+void DiskIndexChecker_c::CheckDocstore()
+{
+	if ( !m_bHasDocstore )
+		return;
+
+	m_tReporter.Msg ( "checking docstore..." );
+
+	::CheckDocstore ( m_tDocstoreReader, m_tReporter );
+}
+
+
+CSphString DiskIndexChecker_c::GetFilename ( ESphExt eExt ) const
+{
+	CSphString sRes;
+	sRes.SetSprintf ( "%s%s", m_tIndex.GetFilename(), sphGetExt(eExt).cstr() );
+	return sRes;
+}
+
+
+DiskIndexChecker_i * CreateDiskIndexChecker ( CSphIndex & tIndex, DebugCheckError_c & tReporter )
+{
+	return new DiskIndexChecker_c ( tIndex, tReporter );
+}

+ 78 - 0
src/indexcheck.h

@@ -0,0 +1,78 @@
+//
+// Copyright (c) 2017-2019, Manticore Software LTD (http://manticoresearch.com)
+// Copyright (c) 2001-2016, Andrew Aksyonoff
+// Copyright (c) 2008-2016, Sphinx Technologies Inc
+// All rights reserved
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License. You should have
+// received a copy of the GPL license along with this program; if you
+// did not, you can find it at http://www.gnu.org/
+//
+
+#ifndef _indexcheck_
+#define _indexcheck_
+
+#include "sphinx.h"
+
+class DebugCheckReader_i
+{
+public:
+	virtual			~DebugCheckReader_i () {};
+
+	virtual int64_t	GetLengthBytes () = 0;
+	virtual bool	GetBytes ( void * pData, int iSize ) = 0;
+	virtual bool	SeekTo ( int64_t iOff, int iHint ) = 0;
+};
+
+
+// simple error reporter for debug checks
+class DebugCheckError_c
+{
+public:
+			DebugCheckError_c ( FILE * pFile );
+
+	bool	Fail ( const char * szFmt, ... );
+	void	Msg ( const char * szFmt, ... );
+	void	Progress ( const char * szFmt, ... );
+	void	Done();
+
+	void	SetSegment ( int iSegment );
+	int64_t	GetNumFails() const;
+
+private:
+	FILE *	m_pFile {nullptr};
+	bool	m_bProgress {false};
+	int64_t m_tStartTime {0};
+	int64_t	m_nFails {0};
+	int64_t	m_nFailsPrinted {0};
+	int		m_iSegment {-1};
+};
+
+
+// common code for debug checks in RT and disk indexes
+class DebugCheckHelper_c
+{
+protected:
+	void	DebugCheck_Attributes ( DebugCheckReader_i & tAttrs, DebugCheckReader_i & tBlobs, int64_t nRows, int64_t iMinMaxBytes, const CSphSchema & tSchema, DebugCheckError_c & tReporter ) const;
+	void	DebugCheck_DeadRowMap (  int64_t iSizeBytes, int64_t nRows, DebugCheckError_c & tReporter ) const;
+};
+
+
+// disk index checker
+class DiskIndexChecker_i
+{
+public:
+	virtual			~DiskIndexChecker_i() = default;
+
+	virtual bool	OpenFiles ( CSphString & sError ) = 0;
+	virtual void	Setup ( int64_t iNumRows, int64_t iDocinfoIndex, int64_t iMinMaxIndex, bool bCheckIdDups ) = 0;
+	virtual CSphVector<SphWordID_t> & GetHitlessWords() = 0;
+
+	virtual void	Check() = 0;
+};
+
+
+DiskIndexChecker_i * CreateDiskIndexChecker ( CSphIndex & tIndex, DebugCheckError_c & tReporter );
+
+#endif // _indexcheck_

+ 690 - 0
src/indexformat.cpp

@@ -0,0 +1,690 @@
+//
+// Copyright (c) 2017-2019, Manticore Software LTD (http://manticoresearch.com)
+// Copyright (c) 2001-2016, Andrew Aksyonoff
+// Copyright (c) 2008-2016, Sphinx Technologies Inc
+// All rights reserved
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License. You should have
+// received a copy of the GPL license along with this program; if you
+// did not, you can find it at http://www.gnu.org/
+//
+
+#include "indexformat.h"
+
+// let uDocs be DWORD here to prevent int overflow in case of hitless word (highest bit is 1)
+int DoclistHintUnpack ( DWORD uDocs, BYTE uHint )
+{
+	if ( uDocs<(DWORD)DOCLIST_HINT_THRESH )
+		return (int)Min ( 8*(int64_t)uDocs, INT_MAX );
+	else
+		return (int)Min ( 4*(int64_t)uDocs+( int64_t(uDocs)*uHint/64 ), INT_MAX );
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+DiskIndexQwordTraits_c::DiskIndexQwordTraits_c ( bool bUseMini, bool bExcluded )
+{
+	m_bExcluded = bExcluded;
+
+	if ( bUseMini )
+	{
+		m_pDocsBuf = m_dDoclistBuf;
+		m_pHitsBuf = m_dHitlistBuf;
+	}
+}
+
+
+void DiskIndexQwordTraits_c::SetDocReader ( DataReaderFactory_c * pReader )
+{
+	if ( !pReader )
+		return;
+
+	m_rdDoclist = pReader->MakeReader ( m_pDocsBuf, MINIBUFFER_LEN );
+}
+
+
+void DiskIndexQwordTraits_c::SetHitReader ( DataReaderFactory_c * pReader )
+{
+	if ( !pReader )
+		return;
+
+	m_rdHitlist = pReader->MakeReader ( m_pHitsBuf, MINIBUFFER_LEN );
+}
+
+
+void DiskIndexQwordTraits_c::ResetDecoderState ()
+{
+	ISphQword::Reset();
+	m_uHitPosition = 0;
+	m_uInlinedHit = 0;
+	m_uHitState = 0;
+	m_tDoc.m_tRowID = INVALID_ROWID;
+	m_iHitPos = EMPTY_HIT;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+class CheckpointReader_c
+{
+public:
+	const BYTE * ReadEntry ( const BYTE * pBuf, CSphWordlistCheckpoint & tCP ) const
+	{
+		tCP.m_uWordID = (SphWordID_t)sphUnalignedRead ( *(SphOffset_t *)pBuf );
+		pBuf += sizeof(SphOffset_t);
+
+		tCP.m_iWordlistOffset = sphUnalignedRead ( *(SphOffset_t *)pBuf );
+		pBuf += sizeof(SphOffset_t);
+
+		return pBuf;
+	}
+
+	int GetStride() const { return m_iSrcStride; }
+
+private:
+	int m_iSrcStride = 2*sizeof(SphOffset_t);
+};
+
+
+
+struct MappedCheckpoint_fn : public ISphNoncopyable
+{
+	const CSphWordlistCheckpoint *	m_pDstStart;
+	const BYTE *					m_pSrcStart;
+	const CheckpointReader_c *		m_pReader;
+
+	MappedCheckpoint_fn ( const CSphWordlistCheckpoint * pDstStart, const BYTE * pSrcStart, const CheckpointReader_c * pReader )
+		: m_pDstStart ( pDstStart )
+		, m_pSrcStart ( pSrcStart )
+		, m_pReader ( pReader )
+	{}
+
+	CSphWordlistCheckpoint operator() ( const CSphWordlistCheckpoint * pCP ) const
+	{
+		assert ( m_pDstStart<=pCP );
+		const BYTE * pCur = ( pCP - m_pDstStart ) * m_pReader->GetStride() + m_pSrcStart;
+		CSphWordlistCheckpoint tEntry;
+		m_pReader->ReadEntry ( pCur, tEntry );
+		return tEntry;
+	}
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+struct DiskExpandedEntry_t
+{
+	int		m_iNameOff;
+	int		m_iDocs;
+	int		m_iHits;
+};
+
+
+struct DiskExpandedPayload_t
+{
+	int			m_iDocs;
+	int			m_iHits;
+	uint64_t	m_uDoclistOff;
+	int			m_iDoclistHint;
+};
+
+
+struct Slice64_t
+{
+	uint64_t	m_uOff;
+	int			m_iLen;
+};
+
+
+struct DiskSubstringPayload_t : public ISphSubstringPayload
+{
+	explicit DiskSubstringPayload_t ( int iDoclists )
+		: m_dDoclist ( iDoclists )
+	{}
+
+	CSphFixedVector<Slice64_t>	m_dDoclist;
+};
+
+
+//////////////////////////////////////////////////////////////////////////
+
+struct DictEntryDiskPayload_t
+{
+	DictEntryDiskPayload_t ( bool bPayload, ESphHitless eHitless )
+	{
+		m_bPayload = bPayload;
+		m_eHitless = eHitless;
+		if ( bPayload )
+			m_dWordPayload.Reserve ( 1000 );
+
+		m_dWordExpand.Reserve ( 1000 );
+		m_dWordBuf.Reserve ( 8096 );
+	}
+
+	void Add ( const CSphDictEntry & tWord, int iWordLen )
+	{
+		if ( !m_bPayload || !sphIsExpandedPayload ( tWord.m_iDocs, tWord.m_iHits ) ||
+			m_eHitless==SPH_HITLESS_ALL || ( m_eHitless==SPH_HITLESS_SOME && ( tWord.m_iDocs & HITLESS_DOC_FLAG )!=0 ) ) // FIXME!!! do we need hitless=some as payloads?
+		{
+			DiskExpandedEntry_t & tExpand = m_dWordExpand.Add();
+
+			int iOff = m_dWordBuf.GetLength();
+			tExpand.m_iNameOff = iOff;
+			tExpand.m_iDocs = tWord.m_iDocs;
+			tExpand.m_iHits = tWord.m_iHits;
+			m_dWordBuf.Resize ( iOff + iWordLen + 1 );
+			memcpy ( m_dWordBuf.Begin() + iOff + 1, tWord.m_sKeyword, iWordLen );
+			m_dWordBuf[iOff] = (BYTE)iWordLen;
+
+		} else
+		{
+			DiskExpandedPayload_t & tExpand = m_dWordPayload.Add();
+			tExpand.m_iDocs = tWord.m_iDocs;
+			tExpand.m_iHits = tWord.m_iHits;
+			tExpand.m_uDoclistOff = tWord.m_iDoclistOffset;
+			tExpand.m_iDoclistHint = tWord.m_iDoclistHint;
+		}
+	}
+
+	void Convert ( ISphWordlist::Args_t & tArgs )
+	{
+		if ( !m_dWordExpand.GetLength() && !m_dWordPayload.GetLength() )
+			return;
+
+		int iTotalDocs = 0;
+		int iTotalHits = 0;
+		if ( m_dWordExpand.GetLength() )
+		{
+			LimitExpanded ( tArgs.m_iExpansionLimit, m_dWordExpand );
+
+			const BYTE * sBase = m_dWordBuf.Begin();
+			ARRAY_FOREACH ( i, m_dWordExpand )
+			{
+				const DiskExpandedEntry_t & tCur = m_dWordExpand[i];
+				int iDocs = tCur.m_iDocs;
+
+				if ( m_eHitless==SPH_HITLESS_SOME )
+					iDocs = ( tCur.m_iDocs & HITLESS_DOC_MASK );
+
+				tArgs.AddExpanded ( sBase + tCur.m_iNameOff + 1, sBase[tCur.m_iNameOff], iDocs, tCur.m_iHits );
+
+				iTotalDocs += iDocs;
+				iTotalHits += tCur.m_iHits;
+			}
+		}
+
+		if ( m_dWordPayload.GetLength() )
+		{
+			LimitExpanded ( tArgs.m_iExpansionLimit, m_dWordPayload );
+
+			DiskSubstringPayload_t * pPayload = new DiskSubstringPayload_t ( m_dWordPayload.GetLength() );
+			// sorting by ascending doc-list offset gives some (15%) speed-up too
+			sphSort ( m_dWordPayload.Begin(), m_dWordPayload.GetLength(), bind ( &DiskExpandedPayload_t::m_uDoclistOff ) );
+
+			ARRAY_FOREACH ( i, m_dWordPayload )
+			{
+				const DiskExpandedPayload_t & tCur = m_dWordPayload[i];
+				assert ( m_eHitless==SPH_HITLESS_NONE || ( m_eHitless==SPH_HITLESS_SOME && ( tCur.m_iDocs & HITLESS_DOC_FLAG )==0 ) );
+
+				iTotalDocs += tCur.m_iDocs;
+				iTotalHits += tCur.m_iHits;
+				pPayload->m_dDoclist[i].m_uOff = tCur.m_uDoclistOff;
+				pPayload->m_dDoclist[i].m_iLen = tCur.m_iDoclistHint;
+			}
+
+			pPayload->m_iTotalDocs = iTotalDocs;
+			pPayload->m_iTotalHits = iTotalHits;
+			tArgs.m_pPayload = pPayload;
+		}
+		tArgs.m_iTotalDocs = iTotalDocs;
+		tArgs.m_iTotalHits = iTotalHits;
+	}
+
+	// sort expansions by frequency desc
+	// clip the less frequent ones if needed, as they are likely misspellings
+	template < typename T >
+	void LimitExpanded ( int iExpansionLimit, CSphVector<T> & dVec ) const
+	{
+		if ( !iExpansionLimit || dVec.GetLength()<=iExpansionLimit )
+			return;
+
+		sphSort ( dVec.Begin(), dVec.GetLength(), ExpandedOrderDesc_T<T>() );
+		dVec.Resize ( iExpansionLimit );
+	}
+
+	bool								m_bPayload;
+	ESphHitless							m_eHitless;
+	CSphVector<DiskExpandedEntry_t>		m_dWordExpand;
+	CSphVector<DiskExpandedPayload_t>	m_dWordPayload;
+	CSphVector<BYTE>					m_dWordBuf;
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+CWordlist::~CWordlist ()
+{
+	Reset();
+}
+
+
+void CWordlist::Reset ()
+{
+	m_tBuf.Reset ();
+	m_dCheckpoints.Reset ( 0 );
+	m_pWords.Reset ( 0 );
+	SafeDeleteArray ( m_pInfixBlocksWords );
+	SafeDelete ( m_pCpReader );
+}
+
+
+bool CWordlist::Preread ( const CSphString & sName, bool bWordDict, int iSkiplistBlockSize, CSphString & sError )
+{
+	assert ( m_iDictCheckpointsOffset>0 );
+
+	m_bWordDict = bWordDict;
+	m_iWordsEnd = m_iDictCheckpointsOffset; // set wordlist end
+	m_iSkiplistBlockSize = iSkiplistBlockSize;
+
+	////////////////////////////
+	// preload word checkpoints
+	////////////////////////////
+
+	////////////////////////////
+	// fast path for CRC checkpoints - just maps data and use inplace CP reader
+	if ( !bWordDict )
+	{
+		if ( !m_tBuf.Setup ( sName, sError ) )
+			return false;
+
+		m_pCpReader = new CheckpointReader_c;
+		return true;
+	}
+
+	////////////////////////////
+	// regular path that loads checkpoints data
+
+	CSphAutoreader tReader;
+	if ( !tReader.Open ( sName, sError ) )
+		return false;
+
+	int64_t iFileSize = tReader.GetFilesize();
+
+	int iCheckpointOnlySize = (int)(iFileSize-m_iDictCheckpointsOffset);
+	if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
+		iCheckpointOnlySize = (int)(m_iInfixBlocksOffset - strlen ( g_sTagInfixBlocks ) - m_iDictCheckpointsOffset);
+
+	if ( iFileSize-m_iDictCheckpointsOffset>=UINT_MAX )
+	{
+		sError.SetSprintf ( "dictionary meta overflow: meta size=" INT64_FMT ", total size=" INT64_FMT ", meta offset=" INT64_FMT,
+			iFileSize-m_iDictCheckpointsOffset, iFileSize, (int64_t)m_iDictCheckpointsOffset );
+		return false;
+	}
+
+	tReader.SeekTo ( m_iDictCheckpointsOffset, iCheckpointOnlySize );
+
+	assert ( m_bWordDict );
+	int iArenaSize = iCheckpointOnlySize
+		- (sizeof(DWORD)+sizeof(SphOffset_t))*m_dCheckpoints.GetLength()
+		+ sizeof(BYTE)*m_dCheckpoints.GetLength();
+	assert ( iArenaSize>=0 );
+	m_pWords.Reset ( iArenaSize );
+
+	BYTE * pWord = m_pWords.Begin();
+	for ( auto & dCheckpoint : m_dCheckpoints )
+	{
+		dCheckpoint.m_sWord = (char *)pWord;
+
+		const int iLen = tReader.GetDword();
+		assert ( iLen>0 );
+		assert ( iLen + 1 + ( pWord - m_pWords.Begin() )<=iArenaSize );
+		tReader.GetBytes ( pWord, iLen );
+		pWord[iLen] = '\0';
+		pWord += iLen+1;
+
+		dCheckpoint.m_iWordlistOffset = tReader.GetOffset();
+	}
+
+	////////////////////////
+	// preload infix blocks
+	////////////////////////
+
+	if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
+	{
+		// reading to vector as old version doesn't store total infix words length
+		CSphTightVector<BYTE> dInfixWords;
+		dInfixWords.Reserve ( (int)m_iInfixBlocksWordsSize );
+
+		tReader.SeekTo ( m_iInfixBlocksOffset, (int)(iFileSize-m_iInfixBlocksOffset) );
+		m_dInfixBlocks.Resize ( tReader.UnzipInt() );
+		for ( auto & dInfixBlock : m_dInfixBlocks )
+		{
+			int iBytes = tReader.UnzipInt();
+
+			int iOff = dInfixWords.GetLength();
+			dInfixBlock.m_iInfixOffset = (DWORD) iOff; /// FIXME! name convention of m_iInfixOffset
+			dInfixWords.Resize ( iOff+iBytes+1 );
+
+			tReader.GetBytes ( dInfixWords.Begin()+iOff, iBytes );
+			dInfixWords[iOff+iBytes] = '\0';
+
+			dInfixBlock.m_iOffset = tReader.UnzipInt();
+		}
+
+		// fix-up offset to pointer
+		m_pInfixBlocksWords = dInfixWords.LeakData();
+		ARRAY_FOREACH ( i, m_dInfixBlocks )
+			m_dInfixBlocks[i].m_sInfix = (const char *)m_pInfixBlocksWords + m_dInfixBlocks[i].m_iInfixOffset;
+
+		// FIXME!!! store and load that explicitly
+		if ( m_dInfixBlocks.GetLength() )
+			m_iWordsEnd = m_dInfixBlocks.Begin()->m_iOffset - strlen ( g_sTagInfixEntries );
+		else
+			m_iWordsEnd -= strlen ( g_sTagInfixEntries );
+	}
+
+	if ( tReader.GetErrorFlag() )
+	{
+		sError = tReader.GetErrorMessage();
+		return false;
+	}
+
+	tReader.Close();
+
+	// mapping up only wordlist without meta (checkpoints, infixes, etc)
+	if ( !m_tBuf.Setup ( sName, sError ) )
+		return false;
+
+	return true;
+}
+
+
+void CWordlist::DebugPopulateCheckpoints()
+{
+	if ( !m_pCpReader )
+		return;
+
+	const BYTE * pCur = m_tBuf.GetWritePtr() + m_iDictCheckpointsOffset;
+	ARRAY_FOREACH ( i, m_dCheckpoints )
+		pCur = m_pCpReader->ReadEntry ( pCur, m_dCheckpoints[i] );
+
+	SafeDelete(m_pCpReader);
+}
+
+
+const CSphWordlistCheckpoint * CWordlist::FindCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID, bool bStarMode ) const
+{
+	if ( m_pCpReader ) // FIXME!!! fall to regular checkpoints after data got read
+	{
+		MappedCheckpoint_fn tPred ( m_dCheckpoints.Begin(), m_tBuf.GetWritePtr() + m_iDictCheckpointsOffset, m_pCpReader );
+		return sphSearchCheckpoint ( sWord, iWordLen, iWordID, bStarMode, m_bWordDict, m_dCheckpoints.Begin(), &m_dCheckpoints.Last(), tPred );
+	}
+
+	return sphSearchCheckpoint ( sWord, iWordLen, iWordID, bStarMode, m_bWordDict, m_dCheckpoints.Begin(), &m_dCheckpoints.Last() );
+}
+
+
+bool CWordlist::GetWord ( const BYTE * pBuf, SphWordID_t iWordID, CSphDictEntry & tWord ) const
+{
+	SphWordID_t iLastID = 0;
+	SphOffset_t uLastOff = 0;
+
+	while (true)
+	{
+		// unpack next word ID
+		const SphWordID_t iDeltaWord = sphUnzipWordid ( pBuf ); // FIXME! slow with 32bit wordids
+
+		if ( iDeltaWord==0 ) // wordlist chunk is over
+			return false;
+
+		iLastID += iDeltaWord;
+
+		// list is sorted, so if there was no match, there's no such word
+		if ( iLastID>iWordID )
+			return false;
+
+		// unpack next offset
+		const SphOffset_t iDeltaOffset = sphUnzipOffset ( pBuf );
+		uLastOff += iDeltaOffset;
+
+		// unpack doc/hit count
+		const int iDocs = sphUnzipInt ( pBuf );
+		const int iHits = sphUnzipInt ( pBuf );
+		SphOffset_t iSkiplistPos = 0;
+		if ( iDocs > m_iSkiplistBlockSize )
+			iSkiplistPos = sphUnzipOffset ( pBuf );
+
+		assert ( iDeltaOffset );
+		assert ( iDocs );
+		assert ( iHits );
+
+		// it matches?!
+		if ( iLastID==iWordID )
+		{
+			sphUnzipWordid ( pBuf ); // might be 0 at checkpoint
+			const SphOffset_t iDoclistLen = sphUnzipOffset ( pBuf );
+
+			tWord.m_iDoclistOffset = uLastOff;
+			tWord.m_iDocs = iDocs;
+			tWord.m_iHits = iHits;
+			tWord.m_iDoclistHint = (int)iDoclistLen;
+			tWord.m_iSkiplistOffset = iSkiplistPos;
+			return true;
+		}
+	}
+}
+
+
+const BYTE * CWordlist::AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint ) const
+{
+	assert ( pCheckpoint );
+	assert ( m_dCheckpoints.GetLength() );
+	assert ( pCheckpoint>=m_dCheckpoints.Begin() && pCheckpoint<=&m_dCheckpoints.Last() );
+
+	SphOffset_t iOff = pCheckpoint->m_iWordlistOffset;
+	if ( m_pCpReader )
+	{
+		MappedCheckpoint_fn tPred ( m_dCheckpoints.Begin(), m_tBuf.GetWritePtr() + m_iDictCheckpointsOffset, m_pCpReader );
+		iOff = tPred ( pCheckpoint ).m_iWordlistOffset;
+	}
+
+	assert ( !m_tBuf.IsEmpty() );
+	assert ( iOff>0 && iOff<(int64_t)m_tBuf.GetLengthBytes() );
+
+	return m_tBuf.GetWritePtr()+iOff;
+}
+
+
+void CWordlist::GetPrefixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const
+{
+	assert ( sSubstring && *sSubstring && iSubLen>0 );
+
+	// empty index?
+	if ( !m_dCheckpoints.GetLength() )
+		return;
+
+	DictEntryDiskPayload_t tDict2Payload ( tArgs.m_bPayload, tArgs.m_eHitless );
+
+	int dWildcard [ SPH_MAX_WORD_LEN + 1 ];
+	int * pWildcard = ( sphIsUTF8 ( sWildcard ) && sphUTF8ToWideChar ( sWildcard, dWildcard, SPH_MAX_WORD_LEN ) ) ? dWildcard : NULL;
+
+	const CSphWordlistCheckpoint * pCheckpoint = FindCheckpoint ( sSubstring, iSubLen, 0, true );
+	const int iSkipMagic = ( BYTE(*sSubstring)<0x20 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
+	while ( pCheckpoint )
+	{
+		// decode wordlist chunk
+		KeywordsBlockReader_c tDictReader ( AcquireDict ( pCheckpoint ), m_iSkiplistBlockSize );
+		while ( tDictReader.UnpackWord() )
+		{
+			// block is sorted
+			// so once keywords are greater than the prefix, no more matches
+			int iCmp = sphDictCmp ( sSubstring, iSubLen, (const char *)tDictReader.m_sKeyword, tDictReader.GetWordLen() );
+			if ( iCmp<0 )
+				break;
+
+			if ( sphInterrupted() )
+				break;
+
+			// does it match the prefix *and* the entire wildcard?
+			if ( iCmp==0 && sphWildcardMatch ( (const char *)tDictReader.m_sKeyword + iSkipMagic, sWildcard, pWildcard ) )
+				tDict2Payload.Add ( tDictReader, tDictReader.GetWordLen() );
+		}
+
+		if ( sphInterrupted () )
+			break;
+
+		pCheckpoint++;
+		if ( pCheckpoint > &m_dCheckpoints.Last() )
+			break;
+
+		if ( sphDictCmp ( sSubstring, iSubLen, pCheckpoint->m_sWord, strlen ( pCheckpoint->m_sWord ) )<0 )
+			break;
+	}
+
+	tDict2Payload.Convert ( tArgs );
+}
+
+
+void CWordlist::GetInfixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const
+{
+	// dict must be of keywords type, and fully cached
+	// mmap()ed in the worst case, should we ever banish it to disk again
+	if ( m_tBuf.IsEmpty() || !m_dCheckpoints.GetLength() )
+		return;
+
+	assert ( !m_pCpReader );
+
+	// extract key1, upto 6 chars from infix start
+	int iBytes1 = sphGetInfixLength ( sSubstring, iSubLen, m_iInfixCodepointBytes );
+
+	// lookup key1
+	// OPTIMIZE? maybe lookup key2 and reduce checkpoint set size, if possible?
+	CSphVector<DWORD> dPoints;
+	if ( !sphLookupInfixCheckpoints ( sSubstring, iBytes1, m_tBuf.GetWritePtr(), m_dInfixBlocks, m_iInfixCodepointBytes, dPoints ) )
+		return;
+
+	DictEntryDiskPayload_t tDict2Payload ( tArgs.m_bPayload, tArgs.m_eHitless );
+	const int iSkipMagic = ( tArgs.m_bHasExactForms ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
+
+	int dWildcard [ SPH_MAX_WORD_LEN + 1 ];
+	int * pWildcard = ( sphIsUTF8 ( sWildcard ) && sphUTF8ToWideChar ( sWildcard, dWildcard, SPH_MAX_WORD_LEN ) ) ? dWildcard : NULL;
+
+	// walk those checkpoints, check all their words
+	ARRAY_FOREACH ( i, dPoints )
+	{
+		// OPTIMIZE? add a quicker path than a generic wildcard for "*infix*" case?
+		KeywordsBlockReader_c tDictReader ( m_tBuf.GetWritePtr() + m_dCheckpoints[dPoints[i]-1].m_iWordlistOffset, m_iSkiplistBlockSize );
+		while ( tDictReader.UnpackWord() )
+		{
+			if ( sphInterrupted () )
+				break;
+
+			// stemmed terms should not match suffixes
+			if ( tArgs.m_bHasExactForms && *tDictReader.m_sKeyword!=MAGIC_WORD_HEAD_NONSTEMMED )
+				continue;
+
+			if ( sphWildcardMatch ( (const char *)tDictReader.m_sKeyword+iSkipMagic, sWildcard, pWildcard ) )
+				tDict2Payload.Add ( tDictReader, tDictReader.GetWordLen() );
+		}
+
+		if ( sphInterrupted () )
+			break;
+	}
+
+	tDict2Payload.Convert ( tArgs );
+}
+
+
+void CWordlist::SuffixGetChekpoints ( const SuggestResult_t & , const char * sSuffix, int iLen, CSphVector<DWORD> & dCheckpoints ) const
+{
+	sphLookupInfixCheckpoints ( sSuffix, iLen, m_tBuf.GetWritePtr(), m_dInfixBlocks, m_iInfixCodepointBytes, dCheckpoints );
+}
+
+
+void CWordlist::SetCheckpoint ( SuggestResult_t & tRes, DWORD iCP ) const
+{
+	assert ( tRes.m_pWordReader );
+	KeywordsBlockReader_c * pReader = (KeywordsBlockReader_c *)tRes.m_pWordReader;
+	pReader->Reset ( m_tBuf.GetWritePtr() + m_dCheckpoints[iCP-1].m_iWordlistOffset );
+}
+
+
+bool CWordlist::ReadNextWord ( SuggestResult_t & tRes, DictWord_t & tWord ) const
+{
+	KeywordsBlockReader_c * pReader = (KeywordsBlockReader_c *)tRes.m_pWordReader;
+	if ( !pReader->UnpackWord() )
+		return false;
+
+	tWord.m_sWord = pReader->GetWord();
+	tWord.m_iLen = pReader->GetWordLen();
+	tWord.m_iDocs = pReader->m_iDocs;
+	return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+KeywordsBlockReader_c::KeywordsBlockReader_c ( const BYTE * pBuf, int iSkiplistBlockSize )
+	: m_iSkiplistBlockSize ( iSkiplistBlockSize )
+{
+	Reset ( pBuf );
+}
+
+
+void KeywordsBlockReader_c::Reset ( const BYTE * pBuf )
+{
+	m_pBuf = pBuf;
+	m_sWord[0] = '\0';
+	m_iLen = 0;
+	m_sKeyword = m_sWord;
+}
+
+
+bool KeywordsBlockReader_c::UnpackWord()
+{
+	if ( !m_pBuf )
+		return false;
+
+	assert ( m_iSkiplistBlockSize>0 );
+
+	// unpack next word
+	// must be in sync with DictEnd()!
+	BYTE uPack = *m_pBuf++;
+	if ( !uPack )
+	{
+		// ok, this block is over
+		m_pBuf = NULL;
+		m_iLen = 0;
+		return false;
+	}
+
+	int iMatch, iDelta;
+	if ( uPack & 0x80 )
+	{
+		iDelta = ( ( uPack>>4 ) & 7 ) + 1;
+		iMatch = uPack & 15;
+	} else
+	{
+		iDelta = uPack & 127;
+		iMatch = *m_pBuf++;
+	}
+
+	assert ( iMatch+iDelta<(int)sizeof(m_sWord)-1 );
+	assert ( iMatch<=(int)strlen ( (char *)m_sWord ) );
+
+	memcpy ( m_sWord + iMatch, m_pBuf, iDelta );
+	m_pBuf += iDelta;
+
+	m_iLen = iMatch + iDelta;
+	m_sWord[m_iLen] = '\0';
+
+	m_iDoclistOffset = sphUnzipOffset ( m_pBuf );
+	m_iDocs = sphUnzipInt ( m_pBuf );
+	m_iHits = sphUnzipInt ( m_pBuf );
+	m_uHint = ( m_iDocs>=DOCLIST_HINT_THRESH ) ? *m_pBuf++ : 0;
+	m_iDoclistHint = DoclistHintUnpack ( m_iDocs, m_uHint );
+	if ( m_iDocs > m_iSkiplistBlockSize )
+		m_iSkiplistOffset = sphUnzipOffset ( m_pBuf );
+	else
+		m_iSkiplistOffset = 0;
+
+	assert ( m_iLen>0 );
+	return true;
+}

+ 157 - 0
src/indexformat.h

@@ -0,0 +1,157 @@
+//
+// Copyright (c) 2017-2019, Manticore Software LTD (http://manticoresearch.com)
+// Copyright (c) 2001-2016, Andrew Aksyonoff
+// Copyright (c) 2008-2016, Sphinx Technologies Inc
+// All rights reserved
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License. You should have
+// received a copy of the GPL license along with this program; if you
+// did not, you can find it at http://www.gnu.org/
+//
+#ifndef _indexformat_
+#define _indexformat_
+
+#include "sphinxstd.h"
+#include "sphinxsearch.h"
+#include "datareader.h"
+
+const int	DOCLIST_HINT_THRESH = 256;
+const DWORD HITLESS_DOC_MASK = 0x7FFFFFFF;
+const DWORD	HITLESS_DOC_FLAG = 0x80000000;
+
+#define sphUnzipWordid sphUnzipOffset
+
+class DiskIndexQwordSetup_c;
+
+/// query word from the searcher's point of view
+class DiskIndexQwordTraits_c : public ISphQword
+{
+public:
+	/// tricky bit
+	/// m_uHitPosition is always a current position in the .spp file
+	/// base ISphQword::m_iHitlistPos carries the inlined hit data when m_iDocs==1
+	/// but this one is always a real position, used for delta coding
+	SphOffset_t		m_uHitPosition = 0;
+	CSphMatch		m_tDoc;			///< current match (partial)
+
+	FileBlockReaderPtr_c	m_rdDoclist;	///< my doclist accessor
+	FileBlockReaderPtr_c	m_rdHitlist;	///< my hitlist accessor
+
+
+					DiskIndexQwordTraits_c ( bool bUseMini, bool bExcluded );
+
+	void			SetDocReader ( DataReaderFactory_c * pReader );
+	void			SetHitReader ( DataReaderFactory_c * pReader );
+	void			ResetDecoderState();
+	virtual bool	Setup ( const DiskIndexQwordSetup_c * pSetup ) = 0;
+
+protected:
+	Hitpos_t		m_uInlinedHit {0};
+	DWORD			m_uHitState = 0;
+	Hitpos_t		m_iHitPos {EMPTY_HIT};	///< current hit postition, from hitlist
+
+	static const int MINIBUFFER_LEN = 1024;
+	BYTE			m_dHitlistBuf[MINIBUFFER_LEN];
+	BYTE			m_dDoclistBuf[MINIBUFFER_LEN];
+
+	BYTE *			m_pHitsBuf = nullptr;
+	BYTE *			m_pDocsBuf = nullptr;
+
+#ifndef NDEBUG
+	bool			m_bHitlistOver = true;
+#endif
+
+};
+
+
+struct CSphWordlistCheckpoint
+{
+	union
+	{
+		SphWordID_t		m_uWordID;
+		const char *	m_sWord;
+	};
+	SphOffset_t			m_iWordlistOffset;
+};
+
+
+int DoclistHintUnpack ( DWORD uDocs, BYTE uHint );
+
+
+// dictionary header
+struct DictHeader_t
+{
+	int				m_iDictCheckpoints = 0;			///< how many dict checkpoints (keyword blocks) are there
+	SphOffset_t		m_iDictCheckpointsOffset = 0;	///< dict checkpoints file position
+
+	int				m_iInfixCodepointBytes = 0;		///< max bytes per infix codepoint (0 means no infixes)
+	int64_t			m_iInfixBlocksOffset = 0;		///< infix blocks file position (stored as unsigned 32bit int as keywords dictionary is pretty small)
+	int				m_iInfixBlocksWordsSize = 0;	///< infix checkpoints size
+};
+
+
+class CheckpointReader_c;
+
+// FIXME: eliminate this, move it to proper dict impls
+class CWordlist : public ISphWordlist, public DictHeader_t, public ISphWordlistSuggest
+{
+public:
+	// !COMMIT slow data
+	CSphMappedBuffer<BYTE>						m_tBuf;					///< my cache
+	CSphFixedVector<CSphWordlistCheckpoint>		m_dCheckpoints {0};		///< checkpoint offsets
+
+
+										~CWordlist () override;
+
+	void								Reset();
+	bool								Preread ( const CSphString & sName, bool bWordDict, int iSkiplistBlockSize, CSphString & sError );
+
+	const CSphWordlistCheckpoint *		FindCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID, bool bStarMode ) const;
+	bool								GetWord ( const BYTE * pBuf, SphWordID_t iWordID, CSphDictEntry & tWord ) const;
+
+	const BYTE *						AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint ) const;
+	void								GetPrefixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const override;
+	void								GetInfixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const override;
+
+	void								SuffixGetChekpoints ( const SuggestResult_t & tRes, const char * sSuffix, int iLen, CSphVector<DWORD> & dCheckpoints ) const override;
+	void								SetCheckpoint ( SuggestResult_t & tRes, DWORD iCP ) const override;
+	bool								ReadNextWord ( SuggestResult_t & tRes, DictWord_t & tWord ) const override;
+	int									GetWordsEnd() const { return m_iWordsEnd; }
+
+	void								DebugPopulateCheckpoints();
+
+private:
+	bool								m_bWordDict = false;
+	CSphVector<InfixBlock_t>			m_dInfixBlocks {0};
+	CSphFixedVector<BYTE>				m_pWords {0};			///< arena for checkpoint's words
+	BYTE *								m_pInfixBlocksWords = nullptr;	///< arena for infix checkpoint's words
+	int									m_iSkiplistBlockSize {0};
+
+	SphOffset_t							m_iWordsEnd = 0;		///< end of wordlist
+	CheckpointReader_c *				m_pCpReader = nullptr;
+};
+
+
+/// dict=keywords block reader
+class KeywordsBlockReader_c : public CSphDictEntry
+{
+public:
+					KeywordsBlockReader_c ( const BYTE * pBuf, int iSkiplistBlockSize );
+
+	void			Reset ( const BYTE * pBuf );
+	bool			UnpackWord();
+
+	const char *	GetWord() const			{ return (const char*)m_sWord; }
+	int				GetWordLen() const		{ return m_iLen; }
+
+private:
+	const BYTE *	m_pBuf;
+	BYTE			m_sWord [ MAX_KEYWORD_BYTES ];
+	int				m_iLen;
+	BYTE			m_uHint = 0;
+	int				m_iSkiplistBlockSize = 0;
+};
+
+
+#endif // _indexformat_

File diff suppressed because it is too large
+ 59 - 1477
src/sphinx.cpp


+ 7 - 25
src/sphinx.h

@@ -123,6 +123,8 @@ STATIC_ASSERT ( ( 1 << ROWITEM_SHIFT )==ROWITEM_BITS, INVALID_ROWITEM_SHIFT );
 #define SPH_MAX_FILENAME_LEN	512
 #define SPH_MAX_FIELDS			256
 
+const int MAX_KEYWORD_BYTES = SPH_MAX_WORD_LEN*3+4;
+
 /////////////////////////////////////////////////////////////////////////////
 
 extern int64_t g_iIndexerCurrentDocID;
@@ -3290,33 +3292,11 @@ private:
 };
 
 
-// simple error reporter for debug checks
-class DebugCheckError_c
-{
-public:
-			DebugCheckError_c ( FILE * pFile );
-
-	void	Fail ( const char * szFmt, ... );
-	void	Msg ( const char * szFmt, ... );
-	void	Progress ( const char * szFmt, ... );
-	void	Done();
-
-	void	SetSegment ( int iSegment );
-	int64_t	GetNumFails() const;
-
-private:
-	FILE *	m_pFile {nullptr};
-	bool	m_bProgress {false};
-	int64_t m_tStartTime {0};
-	int64_t	m_nFails {0};
-	int64_t	m_nFailsPrinted {0};
-	int		m_iSegment {-1};
-};
-
-
 class DocstoreFields_i;
 void SetupDocstoreFields ( DocstoreFields_i & tFields, const CSphSchema & tSchema );
 
+class DiskIndexQwordTraits_c;
+DiskIndexQwordTraits_c * sphCreateDiskIndexQword ( bool bInlineHits );
 
 struct DocstoreDoc_t
 {
@@ -3327,7 +3307,8 @@ struct DocstoreDoc_t
 enum DocstoreDataType_e
 {
 	DOCSTORE_TEXT,
-	DOCSTORE_BIN
+	DOCSTORE_BIN,
+	DOCSTORE_TOTAL
 };
 
 
@@ -3506,6 +3487,7 @@ public:
 	virtual CSphFixedVector<SphAttr_t> BuildDocList () const;
 
 	virtual void				SetMemorySettings ( const FileAccessSettings_t & tFileAccessSettings ) = 0;
+	virtual const FileAccessSettings_t & GetMemorySettings() const = 0;
 
 	virtual void				GetFieldFilterSettings ( CSphFieldFilterSettings & tSettings );
 

+ 7 - 7
src/sphinxaot.cpp

@@ -932,7 +932,7 @@ void sphAotLemmatizeRu1251 ( BYTE * pWord, int iLen )
 	// do lemmatizing
 	// input keyword moves into sForm; LemmatizeWord() will also case fold sForm
 	// we will generate results using sForm into pWord; so we need this extra copy
-	BYTE sForm [ SPH_MAX_WORD_LEN*3+4 ]; // aka MAX_KEYWORD_BYTES
+	BYTE sForm[MAX_KEYWORD_BYTES];
 	int iFormLen = 0;
 
 	// faster than strlen and strcpy..
@@ -987,7 +987,7 @@ void sphAotLemmatize ( BYTE * pWord, int iLang )
 	// do lemmatizing
 	// input keyword moves into sForm; LemmatizeWord() will also case fold sForm
 	// we will generate results using sForm into pWord; so we need this extra copy
-	BYTE sForm [ SPH_MAX_WORD_LEN*3+4 ]; // aka MAX_KEYWORD_BYTES
+	BYTE sForm[MAX_KEYWORD_BYTES];
 	int iFormLen = 0;
 
 	// faster than strlen and strcpy..
@@ -1063,7 +1063,7 @@ void sphAotLemmatizeDe1252 ( BYTE * pWord, int iLen )
 	// do lemmatizing
 	// input keyword moves into sForm; LemmatizeWord() will also case fold sForm
 	// we will generate results using sForm into pWord; so we need this extra copy
-	BYTE sForm [ SPH_MAX_WORD_LEN*3+4 ]; // aka MAX_KEYWORD_BYTES
+	BYTE sForm[MAX_KEYWORD_BYTES];
 	int iFormLen = 0;
 
 	// faster than strlen and strcpy..
@@ -1425,13 +1425,13 @@ class CSphAotTokenizerTmpl : public CSphTokenFilter
 {
 protected:
 	using Base = CSphTokenFilter;
-	BYTE		m_sForm [ SPH_MAX_WORD_LEN*3+4 ];	///< aka MAX_KEYWORD_BYTES
+	BYTE		m_sForm[MAX_KEYWORD_BYTES];
 	int			m_iFormLen = 0;						///< in bytes, but in windows-1251 that is characters, too
 	bool		m_bFound = false;					///< found or predicted?
 	DWORD		m_FindResults[12];					///< max results is like 6
-	int			m_iCurrent = -1;							///< index in m_FindResults that was just returned, -1 means no blending
-	BYTE		m_sToken [ SPH_MAX_WORD_LEN*3+4 ];	///< to hold generated lemmas
-	BYTE		m_sOrigToken [ SPH_MAX_WORD_LEN*3+4 ];	///< to hold original token
+	int			m_iCurrent = -1;					///< index in m_FindResults that was just returned, -1 means no blending
+	BYTE		m_sToken[MAX_KEYWORD_BYTES];		///< to hold generated lemmas
+	BYTE		m_sOrigToken[MAX_KEYWORD_BYTES];	///< to hold original token
 	bool		m_bIndexExact;
 
 	const CSphWordforms *	m_pWordforms = nullptr;

+ 33 - 20
src/sphinxint.h

@@ -491,10 +491,15 @@ public:
 		return m_pData;
 	}
 
+	int GetLength() const
+	{
+		return m_iLen;
+	}
+
 protected:
-	const BYTE * m_pData = nullptr;
-	const int m_iLen = 0;
-	const BYTE * m_pCur = nullptr;
+	const BYTE *	m_pData = nullptr;
+	const int		m_iLen = 0;
+	const BYTE *	m_pCur = nullptr;
 };
 
 class MemoryWriter_c
@@ -598,23 +603,6 @@ namespace sph
 	int rename ( const char * sOld, const char * sNew );
 }
 
-class DebugCheckReader_i
-{
-public:
-	virtual ~DebugCheckReader_i () {};
-	virtual int64_t GetLengthBytes () = 0;
-	virtual bool GetBytes ( void * pData, int iSize ) = 0;
-	virtual bool SeekTo ( int64_t iOff, int iHint ) = 0;
-};
-
-// common code for debug checks
-class DebugCheckHelper_c
-{
-protected:
-	void				DebugCheck_Attributes ( DebugCheckReader_i & tAttrs, DebugCheckReader_i & tBlobs, int64_t nRows, int64_t iMinMaxBytes, const CSphSchema & tSchema, DebugCheckError_c & tReporter );
-	void				DebugCheck_DeadRowMap (  int64_t iSizeBytes, int64_t nRows, DebugCheckError_c & tReporter ) const;
-};
-
 //////////////////////////////////////////////////////////////////////////
 
 /// generic COM-like uids
@@ -2490,6 +2478,31 @@ BYTE PrereadMapping ( const char * sIndexName, const char * sFor, bool bMlock, b
 	return g_uHash;
 }
 
+#if PARANOID
+
+#define SPH_VARINT_DECODE(_type,_getexpr) \
+	register DWORD b = 0; \
+	register _type v = 0; \
+	int it = 0; \
+	do { b = _getexpr; v = ( v<<7 ) + ( b&0x7f ); it++; } while ( b&0x80 ); \
+	assert ( (it-1)*7<=sizeof(_type)*8 ); \
+	return v;
+
+#else
+
+#define SPH_VARINT_DECODE(_type,_getexpr) \
+	register DWORD b = _getexpr; \
+	register _type res = 0; \
+	while ( b & 0x80 ) \
+	{ \
+		res = ( res<<7 ) + ( b & 0x7f ); \
+		b = _getexpr; \
+	} \
+	res = ( res<<7 ) + b; \
+	return res;
+
+#endif // PARANOID
+
 // crash related code
 struct CrashQuery_t
 {

+ 3 - 2
src/sphinxpq.cpp

@@ -69,6 +69,7 @@ static bool operator < ( int64_t iQUID, const StoredQueryKey_t & tKey )
 }
 
 static int g_iPercolateThreads = 1;
+static FileAccessSettings_t g_tDummyFASettings;
 
 class PercolateIndex_c : public PercolateIndex_i
 {
@@ -83,8 +84,7 @@ public:
 	bool Commit ( int * pDeleted, RtAccum_t * pAccExt ) override;
 	void RollBack ( RtAccum_t * pAccExt ) override;
 
-	StoredQuery_i * AddQuery ( const PercolateQueryArgs_t & tArgs, const ISphTokenizer * pTokenizer, CSphDict * pDict, CSphString & sError )
-		REQUIRES (!m_tLock);
+	StoredQuery_i * AddQuery ( const PercolateQueryArgs_t & tArgs, const ISphTokenizer * pTokenizer, CSphDict * pDict, CSphString & sError ) REQUIRES (!m_tLock);
 	StoredQuery_i * Query ( const PercolateQueryArgs_t & tArgs, CSphString & sError ) override REQUIRES (!m_tLock);
 
 	bool Prealloc ( bool bStripPath ) override;
@@ -140,6 +140,7 @@ public:
 	void				DebugDumpDict ( FILE * ) override {}
 	void				SetProgressCallback ( CSphIndexProgress::IndexingProgress_fn ) override {}
 	void				SetMemorySettings ( const FileAccessSettings_t & ) override {}
+	const FileAccessSettings_t & GetMemorySettings() const override { return g_tDummyFASettings; }
 
 	void				ProhibitSave() override { m_bSaveDisabled = true; }
 	void				EnableSave() override { m_bSaveDisabled = false; }

+ 5 - 4
src/sphinxrt.cpp

@@ -24,6 +24,7 @@
 #include "killlist.h"
 #include "secondaryindex.h"
 #include "accumulator.h"
+#include "indexcheck.h"
 
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -1066,15 +1067,14 @@ public:
 	bool				DeleteDocument ( const DocID_t * pDocs, int iDocs, CSphString & sError, RtAccum_t * pAccExt ) final;
 	bool				Commit ( int * pDeleted, RtAccum_t * pAccExt ) final;
 	void				RollBack ( RtAccum_t * pAccExt ) final;
-	bool				CommitReplayable ( RtSegment_t * pNewSeg, const CSphVector<DocID_t> & dAccKlist,
-			int * pTotalKilled, bool bForceDump ) EXCLUDES (m_tChunkLock ); // FIXME? protect?
+	bool				CommitReplayable ( RtSegment_t * pNewSeg, const CSphVector<DocID_t> & dAccKlist, int * pTotalKilled, bool bForceDump ) EXCLUDES (m_tChunkLock ); // FIXME? protect?
 	void				ForceRamFlush ( bool bPeriodic=false ) final;
 	bool				IsFlushNeed() const final;
 	bool				ForceDiskChunk() final;
 	bool				AttachDiskIndex ( CSphIndex * pIndex, bool bTruncate, bool & bFatal, CSphString & sError ) 			final  EXCLUDES (m_tReading );
 	bool				Truncate ( CSphString & sError ) final;
 	void				Optimize () final;
-	virtual void				ProgressiveMerge ();
+	void				ProgressiveMerge();
 	CSphIndex *			GetDiskChunk ( int iChunk ) final { return m_dDiskChunks.GetLength()>iChunk ? m_dDiskChunks[iChunk] : nullptr; }
 	ISphTokenizer *		CloneIndexingTokenizer() const final { return m_pTokenizerIndexing->Clone ( SPH_CLONE_INDEX ); }
 
@@ -1094,6 +1094,7 @@ public:
 	void				Dealloc () final {}
 	void				Preread () final;
 	void				SetMemorySettings ( const FileAccessSettings_t & tFileAccessSettings ) final;
+	const FileAccessSettings_t & GetMemorySettings() const final { return m_tFiles; }
 	void				SetBase ( const char * ) final {}
 	bool				Rename ( const char * ) final { return true; }
 	bool				Lock () final { return true; }
@@ -7046,7 +7047,7 @@ void RtIndex_c::Optimize()
 {
 	if ( g_bProgressiveMerge )
 	{
-		ProgressiveMerge ( );
+		ProgressiveMerge();
 		return;
 	}
 

+ 5 - 2
src/sphinxsearch.cpp

@@ -20,8 +20,11 @@
 
 #include <math.h>
 
-//////////////////////////////////////////////////////////////////////////
-// EXTENDED MATCHING V2
+
+bool operator < ( const SkiplistEntry_t & a, RowID_t b )	{ return a.m_tBaseRowIDPlus1<b; }
+bool operator == ( const SkiplistEntry_t & a, RowID_t b )	{ return a.m_tBaseRowIDPlus1==b; }
+bool operator < ( RowID_t a, const SkiplistEntry_t & b )	{ return a<b.m_tBaseRowIDPlus1; }
+
 //////////////////////////////////////////////////////////////////////////
 
 #define SPH_TREE_DUMP			0

+ 3 - 0
src/sphinxsearch.h

@@ -39,6 +39,9 @@ struct SkiplistEntry_t
 	int64_t		m_iBaseHitlistPos;	///< delta decoder hitlist offset base
 };
 
+bool operator < ( const SkiplistEntry_t & a, RowID_t b );
+bool operator == ( const SkiplistEntry_t & a, RowID_t b );
+bool operator < ( RowID_t a, const SkiplistEntry_t & b );
 
 /// term, searcher view
 class ISphQword

Some files were not shown because too many files changed in this diff