suggest.php 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. <?php
  2. define ( "FREQ_THRESHOLD", 40 );
  3. define ( "SUGGEST_DEBUG", 0 );
  4. define ( "LENGTH_THRESHOLD", 2 );
  5. define ( "LEVENSHTEIN_THRESHOLD", 2 );
  6. define ( "TOP_COUNT", 10 );
  7. // error_reporting ( E_ALL ^ E_NOTICE );
  8. mb_internal_encoding ( "utf-8" );
  9. require ( "../../api/sphinxapi.php" );
  10. /// build a list of trigrams for a given keywords
  11. function BuildTrigrams ( $keyword )
  12. {
  13. $t = "__" . $keyword . "__";
  14. $trigrams = "";
  15. for ( $i=0; $i<mb_strlen($t)-2; $i++ )
  16. $trigrams .= mb_substr ( $t, $i, 3 ) . " ";
  17. return $trigrams;
  18. }
  19. /// create SQL dump of the dictionary from Sphinx stopwords file
  20. /// expects open files as parameters
  21. function BuildDictionarySQL ( $out, $in )
  22. {
  23. fwrite ( $out, "DROP TABLE IF EXISTS suggest;
  24. CREATE TABLE suggest (
  25. id INTEGER PRIMARY KEY AUTO_INCREMENT NOT NULL,
  26. keyword VARCHAR(255) NOT NULL,
  27. trigrams VARCHAR(255) NOT NULL,
  28. freq INTEGER NOT NULL,
  29. UNIQUE(keyword)
  30. );
  31. " );
  32. $n = 0;
  33. $m = 0;
  34. while ( $line = fgets ( $in, 1024 ) )
  35. {
  36. list ( $keyword, $freq ) = preg_split ( "/[\s,]+/", trim ( $line ) );
  37. if ( $freq<FREQ_THRESHOLD || strstr ( $keyword, "_" )!==false || strstr ( $keyword, "'" )!==false )
  38. continue;
  39. $trigrams = BuildTrigrams ( $keyword );
  40. if ( !$m )
  41. print "INSERT INTO suggest VALUES\n";
  42. else
  43. print ",\n";
  44. $n++;
  45. fwrite ( $out, "( 0, '$keyword', '$trigrams', $freq )" );
  46. $m++;
  47. if ( ( $m % 10000 )==0 )
  48. {
  49. print ";\n";
  50. $m = 0;
  51. }
  52. }
  53. if ( $m )
  54. fwrite ( $out, ";" );
  55. }
  56. /// search for suggestions
  57. function MakeSuggestion ( $keyword )
  58. {
  59. $trigrams = BuildTrigrams ( $keyword );
  60. $query = "\"$trigrams\"/1";
  61. $len = strlen($keyword);
  62. $delta = LENGTH_THRESHOLD;
  63. $cl = new SphinxClient ();
  64. $cl->SetMatchMode ( SPH_MATCH_EXTENDED2 );
  65. $cl->SetRankingMode ( SPH_RANK_WORDCOUNT );
  66. $cl->SetFilterRange ( "len", $len-$delta, $len+$delta );
  67. $cl->SetSelect ( "*, @weight+$delta-abs(len-$len) AS myrank" );
  68. $cl->SetSortMode ( SPH_SORT_EXTENDED, "myrank DESC, freq DESC" );
  69. $cl->SetArrayResult ( true );
  70. // pull top-N best trigram matches and run them through Levenshtein
  71. $cl->SetLimits ( 0, TOP_COUNT );
  72. $res = $cl->Query ( $query, "suggest" );
  73. if ( !$res || !$res["matches"] )
  74. return false;
  75. if ( SUGGEST_DEBUG )
  76. {
  77. print "--- DEBUG START ---\n";
  78. foreach ( $res["matches"] as $match )
  79. {
  80. $w = $match["attrs"]["keyword"];
  81. $myrank = @$match["attrs"]["myrank"];
  82. if ( $myrank )
  83. $myrank = ", myrank=$myrank";
  84. // FIXME? add costs?
  85. // FIXME! does not work with UTF-8.. THIS! IS!! PHP!!!
  86. $levdist = levenshtein ( $keyword, $w );
  87. print "id=$match[id], weight=$match[weight], freq={$match[attrs][freq]}{$myrank}, word=$w, levdist=$levdist\n";
  88. }
  89. print "--- DEBUG END ---\n";
  90. }
  91. // further restrict trigram matches with a sane Levenshtein distance limit
  92. foreach ( $res["matches"] as $match )
  93. {
  94. $suggested = $match["attrs"]["keyword"];
  95. if ( levenshtein ( $keyword, $suggested )<=LEVENSHTEIN_THRESHOLD )
  96. return $suggested;
  97. }
  98. return $keyword;
  99. }
  100. /// main
  101. if ( $_SERVER["argc"]<2 )
  102. {
  103. die ( "usage:\n"
  104. . "php suggest.php --builddict\treads stopwords from stdin, prints SQL dump of the dictionary to stdout\n"
  105. . "php suggest.php --query WORD\tqueries Sphinx, prints suggestion\n" );
  106. }
  107. if ( $_SERVER["argv"][1]=="--builddict" )
  108. {
  109. $in = fopen ( "php://stdin", "r" );
  110. $out = fopen ( "php://stdout", "w+" );
  111. BuildDictionarySQL ( $out, $in );
  112. }
  113. if ( $_SERVER["argv"][1]=="--query" )
  114. {
  115. mysql_connect ( "localhost", "root", "" ) or die ( "mysql_connect() failed: ".mysql_error() );
  116. mysql_select_db ( "test" ) or die ( "mysql_select_db() failed: ".mysql_error() );
  117. $keyword = $_SERVER["argv"][2];
  118. printf ( "keyword: %s\nsuggestion: %s\n", $keyword, MakeSuggestion($keyword) );
  119. }