wordbreak.pl 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. #
  2. # prepare titles for wordbreaker frequency dictionary builder
  3. # extract and cleanup data
  4. #
  5. #
  6. # usage example:
  7. #
  8. # perl wordbreak.pl < raw.xml > titles.xml
  9. # indexer ub --buildstops titles-freq.txt 10000000 --buildfreqs
  10. #
  11. # sphinx.conf:
  12. #
  13. # source ub
  14. # {
  15. # type = xmlpipe2
  16. # xmlpipe_field = title
  17. # xmlpipe_fixup_utf8 = 1
  18. # xmlpipe_command = cat titles.xml
  19. # }
  20. #
  21. # index ub
  22. # {
  23. # dict = keywords
  24. # type = plain
  25. # source = ub
  26. # path = ub
  27. # charset_type = utf-8
  28. # html_strip = 0
  29. # charset_table = A..Z->a..z, a..z
  30. # }
  31. #
  32. $n = 1;
  33. print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
  34. print "<sphinx:docset>\n";
  35. while (<>)
  36. {
  37. # extract title
  38. next if (!/^\s*<title>/);
  39. chomp;
  40. # cleanup ABC's as in World's
  41. s/[a-z]\'s\b//ig;
  42. # cleanup A.B.C. as in S.r.l. and other abbreviations
  43. s/\b([a-z]\.){2,}\b//ig;
  44. # cleanup A&B as in H&M
  45. s/\b[a-z]\&[a-z]\b//ig;
  46. # cleanup ABC.com as in google.com, brisbanetimes.com.au, etc
  47. s/\b\w+(\.(com|org|net))*\.(com|org|net|it|de|pl|co\.uk|nl|edu|eu|info|fr|ch|br|ru|at|ca|si|tv|es|gov|br|au|jp|biz|dk|il|se|cz|no)\b//ig;
  48. # print out cleaned up document
  49. print "<sphinx:document id=\"$n\">";
  50. print;
  51. print "</sphinx:document>\n";
  52. $n++;
  53. }
  54. print "</sphinx:docset>\n";