FindGoodSegmentDelimiters.cpp 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. // Searches for good delimiters to cut streams into relatively well sized
  2. // segments.
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <time.h>
  6. #include <sys/time.h>
  7. #include <boost/cstdint.hpp>
  8. #include <boost/array.hpp>
  9. #include <boost/random/mersenne_twister.hpp>
  10. #include <boost/thread.hpp>
  11. #include <boost/bind.hpp>
  12. #include <boost/shared_ptr.hpp>
  13. #include <iostream>
  14. #include <vector>
  15. #include <map>
  16. // Desired size range
  17. #define MIN_DESIRED_SIZE 4096
  18. #define MAX_DESIRED_SIZE 131072
  19. #define DELIMITER_SET_SIZE 1
  20. typedef boost::array<boost::uint16_t,DELIMITER_SET_SIZE> DelimArray;
  21. struct BestEntry
  22. {
  23. DelimArray best;
  24. double bestScore;
  25. std::vector<unsigned char> data;
  26. };
  27. boost::mutex bestLock;
  28. boost::mutex outLock;
  29. std::map<std::string,BestEntry> best;
  30. static void runThread(const std::string &fileName)
  31. {
  32. char tmp[4096];
  33. boost::mt19937 prng;
  34. {
  35. boost::uint32_t seed;
  36. FILE *ur = fopen("/dev/urandom","r");
  37. fread((void *)&seed,1,sizeof(seed),ur);
  38. fclose(ur);
  39. prng.seed(seed);
  40. }
  41. BestEntry *myEntry;
  42. {
  43. boost::mutex::scoped_lock l(bestLock);
  44. myEntry = &(best[fileName]);
  45. myEntry->bestScore = 99999999.0;
  46. }
  47. {
  48. boost::mutex::scoped_lock l(outLock);
  49. std::cout << "*** Reading test data from: " << fileName << std::endl;
  50. FILE *f = fopen(fileName.c_str(),"r");
  51. if (f) {
  52. int n;
  53. while ((n = fread((void *)tmp,1,sizeof(tmp),f)) > 0) {
  54. for(int i=0;i<n;++i)
  55. myEntry->data.push_back((unsigned char)tmp[i]);
  56. }
  57. fclose(f);
  58. }
  59. if (myEntry->data.size() <= 0) {
  60. std::cout << "Error: no data read." << std::endl;
  61. exit(1);
  62. } else std::cout << "*** Read " << myEntry->data.size() << " bytes of test data." << std::endl;
  63. std::cout.flush();
  64. }
  65. DelimArray current;
  66. for(unsigned int i=0;i<DELIMITER_SET_SIZE;++i)
  67. current[i] = (boost::uint16_t)prng();
  68. for(;;) {
  69. unsigned long numTooShort = 0;
  70. unsigned long numTooLong = 0;
  71. unsigned long numGood = 0;
  72. boost::uint32_t shiftRegister = 0;
  73. unsigned long segSize = 0;
  74. for(std::vector<unsigned char>::iterator i=myEntry->data.begin();i!=myEntry->data.end();++i) {
  75. shiftRegister <<= 1;
  76. shiftRegister |= (((boost::uint32_t)*i) & 1);
  77. ++segSize;
  78. boost::uint16_t transformedShiftRegister = (boost::uint16_t)(shiftRegister);
  79. for(DelimArray::iterator d=current.begin();d!=current.end();++d) {
  80. if (transformedShiftRegister == *d) {
  81. if (segSize < MIN_DESIRED_SIZE)
  82. ++numTooShort;
  83. else if (segSize > MAX_DESIRED_SIZE)
  84. ++numTooLong;
  85. else ++numGood;
  86. segSize = 0;
  87. break;
  88. }
  89. }
  90. }
  91. if (segSize) {
  92. if (segSize < MIN_DESIRED_SIZE)
  93. ++numTooShort;
  94. else if (segSize > MAX_DESIRED_SIZE)
  95. ++numTooLong;
  96. else ++numGood;
  97. }
  98. if (numGood) {
  99. double score = ((double)(numTooShort + numTooLong)) / ((double)numGood);
  100. if (score < myEntry->bestScore) {
  101. myEntry->best = current;
  102. myEntry->bestScore = score;
  103. boost::mutex::scoped_lock l(outLock);
  104. std::cout << fileName << ": ";
  105. for(DelimArray::iterator d=current.begin();d!=current.end();++d) {
  106. sprintf(tmp,"0x%.4x",(unsigned int)*d);
  107. if (d != current.begin())
  108. std::cout << ',';
  109. std::cout << tmp;
  110. }
  111. std::cout << ": " << numTooShort << " / " << numGood << " / " << numTooLong << " (" << score << ")" << std::endl;
  112. std::cout.flush();
  113. if ((numTooShort == 0)&&(numTooLong == 0))
  114. break;
  115. }
  116. }
  117. for(DelimArray::iterator i=current.begin();i!=current.end();++i)
  118. *i = (boost::uint16_t)prng();
  119. }
  120. }
  121. int main(int argc,char **argv)
  122. {
  123. std::vector< boost::shared_ptr<boost::thread> > threads;
  124. for(int i=1;i<argc;++i) {
  125. boost::shared_ptr<boost::thread> t(new boost::thread(boost::bind(&runThread,std::string(argv[i]))));
  126. threads.push_back(t);
  127. }
  128. for(std::vector< boost::shared_ptr<boost::thread> >::iterator i=threads.begin();i!=threads.end();++i)
  129. (*i)->join();
  130. return 0;
  131. }