test_data_generator.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. import codecs
  2. import os
  3. import re
  4. def generate_test_files(encoding, output_dir):
  5. if re.match(r'^cp\d+', encoding):
  6. prefix = re.sub(r'^cp', 'windows_', encoding)
  7. else:
  8. prefix = encoding
  9. encoded_filename = os.path.join(output_dir, f"{prefix}_encoded.txt")
  10. utf8_filename = os.path.join(output_dir, f"{prefix}_utf8.txt")
  11. encoded_chars = bytearray()
  12. utf8_chars = bytearray()
  13. for i in range(0x20, 0x100):
  14. try:
  15. char = bytes([i])
  16. utf8_char = codecs.decode(char, encoding).encode('utf-8')
  17. encoded_chars.extend(char)
  18. utf8_chars.extend(utf8_char)
  19. except UnicodeDecodeError:
  20. pass
  21. with open(encoded_filename, 'wb') as encoded_file:
  22. encoded_file.write(encoded_chars)
  23. with open(utf8_filename, 'wb') as utf8_file:
  24. utf8_file.write(utf8_chars)
  25. def main():
  26. encodings = ['iso_8859_1', 'iso_8859_2', 'iso_8859_5', 'iso_8859_6', 'iso_8859_7', 'iso_8859_8', 'iso_8859_9', 'iso_8859_15', 'cp1252', 'cp1251', 'cp1250', 'cp1254', 'cp1253', 'cp1257', 'cp1255', 'cp1256']
  27. output_dir = 'test_data'
  28. if not os.path.exists(output_dir):
  29. os.makedirs(output_dir)
  30. for encoding in encodings:
  31. generate_test_files(encoding, output_dir)
  32. if __name__ == "__main__":
  33. main()