split.pl 832 B

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. #!/bin/perl
  2. my %words ;
  3. my $input_file = shift;
  4. my $output_file = shift;
  5. open( my $f, $input_file ) or die "Cannot open input file $input_file";
  6. binmode $f ;
  7. my $text = '' ;
  8. $text .= $_ while (<$f>) ;
  9. close $f ;
  10. my @a = split /[^\w'-]/, $text ;
  11. foreach my $w (@a) {
  12. $words{$w} += 1 if $w ;
  13. }
  14. for (my $j = 1; $j < 30; ++$j ) {
  15. for ( my $i = 0; $i + $j - 1 < @a; $i += 1 ) {
  16. my $s = '';
  17. for ( my $k = 0; $k < $j; ++$k ) {
  18. $s .= ' '.$a[$i+$k];
  19. }
  20. $s =~ /\s*(\S.+\S)\s*/;
  21. $s = $1 ;
  22. $s =~ s/\s\s+/ /g ;
  23. $words{$s} += 1 ;
  24. }
  25. }
  26. open (my $dst, ">$output_file") or die "Cannot open output file $output_file";
  27. binmode $dst ;
  28. my $nCount = 0 ;
  29. $nCount++ foreach (keys %words) ;
  30. print $dst $nCount, "\n" ;
  31. print "Generate test dictionary $output_file ...\n" ;
  32. print $dst $_, "\n" foreach (keys %words) ;
  33. close $dst ;