#!/usr/bin/perl -w # # Multi-language sentence splitter. # # For help, enter `./split-sentences.pl -h` # # Example usage: # echo "This is a test. This is more." | ./split-sentences.pl -l en # # Expected output is one sentence per line. That is, newlines are # inserted after the end of each sentence. The -l en flag uses English # language period-punctuation rules. # # Derived from the moses-smt (Moses Statistica Machine Translation) # sentece splitter; modified slightly for our needs. # # moses-smt and this file is licensed under the LGPLv2.1 # (Lesser GNU Public License Version 2.1) # Based on Preprocessor written by Philipp Koehn binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); use FindBin qw($RealBin); use strict; use utf8; my $mydir = "$RealBin/nonbreaking_prefixes"; my %NONBREAKING_PREFIX = (); my $language = "en"; my $hanzi = 1; my $QUIET = 0; my $HELP = 0; while (@ARGV) { $_ = shift; /^-l$/ && ($language = shift, next); /^-q$/ && ($QUIET = 1, next); /^-h$/ && ($HELP = 1, next); } if ($HELP) { print "Usage ./split-sentences.perl (-l [en|de|...]) < textfile > splitfile\n"; exit; } # Specifying `-l zh` or `-l yue` will pad each and every hanzi # character with whitespace. Specifying `-l zh-pre` or `-l yue-pre` # implies that the input text is pre-split into words, and whitespace # is not added. In either case, splitting into sentences is still # performed. if ($language =~ /-pre$/) { $language =~ s/-pre//; $hanzi = 0; } if (!$QUIET) { print STDERR "Sentence Splitter v3\n"; print STDERR "Language: $language\n"; } my $prefixfile = "$mydir/nonbreaking_prefix.$language"; # Default to English if we don't have a language-specific prefix file. if (!(-e $prefixfile)) { $prefixfile = "$mydir/nonbreaking_prefix.en"; print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); } if (-e "$prefixfile") { open(PREFIX, "<:utf8", "$prefixfile") or die "Cannot open: $!"; while () { my $item = $_; chomp($item); if (($item) && (substr($item,0,1) ne "#")) { if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) { $NONBREAKING_PREFIX{$1} = 2; } else { $NONBREAKING_PREFIX{$item} = 1; } } } close(PREFIX); } ##loop text, add lines together until we get a blank line or a

my $text = ""; while() { chop; # Punt on lines containing stray ISO-8859 chars if (utf8::valid($_) != 1) { next; } if (/^<.+>$/ || /^\s*$/) { #time to process this block, we've hit a blank or

&do_it_for($text,$_); # print "

\n" if (/^\s*$/ && $text); ##if we have text followed by

$text = ""; } else { #append the text, with a space $text .= $_. " "; } } #do the leftover text &do_it_for($text,"") if $text; sub do_it_for { my($text,$markup) = @_; print &preprocess($text) if $text; print "$markup\n" if ($markup =~ /^<.+>$/); #chop($text); } sub preprocess { # This is one paragraph. my($text) = @_; # Double-dashes, without spaces! sometimes get used to concatenate # sentences. I see this in Project Gutenberg books a lot. $text =~ s/--/ -- /g; # Clean up spaces at head and tail of each line as well as any # double-spacing. Get rid of carriage returns. $text =~ s/\r//g; $text =~ s/ +/ /g; $text =~ s/\n /\n/g; $text =~ s/ \n/\n/g; $text =~ s/^ //g; $text =~ s/ $//g; #####add sentence breaks as needed##### # Non-period end of sentence markers (?!) followed by sentence starters. $text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; # Multi-dots followed by sentence starters. $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; # A period, followed by a footnote.[42] followed by sentence starters. # Can be [HarvardStyle42] also. $text =~ s/([\.?!][\'\"\)]?\[.*?]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; # Add breaks for sentences that end with some sort of punctuation # inside a quote or parenthetical and are followed by a possible # sentence starter punctuation and upper case. $text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g; # Add breaks for sentences that end with some sort of punctuation # are followed by a sentence starter punctuation and upper case. # This differs from above in that starter punctuation is mandatory, # and thus we can take the period, here, too. $text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g; # Sentences that end in punctuation, followed by a double-dash. $text =~ s/([?!\.]) +(--[ \'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; # Sentences that end in punctuation, followed by one or more # underscores. These are typically used to denote _emphasized text_. $text =~ s/([?!\.]) *(_+[ \'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; # Sentence-ending punctuation, followed by optional close-quote. $text =~ s/([。.?!♪]”?)/$1\n/g; $text =~ s/([\.?!]) *(\p{InCJK})/$1\n$2/g; if ($hanzi) { $text =~ s/(\p{InCJK})/ $1 /g; } $text =~ s/ +/ /g; # Special punctuation cases are covered. Check all remaining periods. my $word; my $i; my @words = split(/ /,$text); $text = ""; for ($i=0; $i<(scalar(@words)-1); $i++) { if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) { # Check if $1 is a known honorific and $2 is empty, never break. my $prefix = $1; my $starting_punct = $2; if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) { #not breaking; } elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) { #not breaking - upper case acronym } elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) { # The next word has a bunch of initial quotes, maybe a # space, then either upper case or a number $words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/)); #we always add a return for these unless we have a numeric non-breaker and a number start } } $text = $text.$words[$i]." "; } # We stopped one token from the end to allow for easy look-ahead. Append it now. $text = $text.$words[$i]; # Clean up spaces at head and tail of each line as well as any double-spacing $text =~ s/ +/ /g; $text =~ s/\n /\n/g; $text =~ s/ \n/\n/g; $text =~ s/^ //g; $text =~ s/ $//g; # Add trailing break. $text .= "\n" unless $text =~ /\n$/; # Escape back-slashes. This is needed to prevent guile from messing # with them. $text =~ s/\\/\\\\/g; # Escape double-quotes. This is needed to prevent guile from messing # with them. $text =~ s/\"/\\\"/g; return $text; }