Text Cleaning & Character Encoding
Text Cleaning
We used to get book reviews and summaries from a “service” called syndetics. The text was a horrible mess of mixed and broken encodings. This was my attempt to make some sense of it.
sub clean_summary {
my $text = HTML::Entities::decode( shift );
#attempt to make some sense of the mess that is text content from syndetics
$text =~ s/&/&/ig; #ampersands
$text =~ s{#xC3;#xA2;#xC2;#x80;#xC2;#x94;}{—}g;
$text =~ s{#xC3;#xA2;#xC2;#x80;#xC2;#x99;}{’}g;
$text =~ s{#xC3;#xA2;#xC2;#x80;#xC2;#x9C;}{“}g;
$text =~ s{#xC3;#xA2;#xC2;#x80;#xC2;#x9D;}{”}g;
$text =~ s{\xF4([^\xF6]+)\xF6}{“$1”}g; #bizarrely-encoded quotes
$text =~ s{\xC3\xB6}{ö}g; #unconverted UTF-8 ?
$text =~ s{#xE2;#xAC;#x1D;}{—}g;
$text =~ s{#xE2;#xAC;S}{“}g;
$text =~ s{#xE2;#xAC;}{”}g;
$text =~ s/#xC2;#x1[34];/—/ig;
$text =~ s/#xC2;#x18;/‘/ig;
$text =~ s/#xC2;#x19;/’/ig;
$text =~ s/#xC2;#x1C;/“/ig;
$text =~ s/#xC2;#x1D;/”/ig;
$text =~ s/#xC2;&/…/ig;
$text =~ s/(?<!&)#x13;/-/ig; #hyphen
$text =~ s/(?<!&)#x14;/—/ig; #mdash
$text =~ s/(?<!&)#x1[89];/'/ig; #single quote
$text =~ s/(?<!&)#x1C;/“/ig; #left double quote
$text =~ s/(?<!&)#x1D;/”/ig; #right double quote
$text =~ s/(?<!&)#xD1;/—/ig;
$text =~ s/(?<!&)#xD5;/’/ig;
$text =~ s/&?#x([0-9a-f]{2});/chr(hex($1))/ige; #syndetic-encoded hex character entities
$text =~ s/&?#([0-9]{2,4});/chr($1)/ige; #syndetic-encoded decimal character entities
$text =~ s/(?<!&)(lsquo|rsquo|ldquo|rdquo|mdash|ndash|bull|middot|hellip);/&$1;/ig; #entities missing ampersands
HTML::Entities::decode( $text ); #again, to catch the syndetic to standard conversions
$text =~ s/<[^>]+>/ /g; #strip HTML-like tags (so many of which don't have proper spacing around them...)
$text =~ s/\s+([,;:])/$1/gi; #remove space before commas
$text =~ s/([,;:])(?=\w)/$1 /gi; #add a space after commas
$text =~ s/(?<=[a-z])([.?!])(?=[A-Z])/$1 /i; #add a space between sentences
$text =~ s/[\s\xA0]+/ /g; #replace all whitespace (including newlines) with a single space
$text =~ s/^\s//; #remove leading whitespace
$text =~ s/\s$//; #remove trailing whitespace
$text =~ s{ [\x00-\x1F] }{}gxms; #strip unprintable characters
HTML::Entities::encode_numeric($text);
$text =~ s/‘/‘/g;
$text =~ s/’/’/g;
$text =~ s/“/“/g;
$text =~ s/”/”/g;
$text =~ s/•/•/g;
$text =~ s/–/–/g;
$text =~ s/—/—/g;
$text =~ s/[|]/|/g; # don't break pipe files!
return $text;
}
Encoding Translation
A Perl module I used to use to get text out of the library catalogue for use on the web.
package PLCH::MARCtoLatin;
use 5.008007;
use warnings;
use strict;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw( CharConv );
# Within your program, you would pass the function a
# string that you would like to be converted.
# Something like this...
# my $ConvertedAuthor = CharConv($author);
sub CharConv {
my $line = shift;
# Check for high-order characters ( > hex 7F). If none found,
# no conversion needed. (Added by MK, 2005)
# Added check for \x1B - RRI 2007
return ($line) unless ( $line && $line =~ /[\x1B\x80-\xff]/ );
# Convert input string to hexadecimal
$line =~ s/(.)/sprintf ("%%%X", ord($1))/eg;
# This subroutine takes data encoded with the MARC-8
# character set and (as best as can be done) maps it
# to the ISO 8859-1 (Latin-1) character set.
# First, combined MARC-8 characters that don't
# correspond to any Latin-1 characters are removed.
# Then combined MARC characters that correspond to
# extended Latin-1 characters are replaced with the
# appropriate values. This process is then repeated
# for single values.
my @orphan_chars_combined = (
'%1B%67%61%1B%73', '%1B%67%62%1B%73', '%1B%67%63%1B%73', '%1B%62%30%1B%73', '%1B%62%31%1B%73', '%1B%62%32%1B%73',
'%1B%62%33%1B%73', '%1B%62%34%1B%73', '%1B%62%35%1B%73', '%1B%62%36%1B%73', '%1B%62%37%1B%73', '%1B%62%38%1B%73',
'%1B%62%39%1B%73', '%1B%62%28%1B%73', '%1B%62%2B%1B%73', '%1B%62%29%1B%73', '%1B%70%30%1B%73', '%1B%70%34%1B%73',
'%1B%70%35%1B%73', '%1B%70%36%1B%73', '%1B%70%37%1B%73', '%1B%70%38%1B%73', '%1B%70%39%1B%73', '%1B%70%28%1B%73',
'%1B%70%2D%1B%73', '%1B%70%2B%1B%73', '%1B%70%29%1B%73',
'%E0%E6', '%E1%E3', '%E1%E5', '%E1%E6', '%E1%E8', '%E2%A5',
'%E2%B5', '%E2%E4', '%E2%E5', '%E2%E6', '%E2%E8', '%E2%EA',
'%E2%F0', '%E3%E0', '%E3%E1', '%E3%E2', '%E3%F2', '%E4%E3',
'%E4%E6', '%E5%E4', '%E5%E7', '%E5%E8', '%E5%A5', '%E5%B5',
'%E5%F1', '%E5%F2', '%E6%F0', '%E6%F2', '%E7%E2', '%E7%F2',
'%E8%E4', '%E8%E5', '%E9%E8',
);
my @orphan_chars_single = (
'%A1', '%A7', '%A9', '%AE', '%AF', '%B0', '%B1', '%B3', '%B7', '%B8', '%BB',
'%BE', '%BF', '%C1', '%C2', '%C4', '%E0', '%E5', '%E6', '%E7', '%E9', '%EB', '%EC', '%ED', '%EE',
'%EF', '%F1', '%F2', '%F3', '%F4', '%F5', '%F6', '%F7', '%F8', '%F9', '%FA', '%FB', '%FC', '%FD', '%FE',
);
my %marc8_to_latin1_combined = (
'%1B%70%31%1B%73' => 'B9', # Superscript 1
'%1B%70%32%1B%73' => 'B2', # Superscript 2
'%1B%70%33%1B%73' => 'B3', # Superscript 3
'%E1%41' => 'C0', # A grave
'%E2%41' => 'C1', # A acute
'%E3%41' => 'C2', # A circumflex
'%E4%41' => 'C3', # A tilde
'%E8%41' => 'C4', # A umlaut
'%EA%41' => 'C5', # A ring
'%E2%43' => '43', # C
'%E3%43' => '43', # C
'%F0%43' => 'C7', # C cedilla
'%E1%45' => 'C8', # E grave
'%E2%45' => 'C9', # E acute
'%E3%45' => 'CA', # E circumflex
'%E4%45' => '45', # E
'%E8%45' => 'CB', # E umlaut
'%F0%45' => '45', # E
'%E2%47' => '47', # G
'%E3%47' => '47', # G
'%F0%47' => '47', # G
'%E3%48' => '48', # H
'%E8%48' => '48', # H
'%F0%48' => '48', # H
'%E1%49' => 'CC', # I grave
'%E2%49' => 'CD', # I acute
'%E3%49' => 'CE', # I circumflex
'%E4%49' => '49', # I
'%E8%49' => 'CF', # I umlaut
'%E3%4A' => '4A', # J
'%E2%4B' => '4B', # K
'%E3%4B' => '4B', # K
'%F0%4B' => '4B', # K
'%F2%4B' => '4B', # K
'%E2%4C' => '4C', # L
'%E3%4C' => '4C', # L
'%F0%4C' => '4C', # L
'%E2%4D' => '4D', # M
'%E1%4E' => '4E', # N
'%E2%4E' => '4E', # N
'%E4%4E' => 'D1', # N tilde
'%F0%4E' => '4E', # N
'%E1%4F' => 'D2', # O grave
'%E2%4F' => 'D3', # O acute
'%E3%4F' => 'D4', # O circumflex
'%E4%4F' => 'D5', # O tilde
'%E8%4F' => 'D6', # O umlaut
'%E2%50' => '50', # P
'%E2%52' => '52', # R
'%E2%53' => '53', # S
'%E3%53' => '53', # S
'%F0%53' => '53', # S
'%F0%54' => '54', # T
'%E1%55' => 'D9', # U grave
'%E2%55' => 'DA', # U acute
'%E3%55' => 'DB', # U circumflex
'%E4%55' => '55', # U
'%E8%55' => 'DC', # U umlaut
'%EA%55' => '55', # U
'%E4%56' => '56', # V
'%E1%57' => '57', # W
'%E2%57' => '57', # W
'%E3%57' => '57', # W
'%E8%57' => '57', # W
'%E8%58' => '58', # X
'%E1%59' => '59', # Y
'%E2%59' => 'DD', # Y acute
'%E3%59' => '59', # Y
'%E4%59' => '59', # Y
'%E8%59' => '59', # Y
'%E2%5A' => '5A', # Z
'%E3%5A' => '5A', # Z
'%E1%61' => 'E0', # a grave
'%E2%61' => 'E1', # a acute
'%E3%61' => 'E2', # a circumflex
'%E4%61' => 'E3', # a tilde
'%E8%61' => 'E4', # a umlaut
'%EA%61' => 'E5', # a ring
'%E2%63' => '63', # c
'%E3%63' => '63', # c
'%F0%63' => 'E7', # c cedilla
'%E1%65' => 'E8', # e grave
'%E2%65' => 'E9', # e acute
'%E3%65' => 'EA', # e circumflex
'%E4%65' => '65', # e
'%E8%65' => 'EB', # e umlaut
'%F0%65' => '65', # e
'%E2%67' => '67', # g
'%E3%67' => '67', # g
'%F0%67' => '67', # g
'%E3%68' => '68', # h
'%E8%68' => '68', # h
'%F0%68' => '68', # h
'%E1%69' => 'EC', # i grave
'%E2%69' => 'ED', # i acute
'%E3%69' => 'EE', # i circumflex
'%E4%69' => '69', # i
'%E8%69' => 'EF', # i umlaut
'%E3%6A' => '6A', # j
'%E2%6B' => '6B', # k
'%E3%6B' => '6B', # k
'%F0%6B' => '6B', # k
'%F2%6B' => '6B', # k
'%E2%6C' => '6C', # l
'%E3%6C' => '6C', # l
'%F0%6C' => '6C', # l
'%E2%6D' => '6D', # m
'%E1%6E' => '6E', # n
'%E2%6E' => '6E', # n
'%E4%6E' => 'F1', # n tilde
'%F0%6E' => '6E', # n
'%E1%6F' => 'F2', # o grave
'%E2%6F' => 'F3', # o acute
'%E3%6F' => 'F4', # o circumflex
'%E4%6F' => 'F5', # o tilde
'%E8%6F' => 'F6', # o umlaut
'%E2%70' => '70', # p
'%E2%72' => '72', # r
'%E2%73' => '73', # s
'%E3%73' => '73', # s
'%F0%73' => '73', # s
'%E8%74' => '74', # t
'%F0%74' => '74', # t
'%E1%75' => 'F9', # u grave
'%E2%75' => 'FA', # u acute
'%E3%75' => 'FB', # u circumflex
'%E4%75' => '75', # u
'%E8%75' => 'FC', # u umlaut
'%EA%75' => '75', # u
'%E4%76' => '76', # v
'%E1%77' => '77', # w
'%E2%77' => '77', # w
'%E3%77' => '77', # w
'%E8%77' => '77', # w
'%EA%77' => '77', # w
'%E8%78' => '78', # x
'%E1%79' => '79', # y
'%E2%79' => 'FD', # y acute
'%E3%79' => '79', # y
'%E4%79' => '79', # y
'%E8%79' => '79', # y
'%EA%79' => '79', # y
'%E8%79' => 'FF', # y umlaut
'%E2%7A' => '7A', # z
'%E3%7A' => '7A', # z
'%E2%A2' => '4F', # O
'%E1%AC' => '4F', # O
'%E2%AC' => '4F', # O
'%E4%AC' => '4F', # O
'%E1%AD' => '55', # U
'%E2%AD' => '55', # U
'%E4%AD' => '55', # U
'%E2%B2' => '6F', # o
'%E1%BC' => '6F', # o
'%E2%BC' => '6F', # o
'%E4%BC' => '6F', # o
'%E1%BD' => '75', # u
'%E2%BD' => '75', # u
'%E4%BD' => '75', # u
);
my %marc8_to_latin1_single = (
'%A2' => 'D8', # O slash
'%A3' => 'D0', # ETH
'%A4' => 'DE', # THORN
'%A5' => 'C6', # AE ligature
'%A6' => '4F', # OE ligature -> O
'%A8' => 'B7', # Middle dot
'%AA' => 'AE', # Registered trademark
'%AB' => 'B1', # Plus-minus sign
'%AC' => '4F', # O
'%AD' => '55', # U
'%B2' => 'F8', # o slash
'%B4' => 'FE', # thorn
'%B5' => 'E6', # ae ligature
'%B6' => '6F', # oe ligature -> o
'%B9' => 'A3', # Pound sign
'%BA' => 'F0', # eth
'%BC' => '6F', # o
'%BD' => '75', # u
'%C0' => 'B0', # Degree sign
'%C3' => 'A9', # Copyright sign
'%C5' => 'BF', # Inverted question mark
'%C6' => 'A1', # Inverted exclamation
'%C7' => 'DF', # sharp s
'%EA' => 'B0', # Degree sign
);
foreach my $char1 (@orphan_chars_combined) {
$line =~ s/$char1//g;
}
foreach my $marc_char1 ( keys(%marc8_to_latin1_combined) ) {
$line =~ s/$marc_char1/pack("C", hex($marc8_to_latin1_combined{$marc_char1}))/eg;
}
foreach my $char2 (@orphan_chars_single) {
$line =~ s/$char2//g;
}
foreach my $marc_char2 ( keys(%marc8_to_latin1_single) ) {
$line =~ s/$marc_char2/pack("C", hex($marc8_to_latin1_single{$marc_char2}))/eg;
}
$line =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/eg;
return ($line);
}
1;