Text Cleaning & Character Encoding

Text Cleaning

We used to get book reviews and summaries from a “service” called syndetics. The text was a horrible mess of mixed and broken encodings. This was my attempt to make some sense of it.

sub clean_summary {
  my $text = HTML::Entities::decode( shift );

  #attempt to make some sense of the mess that is text content from syndetics
  $text =~ s/&amp;/&/ig;        #ampersands

  $text =~ s{#xC3;#xA2;#xC2;#x80;#xC2;#x94;}{&mdash;}g;
  $text =~ s{#xC3;#xA2;#xC2;#x80;#xC2;#x99;}{&rsquo;}g;
  $text =~ s{#xC3;#xA2;#xC2;#x80;#xC2;#x9C;}{&ldquo;}g;
  $text =~ s{#xC3;#xA2;#xC2;#x80;#xC2;#x9D;}{&rdquo;}g;
  $text =~ s{\xF4([^\xF6]+)\xF6}{&ldquo;$1&rdquo;}g; #bizarrely-encoded quotes

  $text =~ s{\xC3\xB6}{&ouml;}g; #unconverted UTF-8 ?

  $text =~ s{#xE2;#xAC;#x1D;}{&mdash;}g;
  $text =~ s{#xE2;#xAC;S}{&ldquo;}g;
  $text =~ s{#xE2;#xAC;}{&rdquo;}g;

  $text =~ s/#xC2;#x1[34];/&mdash;/ig;
  $text =~ s/#xC2;#x18;/&lsquo;/ig;
  $text =~ s/#xC2;#x19;/&rsquo;/ig;
  $text =~ s/#xC2;#x1C;/&ldquo;/ig;
  $text =~ s/#xC2;#x1D;/&rdquo;/ig;
  $text =~ s/#xC2;&/&hellip;/ig;

  $text =~ s/(?<!&)#x13;/-/ig;       #hyphen
  $text =~ s/(?<!&)#x14;/&mdash;/ig; #mdash
  $text =~ s/(?<!&)#x1[89];/'/ig;    #single quote
  $text =~ s/(?<!&)#x1C;/&ldquo;/ig; #left double quote
  $text =~ s/(?<!&)#x1D;/&rdquo;/ig; #right double quote
  $text =~ s/(?<!&)#xD1;/&mdash;/ig;
  $text =~ s/(?<!&)#xD5;/&rsquo;/ig;

  $text =~ s/&?#x([0-9a-f]{2});/chr(hex($1))/ige; #syndetic-encoded hex character entities
  $text =~ s/&?#([0-9]{2,4});/chr($1)/ige;        #syndetic-encoded decimal character entities

  $text =~ s/(?<!&)(lsquo|rsquo|ldquo|rdquo|mdash|ndash|bull|middot|hellip);/&$1;/ig; #entities missing ampersands

  HTML::Entities::decode( $text ); #again, to catch the syndetic to standard conversions

  $text =~ s/<[^>]+>/ /g;  #strip HTML-like tags (so many of which don't have proper spacing around them...)
  $text =~ s/\s+([,;:])/$1/gi; #remove space before commas
  $text =~ s/([,;:])(?=\w)/$1 /gi; #add a space after commas
  $text =~ s/(?<=[a-z])([.?!])(?=[A-Z])/$1 /i; #add a space between sentences
  $text =~ s/[\s\xA0]+/ /g;     #replace all whitespace (including newlines) with a single space
  $text =~ s/^\s//;       #remove leading whitespace
  $text =~ s/\s$//;       #remove trailing whitespace
  $text =~ s{ [\x00-\x1F] }{}gxms; #strip unprintable characters

  HTML::Entities::encode_numeric($text);
  $text =~ s/&#145;/&#8216;/g;
  $text =~ s/&#146;/&#8217;/g;
  $text =~ s/&#147;/&#8220;/g;
  $text =~ s/&#148;/&#8221;/g;
  $text =~ s/&#149;/&#8226;/g;
  $text =~ s/&#150;/&#8211;/g;
  $text =~ s/&#151;/&#8212;/g;
  $text =~ s/[|]/&#x7C;/g;      # don't break pipe files!

  return $text;
}

Encoding Translation

A Perl module I used to use to get text out of the library catalogue for use on the web.

package PLCH::MARCtoLatin;

use 5.008007;
use warnings;
use strict;

require Exporter;
our @ISA    = qw(Exporter);
our @EXPORT = qw( CharConv );

#  Within your program, you would pass the function a
#  string that you would like to be converted.
#  Something like this...
#   my $ConvertedAuthor = CharConv($author);

sub CharConv {

    my $line = shift;

    # Check for high-order characters ( > hex 7F).  If none found,
    # no conversion needed.  (Added by MK, 2005)
    # Added check for \x1B - RRI 2007
    return ($line) unless ( $line && $line =~ /[\x1B\x80-\xff]/ );

    # Convert input string to hexadecimal
    $line =~ s/(.)/sprintf ("%%%X", ord($1))/eg;

    # This subroutine takes data encoded with the MARC-8
    # character set and (as best as can be done) maps it
    # to the ISO 8859-1 (Latin-1) character set.

    # First, combined MARC-8 characters that don't
    # correspond to any Latin-1 characters are removed.
    # Then combined MARC characters that correspond to
    # extended Latin-1 characters are replaced with the
    # appropriate values. This process is then repeated
    # for single values.

    my @orphan_chars_combined = (
        '%1B%67%61%1B%73', '%1B%67%62%1B%73', '%1B%67%63%1B%73', '%1B%62%30%1B%73', '%1B%62%31%1B%73', '%1B%62%32%1B%73',
        '%1B%62%33%1B%73', '%1B%62%34%1B%73', '%1B%62%35%1B%73', '%1B%62%36%1B%73', '%1B%62%37%1B%73', '%1B%62%38%1B%73',
        '%1B%62%39%1B%73', '%1B%62%28%1B%73', '%1B%62%2B%1B%73', '%1B%62%29%1B%73', '%1B%70%30%1B%73', '%1B%70%34%1B%73',
        '%1B%70%35%1B%73', '%1B%70%36%1B%73', '%1B%70%37%1B%73', '%1B%70%38%1B%73', '%1B%70%39%1B%73', '%1B%70%28%1B%73',
        '%1B%70%2D%1B%73', '%1B%70%2B%1B%73', '%1B%70%29%1B%73',
        '%E0%E6',          '%E1%E3',          '%E1%E5',          '%E1%E6',          '%E1%E8',          '%E2%A5',
        '%E2%B5',          '%E2%E4',          '%E2%E5',          '%E2%E6',          '%E2%E8',          '%E2%EA',
        '%E2%F0',          '%E3%E0',          '%E3%E1',          '%E3%E2',          '%E3%F2',          '%E4%E3',
        '%E4%E6',          '%E5%E4',          '%E5%E7',          '%E5%E8',          '%E5%A5',          '%E5%B5',
        '%E5%F1',          '%E5%F2',          '%E6%F0',          '%E6%F2',          '%E7%E2',          '%E7%F2',
        '%E8%E4',          '%E8%E5',          '%E9%E8',
    );

    my @orphan_chars_single = (
        '%A1', '%A7', '%A9', '%AE', '%AF', '%B0', '%B1', '%B3', '%B7', '%B8', '%BB',
        '%BE', '%BF', '%C1', '%C2', '%C4', '%E0', '%E5', '%E6', '%E7', '%E9', '%EB', '%EC', '%ED', '%EE',
        '%EF', '%F1', '%F2', '%F3', '%F4', '%F5', '%F6', '%F7', '%F8', '%F9', '%FA', '%FB', '%FC', '%FD', '%FE',
    );

    my %marc8_to_latin1_combined = (
        '%1B%70%31%1B%73' => 'B9', # Superscript 1
        '%1B%70%32%1B%73' => 'B2', # Superscript 2
        '%1B%70%33%1B%73' => 'B3', # Superscript 3
        '%E1%41'          => 'C0', # A grave
        '%E2%41'          => 'C1', # A acute
        '%E3%41'          => 'C2', # A circumflex
        '%E4%41'          => 'C3', # A tilde
        '%E8%41'          => 'C4', # A umlaut
        '%EA%41'          => 'C5', # A ring
        '%E2%43'          => '43', # C
        '%E3%43'          => '43', # C
        '%F0%43'          => 'C7', # C cedilla
        '%E1%45'          => 'C8', # E grave
        '%E2%45'          => 'C9', # E acute
        '%E3%45'          => 'CA', # E circumflex
        '%E4%45'          => '45', # E
        '%E8%45'          => 'CB', # E umlaut
        '%F0%45'          => '45', # E
        '%E2%47'          => '47', # G
        '%E3%47'          => '47', # G
        '%F0%47'          => '47', # G
        '%E3%48'          => '48', # H
        '%E8%48'          => '48', # H
        '%F0%48'          => '48', # H
        '%E1%49'          => 'CC', # I grave
        '%E2%49'          => 'CD', # I acute
        '%E3%49'          => 'CE', # I circumflex
        '%E4%49'          => '49', # I
        '%E8%49'          => 'CF', # I umlaut
        '%E3%4A'          => '4A', # J
        '%E2%4B'          => '4B', # K
        '%E3%4B'          => '4B', # K
        '%F0%4B'          => '4B', # K
        '%F2%4B'          => '4B', # K
        '%E2%4C'          => '4C', # L
        '%E3%4C'          => '4C', # L
        '%F0%4C'          => '4C', # L
        '%E2%4D'          => '4D', # M
        '%E1%4E'          => '4E', # N
        '%E2%4E'          => '4E', # N
        '%E4%4E'          => 'D1', # N tilde
        '%F0%4E'          => '4E', # N
        '%E1%4F'          => 'D2', # O grave
        '%E2%4F'          => 'D3', # O acute
        '%E3%4F'          => 'D4', # O circumflex
        '%E4%4F'          => 'D5', # O tilde
        '%E8%4F'          => 'D6', # O umlaut
        '%E2%50'          => '50', # P
        '%E2%52'          => '52', # R
        '%E2%53'          => '53', # S
        '%E3%53'          => '53', # S
        '%F0%53'          => '53', # S
        '%F0%54'          => '54', # T
        '%E1%55'          => 'D9', # U grave
        '%E2%55'          => 'DA', # U acute
        '%E3%55'          => 'DB', # U circumflex
        '%E4%55'          => '55', # U
        '%E8%55'          => 'DC', # U umlaut
        '%EA%55'          => '55', # U
        '%E4%56'          => '56', # V
        '%E1%57'          => '57', # W
        '%E2%57'          => '57', # W
        '%E3%57'          => '57', # W
        '%E8%57'          => '57', # W
        '%E8%58'          => '58', # X
        '%E1%59'          => '59', # Y
        '%E2%59'          => 'DD', # Y acute
        '%E3%59'          => '59', # Y
        '%E4%59'          => '59', # Y
        '%E8%59'          => '59', # Y
        '%E2%5A'          => '5A', # Z
        '%E3%5A'          => '5A', # Z
        '%E1%61'          => 'E0', # a grave
        '%E2%61'          => 'E1', # a acute
        '%E3%61'          => 'E2', # a circumflex
        '%E4%61'          => 'E3', # a tilde
        '%E8%61'          => 'E4', # a umlaut
        '%EA%61'          => 'E5', # a ring
        '%E2%63'          => '63', # c
        '%E3%63'          => '63', # c
        '%F0%63'          => 'E7', # c cedilla
        '%E1%65'          => 'E8', # e grave
        '%E2%65'          => 'E9', # e acute
        '%E3%65'          => 'EA', # e circumflex
        '%E4%65'          => '65', # e
        '%E8%65'          => 'EB', # e umlaut
        '%F0%65'          => '65', # e
        '%E2%67'          => '67', # g
        '%E3%67'          => '67', # g
        '%F0%67'          => '67', # g
        '%E3%68'          => '68', # h
        '%E8%68'          => '68', # h
        '%F0%68'          => '68', # h
        '%E1%69'          => 'EC', # i grave
        '%E2%69'          => 'ED', # i acute
        '%E3%69'          => 'EE', # i circumflex
        '%E4%69'          => '69', # i
        '%E8%69'          => 'EF', # i umlaut
        '%E3%6A'          => '6A', # j
        '%E2%6B'          => '6B', # k
        '%E3%6B'          => '6B', # k
        '%F0%6B'          => '6B', # k
        '%F2%6B'          => '6B', # k
        '%E2%6C'          => '6C', # l
        '%E3%6C'          => '6C', # l
        '%F0%6C'          => '6C', # l
        '%E2%6D'          => '6D', # m
        '%E1%6E'          => '6E', # n
        '%E2%6E'          => '6E', # n
        '%E4%6E'          => 'F1', # n tilde
        '%F0%6E'          => '6E', # n
        '%E1%6F'          => 'F2', # o grave
        '%E2%6F'          => 'F3', # o acute
        '%E3%6F'          => 'F4', # o circumflex
        '%E4%6F'          => 'F5', # o tilde
        '%E8%6F'          => 'F6', # o umlaut
        '%E2%70'          => '70', # p
        '%E2%72'          => '72', # r
        '%E2%73'          => '73', # s
        '%E3%73'          => '73', # s
        '%F0%73'          => '73', # s
        '%E8%74'          => '74', # t
        '%F0%74'          => '74', # t
        '%E1%75'          => 'F9', # u grave
        '%E2%75'          => 'FA', # u acute
        '%E3%75'          => 'FB', # u circumflex
        '%E4%75'          => '75', # u
        '%E8%75'          => 'FC', # u umlaut
        '%EA%75'          => '75', # u
        '%E4%76'          => '76', # v
        '%E1%77'          => '77', # w
        '%E2%77'          => '77', # w
        '%E3%77'          => '77', # w
        '%E8%77'          => '77', # w
        '%EA%77'          => '77', # w
        '%E8%78'          => '78', # x
        '%E1%79'          => '79', # y
        '%E2%79'          => 'FD', # y acute
        '%E3%79'          => '79', # y
        '%E4%79'          => '79', # y
        '%E8%79'          => '79', # y
        '%EA%79'          => '79', # y
        '%E8%79'          => 'FF', # y umlaut
        '%E2%7A'          => '7A', # z
        '%E3%7A'          => '7A', # z
        '%E2%A2'          => '4F', # O
        '%E1%AC'          => '4F', # O
        '%E2%AC'          => '4F', # O
        '%E4%AC'          => '4F', # O
        '%E1%AD'          => '55', # U
        '%E2%AD'          => '55', # U
        '%E4%AD'          => '55', # U
        '%E2%B2'          => '6F', # o
        '%E1%BC'          => '6F', # o
        '%E2%BC'          => '6F', # o
        '%E4%BC'          => '6F', # o
        '%E1%BD'          => '75', # u
        '%E2%BD'          => '75', # u
        '%E4%BD'          => '75', # u
    );

    my %marc8_to_latin1_single = (
        '%A2' => 'D8', # O slash
        '%A3' => 'D0', # ETH
        '%A4' => 'DE', # THORN
        '%A5' => 'C6', # AE ligature
        '%A6' => '4F', # OE ligature -> O
        '%A8' => 'B7', # Middle dot
        '%AA' => 'AE', # Registered trademark
        '%AB' => 'B1', # Plus-minus sign
        '%AC' => '4F', # O
        '%AD' => '55', # U
        '%B2' => 'F8', # o slash
        '%B4' => 'FE', # thorn
        '%B5' => 'E6', # ae ligature
        '%B6' => '6F', # oe ligature -> o
        '%B9' => 'A3', # Pound sign
        '%BA' => 'F0', # eth
        '%BC' => '6F', # o
        '%BD' => '75', # u
        '%C0' => 'B0', # Degree sign
        '%C3' => 'A9', # Copyright sign
        '%C5' => 'BF', # Inverted question mark
        '%C6' => 'A1', # Inverted exclamation
        '%C7' => 'DF', # sharp s
        '%EA' => 'B0', # Degree sign
    );

    foreach my $char1 (@orphan_chars_combined) {
        $line =~ s/$char1//g;
    }
    foreach my $marc_char1 ( keys(%marc8_to_latin1_combined) ) {
        $line =~ s/$marc_char1/pack("C", hex($marc8_to_latin1_combined{$marc_char1}))/eg;
    }
    foreach my $char2 (@orphan_chars_single) {
        $line =~ s/$char2//g;
    }
    foreach my $marc_char2 ( keys(%marc8_to_latin1_single) ) {
        $line =~ s/$marc_char2/pack("C", hex($marc8_to_latin1_single{$marc_char2}))/eg;
    }

    $line =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/eg;

    return ($line);
}

1;