365 lines
9.8 KiB
Perl
365 lines
9.8 KiB
Perl
#! /bin/false
|
|
|
|
# vim: set autoindent shiftwidth=4 tabstop=4:
|
|
|
|
# Portable character conversion for Perl.
|
|
# Copyright (C) 2002-2017 Guido Flohr <guido.flohr@cantanea.com>,
|
|
# all rights reserved.
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
package Locale::Recode;
|
|
|
|
use strict;
|
|
|
|
require Locale::Recode::_Conversions;
|
|
|
|
my $loaded = {};
|
|
my $has_encode;
|
|
|
|
sub new
|
|
{
|
|
my $class = ref($_[0]) || $_[0];
|
|
shift;
|
|
my %args = @_;
|
|
|
|
my $self = bless {}, $class;
|
|
|
|
my ($from_codeset, $to_codeset) = @args{qw (from to)};
|
|
|
|
unless ($from_codeset && $to_codeset) {
|
|
require Carp;
|
|
Carp::croak (<<EOF);
|
|
Usage: $class->new (from => FROM_CODESET, to => TO_CODESET);
|
|
EOF
|
|
}
|
|
|
|
# Find a conversion path.
|
|
my $path = Locale::Recode::_Conversions->findPath ($from_codeset,
|
|
$to_codeset);
|
|
unless ($path) {
|
|
$self->{__error} = 'EINVAL';
|
|
return $self;
|
|
}
|
|
|
|
my @conversions = ();
|
|
foreach (@$path) {
|
|
my ($module, $from, $to) = @$_;
|
|
|
|
unless ($loaded->{$module}) {
|
|
eval "require Locale::RecodeData::$module";
|
|
if ($@) {
|
|
$self->{__error} = $@;
|
|
return $self;
|
|
}
|
|
|
|
$loaded->{$module} = 1;
|
|
}
|
|
|
|
my $module_name = "Locale::RecodeData::$module";
|
|
my $method = 'new';
|
|
my $object = $module_name->$method (from => $from,
|
|
to => $to);
|
|
|
|
push @conversions, $object;
|
|
}
|
|
|
|
$self->{__conversions} = \@conversions;
|
|
|
|
return $self;
|
|
}
|
|
|
|
sub resolveAlias
|
|
{
|
|
my ($class, $alias) = @_;
|
|
|
|
return Locale::Recode::_Conversions->resolveAlias ($alias);
|
|
}
|
|
|
|
sub getSupported
|
|
{
|
|
return [ Locale::Recode::_Conversions->listSupported ];
|
|
}
|
|
|
|
sub getCharsets
|
|
{
|
|
my $self = shift;
|
|
my %all = map { $_ => 1 } @{&getSupported};
|
|
|
|
require Locale::Recode::_Aliases;
|
|
|
|
my $conversions = Locale::Recode::_Conversions->listSupported;
|
|
foreach my $charset (keys %{Locale::Recode::_Aliases::ALIASES()}) {
|
|
my $mime_name = $self->resolveAlias ($charset);
|
|
next unless exists $all{$mime_name};
|
|
$all{$charset} = 1;
|
|
}
|
|
|
|
return [ keys %all ];
|
|
}
|
|
|
|
sub recode
|
|
{
|
|
my $self = $_[0];
|
|
|
|
return if $self->{__error};
|
|
|
|
return 1 unless defined $_[1];
|
|
|
|
my $chain = $self->{__conversions};
|
|
|
|
foreach my $module (@$chain) {
|
|
my $success = $module->_recode ($_[1]);
|
|
|
|
unless ($success) {
|
|
$self->{__error} = $module->_getError;
|
|
return;
|
|
}
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
sub getError
|
|
{
|
|
my $self = shift;
|
|
my $error = $self->{__error} or return;
|
|
|
|
if ('EINVAL' eq $error) {
|
|
return 'Invalid conversion';
|
|
} else {
|
|
return $error;
|
|
}
|
|
}
|
|
|
|
1;
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
Locale::Recode - Object-Oriented Portable Charset Conversion
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use Locale::Recode;
|
|
|
|
$cd = Locale::Recode->new (from => 'UTF-8',
|
|
to => 'ISO-8859-1');
|
|
|
|
die $cd->getError if $cd->getError;
|
|
|
|
$cd->recode ($text) or die $cd->getError;
|
|
|
|
$mime_name = Locale::Recode->resolveAlias ('latin-1');
|
|
|
|
$supported = Locale::Recode->getSupported;
|
|
|
|
$complete = Locale::Recode->getCharsets;
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This module provides routines that convert textual data from one
|
|
codeset to another in a portable way. The module has been started
|
|
before Encode(3) was written. It's main purpose today is to provide
|
|
charset conversion even when Encode(3) is not available on the system.
|
|
It should also work for older Perl versions without Unicode support.
|
|
|
|
Internally Locale::Recode(3) will use Encode(3) whenever possible,
|
|
to allow for a faster conversion and for a wider range of supported
|
|
charsets, and will only fall back to the Perl implementation when
|
|
Encode(3) is not available or does not support a particular charset
|
|
that Locale::Recode(3) does.
|
|
|
|
Locale::Recode(3) is part of libintl-perl, and it's main purpose is
|
|
actually to implement a portable charset conversion framework for
|
|
the message translation facilities described in Locale::TextDomain(3).
|
|
|
|
=head1 CONSTRUCTOR
|
|
|
|
The constructor C<new()> requires two named arguments:
|
|
|
|
=over 4
|
|
|
|
=item B<from>
|
|
|
|
The encoding of the original data. Case doesn't matter, aliases
|
|
are resolved.
|
|
|
|
=item B<to>
|
|
|
|
The target encoding. Again, case doesn't matter, and aliases
|
|
are resolved.
|
|
|
|
=back
|
|
|
|
The constructor will never fail. In case of an error, the object's
|
|
internal state is set to bad and it will refuse to do any conversions.
|
|
You can inquire the reason for the failure with the method
|
|
getError().
|
|
|
|
=head1 OBJECT METHODS
|
|
|
|
The following object methods are available.
|
|
|
|
=over 4
|
|
|
|
=item B<recode (STRING)>
|
|
|
|
Converts B<STRING> from the source encoding into the destination
|
|
encoding. In case of success, a truth value is returned, false
|
|
otherwise. You can inquire the reason for the failure with the
|
|
method getError().
|
|
|
|
=item B<getError>
|
|
|
|
Returns either false if the object is not in an error state or
|
|
an error message.
|
|
|
|
=back
|
|
|
|
=head1 CLASS METHODS
|
|
|
|
The object provides some additional class methods:
|
|
|
|
=over 4
|
|
|
|
=item B<getSupported>
|
|
|
|
Returns a reference to a list of all supported charsets. This
|
|
may implicitly load additional Encode(3) conversions like
|
|
Encode::HanExtra(3) which may produce considerable load on your
|
|
system.
|
|
|
|
The method is therefore not intended for regular use but rather
|
|
for getting resp. displaying I<once> a list of available encodings.
|
|
|
|
The members of the list are all converted to uppercase!
|
|
|
|
=item B<getCharsets>
|
|
|
|
Like getSupported() but also returns all available aliases.
|
|
|
|
=back
|
|
|
|
=head1 SUPPORTED CHARSETS
|
|
|
|
The range of supported charsets is system-dependent. The following
|
|
somewhat special charsets are always available:
|
|
|
|
=over 4
|
|
|
|
=item B<UTF-8>
|
|
|
|
UTF-8 is available independently of your Perl version. For Perl 5.6
|
|
or better or in the presence of Encode(3), conversions are not done
|
|
in Perl but with the interfaces provided by these facilities which
|
|
are written in C, hence much faster.
|
|
|
|
Encoding data I<into> UTF-8 is fast, even if it is done in Perl.
|
|
Decoding it in Perl may become quite slow. If you frequently have
|
|
to decode UTF-8 with B<Locale::Recode> you will probably want to
|
|
make sure that you do that with Perl 5.6 or beter, or install Encode(3) to
|
|
speed up things.
|
|
|
|
=item B<INTERNAL>
|
|
|
|
UTF-8 is fast to write but hard to read for applications. It is
|
|
therefore not the worst for internal string representation but not
|
|
far from that. Locale::Recode(3) stores strings internally as a
|
|
reference to an array of integer values like most programming languages
|
|
(Perl is an exception) do, trading memory for performance.
|
|
|
|
The integer values are the UCS-4 codes of the characters in host
|
|
byte order.
|
|
|
|
The encoding B<INTERNAL> is directly available via Locale::Recode(3)
|
|
but of course you should not really use it for data exchange, unless
|
|
you know what you are doing.
|
|
|
|
=back
|
|
|
|
Locale::Recode(3) has native support for a plethora of other encodings,
|
|
most of them 8 bit encodings that are fast to decode, including most
|
|
encodings used on popular micros like the ISO-8859-* series of encodings,
|
|
most Windows-* encodings (also known as CP*), Macintosh, Atari, etc.
|
|
|
|
=head1 NAMES AND ALIASES
|
|
|
|
Each charset resp. encoding is available internally under a unique
|
|
name. Whenever the information was available, the preferred MIME name
|
|
(see L<http://www.iana.org/assignments/character-sets/>) was chosen as
|
|
the internal name.
|
|
|
|
Alias handling is quite strict. The module does not make wild guesses
|
|
at what you mean ("What's the meaning of the acronym JIS" is a valid
|
|
alias for "7bit-jis" in Encode(3) ....) but aims at providing common
|
|
aliases only. The same applies to so-called aliases that are really
|
|
mistakes, like "utf8" for UTF-8.
|
|
|
|
The module knows all aliases that are listed with the IANA character
|
|
set registry (L<http://www.iana.org/assignments/character-sets/>), plus
|
|
those known to libiconv version 1.8, and a bunch of additional ones.
|
|
|
|
=head1 CONVERSION TABLES
|
|
|
|
The conversion tables have either been taken from official sources
|
|
like the IANA or the Unicode Consortium, from Bruno Haible's libiconv,
|
|
or from the sources of the GNU libc and the regression tests for
|
|
libintl-perl will check for conformance here. For some encodings this data
|
|
differs from Encode(3)'s data which would cause these tests to fail.
|
|
In these cases, the module will not invoke the Encode(3) methods, but
|
|
will fall back to the internal implementation for the sake of
|
|
consistency.
|
|
|
|
The few encodings that are affected are so simple that you will not
|
|
experience any real performance penalty unless you convert large chunks
|
|
of data. But the package is not really intended for such use anyway, and
|
|
since Encode(3) is relatively new, I rather think that the differences
|
|
are bugs in Encode which will be fixed soon.
|
|
|
|
=head1 BUGS
|
|
|
|
The module should provide fall back conversions for other Unicode
|
|
encoding schemes like UCS-2, UCS-4 (big- and little-endian).
|
|
|
|
The pure Perl UTF-8 decoder will not always handle corrupt UTF-8
|
|
correctly, especially at the end and at the beginning of the string.
|
|
This is not likely to be fixed, since the module's intention is not
|
|
to be a consistency checker for UTF-8 data.
|
|
|
|
=head1 AUTHOR
|
|
|
|
Copyright (C) 2002-2017 L<Guido Flohr|http://www.guido-flohr.net/>
|
|
(L<mailto:guido.flohr@cantanea.com>), all rights reserved. See the source
|
|
code for details!code for details!
|
|
|
|
=head1 SEE ALSO
|
|
|
|
Encode(3), iconv(3), iconv(1), recode(1), perl(1)
|
|
|
|
=cut
|
|
Local Variables:
|
|
mode: perl
|
|
perl-indent-level: 4
|
|
perl-continued-statement-offset: 4
|
|
perl-continued-brace-offset: 0
|
|
perl-brace-offset: -4
|
|
perl-brace-imaginary-offset: 0
|
|
perl-label-offset: -4
|
|
cperl-indent-level: 4
|
|
cperl-continued-statement-offset: 2
|
|
tab-width: 4
|
|
End:
|