Linux lhjmq-records 5.15.0-118-generic #128-Ubuntu SMP Fri Jul 5 09:28:59 UTC 2024 x86_64
Your IP : 3.128.201.71
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is machine-generated by lib/unicore/mktables from the Unicode
# database, Version 13.0.0. Any changes made here will be lost!
# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
# This file is for internal use by core Perl only. The format and even the
# name or existence of this file are subject to change without notice. Don't
# use it directly. Use Unicode::UCD to access the Unicode character data
# base.
package charnames;
# This module contains machine-generated tables and code for the
# algorithmically-determinable Unicode character names. The following
# routines can be used to translate between name and code point and vice versa
{ # Closure
# Matches legal code point. 4-6 hex numbers, If there are 6, the first
# two must be 10; if there are 5, the first must not be a 0. Written this
# way to decrease backtracking. The first regex allows the code point to
# be at the end of a word, but to work properly, the word shouldn't end
# with a valid hex character. The second one won't match a code point at
# the end of a word, and doesn't have the run-on issue
my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/;
my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/;
# In the following hash, the keys are the bases of names which include
# the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
# of each key is another hash which is used to get the low and high ends
# for each range of code points that apply to the name.
my %names_ending_in_code_point = (
'CJK COMPATIBILITY IDEOGRAPH' =>
{
'high' =>
[
64109,
64217,
195101,
],
'low' =>
[
63744,
64112,
194560,
],
},
'CJK UNIFIED IDEOGRAPH' =>
{
'high' =>
[
19903,
40956,
173789,
177972,
178205,
183969,
191456,
201546,
],
'low' =>
[
13312,
19968,
131072,
173824,
177984,
178208,
183984,
196608,
],
},
'KHITAN SMALL SCRIPT CHARACTER' =>
{
'high' =>
[
101589,
],
'low' =>
[
101120,
],
},
'NUSHU CHARACTER' =>
{
'high' =>
[
111355,
],
'low' =>
[
110960,
],
},
'TANGUT IDEOGRAPH' =>
{
'high' =>
[
100343,
],
'low' =>
[
94208,
],
},
'TANGUT IDEOGRAPH SUPPLEMENT' =>
{
'high' =>
[
101640,
],
'low' =>
[
101632,
],
},
);
# The following hash is a copy of the previous one, except is for loose
# matching, so each name has blanks and dashes squeezed out
my %loose_names_ending_in_code_point = (
'CJKCOMPATIBILITYIDEOGRAPH' =>
{
'high' =>
[
64109,
64217,
195101,
],
'low' =>
[
63744,
64112,
194560,
],
},
'CJKUNIFIEDIDEOGRAPH' =>
{
'high' =>
[
19903,
40956,
173789,
177972,
178205,
183969,
191456,
201546,
],
'low' =>
[
13312,
19968,
131072,
173824,
177984,
178208,
183984,
196608,
],
},
'KHITANSMALLSCRIPTCHARACTER' =>
{
'high' =>
[
101589,
],
'low' =>
[
101120,
],
},
'NUSHUCHARACTER' =>
{
'high' =>
[
111355,
],
'low' =>
[
110960,
],
},
'TANGUTIDEOGRAPH' =>
{
'high' =>
[
100343,
],
'low' =>
[
94208,
],
},
'TANGUTIDEOGRAPHSUPPLEMENT' =>
{
'high' =>
[
101640,
],
'low' =>
[
101632,
],
},
);
# And the following array gives the inverse mapping from code points to
# names. Lowest code points are first
@code_points_ending_in_code_point = (
{
'high' => 19903,
'legal' =>
'
-0123456789ABCDEFGHIJKNOPRU',
'low' => 13312,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 40956,
'legal' =>
'
-0123456789ABCDEFGHIJKNOPRU',
'low' => 19968,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 64109,
'legal' =>
'
-0123456789ABCDEFGHIJKLMOPRTY',
'low' => 63744,
'name' => 'CJK COMPATIBILITY IDEOGRAPH',
},
{
'high' => 64217,
'legal' =>
'
-0123456789ABCDEFGHIJKLMOPRTY',
'low' => 64112,
'name' => 'CJK COMPATIBILITY IDEOGRAPH',
},
{
'high' => 100343,
'legal' =>
'
-0123456789ABCDEFGHINOPRTU',
'low' => 94208,
'name' => 'TANGUT IDEOGRAPH',
},
{
'high' => 101589,
'legal' =>
'
-0123456789ABCDEFHIKLMNPRST',
'low' => 101120,
'name' => 'KHITAN SMALL SCRIPT CHARACTER',
},
{
'high' => 101640,
'legal' =>
'
-0123456789ABCDEFGHILMNOPRSTU',
'low' => 101632,
'name' => 'TANGUT IDEOGRAPH SUPPLEMENT',
},
{
'high' => 111355,
'legal' =>
'
-0123456789ABCDEFHNRSTU',
'low' => 110960,
'name' => 'NUSHU CHARACTER',
},
{
'high' => 173789,
'legal' =>
'
-0123456789ABCDEFGHIJKNOPRU',
'low' => 131072,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 177972,
'legal' =>
'
-0123456789ABCDEFGHIJKNOPRU',
'low' => 173824,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 178205,
'legal' =>
'
-0123456789ABCDEFGHIJKNOPRU',
'low' => 177984,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 183969,
'legal' =>
'
-0123456789ABCDEFGHIJKNOPRU',
'low' => 178208,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 191456,
'legal' =>
'
-0123456789ABCDEFGHIJKNOPRU',
'low' => 183984,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
{
'high' => 195101,
'legal' =>
'
-0123456789ABCDEFGHIJKLMOPRTY',
'low' => 194560,
'name' => 'CJK COMPATIBILITY IDEOGRAPH',
},
{
'high' => 201546,
'legal' =>
'
-0123456789ABCDEFGHIJKNOPRU',
'low' => 196608,
'name' => 'CJK UNIFIED IDEOGRAPH',
},
,
);
# Is exportable, make read-only
Internals::SvREADONLY(@code_points_ending_in_code_point, 1);
# Convert from code point to Jamo short name for use in composing Hangul
# syllable names
my %Jamo = (
4352 => 'G',
4353 => 'GG',
4354 => 'N',
4355 => 'D',
4356 => 'DD',
4357 => 'R',
4358 => 'M',
4359 => 'B',
4360 => 'BB',
4361 => 'S',
4362 => 'SS',
4363 => '',
4364 => 'J',
4365 => 'JJ',
4366 => 'C',
4367 => 'K',
4368 => 'T',
4369 => 'P',
4370 => 'H',
4449 => 'A',
4450 => 'AE',
4451 => 'YA',
4452 => 'YAE',
4453 => 'EO',
4454 => 'E',
4455 => 'YEO',
4456 => 'YE',
4457 => 'O',
4458 => 'WA',
4459 => 'WAE',
4460 => 'OE',
4461 => 'YO',
4462 => 'U',
4463 => 'WEO',
4464 => 'WE',
4465 => 'WI',
4466 => 'YU',
4467 => 'EU',
4468 => 'YI',
4469 => 'I',
4520 => 'G',
4521 => 'GG',
4522 => 'GS',
4523 => 'N',
4524 => 'NJ',
4525 => 'NH',
4526 => 'D',
4527 => 'L',
4528 => 'LG',
4529 => 'LM',
4530 => 'LB',
4531 => 'LS',
4532 => 'LT',
4533 => 'LP',
4534 => 'LH',
4535 => 'M',
4536 => 'B',
4537 => 'BS',
4538 => 'S',
4539 => 'SS',
4540 => 'NG',
4541 => 'J',
4542 => 'C',
4543 => 'K',
4544 => 'T',
4545 => 'P',
4546 => 'H',
);
# Leading consonant (can be null)
my %Jamo_L = (
'' => 11,
'B' => 7,
'BB' => 8,
'C' => 14,
'D' => 3,
'DD' => 4,
'G' => 0,
'GG' => 1,
'H' => 18,
'J' => 12,
'JJ' => 13,
'K' => 15,
'M' => 6,
'N' => 2,
'P' => 17,
'R' => 5,
'S' => 9,
'SS' => 10,
'T' => 16,
);
# Vowel
my %Jamo_V = (
'A' => 0,
'AE' => 1,
'E' => 5,
'EO' => 4,
'EU' => 18,
'I' => 20,
'O' => 8,
'OE' => 11,
'U' => 13,
'WA' => 9,
'WAE' => 10,
'WE' => 15,
'WEO' => 14,
'WI' => 16,
'YA' => 2,
'YAE' => 3,
'YE' => 7,
'YEO' => 6,
'YI' => 19,
'YO' => 12,
'YU' => 17,
);
# Optional trailing consonant
my %Jamo_T = (
'B' => 17,
'BS' => 18,
'C' => 23,
'D' => 7,
'G' => 1,
'GG' => 2,
'GS' => 3,
'H' => 27,
'J' => 22,
'K' => 24,
'L' => 8,
'LB' => 11,
'LG' => 9,
'LH' => 15,
'LM' => 10,
'LP' => 14,
'LS' => 12,
'LT' => 13,
'M' => 16,
'N' => 4,
'NG' => 21,
'NH' => 6,
'NJ' => 5,
'P' => 26,
'S' => 19,
'SS' => 20,
'T' => 25,
);
# Computed re that splits up a Hangul name into LVT or LV syllables
my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/;
my $HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
# These constants names and values were taken from the Unicode standard,
# version 5.1, section 3.12. They are used in conjunction with Hangul
# syllables
my $SBase = 0xAC00;
my $LBase = 0x1100;
my $VBase = 0x1161;
my $TBase = 0x11A7;
my $SCount = 11172;
my $LCount = 19;
my $VCount = 21;
my $TCount = 28;
my $NCount = $VCount * $TCount;
sub name_to_code_point_special {
my ($name, $loose) = @_;
# Returns undef if not one of the specially handled names; otherwise
# returns the code point equivalent to the input name
# $loose is non-zero if to use loose matching, 'name' in that case
# must be input as upper case with all blanks and dashes squeezed out.
if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
|| ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
{
return if $name !~ qr/^$syllable_re$/;
my $L = $Jamo_L{$1};
my $V = $Jamo_V{$2};
my $T = (defined $3) ? $Jamo_T{$3} : 0;
return ($L * $VCount + $V) * $TCount + $T + $SBase;
}
# Name must end in 'code_point' for this to handle.
return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
|| (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
my $base = $1;
my $code_point = CORE::hex $2;
my $names_ref;
if ($loose) {
$names_ref = \%loose_names_ending_in_code_point;
}
else {
return if $base !~ s/-$//;
$names_ref = \%names_ending_in_code_point;
}
# Name must be one of the ones which has the code point in it.
return if ! $names_ref->{$base};
# Look through the list of ranges that apply to this name to see if
# the code point is in one of them.
for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
return if $names_ref->{$base}{'low'}->[$i] > $code_point;
next if $names_ref->{$base}{'high'}->[$i] < $code_point;
# Here, the code point is in the range.
return $code_point;
}
# Here, looked like the name had a code point number in it, but
# did not match one of the valid ones.
return;
}
sub code_point_to_name_special {
my $code_point = shift;
# Returns the name of a code point if algorithmically determinable;
# undef if not
# If in the Hangul range, calculate the name based on Unicode's
# algorithm
if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
use integer;
my $SIndex = $code_point - $SBase;
my $L = $LBase + $SIndex / $NCount;
my $V = $VBase + ($SIndex % $NCount) / $TCount;
my $T = $TBase + $SIndex % $TCount;
$name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
$name .= $Jamo{$T} if $T != $TBase;
return $name;
}
# Look through list of these code points for one in range.
foreach my $hash (@code_points_ending_in_code_point) {
return if $code_point < $hash->{'low'};
if ($code_point <= $hash->{'high'}) {
return sprintf("%s-%04X", $hash->{'name'}, $code_point);
}
}
return; # None found
}
} # End closure
1;
|