3030import sys
3131import zipfile
3232
33- from textwrap import dedent
3433from functools import partial
34+ from textwrap import dedent
35+ from typing import *
3536
3637SCRIPT = sys .argv [0 ]
3738VERSION = "3.3"
@@ -903,6 +904,32 @@ def open_data(template, version):
903904 return open (local , 'rb' )
904905
905906
907+ class UcdFile :
908+ '''
909+ A file in the standard format of the UCD.
910+
911+ See: https://www.unicode.org/reports/tr44/#Format_Conventions
912+
913+ Note that, as described there, the Unihan data files have their
914+ own separate format.
915+ '''
916+
917+ def __init__ (self , template : str , version : str ) -> None :
918+ self .template = template
919+ self .version = version
920+
921+ def records (self ) -> Iterator [List [str ]]:
922+ with open_data (self .template , self .version ) as file :
923+ for line in file :
924+ line = line .split ('#' , 1 )[0 ].strip ()
925+ if not line :
926+ continue
927+ yield [field .strip () for field in line .split (';' )]
928+
929+ def __iter__ (self ) -> Iterator [List [str ]]:
930+ return self .records ()
931+
932+
906933# --------------------------------------------------------------------
907934# the following support code is taken from the unidb utilities
908935# Copyright (c) 1999-2000 by Secret Labs AB
@@ -922,14 +949,9 @@ def __init__(self, version,
922949 cjk_check = True ):
923950 self .changed = []
924951 table = [None ] * 0x110000
925- with open_data (UNICODE_DATA , version ) as file :
926- while 1 :
927- s = file .readline ()
928- if not s :
929- break
930- s = s .strip ().split (";" )
931- char = int (s [0 ], 16 )
932- table [char ] = s
952+ for s in UcdFile (UNICODE_DATA , version ):
953+ char = int (s [0 ], 16 )
954+ table [char ] = s
933955
934956 cjk_ranges_found = []
935957
@@ -968,17 +990,12 @@ def __init__(self, version,
968990 # in order to take advantage of the compression and lookup
969991 # algorithms used for the other characters
970992 pua_index = NAME_ALIASES_START
971- with open_data (NAME_ALIASES , version ) as file :
972- for s in file :
973- s = s .strip ()
974- if not s or s .startswith ('#' ):
975- continue
976- char , name , abbrev = s .split (';' )
977- char = int (char , 16 )
978- self .aliases .append ((name , char ))
979- # also store the name in the PUA 1
980- self .table [pua_index ][1 ] = name
981- pua_index += 1
993+ for char , name , abbrev in UcdFile (NAME_ALIASES , version ):
994+ char = int (char , 16 )
995+ self .aliases .append ((name , char ))
996+ # also store the name in the PUA 1
997+ self .table [pua_index ][1 ] = name
998+ pua_index += 1
982999 assert pua_index - NAME_ALIASES_START == len (self .aliases )
9831000
9841001 self .named_sequences = []
@@ -988,50 +1005,32 @@ def __init__(self, version,
9881005
9891006 assert pua_index < NAMED_SEQUENCES_START
9901007 pua_index = NAMED_SEQUENCES_START
991- with open_data (NAMED_SEQUENCES , version ) as file :
992- for s in file :
993- s = s .strip ()
994- if not s or s .startswith ('#' ):
995- continue
996- name , chars = s .split (';' )
997- chars = tuple (int (char , 16 ) for char in chars .split ())
998- # check that the structure defined in makeunicodename is OK
999- assert 2 <= len (chars ) <= 4 , "change the Py_UCS2 array size"
1000- assert all (c <= 0xFFFF for c in chars ), ("use Py_UCS4 in "
1001- "the NamedSequence struct and in unicodedata_lookup" )
1002- self .named_sequences .append ((name , chars ))
1003- # also store these in the PUA 1
1004- self .table [pua_index ][1 ] = name
1005- pua_index += 1
1008+ for name , chars in UcdFile (NAMED_SEQUENCES , version ):
1009+ chars = tuple (int (char , 16 ) for char in chars .split ())
1010+ # check that the structure defined in makeunicodename is OK
1011+ assert 2 <= len (chars ) <= 4 , "change the Py_UCS2 array size"
1012+ assert all (c <= 0xFFFF for c in chars ), ("use Py_UCS4 in "
1013+ "the NamedSequence struct and in unicodedata_lookup" )
1014+ self .named_sequences .append ((name , chars ))
1015+ # also store these in the PUA 1
1016+ self .table [pua_index ][1 ] = name
1017+ pua_index += 1
10061018 assert pua_index - NAMED_SEQUENCES_START == len (self .named_sequences )
10071019
10081020 self .exclusions = {}
1009- with open_data (COMPOSITION_EXCLUSIONS , version ) as file :
1010- for s in file :
1011- s = s .strip ()
1012- if not s :
1013- continue
1014- if s [0 ] == '#' :
1015- continue
1016- char = int (s .split ()[0 ],16 )
1017- self .exclusions [char ] = 1
1021+ for char , in UcdFile (COMPOSITION_EXCLUSIONS , version ):
1022+ char = int (char , 16 )
1023+ self .exclusions [char ] = 1
10181024
10191025 widths = [None ] * 0x110000
1020- with open_data (EASTASIAN_WIDTH , version ) as file :
1021- for s in file :
1022- s = s .strip ()
1023- if not s :
1024- continue
1025- if s [0 ] == '#' :
1026- continue
1027- s = s .split ()[0 ].split (';' )
1028- if '..' in s [0 ]:
1029- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1030- chars = list (range (first , last + 1 ))
1031- else :
1032- chars = [int (s [0 ], 16 )]
1033- for char in chars :
1034- widths [char ] = s [1 ]
1026+ for s in UcdFile (EASTASIAN_WIDTH , version ):
1027+ if '..' in s [0 ]:
1028+ first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1029+ chars = list (range (first , last + 1 ))
1030+ else :
1031+ chars = [int (s [0 ], 16 )]
1032+ for char in chars :
1033+ widths [char ] = s [1 ]
10351034
10361035 for i in range (0 , 0x110000 ):
10371036 if table [i ] is not None :
@@ -1041,38 +1040,27 @@ def __init__(self, version,
10411040 if table [i ] is not None :
10421041 table [i ].append (set ())
10431042
1044- with open_data (DERIVED_CORE_PROPERTIES , version ) as file :
1045- for s in file :
1046- s = s .split ('#' , 1 )[0 ].strip ()
1047- if not s :
1048- continue
1049-
1050- r , p = s .split (";" )
1051- r = r .strip ()
1052- p = p .strip ()
1053- if ".." in r :
1054- first , last = [int (c , 16 ) for c in r .split ('..' )]
1055- chars = list (range (first , last + 1 ))
1056- else :
1057- chars = [int (r , 16 )]
1058- for char in chars :
1059- if table [char ]:
1060- # Some properties (e.g. Default_Ignorable_Code_Point)
1061- # apply to unassigned code points; ignore them
1062- table [char ][- 1 ].add (p )
1063-
1064- with open_data (LINE_BREAK , version ) as file :
1065- for s in file :
1066- s = s .partition ('#' )[0 ]
1067- s = [i .strip () for i in s .split (';' )]
1068- if len (s ) < 2 or s [1 ] not in MANDATORY_LINE_BREAKS :
1069- continue
1070- if '..' not in s [0 ]:
1071- first = last = int (s [0 ], 16 )
1072- else :
1073- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1074- for char in range (first , last + 1 ):
1075- table [char ][- 1 ].add ('Line_Break' )
1043+ for r , p in UcdFile (DERIVED_CORE_PROPERTIES , version ):
1044+ if ".." in r :
1045+ first , last = [int (c , 16 ) for c in r .split ('..' )]
1046+ chars = list (range (first , last + 1 ))
1047+ else :
1048+ chars = [int (r , 16 )]
1049+ for char in chars :
1050+ if table [char ]:
1051+ # Some properties (e.g. Default_Ignorable_Code_Point)
1052+ # apply to unassigned code points; ignore them
1053+ table [char ][- 1 ].add (p )
1054+
1055+ for s in UcdFile (LINE_BREAK , version ):
1056+ if len (s ) < 2 or s [1 ] not in MANDATORY_LINE_BREAKS :
1057+ continue
1058+ if '..' not in s [0 ]:
1059+ first = last = int (s [0 ], 16 )
1060+ else :
1061+ first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1062+ for char in range (first , last + 1 ):
1063+ table [char ][- 1 ].add ('Line_Break' )
10761064
10771065 # We only want the quickcheck properties
10781066 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
@@ -1083,23 +1071,19 @@ def __init__(self, version,
10831071 # for older versions, and no delta records will be created.
10841072 quickchecks = [0 ] * 0x110000
10851073 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC' .split ()
1086- with open_data (DERIVEDNORMALIZATION_PROPS , version ) as file :
1087- for s in file :
1088- if '#' in s :
1089- s = s [:s .index ('#' )]
1090- s = [i .strip () for i in s .split (';' )]
1091- if len (s ) < 2 or s [1 ] not in qc_order :
1092- continue
1093- quickcheck = 'MN' .index (s [2 ]) + 1 # Maybe or No
1094- quickcheck_shift = qc_order .index (s [1 ])* 2
1095- quickcheck <<= quickcheck_shift
1096- if '..' not in s [0 ]:
1097- first = last = int (s [0 ], 16 )
1098- else :
1099- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1100- for char in range (first , last + 1 ):
1101- assert not (quickchecks [char ]>> quickcheck_shift )& 3
1102- quickchecks [char ] |= quickcheck
1074+ for s in UcdFile (DERIVEDNORMALIZATION_PROPS , version ):
1075+ if len (s ) < 2 or s [1 ] not in qc_order :
1076+ continue
1077+ quickcheck = 'MN' .index (s [2 ]) + 1 # Maybe or No
1078+ quickcheck_shift = qc_order .index (s [1 ])* 2
1079+ quickcheck <<= quickcheck_shift
1080+ if '..' not in s [0 ]:
1081+ first = last = int (s [0 ], 16 )
1082+ else :
1083+ first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1084+ for char in range (first , last + 1 ):
1085+ assert not (quickchecks [char ]>> quickcheck_shift )& 3
1086+ quickchecks [char ] |= quickcheck
11031087 for i in range (0 , 0x110000 ):
11041088 if table [i ] is not None :
11051089 table [i ].append (quickchecks [i ])
@@ -1122,34 +1106,26 @@ def __init__(self, version,
11221106 # Patch the numeric field
11231107 if table [i ] is not None :
11241108 table [i ][8 ] = value
1109+
11251110 sc = self .special_casing = {}
1126- with open_data (SPECIAL_CASING , version ) as file :
1127- for s in file :
1128- s = s [:- 1 ].split ('#' , 1 )[0 ]
1129- if not s :
1130- continue
1131- data = s .split ("; " )
1132- if data [4 ]:
1133- # We ignore all conditionals (since they depend on
1134- # languages) except for one, which is hardcoded. See
1135- # handle_capital_sigma in unicodeobject.c.
1136- continue
1137- c = int (data [0 ], 16 )
1138- lower = [int (char , 16 ) for char in data [1 ].split ()]
1139- title = [int (char , 16 ) for char in data [2 ].split ()]
1140- upper = [int (char , 16 ) for char in data [3 ].split ()]
1141- sc [c ] = (lower , title , upper )
1111+ for data in UcdFile (SPECIAL_CASING , version ):
1112+ if data [4 ]:
1113+ # We ignore all conditionals (since they depend on
1114+ # languages) except for one, which is hardcoded. See
1115+ # handle_capital_sigma in unicodeobject.c.
1116+ continue
1117+ c = int (data [0 ], 16 )
1118+ lower = [int (char , 16 ) for char in data [1 ].split ()]
1119+ title = [int (char , 16 ) for char in data [2 ].split ()]
1120+ upper = [int (char , 16 ) for char in data [3 ].split ()]
1121+ sc [c ] = (lower , title , upper )
1122+
11421123 cf = self .case_folding = {}
11431124 if version != '3.2.0' :
1144- with open_data (CASE_FOLDING , version ) as file :
1145- for s in file :
1146- s = s [:- 1 ].split ('#' , 1 )[0 ]
1147- if not s :
1148- continue
1149- data = s .split ("; " )
1150- if data [1 ] in "CF" :
1151- c = int (data [0 ], 16 )
1152- cf [c ] = [int (char , 16 ) for char in data [2 ].split ()]
1125+ for data in UcdFile (CASE_FOLDING , version ):
1126+ if data [1 ] in "CF" :
1127+ c = int (data [0 ], 16 )
1128+ cf [c ] = [int (char , 16 ) for char in data [2 ].split ()]
11531129
11541130 def uselatin1 (self ):
11551131 # restrict character range to ISO Latin 1
0 commit comments