#!/usr/bin/python3 ''' Unicode と CID の情報を元に、 OpenType/CID (Adobe-Identity-0) を Adobe-Japan1-* にマッピングするための情報を出力。 (1) 変換元の CID フォントから cmap の情報を取得 $ spot -t cmap=7 [FONTFILE] > cmap.txt (2) https://github.com/adobe-type-tools/cmap-resources から、以下をダウンロード。 - Adobe-Japan1-7/CMap/UniJIS2004-UTF32-H (3) https://github.com/adobe-type-tools/Adobe-Japan1 から、以下をダウンロード。 - Adobe-Japan1_sequences.txt (3) 変換 $ ./remap-unicode.py cmap.txt [UniJIS2004-UTF32-H] [Adobe-Japan1_sequences.txt] > out.txt ''' import sys import re if len(sys.argv) < 2: print("[usage] " + sys.argv[0] + " [] []") sys.exit(0) fname_spot = sys.argv[1] if len(sys.argv) < 3: fname_cmap = 'UniJIS2004-UTF32-H' else: fname_cmap = sys.argv[2] if len(sys.argv) < 4: fname_seq = 'Adobe-Japan1_sequences.txt' else: fname_seq = sys.argv[3] class CmapConv: # srcmap 読み込み def read_src(self,fname): f = open(fname, 'rt') lines = f.readlines() f.close() self.src = {} mo = re.compile(r'\[(.+?)\]\=\s*<(.+?)>') for l in lines: l = l.strip() if (not l) or l[0] != '[': continue m = mo.match(l) if m: if ' ' in m.group(1): # 異体字 uc = m.group(1) else: uc = int(m.group(1), base=16) if m.group(2)[0] == '\\': cid = int(m.group(2)[1:]) self.src[uc] = cid # 変換用 CMap 読み込み def read_dst(self,fname): f = open(fname, 'rt') lines = f.readlines() f.close() self.dst = [None] * 65536 flag = 0 moc = re.compile(r'<(.+?)>\s+(\d+)$') mor = re.compile(r'<(.+?)>\s+<(.+?)>\s+(\d+)$') for l in lines: l = l.strip() if not l: continue if flag == 1: # cidchar if l == 'endcidchar': flag = 0 else: m = moc.match(l) if m: uc = int(m.group(1), base=16) cid = int(m.group(2)) if self.dst[cid]: self.dst[cid].append(uc) else: self.dst[cid] = [uc] elif flag == 2: # cidrange if l == 'endcidrange': flag = 0 else: m = mor.match(l) if m: uc1 = int(m.group(1), base=16) uc2 = int(m.group(2), base=16) cid = int(m.group(3)) for uc in range(uc1, uc2 + 1): if self.dst[cid]: self.dst[cid].append(uc) else: self.dst[cid] = [uc] cid += 1 elif l.endswith('begincidchar'): flag = 1 elif l.endswith('begincidrange'): flag = 2 # 個数 i = len(self.dst) - 1 while i > 0: if self.dst[i]: break i -= 1 self.dstnum = i + 1 # dst に対してマッピング def remap(self): self.res = [None] * self.dstnum cid_dst = 1 while cid_dst < self.dstnum: dlist = self.dst[cid_dst] cid_src = None # dst と同じ Unicode の CID を検索 if dlist: for uc in dlist: if uc in self.src: cid_src = self.src[uc] break if cid_src: self.res[cid_dst] = cid_src cid_dst += 1 # 異体字マッピング def remap_seq(self,fname): f = open(fname, 'rt') lines = f.readlines() f.close() mo = re.compile(r'(.+?)\; .+?\; CID\+(\d+)$') for l in lines: l = l.strip() if (not l) or l[0] == '#': continue m = mo.match(l) if m: uc = m.group(1) cid = int(m.group(2)) self.dst[cid] = uc if uc in self.src: self.res[cid] = self.src[uc] # 書き込み def write(self): i = 1 num = 0 while i < self.dstnum: sid = self.res[i] uclist = self.dst[i] o = '{0:05d}'.format(i) # src 割り当て if sid: o += '[{0}]'.format(sid) num += 1 # Unicode if uclist: o += '#' if type(uclist) is str: # 異体字 o += uclist else: for uc in uclist: o += '{0:X}/'.format(uc) o = o[:-1] print(o) i += 1 print('{0}/{1} ({2:.2%})'.format(num, self.dstnum, num / self.dstnum) ,file=sys.stderr) #---- cv = CmapConv() cv.read_src(fname_spot) cv.read_dst(fname_cmap) cv.remap() cv.remap_seq(fname_seq) cv.write()