Advance Wayland and KDE package bring-up

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
2026-04-14 10:51:06 +01:00
parent 51f3c21121
commit cf12defd28
15214 changed files with 20594243 additions and 269 deletions
@@ -0,0 +1,859 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# This script generates a data file containing all Unicode information needed
# by KCharSelect.
#
##############################################################################
# SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de>
# SPDX-FileCopyrightText: 2016 John Zaitseff <J.Zaitseff@zap.org.au>
#
# SPDX-License-Identifier: LGPL-2.0-or-later
##############################################################################
#
# The current directory must contain the following files that can be found at
# http://www.unicode.org/Public/UNIDATA/:
# - UnicodeData.txt
# - Unihan_Readings.txt (you need to uncompress it from Unihan.zip)
# - NamesList.txt
# - Blocks.txt
#
# The generated file is named "kcharselect-data" and has to be put in
# kwidgetsaddons/src. Additionally a translation dummy named
# "kcharselect-translation.cpp" is generated and has to be placed in the same
# directory.
#
# FILE STRUCTURE
#
# The generated file is a binary file. The first 40 bytes are the header and
# contain the position of each part of the file. Each entry is uint32.
#
# pos content
# 0 names strings begin
# 4 names offsets begin
# 8 details strings begin
# 12 details offsets begin
# 16 block strings begin
# 20 block offsets begin
# 24 section strings begin
# 28 section offsets begin
# 32 unihan strings begin
# 36 unihan offsets begin
#
# The string parts always contain all strings in a row, followed by a 0x00
# byte. There is one exception: The data for seeAlso in details is only 2
# bytes (as is always is _one_ unicode character) and _not_ followed by a 0x00
# byte.
#
# The offset parts contain entries with a fixed length. Unicode characters
# are always uint16 and offsets uint32. Offsets are positions in the data
# file.
#
# names_offsets:
# each entry 6 bytes
# 16bit: unicode
# 32bit: offset to name in names_strings
#
# names_strings:
# the first byte is the category (same values as QChar::Category),
# directly followed by the character name (terminated by 0x00)
#
# nameslist_offsets:
# char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_coutn, equiv, equiv_count, seeAlso, seeAlso_count
# 16 32 8 32 8 32 8 32 8 32 8
# => each entry 27 bytes
#
# blocks_offsets:
# each entry 4 bytes
# 16bit: start unicode
# 16bit: end unicode
# Note that there is no string offset.
#
# section_offsets:
# each entry 4 bytes
# 16bit: section offset
# 16bit: block offset
# Note that these offsets are _not_ positions in the data file but indexes.
# For example 0x0403 means the fourth section includes the third block.
#
# unihan_offsets:
# each entry 30 bytes
# 16bit: unicode
# 32bit: offset to unihan_strings for Definition
# 32bit: offset to unihan_strings for Cantonese
# 32bit: offset to unihan_strings for Mandarin
# 32bit: offset to unihan_strings for Tang
# 32bit: offset to unihan_strings for Korean
# 32bit: offset to unihan_strings for JapaneseKun
# 32bit: offset to unihan_strings for JapaneseOn
from struct import *
import sys
import re
import io
# Based on http://www.unicode.org/charts/, updated for Unicode 9.0
sectiondata = '''
SECTION European Scripts
Basic Latin
Latin-1 Supplement
Latin Extended-A
Latin Extended-B
Latin Extended-C
Latin Extended-D
Latin Extended-E
Latin Extended Additional
Armenian
Coptic
Cyrillic
Cyrillic Supplement
Cyrillic Extended-A
Cyrillic Extended-B
Cyrillic Extended-C
Georgian
Georgian Supplement
Georgian Extended
Glagolitic
Greek and Coptic
Greek Extended
Ogham
Runic
SECTION African Scripts
Bamum
Ethiopic
Ethiopic Supplement
Ethiopic Extended
Ethiopic Extended-A
NKo
Tifinagh
Vai
SECTION Middle Eastern Scripts
Arabic
Arabic Supplement
Arabic Extended-A
Arabic Extended-B
Arabic Presentation Forms-A
Arabic Presentation Forms-B
Hebrew
Mandaic
Samaritan
Syriac
Syriac Supplement
SECTION Central Asian Scripts
Mongolian
Phags-pa
Tibetan
SECTION South Asian Scripts
Bengali
Common Indic Number Forms
Devanagari
Devanagari Extended
Gujarati
Gurmukhi
Kannada
Lepcha
Limbu
Malayalam
Meetei Mayek
Meetei Mayek Extensions
Ol Chiki
Oriya
Saurashtra
Sinhala
Syloti Nagri
Tamil
Telugu
Thaana
Vedic Extensions
SECTION Southeast Asian Scripts
Cham
Kayah Li
Khmer
Khmer Symbols
Lao
Myanmar
Myanmar Extended-A
Myanmar Extended-B
New Tai Lue
Tai Le
Tai Tham
Tai Viet
Thai
SECTION Indonesia and Oceania Scripts
Balinese
Batak
Buginese
Buhid
Hanunoo
Javanese
Rejang
Sundanese
Sundanese Supplement
Tagalog
Tagbanwa
SECTION East Asian Scripts
Bopomofo
Bopomofo Extended
CJK Unified Ideographs
CJK Unified Ideographs Extension A
CJK Compatibility
CJK Compatibility Ideographs
CJK Compatibility Forms
CJK Radicals Supplement
CJK Strokes
CJK Symbols and Punctuation
Enclosed CJK Letters and Months
Hangul Jamo
Hangul Jamo Extended-A
Hangul Jamo Extended-B
Hangul Compatibility Jamo
Hangul Syllables
Hiragana
Ideographic Description Characters
Kanbun
Kangxi Radicals
Katakana
Katakana Phonetic Extensions
Lisu
Yi Radicals
Yi Syllables
SECTION American Scripts
Cherokee
Cherokee Supplement
Unified Canadian Aboriginal Syllabics
Unified Canadian Aboriginal Syllabics Extended
SECTION Symbols
General Punctuation
Alchemical Symbols
Braille Patterns
Chess Symbols
Control Pictures
Currency Symbols
Dingbats
Domino Tiles
Emoticons
Enclosed Alphanumerics
Enclosed Alphanumeric Supplement
Enclosed Ideographic Supplement
Mahjong Tiles
Miscellaneous Symbols
Miscellaneous Symbols and Pictographs
Miscellaneous Technical
Optical Character Recognition
Ornamental Dingbats
Playing Cards
Small Form Variants
Supplemental Punctuation
Supplemental Symbols and Pictographs
Symbols and Pictographs Extended-A
Symbols for Legacy Computing
Transport and Map Symbols
Vertical Forms
Yijing Hexagram Symbols
SECTION Mathematical Symbols
Arrows
Block Elements
Box Drawing
Geometric Shapes
Geometric Shapes Extended
Letterlike Symbols
Mathematical Operators
Miscellaneous Mathematical Symbols-A
Miscellaneous Mathematical Symbols-B
Miscellaneous Symbols and Arrows
Number Forms
Superscripts and Subscripts
Supplemental Arrows-A
Supplemental Arrows-B
Supplemental Arrows-C
Supplemental Mathematical Operators
SECTION Phonetic Symbols
IPA Extensions
Modifier Tone Letters
Phonetic Extensions
Phonetic Extensions Supplement
Spacing Modifier Letters
SECTION Combining Diacritics
Combining Diacritical Marks
Combining Diacritical Marks Extended
Combining Diacritical Marks Supplement
Combining Diacritical Marks for Symbols
Combining Half Marks
SECTION Other
Alphabetic Presentation Forms
Halfwidth and Fullwidth Forms
High Private Use Surrogates
High Surrogates
Low Surrogates
Private Use Area
Specials
Variation Selectors
'''
categoryMap = { # same values as QChar::Category
"Mn": 1,
"Mc": 2,
"Me": 3,
"Nd": 4,
"Nl": 5,
"No": 6,
"Zs": 7,
"Zl": 8,
"Zp": 9,
"Cc": 10,
"Cf": 11,
"Cs": 12,
"Co": 13,
"Cn": 14,
"Lu": 15,
"Ll": 16,
"Lt": 17,
"Lm": 18,
"Lo": 19,
"Pc": 20,
"Pd": 21,
"Ps": 22,
"Pe": 23,
"Pi": 24,
"Pf": 25,
"Po": 26,
"Sm": 27,
"Sc": 28,
"Sk": 29,
"So": 30
}
# Temporary code point remapping
#
# Initial SMP support without needing a new data file format
# - BMP U+Fxxx are remapped to U+Exxx
# - SMP symbols U+1Fxxx are remapped to U+Fxxx
# - Private Use Area is limited to U+F000 ... U+F8FF
def remap(char):
cp = int(char, 16)
if cp >= 0xE000 and cp <= 0xFFFF:
return "E"+char[1:]
if cp >= 0x1F000 and cp <= 0x1FFFF:
return char[1:]
return char
class Names:
def __init__(self):
self.names = []
self.controlpos = -1
def addName(self, uni, name, category):
self.names.append([uni, name, category])
def calculateStringSize(self):
size = 0
hadcontrol = False
for entry in self.names:
if entry[1] == "<control>":
if not hadcontrol:
size += len(entry[1]) + 2
hadcontrol = True
else:
size += len(entry[1]) + 2
return size
def calculateOffsetSize(self):
return len(self.names)*6
def writeStrings(self, out, pos):
hadcontrol = False
for entry in self.names:
if entry[1] == "<control>":
if not hadcontrol:
out.write(pack("=b", entry[2]))
out.write(entry[1].encode("utf-8") + b"\0")
size = len(entry[1]) + 2
entry[1] = pos
self.controlpos = pos
pos += size
hadcontrol = True
else:
entry[1] = self.controlpos
else:
out.write(pack("=b", entry[2]))
out.write(entry[1].encode("utf-8") + b"\0")
size = len(entry[1]) + 2
entry[1] = pos
pos += size
return pos
def writeOffsets(self, out, pos):
for entry in self.names:
out.write(pack("=HI", int(entry[0], 16), entry[1]))
pos += 6
return pos
class Details:
def __init__(self):
self.details = {}
def addEntry(self, char, category, text):
if not char in self.details:
self.details[char] = {}
if not category in self.details[char]:
self.details[char][category] = []
self.details[char][category].append(text)
def calculateStringSize(self):
size = 0
for char in self.details.values():
for cat in char.values():
for s in cat:
if type(s) is str:
size += len(s.encode("utf-8")) + 1
else:
size += 2
return size
def calculateOffsetSize(self):
return len(self.details)*27
def writeStrings(self, out, pos):
for char in self.details.values():
for cat in char.values():
for i in range(0, len(cat)):
s = cat[i]
if type(s) is str:
out.write(s.encode("utf-8") + b"\0")
size = len(s.encode("utf-8")) + 1
else:
out.write(pack("=H", s))
size = 2
cat[i] = pos
pos += size
return pos
def writeOffsets(self, out, pos):
for char in self.details.keys():
alias = 0
alias_count = 0
note = 0
note_count = 0
approxEquiv = 0
approxEquiv_count = 0
equiv = 0
equiv_count = 0
seeAlso = 0
seeAlso_count = 0
if "alias" in self.details[char]:
alias = self.details[char]["alias"][0]
alias_count = len(self.details[char]["alias"])
if "note" in self.details[char]:
note = self.details[char]["note"][0]
note_count = len(self.details[char]["note"])
if "approxEquiv" in self.details[char]:
approxEquiv = self.details[char]["approxEquiv"][0]
approxEquiv_count = len(self.details[char]["approxEquiv"])
if "equiv" in self.details[char]:
equiv = self.details[char]["equiv"][0]
equiv_count = len(self.details[char]["equiv"])
if "seeAlso" in self.details[char]:
seeAlso = self.details[char]["seeAlso"][0]
seeAlso_count = len(self.details[char]["seeAlso"])
out.write(pack("=HIbIbIbIbIb", char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_count, equiv, equiv_count, seeAlso, seeAlso_count))
pos += 27
return pos
class SectionsBlocks:
def __init__(self):
self.sections = []
self.blocks = []
self.blockList = []
self.sectionList = []
def addBlock(self, begin, end, name):
self.blocks.append([begin, end, name])
self.blockList.append(name)
def addSection(self, section, block):
self.sections.append([section, block])
if not section in self.sectionList:
self.sectionList.append(section)
def calculateBlockStringSize(self):
size = 0
for block in self.blocks:
size += len(block[2]) + 1
return size
def calculateBlockOffsetSize(self):
return len(self.blocks) * 4
def calculateSectionStringSize(self):
size = 0
lastsection = ""
for section in self.sections:
if section[0] != lastsection:
size += len(section[0]) + 1
lastsection = section[0]
return size
def calculateSectionOffsetSize(self):
return len(self.sections) * 4
def writeBlockStrings(self, out, pos):
index = 0
for block in self.blocks:
out.write(block[2].encode("utf-8") + b"\0")
size = len(block[2].encode("utf-8")) + 1
found = False
for section in self.sections:
if section[1] == block[2]:
print("found", section)
section[1] = index
found = True
if not found:
print("Error: Did not find any category for block \""+block[2]+"\"")
sys.exit(1)
block[2] = index
pos += size
index += 1
return pos
def writeBlockOffsets(self, out, pos):
for block in self.blocks:
out.write(pack("=HH", int(block[0], 16), int(block[1], 16)))
pos += 4
return pos
def writeSectionStrings(self, out, pos):
lastsection = ""
lastpos = 0
index = -1
for section in self.sections:
if section[0] != lastsection:
index += 1
lastsection = section[0]
out.write(section[0].encode("utf-8") + b"\0")
size = len(section[0].encode("utf-8")) + 1
section[0] = index
lastpos = pos
pos += size
else:
section[0] = index
return pos
def writeSectionOffsets(self, out, pos):
for section in self.sections:
out.write(pack("=HH", section[0], section[1]))
pos += 4
return pos
def getBlockList(self):
return self.blockList
def getSectionList(self):
return self.sectionList
class Unihan:
def __init__(self):
self.unihan = {}
def addUnihan(self, uni, category, value):
uni = int(uni, 16)
if category != "kDefinition" and category != "kCantonese" and category != "kMandarin" and category != "kTang" and category != "kKorean" and category != "kJapaneseKun" and category != "kJapaneseOn":
return
if not uni in self.unihan:
self.unihan[uni] = [None, None, None, None, None, None, None]
if category == "kDefinition":
self.unihan[uni][0] = value
elif category == "kCantonese":
self.unihan[uni][1] = value
elif category == "kMandarin":
self.unihan[uni][2] = value
elif category == "kTang":
self.unihan[uni][3] = value
elif category == "kKorean":
self.unihan[uni][4] = value
elif category == "kJapaneseKun":
self.unihan[uni][5] = value
elif category == "kJapaneseOn":
self.unihan[uni][6] = value
def calculateStringSize(self):
size = 0
for char in self.unihan.keys():
for entry in self.unihan[char]:
if entry != None:
size += len(entry.encode("utf-8")) + 1
return size
def calculateOffsetSize(self):
return len(self.unihan) * 30
def writeStrings(self, out, pos):
for char in self.unihan.keys():
for i in range(0, 7):
if self.unihan[char][i] != None:
out.write(self.unihan[char][i].encode("utf-8") + b"\0")
size = len(self.unihan[char][i].encode("utf-8")) + 1
self.unihan[char][i] = pos
pos += size
return pos
def writeOffsets(self, out, pos):
for char in self.unihan.keys():
out.write(pack("=H", char))
for i in range(0, 7):
if self.unihan[char][i] != None:
out.write(pack("=I", self.unihan[char][i]))
else:
out.write(pack("=I", 0))
pos += 30
return pos
class Parser:
def parseUnicodeData(self, inUnicodeData, names):
regexp = re.compile(r'^([^;]+);([^;]+);([^;]+)')
for line in inUnicodeData:
line = line[:-1]
m = regexp.match(line)
if not m:
continue
uni = remap(m.group(1))
name = m.group(2)
category = m.group(3)
if len(uni) > 4:
continue
names.addName(uni, name, categoryMap[category])
def parseDetails(self, inNamesList, details):
invalidRegexp = re.compile(r'^@')
unicodeRegexp = re.compile(r'^([0-9A-F]+)')
aliasRegexp = re.compile(r'^\s+=\s+(.+)$') #equal
seeAlsoRegexp1 = re.compile(r'^\s+x\s+.*\s([0-9A-F]{4,6})\)$') #ex
seeAlsoRegexp2 = re.compile(r'^\s+x\s+([0-9A-F]{4,6})$') #ex
noteRegexp = re.compile(r'^\s+\*\s+(.+)$') #star
approxEquivalentRegexp = re.compile(r'^\s+#\s+(.+)$') #pound
equivalentRegexp = re.compile(r'^\s+:\s+(.+)$') #colon
drop = 0
currChar = 0
for line in inNamesList:
line = line[:-1]
m1 = unicodeRegexp.match(line)
m2 = aliasRegexp.match(line)
m3 = noteRegexp.match(line)
m4 = approxEquivalentRegexp.match(line)
m5 = equivalentRegexp.match(line)
m6 = seeAlsoRegexp1.match(line)
m7 = seeAlsoRegexp2.match(line)
if invalidRegexp.match(line):
continue
elif m1:
mg1 = remap(m1.group(1))
currChar = int(mg1, 16)
if len(mg1) > 4:
drop = 1
continue
elif drop == 1:
continue
elif m2:
value = m2.group(1)
details.addEntry(currChar, "alias", value)
elif m3:
value = m3.group(1)
details.addEntry(currChar, "note", value)
elif m4:
value = m4.group(1)
details.addEntry(currChar, "approxEquiv", value)
elif m5:
value = m5.group(1)
details.addEntry(currChar, "equiv", value)
elif m6:
value = int(remap(m6.group(1)), 16)
if value < 0x10000:
details.addEntry(currChar, "seeAlso", value)
elif m7:
value = int(remap(m7.group(1)), 16)
if value < 0x10000:
details.addEntry(currChar, "seeAlso", value)
def parseBlocks(self, inBlocks, sectionsBlocks):
regexp = re.compile(r'^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$')
for line in inBlocks:
line = line[:-1]
m = regexp.match(line)
if not m:
continue
m1 = remap(m.group(1))
m2 = remap(m.group(2))
if len(m1) > 4:
continue
sectionsBlocks.addBlock(m1, m2, m.group(3))
def parseSections(self, inSections, sectionsBlocks):
currSection = ""
for line in inSections:
line = line[:-1]
if len(line) == 0:
continue
temp = line.split(" ")
if temp[0] == "SECTION":
currSection = line[8:]
elif currSection != "":
sectionsBlocks.addSection(currSection, line)
else:
print("error in data file")
sys.exit(1)
def parseUnihan(self, inUnihan, unihan):
regexp = re.compile(r'^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$')
count = 0
for line in inUnihan:
if count % 100000 == 0:
print("\b."); sys.stdout.flush()
count += 1
line = line[:-1]
m = regexp.match(line)
if not m:
continue
if len(remap(m.group(1))) <= 4:
unihan.addUnihan(remap(m.group(1)), m.group(2), m.group(3))
def writeTranslationDummy(out, data):
out.write(b"""/* This file is part of the KDE libraries
SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de>
SPDX-FileCopyrightText: 2016 John Zaitseff <J.Zaitseff@zap.org.au>
SPDX-License-Identifier: LGPL-2.0-or-later
This file is autogenerated by kcharselect/kcharselect-generate-datafile.py
*/\n\n""")
for group in data:
for entry in group[1]:
out.write(b"QT_TRANSLATE_NOOP3(\"KCharSelectData\", \""+entry.encode("utf-8")+b"\", \""+group[0].encode("utf-8")+b"\");\n")
out = open("kcharselect-data", "wb")
outTranslationDummy = open("kcharselect-translation.cpp", "wb")
inUnicodeData = open("UnicodeData.txt", "r")
inNamesList = open("NamesList.txt", "r")
inBlocks = open("Blocks.txt", "r")
inSections = io.StringIO(sectiondata)
inUnihan = open("Unihan_Readings.txt", "r")
if calcsize('=H') != 2 or calcsize('=I') != 4:
print("Error: Sizes of ushort and uint are not 16 and 32 bit as expected")
sys.exit(1)
names = Names()
details = Details()
sectionsBlocks = SectionsBlocks()
unihan = Unihan()
parser = Parser()
print("========== parsing files ===================")
parser.parseUnicodeData(inUnicodeData, names)
print("."); sys.stdout.flush()
parser.parseDetails(inNamesList, details)
print("\b."); sys.stdout.flush()
parser.parseBlocks(inBlocks, sectionsBlocks)
print("\b."); sys.stdout.flush()
parser.parseSections(inSections, sectionsBlocks)
print("\b."); sys.stdout.flush()
parser.parseUnihan(inUnihan, unihan)
print("\b."); sys.stdout.flush()
print("done.")
pos = 0
#write header, size: 40 bytes
print("========== writing header ==================")
out.write(pack("=I", 40))
print("names strings begin", 40)
namesOffsetBegin = names.calculateStringSize() + 40
out.write(pack("=I", namesOffsetBegin))
print("names offsets begin", namesOffsetBegin)
detailsStringBegin = namesOffsetBegin + names.calculateOffsetSize()
out.write(pack("=I", detailsStringBegin))
print("details strings begin", detailsStringBegin)
detailsOffsetBegin = detailsStringBegin + details.calculateStringSize()
out.write(pack("=I", detailsOffsetBegin))
print("details offsets begin", detailsOffsetBegin)
blocksStringBegin = detailsOffsetBegin + details.calculateOffsetSize()
out.write(pack("=I", blocksStringBegin))
print("block strings begin", blocksStringBegin)
blocksOffsetBegin = blocksStringBegin + sectionsBlocks.calculateBlockStringSize()
out.write(pack("=I", blocksOffsetBegin))
print("block offsets begin", blocksOffsetBegin)
sectionStringBegin = blocksOffsetBegin + sectionsBlocks.calculateBlockOffsetSize()
out.write(pack("=I", sectionStringBegin))
print("section strings begin", sectionStringBegin)
sectionOffsetBegin = sectionStringBegin + sectionsBlocks.calculateSectionStringSize()
out.write(pack("=I", sectionOffsetBegin))
print("section offsets begin", sectionOffsetBegin)
unihanStringBegin = sectionOffsetBegin + sectionsBlocks.calculateSectionOffsetSize()
out.write(pack("=I", unihanStringBegin))
print("unihan strings begin", unihanStringBegin)
unihanOffsetBegin = unihanStringBegin + unihan.calculateStringSize()
out.write(pack("=I", unihanOffsetBegin))
print("unihan offsets begin", unihanOffsetBegin)
end = unihanOffsetBegin + unihan.calculateOffsetSize()
print("end should be", end)
pos += 40
print("========== writing data ====================")
pos = names.writeStrings(out, pos)
print("names strings written, position", pos)
pos = names.writeOffsets(out, pos)
print("names offsets written, position", pos)
pos = details.writeStrings(out, pos)
print("details strings written, position", pos)
pos = details.writeOffsets(out, pos)
print("details offsets written, position", pos)
pos = sectionsBlocks.writeBlockStrings(out, pos)
print("block strings written, position", pos)
pos = sectionsBlocks.writeBlockOffsets(out, pos)
print("block offsets written, position", pos)
pos = sectionsBlocks.writeSectionStrings(out, pos)
print("section strings written, position", pos)
pos = sectionsBlocks.writeSectionOffsets(out, pos)
print("section offsets written, position", pos)
pos = unihan.writeStrings(out, pos)
print("unihan strings written, position", pos)
pos = unihan.writeOffsets(out, pos)
print("unihan offsets written, position", pos)
print("========== writing translation dummy ======")
translationData = [["KCharSelect section name", sectionsBlocks.getSectionList()], ["KCharselect unicode block name",sectionsBlocks.getBlockList()]]
writeTranslationDummy(outTranslationDummy, translationData)
print("done. make sure to copy both kcharselect-data and kcharselect-translation.cpp.")