(DEFINE-FILE-INFO PACKAGE "INTERLISP" READTABLE "INTERLISP" BASE 10) (FILECREATED "19-Jul-2023 09:26:13" {WMEDLEY}UNICODE.;199 65282 :EDIT-BY rmk :CHANGES-TO (VARS UNICODECOMS) :PREVIOUS-DATE "19-Jul-2022 15:36:40" {WMEDLEY}UNICODE.;198) (PRETTYCOMPRINT UNICODECOMS) (RPAQQ UNICODECOMS [(COMS (* ;; "External formats") (FNS UTF8.OUTCHARFN UTF8.INCCODEFN UTF8.PEEKCCODEFN \UTF8.BACKCCODEFN) (FNS UTF16BE.OUTCHARFN UTF16BE.INCCODEFN UTF16BE.PEEKCCODEFN \UTF16BE.BACKCCODEFN) (INITVARS (EXTERNALEOL 'LF)) (FNS MAKE-UNICODE-FORMATS) (P (MAKE-UNICODE-FORMATS EXTERNALEOL)) (ADDVARS (*DEFAULT-EXTERNALFORMATS* (UNIX :UTF-8))) (FNS UNICODE.UNMAPPED) (FNS XCCS-UTF8-AFTER-OPEN) (DECLARE%: EVAL@COMPILE DONTCOPY (MACROS UNICODE.TRANSLATE)) (FNS XTOUCODE UTOXCODE)) (COMS (* ;; "Unicode mapping files") (FNS READ-UNICODE-MAPPING-FILENAMES READ-UNICODE-MAPPING WRITE-UNICODE-MAPPING WRITE-UNICODE-INCLUDED WRITE-UNICODE-MAPPING-HEADER WRITE-UNICODE-MAPPING-FILENAME ) (VARS XCCS-SET-NAMES) (* ;; "Automate dumping of a documentation prefix") [DECLARE%: EVAL@COMPILE DONTCOPY (CONSTANTS (MISSINGCODE (CL:PARSE-INTEGER "FFFE" :RADIX 16)) (UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX 16] (VARS UNICODE-MAPPING-HEADER) (INITVARS (UNICODEDIRECTORIES NIL))) (COMS (* ;; "Set up translation tables for UTF8 and UTFBE external formats") (FNS MAKE-UNICODE-TRANSLATION-TABLES) [INITVARS (DEFAULT-XCCS-CHARSETS '(LATIN SYMBOLS1 SYMBOLS2 EXTENDED-LATIN FORMS SYMBOLS3 SYMBOLS4 ACCENTED-LATIN GREEK] [DECLARE%: DONTEVAL@LOAD DOCOPY (P (MAKE-UNICODE-TRANSLATION-TABLES ( READ-UNICODE-MAPPING DEFAULT-XCCS-CHARSETS T) '*XCCSTOUNICODE* '*UNICODETOXCCS*] (GLOBALVARS *XCCSTOUNICODE* *UNICODETOXCCS*)) (FNS HEXSTRING UTF8HEXSTRING NUTF8CODEBYTES NUTF8STRINGBYTES XTOUSTRING XCCSSTRING) (FNS SHOWCHARS) (DECLARE%: EVAL@COMPILE DONTCOPY (FILES (FROM LOADUPS) EXPORTS.ALL) (* ;; "These control the layout of the translation tables. Since many of the upper panels are sparse, doing it per-panel (128) seems more space-efficient, and residual alists can be shorter") (CONSTANTS (TRANSLATION-SEGMENT-SIZE 128) (MAX-ALIST-LENGTH 10) (N-TRANSLATION-SEGMENTS (IQUOTIENT 65536 TRANSLATION-SEGMENT-SIZE)) (TRANSLATION-SHIFT (INTEGERLENGTH (SUB1 TRANSLATION-SEGMENT-SIZE))) (TRANSLATION-MASK (SUB1 TRANSLATION-SEGMENT-SIZE]) (* ;; "External formats") (DEFINEQ (UTF8.OUTCHARFN [LAMBDA (STREAM CHARCODE RAW) (* ; "Edited 8-Aug-2021 13:02 by rmk:") (* ; "Edited 17-Aug-2020 08:45 by rmk:") (* ; "Edited 30-Jan-2020 23:08 by rmk:") (* ;; "Perhaps the translation table should already do the mapping for EOL to LF, but that seems to be a separate property of the stream. Also, CRLF=2 bytes.") (* ;; "Print UTF8 sequence for CHARCODE. Do not do XCCS to Unicode translation if RAW.") (IF (EQ CHARCODE (CHARCODE EOL)) THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0) (\BOUTEOL STREAM) ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM) (IPLUS16 1 DATUM)) (FOR C INSIDE (CL:IF RAW CHARCODE (UNICODE.TRANSLATE CHARCODE *XCCSTOUNICODE*)) DO (IF (ILESSP C 128) THEN (\BOUT STREAM C) ELSEIF (ILESSP C 2048) THEN (* ; "x800") (\BOUT STREAM (LOGOR (LLSH 3 6) (LRSH C 6))) (\BOUT STREAM (LOGOR (LLSH 2 6) (LOADBYTE C 0 6))) ELSEIF (ILESSP C 65536) THEN (* ; "x10000") (\BOUT STREAM (LOGOR (LLSH 7 5) (LRSH C 12))) (\BOUT STREAM (LOGOR (LLSH 2 6) (LOADBYTE C 6 6))) (\BOUT STREAM (LOGOR (LLSH 2 6) (LOADBYTE C 0 6))) ELSEIF (ILESSP C 2097152) THEN (* ; "x200000") (\BOUT STREAM (LOGOR (LLSH 15 4) (LRSH C 18))) (\BOUT STREAM (LOGOR (LLSH 2 6) (LOADBYTE C 12 6))) (\BOUT STREAM (LOGOR (LLSH 2 6) (LOADBYTE C 6 6))) (\BOUT STREAM (LOGOR (LLSH 2 6) (LOADBYTE C 0 6))) ELSE (ERROR "CHARCODE too big for UTF8" C]) (UTF8.INCCODEFN [LAMBDA (STREAM COUNTP RAW) (* ; "Edited 6-Aug-2021 16:02 by rmk:") (* ; "Edited 6-Aug-2020 17:13 by rmk:") (* ;; "Do not do UNICODE to XCSS translation if RAW.") (* ;; "Test for smallp because the stream's End-of-file operation may suppress the error") (DECLARE (USEDFREE *BYTECOUNTER*)) (LET (BYTE1 BYTE2 BYTE3 BYTE4 CODE (COUNT 1)) (SETQ BYTE1 (\BIN STREAM)) (* ;; "Distinguish on header bytes (modulo peculiar EOF behavior--the caller will get whatever ended up in BYTE1") (CL:WHEN (SMALLP BYTE1) [SETQ CODE (IF (ILESSP BYTE1 128) THEN (* ;;  "Test first: Ascii is the common case. EOL requires its own translation") (SELCHARQ BYTE1 (CR (SELECTC (FETCH (STREAM EOLCONVENTION) OF STREAM) (CR.EOLC (* ; "Also eq BYTE1") (CHARCODE EOL)) (CRLF.EOLC (IF (EQ (CHARCODE LF) (\PEEKBIN STREAM T)) THEN (\BIN STREAM) (CL:WHEN COUNTP (SETQ COUNT 2)) (CHARCODE EOL) ELSE BYTE1)) BYTE1)) (LF (CL:IF (EQ LF.EOLC (FETCH (STREAM EOLCONVENTION) OF STREAM)) (CHARCODE EOL) BYTE1)) BYTE1) ELSEIF (IGEQ BYTE1 (LLSH 15 4)) THEN (* ; "4 bytes") (SETQ BYTE2 (\BIN STREAM)) (CL:WHEN (OR (NOT (SMALLP BYTE2)) (ILESSP BYTE2 128)) (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2))) (SETQ BYTE3 (\BIN STREAM)) (CL:WHEN (OR (NOT (SMALLP BYTE3)) (ILESSP BYTE3 128)) (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3))) (SETQ BYTE4 (\BIN STREAM)) (CL:WHEN (OR (NOT (SMALLP BYTE4)) (ILESSP BYTE4 128)) (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3 BYTE4))) (SETQ COUNT 4) (LOGOR (LLSH (LOADBYTE BYTE1 0 3) 18) (LLSH (LOADBYTE BYTE2 0 6) 12) (LLSH (LOADBYTE BYTE3 0 6) 6) (LOADBYTE BYTE4 0 6)) ELSEIF (IGEQ BYTE1 (LLSH 7 5)) THEN (* ; "3 bytes") (SETQ BYTE2 (\BIN STREAM)) (CL:WHEN (OR (NOT (SMALLP BYTE2)) (ILESSP BYTE2 128)) (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2))) (SETQ BYTE3 (\BIN STREAM)) (CL:WHEN (OR (NOT (SMALLP BYTE3)) (ILESSP BYTE3 128)) (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3))) (SETQ COUNT 3) (LOGOR (LLSH (LOADBYTE BYTE1 0 4) 12) (LLSH (LOADBYTE BYTE2 0 6) 6) (LOADBYTE BYTE3 0 6)) ELSE (* ; "Must be 2 bytes") (SETQ COUNT 2) (SETQ BYTE2 (\BIN STREAM)) (CL:WHEN (OR (NOT (SMALLP BYTE2)) (ILESSP BYTE2 128)) (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2))) (LOGOR (LLSH (LOADBYTE BYTE1 0 5) 6) (LOADBYTE BYTE2 0 6]) (CL:UNLESS (OR RAW (NOT (SMALLP CODE))) (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*))) (CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT)) CODE]) (UTF8.PEEKCCODEFN [LAMBDA (STREAM NOERROR RAW) (* ; "Edited 14-Jun-2021 22:53 by rmk:") (* ;; "Modeled this after \EUCPEEK on LLREAD. In the multi-byte (non-ASCII) case, backs the file pointer to the beginning by the proper number of \BACKFILEPTRs, and returns a count of 0. Returns NIL if NOERROR and either invalid UTF8 or end of file.") (* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.") (* ;; "Do not do UNICODE to XCCS translation if RAW") (PROG (BYTE1 BYTE2 BYTE3 BYTE4 CODE) (SETQ BYTE1 (\PEEKBIN STREAM NOERROR)) (* ;; "Distinguish on header bytex") (CL:UNLESS BYTE1 (RETURN NIL)) [IF (ILESSP BYTE1 128) THEN (* ;;  "Test first: Ascii is the common case. No need to back up, since we peeked.") (SETQ CODE BYTE1) ELSEIF (IGEQ BYTE1 (LLSH 15 4)) THEN (* ; "4 bytes") (\BIN STREAM) (CL:UNLESS (AND (SETQ BYTE2 (\PEEKBIN STREAM NOERROR)) (IGEQ BYTE2 128)) (\BACKFILEPTR STREAM) (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2))) (RETURN CODE)) (\BIN STREAM) (CL:UNLESS (AND (SETQ BYTE3 (\PEEKBIN STREAM NOERROR)) (IGEQ BYTE3 128)) (\BACKFILEPTR STREAM) (\BACKFILEPTR STREAM) (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3))) (RETURN CODE)) (\BIN STREAM) (SETQ BYTE4 (\PEEKBIN STREAM NOERROR)) (* ;  "PEEK the last, no need to back it up") (\BACKFILEPTR STREAM) (\BACKFILEPTR STREAM) (\BACKFILEPTR STREAM) (IF (AND BYTE4 (IGEQ BYTE4 128)) THEN (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 3) 18) (LLSH (LOADBYTE BYTE2 0 6) 12) (LLSH (LOADBYTE BYTE3 0 6) 6) (LOADBYTE BYTE4 0 6))) ELSEIF NOERROR ELSE (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3 BYTE4))) ELSEIF (IGEQ BYTE1 (LLSH 7 5)) THEN (* ; "3 bytes") (\BIN STREAM) (CL:UNLESS (AND (SETQ BYTE2 (\PEEKBIN STREAM NOERROR)) (IGEQ BYTE2 128)) (\BACKFILEPTR STREAM) (OR NOERROR (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2))) (RETURN CODE)) (\BIN STREAM) (SETQ BYTE3 (\PEEKBIN STREAM NOERROR)) (\BACKFILEPTR STREAM) (\BACKFILEPTR STREAM) (IF (AND BYTE3 (IGEQ BYTE3 128)) THEN (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 4) 12) (LLSH (LOADBYTE BYTE2 0 6) 6) (LOADBYTE BYTE3 0 6))) ELSEIF NOERROR ELSE (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2 BYTE3))) ELSE (* ; "Must be 2 bytes") (\BIN STREAM) (SETQ BYTE2 (\PEEKBIN STREAM NOERROR)) (\BACKFILEPTR STREAM) (IF (AND BYTE2 (IGEQ BYTE2 128)) THEN (SETQ CODE (LOGOR (LLSH (LOADBYTE BYTE1 0 5) 6) (LOADBYTE BYTE2 0 6))) ELSEIF NOERROR ELSE (ERROR "INVALID UTF8 SEQUENCE" (LIST BYTE1 BYTE2] (CL:WHEN (AND CODE (NOT RAW)) (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*))) (RETURN CODE]) (\UTF8.BACKCCODEFN [LAMBDA (STREAM COUNTP RAW) (* ; "Edited 19-Jul-2022 15:30 by rmk") (* ; "Edited 6-Aug-2021 16:04 by rmk:") (* ;; "\BACKFILEPTR is NIL at beginning of FILE. Presumably a little bit more efficient if we decoded the UTF8 bytes backwards and didn't do the peek, but probably not worth the complexity. ") (DECLARE (USEDFREE *BYTECOUNTER*)) (BIND (C _ 0) WHILE (IF (\BACKFILEPTR STREAM) THEN (ADD C -1) (EQ 2 (LRSH (\PEEKBIN STREAM) 6)) ELSE (CL:WHEN COUNTP (SETQ *BYTECOUNTER* C)) (RETURN NIL)) REPEATUNTIL (EQ C -4) FINALLY (CL:WHEN COUNTP (SETQ *BYTECOUNTER* C)) (RETURN (UTF8.PEEKCCODEFN STREAM NIL RAW]) ) (DEFINEQ (UTF16BE.OUTCHARFN [LAMBDA (STREAM CHARCODE RAW) (* ; "Edited 8-Aug-2021 13:09 by rmk:") (* ; "Edited 30-Jan-2020 23:08 by rmk:") (* ;; "PRINT UTF16 sequence for CHARCODE. Do not do XCCS to UNICODE translation if RAW.") (* ;; "Not sure about EOL conversion if truly %"raw%"") (IF (EQ CHARCODE (CHARCODE EOL)) THEN (FREPLACE (STREAM CHARPOSITION) OF STREAM WITH 0) ELSE (CHANGE (FFETCH (STREAM CHARPOSITION) OF STREAM) (IPLUS16 1 DATUM))) (FOR C INSIDE (CL:IF RAW CHARCODE (UNICODE.TRANSLATE CHARCODE *XCCSTOUNICODE*)) DO (\WOUT STREAM C]) (UTF16BE.INCCODEFN [LAMBDA (STREAM COUNTP RAW) (* ; "Edited 6-Aug-2021 16:05 by rmk:") (* ;;  "Do not do UNICODE to XCCS translation if RAW. Test for SMALLPin case of funky EOF behavior") (DECLARE (USEDFREE *BYTECOUNTER*)) (LET (CODE BYTE1 BYTE2 COUNT) (IF [AND (SMALLP (SETQ BYTE1 (\BIN STREAM))) (SMALLP (SETQ BYTE2 (\BIN STREAM] THEN (SETQ COUNT 2) (SETQ CODE (LOGOR (LLSH (\BIN STREAM) 8) (\BIN STREAM))) (CL:UNLESS RAW (SETQ CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*))) (CL:WHEN COUNTP (SETQ *BYTECOUNTER* COUNT)) CODE ELSE (ERROR "ODD NUMBER OF BYTES IN UTF16 FILE" STREAM]) (UTF16BE.PEEKCCODEFN [LAMBDA (STREAM NOERROR RAW) (* ; "Edited 14-Jun-2021 22:58 by rmk:") (* ;; "Could be that the caller takes care of backing up the file position if the number of binned-bytes is returned.") (* ;; "Do not do UNICODE to XCCS translation if RAW") (LET (BYTE1 BYTE2 CODE) (SETQ BYTE1 (\PEEKBIN STREAM NOERROR)) (IF BYTE1 THEN (\BIN STREAM) (SETQ BYTE2 (\PEEKBIN STREAM NOERROR)) (\BACKFILEPTR STREAM) (IF BYTE2 THEN (SETQ CODE (LOGOR (LLSH BYTE1 8) BYTE2)) (CL:IF RAW CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*)) ELSEIF NOERROR THEN NIL) ELSEIF NOERROR THEN NIL ELSE (ERROR "INVALID UTF16 CHARACTER" (LIST BYTE1 BYTE2]) (\UTF16BE.BACKCCODEFN [LAMBDA (STREAM COUNTP RAW) (* ; "Edited 19-Jul-2022 15:14 by rmk") (* ; "Edited 6-Aug-2021 16:07 by rmk:") (* ;; "\BACKFILEPTR is NIL at beginning of FILE, do nothing.") (* ;; "Common for big-ending and little-ending") (DECLARE (USEDFREE *BYTECOUNTER*)) (CL:WHEN (\BACKFILEPTR STREAM) (LET (CODE (BYTE2 (\PEEKBIN STREAM))) (IF (\BACKFILEPTR STREAM) THEN (CL:WHEN COUNTP (SETQ *BYTECOUNTER* -2)) (SETQ CODE (LOGOR (LLSH BYTE2 8) (\PEEKBIN STREAM))) (CL:IF RAW CODE (UNICODE.TRANSLATE CODE *UNICODETOXCCS*)) ELSEIF COUNTP THEN (SETQ *BYTECOUNTER* -1) NIL)))]) ) (RPAQ? EXTERNALEOL 'LF) (DEFINEQ (MAKE-UNICODE-FORMATS [LAMBDA (EXTERNALEOL) (* ; "Edited 19-Jul-2022 15:36 by rmk") (* ; "Edited 6-Aug-2021 16:08 by rmk:") (* ;; "RAW formats do not do XCCS/Unicode translation, just deal with the byte encoding.") (* ;; "The EXTERNALEOL specifies the EOLCONVENTION of the stream, particularly to produce output files with the desired convention. On input the macro \CHECKEOLC (LLREAD) coerces only that coding to the internal EOL, which is a mistake.") (MAKE-EXTERNALFORMAT :UTF-8 (FUNCTION UTF8.INCCODEFN) (FUNCTION UTF8.PEEKCCODEFN) (FUNCTION \UTF8.BACKCCODEFN) (FUNCTION UTF8.OUTCHARFN) NIL EXTERNALEOL) (MAKE-EXTERNALFORMAT :UTF-8-RAW [FUNCTION (LAMBDA (STREAM COUNTP) (UTF8.INCCODEFN STREAM COUNTP T] [FUNCTION (LAMBDA (STREAM NOERROR) (UTF8.PEEKCCODEFN STREAM NOERROR T] [FUNCTION (LAMBDA (STREAM COUNTP) (\UTF8.BACKCCODEFN STREAM COUNTP T] [FUNCTION (LAMBDA (STREAM CHARCODE) (UTF8.OUTCHARFN STREAM CHARCODE T] NIL EXTERNALEOL) (MAKE-EXTERNALFORMAT :UTF-16BE (FUNCTION UTF16BE.INCCODEFN) (FUNCTION UTF16BE.PEEKCCODEFN) (FUNCTION \UTF16BE.BACKCCODEFN) (FUNCTION UTF16BE.OUTCHARFN) NIL EXTERNALEOL) (MAKE-EXTERNALFORMAT :UTF-16BE-RAW [FUNCTION (LAMBDA (STREAM COUNTP) (UTF16BE.INCCODEFN STREAM COUNTP T] [FUNCTION (LAMBDA (STREAM NOERROR) (UTF16BE.PEEKCCODEFN STREAM NOERROR T] [FUNCTION (LAMBDA (STREAM COUNTP) (\UTF16BE.BACKCCODEFN STREAM COUNTP T] [FUNCTION (LAMBDA (STREAM CHARCODE) (UTF16BE.OUTCHARFN STREAM CHARCODE T] NIL EXTERNALEOL]) ) (MAKE-UNICODE-FORMATS EXTERNALEOL) (ADDTOVAR *DEFAULT-EXTERNALFORMATS* (UNIX :UTF-8)) (DEFINEQ (UNICODE.UNMAPPED [LAMBDA (CODE TRANSLATION-TABLE) (* ; "Edited 11-Aug-2020 20:23 by rmk:") (* ;; "This is the slow fall-out when UNICODE.TRANSLATE determines that CODED has no fast mapping in TRANSLATION-TABLE.") (* ;; "We return an existing entry in the hash array of the table. If CODE has not previously been seen, we allocate a new code in the forward unmapped hasharray and put the inverse in the backward array.") (LET ((FORWARD (CL:SVREF TRANSLATION-TABLE N-TRANSLATION-SEGMENTS)) INVERSE NEXTCODE) (IF (GETHASH CODE (CAR FORWARD)) ELSEIF (AND (ILEQ CODE (CADDR FORWARD)) (IGEQ CODE (CADDDR FORWARD))) THEN (ERROR "UNMAPPED CODE IS EITHER XCCS-UNUSED OR UNICODE-PRIVATE" CODE) ELSE (SETQ INVERSE (CL:SVREF TRANSLATION-TABLE (ADD1 N-TRANSLATION-SEGMENTS))) (SETQ NEXTCODE (ADD (CADR INVERSE) 1)) (CL:WHEN (IGREATERP NEXTCODE (CADDR INVERSE)) (ERROR "EXHAUSTED RANGE FOR UNMAPPED CODES" CODE)) (PUTHASH CODE NEXTCODE (CAR FORWARD)) (PUTHASH NEXTCODE CODE (CAR INVERSE)) NEXTCODE]) ) (DEFINEQ (XCCS-UTF8-AFTER-OPEN [LAMBDA (STREAM ACCESS PARAMETERS) (* ; "Edited 13-Aug-2020 11:54 by rmk:") (* ;; "If added to STREAM-AFTER-OPEN-FNS, causes mapping files to be opened as UTF8.") (CL:WHEN (AND (STRPOS "XCCS-" (U-CASE (FULLNAME STREAM))) [EQ 'TXT (U-CASE (FILENAMEFIELD (FULLNAME STREAM) 'EXTENSION] (NOT (ASSOC 'EXTERNALFORMAT PARAMETERS))) (STREAMPROP STREAM 'EXTERNALFORMAT :UTF8))]) ) (DECLARE%: EVAL@COMPILE DONTCOPY (DECLARE%: EVAL@COMPILE (PUTPROPS UNICODE.TRANSLATE MACRO [OPENLAMBDA (CODE TRANSLATION-TABLE) (LET [(X (CL:SVREF TRANSLATION-TABLE (LRSH CODE TRANSLATION-SHIFT ] (COND ((LISTP X) (OR (CDR (FASSOC (LOGAND CODE TRANSLATION-SHIFT) X)) CODE)) [(AND X (CL:SVREF X (LOGAND CODE TRANSLATION-MASK] (T (UNICODE.UNMAPPED CODE TRANSLATION-TABLE]) ) ) (DEFINEQ (XTOUCODE [LAMBDA (XCCSCODE) (* ; "Edited 9-Aug-2020 09:04 by rmk:") (UNICODE.TRANSLATE XCCSCODE *XCCSTOUNICODE*]) (UTOXCODE [LAMBDA (UNICODE) (* ; "Edited 9-Aug-2020 09:04 by rmk:") (UNICODE.TRANSLATE UNICODE *UNICODETOXCCS*]) ) (* ;; "Unicode mapping files") (DEFINEQ (READ-UNICODE-MAPPING-FILENAMES [LAMBDA (FILESPEC) (* ; "Edited 5-Aug-2020 15:59 by kaplan") (* ; "Edited 4-Aug-2020 17:31 by rmk:") (FOR F X CSI INSIDE FILESPEC COLLECT (IF (FINDFILE (PACKFILENAME 'BODY F 'EXTENSION 'TXT) T UNICODEDIRECTORIES) ELSEIF [SETQ CSI (OR (SASSOC F XCCS-SET-NAMES) (FIND N IN XCCS-SET-NAMES SUCHTHAT (EQ F (CADR N] THEN (FINDFILE (PACKFILENAME 'BODY (CONCAT 'XCCS- (CAR CSI) '= (CADR CSI)) 'EXTENSION 'TXT) T UNICODEDIRECTORIES) ELSE F]) (READ-UNICODE-MAPPING [LAMBDA (FILESPEC NOPRINT NOERROR) (* ; "Edited 3-Jul-2021 13:37 by rmk:") (* ;; "Combines the char-mapping tables from FILES coded in the Uncode-CDROM format. Comments prefixed by # and") (* ;; " Column 1: Input hex code in the format 0xXXXX") (* ;; " Column 2: Corresponding Unicode code-sequence in the format") (* ;; " 0xXXXX ... 0xYYYY") (* ;;  " Column 3: (after #) Character name in some mapping files, utf-8 character") (* ;; " for XCCS mapping files") (* ;; "") (* ;; "Result is a list of (fromcode tocode1 ... tocoden) integer lists (almost always with only a single tocode") (FOR FILE [SEPBITTABLE _ (MAKEBITTABLE (CHARCODE (TAB SPACE] IN (  READ-UNICODE-MAPPING-FILENAMES FILESPEC) JOIN (CL:WITH-OPEN-FILE (STREAM FILE :DIRECTION :INPUT :EXTERNAL-FORMAT :UTF-8-RAW) (BIND LINE START FIRST (CL:UNLESS (FILEPOS "Name:" STREAM NIL NIL NIL T) (ERROR "NOT A UNICODE MAPPING FILE" (FULLNAME STREAM))) (SETQ LINE (CL:READ-LINE STREAM NIL NIL)) (CL:UNLESS NOPRINT (PRINTOUT T T "Unicode mapping: " (CL:STRING-TRIM " " LINE) T)) WHILE (SETQ LINE (CL:READ-LINE STREAM NIL NIL)) WHEN (SETQ START (STRPOSL SEPBITTABLE LINE 1 T)) UNLESS (EQ (CHARCODE %#) (NTHCHARCODE LINE START)) COLLECT (BIND END WHILE [SETQ END (OR (STRPOSL SEPBITTABLE LINE START) (ADD1 (NCHARS LINE] COLLECT [CHARCODE.DECODE (SUBSTRING LINE START (SUB1 END) (CONSTANT (CONCAT] REPEATWHILE (AND (SETQ START (STRPOSL SEPBITTABLE LINE END T)) (NEQ (CHARCODE %#) (NTHCHARCODE LINE START]) (WRITE-UNICODE-MAPPING [LAMBDA (MAPPING INCLUDECHARSETS FILE EMPTYOK) (* ; "Edited 16-Aug-2020 16:56 by rmk:") (* ;; "Writes a symbol unicode mapping file. Mapping is a list of (XCCS-code Unicode) pairs, which may contain codes in multiple character sets.") (* ;; "If FILE is NIL, it defaults to a name XCCS- followed by the octal character sets in the mapping, in the unicode/XEROX directory.") (* ;; "The output lines are of the form x0XXXx0UUUU# Unicode-char") (* ;;  "If INCLUDECHARSETS=T then the mappings are split up into separate per-character set files.") (* ;; "Otherwise, all and only mappings included in thos charsets are included in a single output file--an implicit subset.") (IF (AND (EQ INCLUDECHARSETS T) (NULL FILE)) THEN (IF MAPPING THEN (FOR CSI F IN XCCS-SET-NAMES WHEN (SETQ F (WRITE-UNICODE-MAPPING MAPPING (CAR CSI) NIL T)) COLLECT F) ELSE (PRINTOUT T "THERE ARE NO MAPPINGS" T) NIL) ELSE (LET (IMAPPING CSETINFO RANGES) (CL:MULTIPLE-VALUE-SETQ (IMAPPING CSETINFO RANGES) (WRITE-UNICODE-INCLUDED MAPPING INCLUDECHARSETS)) (IF IMAPPING THEN (CL:WITH-OPEN-FILE (STREAM (WRITE-UNICODE-MAPPING-FILENAME FILE CSETINFO RANGES) :DIRECTION :OUTPUT :IF-EXISTS :NEW-VERSION :EXTERNAL-FORMAT :UTF8-RAW) (WRITE-UNICODE-MAPPING-HEADER STREAM CSETINFO RANGES) (SORT IMAPPING T) (FOR M CSET LEFTC FIRSTRIGHTC CSI IN IMAPPING DO (SETQ LEFTC (CAR M)) (SETQ FIRSTRIGHTC (CADR M)) (CL:UNLESS (EQ CSET (LRSH LEFTC 8)) (SETQ CSET (LRSH LEFTC 8)) (SETQ CSI (ASSOC CSET CSETINFO)) (PRINTOUT STREAM T "# " .P2 (CADR CSI) " " (CADDR CSI) T)) (PRINTOUT STREAM "0x" (HEXSTRING LEFTC 4) %# (FOR RIGHTC IN (CDR M) DO (PRINTOUT NIL " " "0x" (HEXSTRING RIGHTC 4))) " # " (SELECTC FIRSTRIGHTC (UNDEFINEDCODE (* ;; "FFFF") "UNDEFINED") (MISSINGCODE (* ;; "FFFE") "MISSING") (IF (ILESSP FIRSTRIGHTC 32) THEN (* ; "Control chars") [CONCAT "^" (CHARACTER (IPLUS FIRSTRIGHTC (CHARCODE @] ELSE (CHARACTER FIRSTRIGHTC))) T)) (FULLNAME STREAM)) ELSEIF (NOT EMPTYOK) THEN (PRINTOUT T "THERE ARE NO MAPPINGS") (CL:WHEN INCLUDECHARSETS (PRINTOUT T " FOR " .PPVTL (MKLIST INCLUDECHARSETS) T)) NIL]) (WRITE-UNICODE-INCLUDED [LAMBDA (MAPPING INCLUDECHARSETS) (* ; "Edited 4-Aug-2020 17:47 by rmk:") (* ;; "CSETINFO is a list of (num string name) for each included character set.") (LET (CHARSETS CSETINFO RANGES ICSETS IMAPPING) (* ;; "Normalize the INCLUDECHARSETS, then reduce MAPPING to the included mappings") [SETQ ICSETS (FOR C POS KNOWN INSIDE (OR INCLUDECHARSETS (FOR CSI IN XCCS-SET-NAMES COLLECT (CAR CSI))) JOIN [SETQ KNOWN (OR (SASSOC C XCCS-SET-NAMES) (FIND N IN XCCS-SET-NAMES SUCHTHAT (EQ C (CADR N))) (HELP "UNKNOWN CHARACTER SET" (OCTALSTRING C] (IF (SETQ POS (STRPOS "-" (CAR KNOWN))) THEN (FOR I FROM (CL:PARSE-INTEGER (SUBSTRING (CAR KNOWN) 1 (SUB1 POS)) :RADIX 8) TO (CL:PARSE-INTEGER (SUBSTRING (CAR KNOWN) (ADD1 POS)) :RADIX 8) COLLECT (LIST I (OCTALSTRING I) (CADR KNOWN))) ELSE (CONS (CONS (CL:PARSE-INTEGER (CAR KNOWN) :RADIX 8) KNOWN] (SETQ IMAPPING (FOR M CSI IN MAPPING WHEN (SETQ CSI (ASSOC (LRSH (CAR M) 8) ICSETS)) COLLECT (* ;; "The attested subset of INCLUDED") (CL:UNLESS (MEMB CSI CSETINFO) (PUSH CSETINFO CSI)) M)) (* ;; "Sort as numbers, not octal strings, then group into consecutive ranges") (SETQ CSETINFO (SORT CSETINFO T)) [SETQ RANGES (FOR CTAIL C START END ON (FOR CSI IN CSETINFO COLLECT (CAR CSI)) WHILE CTAIL COLLECT (SETQ START (CAR CTAIL)) (SETQ END START) (CONS START (WHILE [AND (CDR CTAIL) (EQ END (SUB1 (CADR CTAIL] COLLECT (SETQ CTAIL (CDR CTAIL)) (SETQ END (CAR CTAIL] (* ;; "Split out groups of less than 3. But if a range exhaustively covers a known subset (like JIS), replace by the name") [SETQ RANGES (FOR R STR KNOWN LAST IN RANGES JOIN (SETQ LAST (CAR (LAST R))) (IF (EQ (CAR R) LAST) THEN (CONS (OCTALSTRING (CAR R))) ELSEIF (SETQ KNOWN (SASSOC (SETQ STR (CONCAT (OCTALSTRING (CAR R)) "-" (OCTALSTRING LAST))) XCCS-SET-NAMES)) THEN (CONS (CADR KNOWN)) ELSEIF (CDDR R) THEN (CONS STR) ELSE (LIST (OCTALSTRING (CAR R)) (OCTALSTRING LAST] (CL:VALUES IMAPPING CSETINFO RANGES]) (WRITE-UNICODE-MAPPING-HEADER [LAMBDA (STREAM CSETINFO RANGES) (* ; "Edited 4-Aug-2020 17:38 by rmk:") (* ;; "Writes the standard per-file header information") (FOR LINE IN UNICODE-MAPPING-HEADER DO (PRINTOUT STREAM "#" 2) (SELECTQ LINE (XCCSCHARACTERSETS (PRINTOUT STREAM " XCCS charset") (IF (CDR CSETINFO) THEN (PRINTOUT STREAM "s:" -4) (FOR R IN RANGES DO (PRINTOUT STREAM R " ")) (TERPRI STREAM) ELSE (* ; "Singleton") (PRINTOUT STREAM ": " -4 (CADAR CSETINFO) " " (CADDAR CSETINFO))) (TERPRI STREAM)) (DATE (PRINTOUT STREAM " Date:" -13 (DATE (DATEFORMAT NO.TIME NO.LEADING.SPACES)) T)) (PRINTOUT STREAM LINE T))) (TERPRI STREAM]) (WRITE-UNICODE-MAPPING-FILENAME [LAMBDA (FILE CSETINFO RANGES) (* ; "Edited 4-Aug-2020 19:34 by rmk:") (PACKFILENAME 'BODY [OR FILE (CONCATLIST (CONS 'XCCS- (IF (CDR CSETINFO) THEN (FOR RTAIL R ON RANGES JOIN (SETQ R (CAR RTAIL)) (SETQ R (CL:IF (LISTP R) (LIST (CAR R) "-" (CDR R)) (CONS R))) (CL:IF (CDR RTAIL) (NCONC1 R ",")) R) ELSE (LIST (CADAR CSETINFO) "=" (CADDAR CSETINFO] 'DIRECTORY (CAR UNICODEDIRECTORIES) 'EXTENSION 'TXT]) ) (RPAQQ XCCS-SET-NAMES (("0" LATIN) ("41" SYMBOLS1) ("42" SYMBOLS2) ("43" EXTENDED-LATIN) ("44" HIRAGANA) ("45" KATAKANA) ("46" GREEK) ("47" CYRILLIC) ("50" FORMS) ("60-172" JIS) ("340" ARABIC) ("341" HEBREW) ("342" IPA) ("343" HANGUL) ("344" GEORGIAN-ARMENIAN) ("356" SYMBOLS3) ("357" SYMBOLS4) ("360" LIGATURES) ("361" ACCENTED-LATIN) ("365" MORE-ARABIC) ("375" GRAPHIC-VARIANTS))) (* ;; "Automate dumping of a documentation prefix") (DECLARE%: EVAL@COMPILE DONTCOPY (DECLARE%: EVAL@COMPILE (RPAQ MISSINGCODE (CL:PARSE-INTEGER "FFFE" :RADIX 16)) (RPAQ UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX 16)) (CONSTANTS (MISSINGCODE (CL:PARSE-INTEGER "FFFE" :RADIX 16)) (UNDEFINEDCODE (CL:PARSE-INTEGER "FFFF" :RADIX 16))) ) ) (RPAQQ UNICODE-MAPPING-HEADER ("" " Name: XCCS (XC-3-1-1-0) to Unicode" " Unicode version: 3.0" XCCSCHARACTERSETS " Table version: 0.1" " Table format: Format A" DATE " Author: Ron Kaplan " "" "This file contains mappings from the Xerox Character Code Standard (version" "XC1-3-3-0, 1987) into Unicode 3.0. standard codes. That is the version of" "XCCS corresponding to the fonts in the Medley system." "" "The format of this file conforms to the format of the other Unicode-supplied" "mapping files:" " Three white-space (tab or spaces) separated columns:" " Column 1 is the XCCS code (as hex 0xXXXX)" " Column 2 is the corresponding Unicode (as hex 0xXXXX)" " Column 3 (after #) is a comment column. For convenience, it contains the" " Unicode character itself (since the Unicode character names" " are not available)" "Unicode FFFF is used for undefined XCCS codes (Column 3 = UNDEFINED" "Unicode FFFE is used for XCCS codes that have not yet been filled in." "(Column 3 = MISSING)" "" "This file is encoded in UTF8, so that the Unicode characters" "are properly displayed in Column 3 and can be edited by standard" "Unicode-enabled editors (e.g. Mac Textedit)." "" "This file can also be read by the function" "READ-UNICODE-MAPPING in the UNICODE Medley library package." "" "The entries are in XCCS order and grouped by character sets. In front of" "the mappings, for convenience, there is a line with the octal XCCS" "character set, after #." "" "Note that a given XCCS code might map to codes in several different Unicode" "positions, since there are repetitions in the Unicode standard." "" "For more details, see the associated README.TXT file." "" "Any comments or problems, contact ")) (RPAQ? UNICODEDIRECTORIES NIL) (* ;; "Set up translation tables for UTF8 and UTFBE external formats") (DEFINEQ (MAKE-UNICODE-TRANSLATION-TABLES [LAMBDA (MAPPING LTORVAR RTOLVAR) (* ; "Edited 21-Aug-2021 13:12 by rmk:") (* ; "Edited 17-Aug-2020 08:46 by rmk:") (* ;; "MAPPING is the list of numeric code correspondence pairs constructed by applying READ-UNICODE-MAPPING to a Unicode mapping file.") (* ;; "This produces two recoding arrays, one maps left-side codes into right-side codes (e.g. XCCS or ISO8859-1 to Unicode), for printing, the other maps right-side (Unicode) codes to corresponding right-side codes (e.g. XCCS).") (* ;; "") (* ;; "We assume that the left-to-right mapping into Unicode is functional, so that each left code maps to a unique right (Unicode) code, because Unicode is presumably the most refined coding scheme. But several Unicode codes may map to the same left code, for logically different codes that happen to have the same glyphs. In that case the heuristic is to map each %"from%" code to the lowest of the possible %"to%" codes. This means that round-trip reading/writing or writing/reading from one or both starting points may not always be lossless.") (* ;; " ") (* ;; " Each recoding array has 256 elements, one for each possible high-order byte of a character code. An array entry is either NIL, a 256-array of codes indexed by low-order bytes, or an alist of (lower-order-bytes . codes). The latter is used to save space for sparsely populated character sets.") (* ;; "") (* ;; "The element 256 of each array contains a hash table for characters that might be encountered in XCCS memory or Unicode files for which there is no mapping. Element 257 contains the corresponding inverse unmapped hash-array, so that UNICODE.TRANSLATE can update them consistently.") (* ;; "") (* ;; "UNICODE.TRANSLATE assigns an unmapped Unicode character to a %"not used%" XCCS code position (from 5,0 to 40,FF, leaving other low not-used sets for other internal uses (TEDIT?).") (* ;; "") (* ;;  "An unmapped XCCS character is assigned a code in the %"private use%" code blocks 0xE000-F8FF") (* ;; "") (* ;; "For the convenience of not having to deal with the multiple values, if LTORVAR or RTOLVAR are given, they are set to the constructed arrays before return.") (* ;; "") (LET ((LTORARRAY (CL:MAKE-ARRAY (IPLUS 2 N-TRANSLATION-SEGMENTS) :INITIAL-ELEMENT NIL)) (RTOLARRAY (CL:MAKE-ARRAY (IPLUS 2 N-TRANSLATION-SEGMENTS) :INITIAL-ELEMENT NIL))) (* ;; "The left-to-right direction (into Unicode). We start by distributing the mappings into alists in arrays indexed by the higher-order (charaset set byte). The second loop converts long alists into arrays.") [FOR M LEFTC RBASE RCODES IN MAPPING EACHTIME (SETQ RCODES (CDR M)) (SETQ RBASE (CAR RCODES)) UNLESS (IGEQ RBASE MISSINGCODE) DO (SETQ LEFTC (CAR M)) (* ;;  "(CDR RCODES) contains combiners on the base") (CL:PUSH (CONS (LOGAND LEFTC TRANSLATION-MASK) (CL:IF (CDR RCODES) RCODES RBASE)) (CL:SVREF LTORARRAY (LRSH LEFTC TRANSLATION-SHIFT ] (FOR I CSA FROM 0 TO (SUB1 N-TRANSLATION-SEGMENTS) WHEN (IGREATERP (LENGTH (CL:SVREF LTORARRAY I)) MAX-ALIST-LENGTH) DO (* ;; "Leave it alone if the alist is short") (SETQ CSA (CL:MAKE-ARRAY TRANSLATION-SEGMENT-SIZE :INITIAL-ELEMENT NIL)) (FOR P IN (CL:SVREF LTORARRAY I) DO (CL:SETF (CL:SVREF CSA (LOGAND (CAR P) TRANSLATION-MASK)) (CDR P))) (CL:SETF (CL:SVREF LTORARRAY I) CSA)) (* ;; "") (* ;; "Now the right-to-left direction (from Unicode). Here we have to detect and compensate for ambiguity.") (FOR M LEFTC RBASE RCOMBINERS PREV IN MAPPING EACHTIME (SETQ RBASE (CADR M)) (SETQ RCOMBINERS (CDDR M)) UNLESS (OR (IGEQ RBASE MISSINGCODE) RCOMBINERS) DO (* ;;  "Have we already seen an explicit mapping from right to left?") (SETQ LEFTC (CAR M)) [SETQ PREV (ASSOC (LOGAND RBASE TRANSLATION-MASK) (CL:SVREF RTOLARRAY (LRSH RBASE TRANSLATION-SHIFT ] (IF (NULL PREV) THEN (CL:PUSH (CONS (LOGAND RBASE TRANSLATION-MASK) LEFTC) (CL:SVREF RTOLARRAY (LRSH RBASE TRANSLATION-SHIFT ))) ELSEIF (IGREATERP (CDR PREV) LEFTC) THEN (RPLACD PREV LEFTC))) (FOR I CSA FROM 0 TO (SUB1 N-TRANSLATION-SEGMENTS) WHEN (IGREATERP (LENGTH (CL:SVREF RTOLARRAY I)) MAX-ALIST-LENGTH) DO (* ;; "Long list, make an array") (SETQ CSA (CL:MAKE-ARRAY TRANSLATION-SEGMENT-SIZE :INITIAL-ELEMENT NIL)) (FOR P IN (CL:SVREF RTOLARRAY I) DO (CL:SETF (CL:SVREF CSA (LOGAND (CAR P) TRANSLATION-MASK)) (CDR P))) (CL:SETF (CL:SVREF RTOLARRAY I) CSA)) (* ;; "") (* ;; "Allocate the hash arrays for future out-of-map codes. We we have to keep track of the next available and last possible codes, as well as the first available, for error checking.") (CL:SETF (CL:SVREF LTORARRAY N-TRANSLATION-SEGMENTS) (LIST (HASHARRAY 10) (CHARCODE.DECODE "5,0") (CHARCODE.DECODE "40,0") (CHARCODE.DECODE "5,0"))) (CL:SETF (CL:SVREF RTOLARRAY N-TRANSLATION-SEGMENTS) (LIST (HASHARRAY 10) (CHARCODE.DECODE "U+E000") (CHARCODE.DECODE "U+F8FF") (CHARCODE.DECODE "U+E000"))) (* ;; "Now put in the inverse unmapped hash arrays") (CL:SETF (CL:SVREF LTORARRAY (ADD1 N-TRANSLATION-SEGMENTS)) (CL:SVREF RTOLARRAY N-TRANSLATION-SEGMENTS)) (CL:SETF (CL:SVREF RTOLARRAY (ADD1 N-TRANSLATION-SEGMENTS)) (CL:SVREF LTORARRAY N-TRANSLATION-SEGMENTS)) (* ;; "") (CL:WHEN LTORVAR (SETATOMVAL LTORVAR LTORARRAY)) (CL:WHEN RTOLVAR (SETATOMVAL RTOLVAR RTOLARRAY)) (LIST LTORARRAY RTOLARRAY]) ) (RPAQ? DEFAULT-XCCS-CHARSETS '(LATIN SYMBOLS1 SYMBOLS2 EXTENDED-LATIN FORMS SYMBOLS3 SYMBOLS4 ACCENTED-LATIN GREEK)) (DECLARE%: DONTEVAL@LOAD DOCOPY (MAKE-UNICODE-TRANSLATION-TABLES (READ-UNICODE-MAPPING DEFAULT-XCCS-CHARSETS T) '*XCCSTOUNICODE* '*UNICODETOXCCS*) ) (DECLARE%: DOEVAL@COMPILE DONTCOPY (GLOBALVARS *XCCSTOUNICODE* *UNICODETOXCCS*) ) (DEFINEQ (HEXSTRING [LAMBDA (N WIDTH) (* ; "Edited 23-Jul-2020 08:28 by rmk:") (* ; "Edited 20-Dec-93 17:51 by rmk:") (* ;;  "Converts positive numbers to Hex strings, padding on the right with 0 up to WIDTH if given.") (CL:UNLESS (FIXP N) (SETQ N (CHARCODE.DECODE N))) (LET [CHAR (STR (ALLOCSTRING [IMAX (OR WIDTH 0) (FOR I (LEFT _ N) FROM 0 UNTIL (EQ LEFT 0) DO (SETQ LEFT (LRSH LEFT 4)) FINALLY (RETURN (MAX I 1] (CHARCODE 0] (FOR I FROM -1 BY -1 UNTIL (EQ N 0) DO (SETQ CHAR (LOGAND N 15)) [RPLCHARCODE STR I (IF (ILESSP CHAR 10) THEN (+ CHAR (CHARCODE 0)) ELSE (+ (- CHAR 10) (CHARCODE A] (SETQ N (LRSH N 4))) STR]) (UTF8HEXSTRING [LAMBDA (CHARCODE) (* ; "Edited 10-Aug-2020 08:33 by rmk:") (* ;; "Utility to produces the UTF8 hexstring representing CODE") (HEXSTRING (IF (ILESSP CHARCODE 128) THEN CHARCODE ELSEIF (ILESSP CHARCODE 2048) THEN (* ; "x800") (LOGOR (LLSH (LOGOR (LLSH 3 6) (LRSH CHARCODE 6)) 8) (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 0 6))) ELSEIF (ILESSP CHARCODE 65536) THEN (* ; "x10000") (LOGOR (LLSH (LOGOR (LLSH 7 5) (LRSH CHARCODE 12)) 16) (LLSH (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 6 6)) 8) (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 0 6))) ELSEIF (ILESSP CHARCODE 2097152) THEN (* ; "x200000") (LOGOR (LLSH (LOGOR (LLSH 15 4) (LRSH CHARCODE 18)) 24) (LLSH (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 12 6)) 16) (LLSH (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 6 6)) 8) (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 0 6))) ELSE (ERROR "CHARCODE too big for UTF8" CHARCODE]) (NUTF8CODEBYTES [LAMBDA (N) (* ; "Edited 28-Jun-2022 00:02 by rmk") (* ; "Edited 10-Aug-2020 12:35 by rmk:") (* ;; "Returns the number of bytes needed to encode N in UTF8, ") (IF (ILESSP N 128) THEN 1 ELSEIF (ILESSP N 2048) THEN (* ; "x800") 2 ELSEIF (ILESSP N 65536) THEN (* ; "x10000") 3 ELSEIF (ILESSP N 2097152) THEN (* ; "x200000") 4 ELSE (SHOULDNT]) (NUTF8STRINGBYTES [LAMBDA (STRING RAWFLG) (* ; "Edited 10-Aug-2020 09:06 by rmk:") (* ;; "Returns the number of bytes it would take to represent STRING in UTF8, assuming it is an XCCS string unless RAWFLG. ") (FOR I C FROM 1 WHILE (SETQ C (NTHCHARCODE STRING I)) SUM (NUTF8CODEBYTES (CL:IF RAWFLG C (XTOUCODE C))]) (XTOUSTRING [LAMBDA (XCCSSTRING RAWFLG) (* ; "Edited 10-Aug-2020 21:42 by rmk:") (* ;; "Produces a string that contains the UTF8 bytes that represent the characters in XCCSSTRING. Applies the XCCSTOUNICODE translation unless RAWFLG. ") (* ;; "The resulting string will not be readable inside Medley.") (LET [(USTR (ALLOCSTRING (NUTF8STRINGBYTES XCCSSTRING RAWFLG] (FOR I CHARCODE (SINDEX _ 0) FROM 1 WHILE (SETQ CHARCODE (NTHCHARCODE XCCSSTRING I)) DO (CL:UNLESS RAWFLG (SETQ CHARCODE (XTOUCODE CHARCODE))) (IF (ILESSP CHARCODE 128) THEN (RPLCHARCODE USTR (ADD SINDEX 1) CHARCODE) ELSEIF (ILESSP CHARCODE 2048) THEN (* ; "x800") (RPLCHARCODE USTR (ADD SINDEX 1) (LOGOR (LLSH 3 6) (LRSH CHARCODE 6))) (RPLCHARCODE USTR (ADD SINDEX 1) (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 0 6))) ELSEIF (ILESSP CHARCODE 65536) THEN (* ; "x10000") (RPLCHARCODE USTR (ADD SINDEX 1) (LOGOR (LLSH 7 5) (LRSH CHARCODE 12))) (RPLCHARCODE USTR (ADD SINDEX 1) (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 6 6))) (RPLCHARCODE USTR (ADD SINDEX 1) (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 0 6))) ELSEIF (ILESSP CHARCODE 2097152) THEN (* ; "x200000") (RPLCHARCODE USTR (ADD SINDEX 1) (LOGOR (LLSH 15 4) (LRSH CHARCODE 18))) (RPLCHARCODE USTR (ADD SINDEX 1) (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 12 6))) (RPLCHARCODE USTR (ADD SINDEX 1) (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 6 6))) (RPLCHARCODE USTR (ADD SINDEX 1) (LOGOR (LLSH 2 6) (LOADBYTE CHARCODE 0 6))) ELSE (SHOULDNT))) USTR]) (XCCSSTRING [LAMBDA (CODE) (* ; "Edited 13-Aug-2020 12:16 by rmk:") (* ;; "Returns XCCS character representation of string %"cset,char%"") (CL:UNLESS (FIXP CODE) (SETQ CODE (CHCON1 CODE))) (CONCAT (OCTALSTRING (LRSH CODE 8)) "," (OCTALSTRING (LOGAND CODE 255]) ) (DEFINEQ (SHOWCHARS [LAMBDA (FROMCHAR TOCHAR FONT) (* ; "Edited 1-Aug-2020 09:27 by rmk:") (RESETFORM (DSPFONT (OR FONT '(CLASSIC 12)) T) (CL:WHEN (AND (SMALLP FROMCHAR) (NOT TOCHAR)) (* ;;  "If a small number, assume it's an octal (in decimal) character set, no need for string quotes") (SETQ TOCHAR (CONCAT FROMCHAR "," 376)) (SETQ FROMCHAR (CONCAT FROMCHAR "," 41))) (CL:UNLESS (SMALLP FROMCHAR) (SETQ FROMCHAR (CHARCODE.DECODE FROMCHAR))) (CL:UNLESS (SMALLP TOCHAR) (SETQ TOCHAR (CL:IF TOCHAR (CHARCODE.DECODE TOCHAR) FROMCHAR))) (FOR C FROM FROMCHAR TO TOCHAR UNLESS (AND (IGEQ (LOGAND C 255) 127) (ILEQ (LOGAND C 255) (PLUS 128 33))) DO (PRINTOUT T .P2 (CONCAT (OCTALSTRING (LRSH CODE 8)) "," (OCTALSTRING (LOGAND CODE 255))) 10 (CHARACTER C) T]) ) (DECLARE%: EVAL@COMPILE DONTCOPY (FILESLOAD (FROM LOADUPS) EXPORTS.ALL) (DECLARE%: EVAL@COMPILE (RPAQQ TRANSLATION-SEGMENT-SIZE 128) (RPAQQ MAX-ALIST-LENGTH 10) (RPAQ N-TRANSLATION-SEGMENTS (IQUOTIENT 65536 TRANSLATION-SEGMENT-SIZE)) (RPAQ TRANSLATION-SHIFT (INTEGERLENGTH (SUB1 TRANSLATION-SEGMENT-SIZE))) (RPAQ TRANSLATION-MASK (SUB1 TRANSLATION-SEGMENT-SIZE)) (CONSTANTS (TRANSLATION-SEGMENT-SIZE 128) (MAX-ALIST-LENGTH 10) (N-TRANSLATION-SEGMENTS (IQUOTIENT 65536 TRANSLATION-SEGMENT-SIZE)) (TRANSLATION-SHIFT (INTEGERLENGTH (SUB1 TRANSLATION-SEGMENT-SIZE))) (TRANSLATION-MASK (SUB1 TRANSLATION-SEGMENT-SIZE))) ) ) (DECLARE%: DONTCOPY (FILEMAP (NIL (3717 17808 (UTF8.OUTCHARFN 3727 . 6558) (UTF8.INCCODEFN 6560 . 12050) (UTF8.PEEKCCODEFN 12052 . 16826) (\UTF8.BACKCCODEFN 16828 . 17806)) (17809 21590 (UTF16BE.OUTCHARFN 17819 . 18643) ( UTF16BE.INCCODEFN 18645 . 19544) (UTF16BE.PEEKCCODEFN 19546 . 20617) (\UTF16BE.BACKCCODEFN 20619 . 21588)) (21620 23681 (MAKE-UNICODE-FORMATS 21630 . 23679)) (23778 25084 (UNICODE.UNMAPPED 23788 . 25082)) (25085 25621 (XCCS-UTF8-AFTER-OPEN 25095 . 25619)) (26454 26803 (XTOUCODE 26464 . 26632) ( UTOXCODE 26634 . 26801)) (26843 42965 (READ-UNICODE-MAPPING-FILENAMES 26853 . 27954) ( READ-UNICODE-MAPPING 27956 . 31254) (WRITE-UNICODE-MAPPING 31256 . 35473) (WRITE-UNICODE-INCLUDED 35475 . 40197) (WRITE-UNICODE-MAPPING-HEADER 40199 . 41431) (WRITE-UNICODE-MAPPING-FILENAME 41433 . 42963)) (46178 54657 (MAKE-UNICODE-TRANSLATION-TABLES 46188 . 54655)) (55074 63100 (HEXSTRING 55084 . 56245) (UTF8HEXSTRING 56247 . 58452) (NUTF8CODEBYTES 58454 . 59239) (NUTF8STRINGBYTES 59241 . 59722) ( XTOUSTRING 59724 . 62735) (XCCSSTRING 62737 . 63098)) (63101 64570 (SHOWCHARS 63111 . 64568))))) STOP