From 834113aab5c1e5647690f93d9ae25f724095dd40 Mon Sep 17 00:00:00 2001 From: Jungshik Shin <jshin@chromium.org> Date: Fri, 2 Nov 2018 01:24:07 -0700 Subject: [PATCH] Make ISO-2022-JP converter compliant to the WHATWG encoding spec The encoding of 373 code point into ISO-2022-JP failed because they're mapped to SJIS code points for which there's no JIS 208 mapping. Using JIS X 208 table in EUC-JP solves this issue. Upstream bug: https://unicode-org.atlassian.net/browse/ICU-20251 Bug: 901255 Test: https://www.w3.org/International/tests/repo/run?base=encoding&batch=encoding-dbl-byte&test=legacy-mb-japanese/iso-2022-jp/iso2022jp-encode-form.html Change-Id: I3237fa1269a3ae2e89b22e246aaf2ae43fd56c7d Reviewed-on: https://chromium-review.googlesource.com/c/1314052 Reviewed-by: Joshua Bell <jsbell@chromium.org> --- README.chromium | 6 +- patches/iso2022jp.patch | 134 +++++++++++++++++++++++++++++++++++++ source/common/ucnv2022.cpp | 101 ++++------------------------ 3 files changed, 153 insertions(+), 88 deletions(-) create mode 100644 patches/iso2022jp.patch diff --git a/README.chromium b/README.chromium index 143d973b6..71b7f0e80 100644 --- a/README.chromium +++ b/README.chromium @@ -251,5 +251,9 @@ D. Local Modifications https://unicode-org.atlassian.net/browse/ICU-20246 - Fix: https://github.com/unicode-org/icu/pull/253 - +9. ISO-2022-JP encoding (fromUnicode) change per WHATWG encoding spec. + + - patches/iso2022jp.patch + - upstream bug: + https://unicode-org.atlassian.net/browse/ICU-20251 diff --git a/patches/iso2022jp.patch b/patches/iso2022jp.patch new file mode 100644 index 000000000..b36bcf48f --- /dev/null +++ b/patches/iso2022jp.patch @@ -0,0 +1,134 @@ +diff --git a/source/common/ucnv2022.cpp b/source/common/ucnv2022.cpp +index 2ef5db97..6ac8f62f 100644 +--- a/source/common/ucnv2022.cpp ++++ b/source/common/ucnv2022.cpp +@@ -513,7 +513,7 @@ _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ + ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); + } + myConverterData->myConverterArray[JISX208] = +- ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); ++ ucnv_loadSharedData("EUC-JP", &stackPieces, &stackArgs, errorCode); + if(jpCharsetMasks[version]&CSM(JISX212)) { + myConverterData->myConverterArray[JISX212] = + ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); +@@ -1514,79 +1514,6 @@ jisx201FromU(uint32_t value) { + return 0xfffe; + } + +-/* +- * Take a valid Shift-JIS byte pair, check that it is in the range corresponding +- * to JIS X 0208, and convert it to a pair of 21..7E bytes. +- * Return 0 if the byte pair is out of range. +- */ +-static inline uint32_t +-_2022FromSJIS(uint32_t value) { +- uint8_t trail; +- +- if(value > 0xEFFC) { +- return 0; /* beyond JIS X 0208 */ +- } +- +- trail = (uint8_t)value; +- +- value &= 0xff00; /* lead byte */ +- if(value <= 0x9f00) { +- value -= 0x7000; +- } else /* 0xe000 <= value <= 0xef00 */ { +- value -= 0xb000; +- } +- value <<= 1; +- +- if(trail <= 0x9e) { +- value -= 0x100; +- if(trail <= 0x7e) { +- value |= trail - 0x1f; +- } else { +- value |= trail - 0x20; +- } +- } else /* trail <= 0xfc */ { +- value |= trail - 0x7e; +- } +- return value; +-} +- +-/* +- * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. +- * If either byte is outside 21..7E make sure that the result is not valid +- * for Shift-JIS so that the converter catches it. +- * Some invalid byte values already turn into equally invalid Shift-JIS +- * byte values and need not be tested explicitly. +- */ +-static inline void +-_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { +- if(c1&1) { +- ++c1; +- if(c2 <= 0x5f) { +- c2 += 0x1f; +- } else if(c2 <= 0x7e) { +- c2 += 0x20; +- } else { +- c2 = 0; /* invalid */ +- } +- } else { +- if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { +- c2 += 0x7e; +- } else { +- c2 = 0; /* invalid */ +- } +- } +- c1 >>= 1; +- if(c1 <= 0x2f) { +- c1 += 0x70; +- } else if(c1 <= 0x3f) { +- c1 += 0xb0; +- } else { +- c1 = 0; /* invalid */ +- } +- bytes[0] = (char)c1; +- bytes[1] = (char)c2; +-} +- + /* + * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) + * Katakana. +@@ -1857,8 +1784,13 @@ getTrail: + converterData->myConverterArray[cs0], + sourceChar, &value, + useFallback, MBCS_OUTPUT_2); +- if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ +- value = _2022FromSJIS(value); ++ // Only accept DBCS char (abs(len2) == 2). ++ // With EUC-JP table for JIS X 208, half-width Kana ++ // represented with DBCS starting with 0x8E has to be ++ // filtered out so that they can be converted with ++ // hwkana_fb table. ++ if((len2 == 2 && ((value & 0xFF00) != 0x8E00)) || (len2 == -2 && len == 0)) { ++ value &= 0x7F7F; + if(value != 0) { + targetValue = value; + len = len2; +@@ -2250,18 +2182,13 @@ getTrailByte: + if (leadIsOk && trailIsOk) { + ++mySource; + tmpSourceChar = (mySourceChar << 8) | trailByte; +- if(cs == JISX208) { +- _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); +- mySourceChar = tmpSourceChar; +- } else { +- /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ +- mySourceChar = tmpSourceChar; +- if (cs == KSC5601) { +- tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ +- } +- tempBuf[0] = (char)(tmpSourceChar >> 8); +- tempBuf[1] = (char)(tmpSourceChar); ++ /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ ++ mySourceChar = tmpSourceChar; ++ if (cs == JISX208 || cs == KSC5601) { ++ tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ + } ++ tempBuf[0] = (char)(tmpSourceChar >> 8); ++ tempBuf[1] = (char)(tmpSourceChar); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { + /* report a pair of illegal bytes if the second byte is not a DBCS starter */ diff --git a/source/common/ucnv2022.cpp b/source/common/ucnv2022.cpp index d1a50d269..6a5dbdbd6 100644 --- a/source/common/ucnv2022.cpp +++ b/source/common/ucnv2022.cpp @@ -513,7 +513,7 @@ _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); } myConverterData->myConverterArray[JISX208] = - ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); + ucnv_loadSharedData("EUC-JP", &stackPieces, &stackArgs, errorCode); if(jpCharsetMasks[version]&CSM(JISX212)) { myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); @@ -1514,79 +1514,6 @@ jisx201FromU(uint32_t value) { return 0xfffe; } -/* - * Take a valid Shift-JIS byte pair, check that it is in the range corresponding - * to JIS X 0208, and convert it to a pair of 21..7E bytes. - * Return 0 if the byte pair is out of range. - */ -static inline uint32_t -_2022FromSJIS(uint32_t value) { - uint8_t trail; - - if(value > 0xEFFC) { - return 0; /* beyond JIS X 0208 */ - } - - trail = (uint8_t)value; - - value &= 0xff00; /* lead byte */ - if(value <= 0x9f00) { - value -= 0x7000; - } else /* 0xe000 <= value <= 0xef00 */ { - value -= 0xb000; - } - value <<= 1; - - if(trail <= 0x9e) { - value -= 0x100; - if(trail <= 0x7e) { - value |= trail - 0x1f; - } else { - value |= trail - 0x20; - } - } else /* trail <= 0xfc */ { - value |= trail - 0x7e; - } - return value; -} - -/* - * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. - * If either byte is outside 21..7E make sure that the result is not valid - * for Shift-JIS so that the converter catches it. - * Some invalid byte values already turn into equally invalid Shift-JIS - * byte values and need not be tested explicitly. - */ -static inline void -_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { - if(c1&1) { - ++c1; - if(c2 <= 0x5f) { - c2 += 0x1f; - } else if(c2 <= 0x7e) { - c2 += 0x20; - } else { - c2 = 0; /* invalid */ - } - } else { - if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { - c2 += 0x7e; - } else { - c2 = 0; /* invalid */ - } - } - c1 >>= 1; - if(c1 <= 0x2f) { - c1 += 0x70; - } else if(c1 <= 0x3f) { - c1 += 0xb0; - } else { - c1 = 0; /* invalid */ - } - bytes[0] = (char)c1; - bytes[1] = (char)c2; -} - /* * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) * Katakana. @@ -1857,8 +1784,13 @@ getTrail: converterData->myConverterArray[cs0], sourceChar, &value, useFallback, MBCS_OUTPUT_2); - if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ - value = _2022FromSJIS(value); + // Only accept DBCS char (abs(len2) == 2). + // With EUC-JP table for JIS X 208, half-width Kana + // represented with DBCS starting with 0x8E has to be + // filtered out so that they can be converted with + // hwkana_fb table. + if((len2 == 2 && ((value & 0xFF00) != 0x8E00)) || (len2 == -2 && len == 0)) { + value &= 0x7F7F; if(value != 0) { targetValue = value; len = len2; @@ -2250,18 +2182,13 @@ getTrailByte: if (leadIsOk && trailIsOk) { ++mySource; tmpSourceChar = (mySourceChar << 8) | trailByte; - if(cs == JISX208) { - _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); - mySourceChar = tmpSourceChar; - } else { - /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ - mySourceChar = tmpSourceChar; - if (cs == KSC5601) { - tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ - } - tempBuf[0] = (char)(tmpSourceChar >> 8); - tempBuf[1] = (char)(tmpSourceChar); + /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ + mySourceChar = tmpSourceChar; + if (cs == JISX208 || cs == KSC5601) { + tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ } + tempBuf[0] = (char)(tmpSourceChar >> 8); + tempBuf[1] = (char)(tmpSourceChar); targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { /* report a pair of illegal bytes if the second byte is not a DBCS starter */ -- GitLab