diff --git a/README.chromium b/README.chromium index 7d84c564924d4e0f3ca8f4985d10db6608e610be..dce2db9b1ede4cbf5c3022c4389a570cf5a92c08 100644 --- a/README.chromium +++ b/README.chromium @@ -279,3 +279,42 @@ D. Local Modifications https://unicode-org.atlassian.net/browse/ICU-20654 - upstream PR: https://github.com/unicode-org/icu/pull/674 + +12. Add usePoolBundle option to filters.json + - patches/usePool.patch + - upstream bug: + https://unicode-org.atlassian.net/browse/ICU-20660 + - upstream PR: + https://github.com/unicode-org/icu/pull/682 + +13. Allow ICUPKG to generating dat with missing dependencies. + - patches/icupkg.patch + - upstream bug: + https://unicode-org.atlassian.net/browse/ICU-20774 + +14. Add LocaleMatcher + - patches/trie.patch + - upstream PR: + https://github.com/unicode-org/icu/pull/747 + + - patches/tracing.patch + - upstream PR: + https://github.com/unicode-org/icu/pull/617 + + - patches/localematcher.patch + - upstream PR: + https://github.com/unicode-org/icu/pull/714 + +15. Prevent leak from adoptCalendar + - patches/calendarToAdopt.patch + - upstream bug: + https://unicode-org.atlassian.net/browse/ICU-20799 + - upstream PR: + https://github.com/unicode-org/icu/pull/790 + +16. Fix minimumGroupingDigits in Hungarian locale (hu) from 4 to 1. + - patches/hu_minimumGroupingDigits.patch + - upstream bug: + https://unicode-org.atlassian.net/browse/CLDR-13256 + - upstream PR: + https://github.com/unicode-org/cldr/pull/142 diff --git a/android/icudtl.dat b/android/icudtl.dat index 05f9e0939710837d94d4185667871d070f748a1b..e704c4269975e691921df7b00e30ef6b629fdf2a 100644 Binary files a/android/icudtl.dat and b/android/icudtl.dat differ diff --git a/android_small/icudtl.dat b/android_small/icudtl.dat index ab364f1e09af4b1c6fc6fc1f9198c89a6080aba3..27c6bce7b71e8e4017d696a7de3c88ad0ddaffbf 100644 Binary files a/android_small/icudtl.dat and b/android_small/icudtl.dat differ diff --git a/android_small/icudtl_extra.dat b/android_small/icudtl_extra.dat new file mode 100644 index 0000000000000000000000000000000000000000..fd3f11db24d67085cdcf4a71ac0f451adb8c0128 Binary files /dev/null and b/android_small/icudtl_extra.dat differ diff --git a/cast/icudtl.dat b/cast/icudtl.dat index fcf5fd15715810b43cbbcb4209b7e511436a1d17..743feed2568f67aad2fcfc1864e8c42f1314f9a5 100644 Binary files a/cast/icudtl.dat and b/cast/icudtl.dat differ diff --git a/chromeos/icudtl.dat b/chromeos/icudtl.dat index 67f690db1028f151f6715550c9d79e3bb57bb53b..695bd26a2cd38f956ebfd3f2e3a7e8680be49363 100644 Binary files a/chromeos/icudtl.dat and b/chromeos/icudtl.dat differ diff --git a/common/icudtb.dat b/common/icudtb.dat index 032d473f981bb47084e83994e1787751be925346..245687014b605b5db44d5d714e879733c940c52d 100644 Binary files a/common/icudtb.dat and b/common/icudtb.dat differ diff --git a/common/icudtl.dat b/common/icudtl.dat index aa3f6a71da75d15e8693a53e7578e282ee12f14a..47cb0eb973af004795686fcc7fe2519543e7255a 100644 Binary files a/common/icudtl.dat and b/common/icudtl.dat differ diff --git a/filters/android-extra-removed-resources.txt b/filters/android-extra-removed-resources.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e3c54368af669db968256e630287d163062d4b5 --- /dev/null +++ b/filters/android-extra-removed-resources.txt @@ -0,0 +1,7 @@ +res_index.res +coll/res_index.res +curr/res_index.res +lang/res_index.res +region/res_index.res +unit/res_index.res +zone/res_index.res diff --git a/filters/android_extra.json b/filters/android_extra.json new file mode 100644 index 0000000000000000000000000000000000000000..8ceb399f9d205fde4e1cb92c6ce2ab80ec882d77 --- /dev/null +++ b/filters/android_extra.json @@ -0,0 +1,857 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +{ + "collationUCAData": "implicithan", + "usePoolBundle": false, + "localeFilter": { + "whitelist": [ + "bn", "et", "gu", "kn", "ml", "mr", "ms", "ta", "te", "af", + "az", "eu", "gl", "hy", "is", "ka", "kk", "km", "ky", "lo", + "mk", "mn", "my", "ne", "pa", "si", "sq", "ur", "uz", "zu" + ] + }, + "featureFilters": { + "brkitr_dictionaries": { "filterType": "exclude" }, + "brkitr_rules": { "filterType": "exclude" }, + "brkitr_tree": { "filterType": "exclude" }, + "cnvalias": { "filterType": "exclude" }, + "coll_index": { "filterType": "exclude" }, + "coll_ucadata": { "filterType": "exclude" }, + "confusables": { "filterType": "exclude" }, + "conversion_mappings": { "filterType": "exclude" }, + "curr_supplemental": { "filterType": "exclude" }, + "curr_index": { "filterType": "exclude" }, + "locales_index": { "filterType": "exclude" }, + "normalization": { "filterType": "exclude" }, + "rbnf_tree": { "filterType": "exclude" }, + "rbnf_index": { "filterType": "exclude" }, + "region_tree": { + "whitelist": [ + "en" + ] + }, + "region_index": { "filterType": "exclude" }, + "misc": { "filterType": "exclude" }, + "stringprep": { "filterType": "exclude" }, + "translit": { + "whitelist": [ +// Need to explicitly add "root" + "root" + ] + }, + "ulayout": { "filterType": "exclude" }, + "unames": { "filterType": "exclude" }, + "zone_index": { "filterType": "exclude" } + }, + "resourceFilters": [ +// Based on filter_locale_data +// # Remove entries currently not used in Chromium/V8. + { + "categories": [ + "locales_tree" + ], + "rules": [ + "-/*/mon", + "-/*/mon-short", + "-/*/mon-narrow", + "-/*/tue", + "-/*/tue-short", + "-/*/tue-narrow", + "-/*/wed", + "-/*/wed-short", + "-/*/wed-narrow", + "-/*/thu", + "-/*/thu-short", + "-/*/thu-narrow", + "-/*/fri", + "-/*/fri-short", + "-/*/fri-narrow", + "-/*/sat", + "-/*/sat-short", + "-/*/sat-narrow", + "-/*/sun", + "-/*/sun-short", + "-/*/sun-narrow", + "-/*/mon-short:alias", + "-/*/mon-narrow:alias", + "-/*/tue-short:alias", + "-/*/tue-narrow:alias", + "-/*/wed-short:alias", + "-/*/wed-narrow:alias", + "-/*/thu-short:alias", + "-/*/thu-narrow:alias", + "-/*/fri-short:alias", + "-/*/fri-narrow:alias", + "-/*/sat-short:alias", + "-/*/sat-narrow:alias", + "-/*/sun-short:alias", + "-/*/sun-narrow:alias", + "-/characterLabel", + "-/AuxExemplarCharacters", + "-/ExemplarCharacters", + "-/ExemplarCharactersNumbers", + "-/ExemplarCharactersPunctuation", + "-/measurementSystemNames", + "-/NumberElements/minimalPairs", + "-/delimiters" + ] + }, +// Based on filter_currency_data +// # Keep only the currencies used by the larget 150 economies in terms of GDP. +// # TODO(jshin): Use ucurr_isAvailable in ICU to drop more currencies. +// # See also http://en.wikipedia.org/wiki/List_of_circulating_currencies + { + "categories": [ + "curr_tree" + ], + "rules": [ + "-/Currencies", + "-/Currencies%narrow", + "-/CurrencyPlurals", + "+/*/AED", +// Baesd on +// https://cs.chromium.org/chromium/src/third_party/icu/android/currencies.list?rcl=797b7c + "+/*/AED", + "+/*/AFN", + "+/*/ALL", + "+/*/AMD", + "+/*/ARS", + "+/*/AUD", + "+/*/AZN", + "+/*/BAM", + "+/*/BDT", + "+/*/BGN", + "+/*/BHD", + "+/*/BND", + "+/*/BRL", + "+/*/BYN", + "+/*/CAD", + "+/*/CHF", + "+/*/CLP", + "+/*/CNY", + "+/*/COP", + "+/*/CRC", + "+/*/CZK", + "+/*/DKK", + "+/*/DZD", + "+/*/EGP", + "+/*/ETB", + "+/*/EUR", + "+/*/GBP", + "+/*/GEL", + "+/*/HKD", + "+/*/HRK", + "+/*/HUF", + "+/*/IDR", + "+/*/ILS", + "+/*/INR", + "+/*/IQD", + "+/*/IRR", + "+/*/JPY", + "+/*/KGS", + "+/*/KHR", + "+/*/KRW", + "+/*/KWD", + "+/*/KZT", + "+/*/LAK", + "+/*/LKR", + "+/*/MAD", + "+/*/MDL", + "+/*/MXN", + "+/*/MYR", + "+/*/NGN", + "+/*/NOK", + "+/*/NPR", + "+/*/NZD", + "+/*/PEN", + "+/*/PHP", + "+/*/PKR", + "+/*/PLN", + "+/*/QAR", + "+/*/RON", + "+/*/RSD", + "+/*/RUB", + "+/*/SAR", + "+/*/SEK", + "+/*/SGD", + "+/*/THB", + "+/*/TJS", + "+/*/TRY", + "+/*/TWD", + "+/*/UAH", + "+/*/USD", + "+/*/VES", + "+/*/VND", + "+/*/XDR", + "+/*/ZAR", + "-/Currencies%formal", + "-/Currencies%variant" + ] + }, +// Based on +// https://cs.chromium.org/chromium/src/third_party/icu/android/patch_locale.sh?rcl=797b7c + { + "categories": [ + "region_tree" + ], + "whitelist": [ +// Need to explicitly add "root" +// "root", +// # Excludes region data. On Android Java API is used to get the data. +// # Due to a bug in ICU, an empty region list always uses 70kB pool.res bundle. +// # As a work around, include the minimal version of en.txt + "en" + ], + "rules": [ + "-/*", + "+/Countries/US" + ] + }, +// Based on https://cs.chromium.org/chromium/src/third_party/icu/patches/data.build.patch?rcl=797b7c +// +# Chromium : To save space, use '-R' option (exclude collation rules. just +// +# include the compiled binary data.) +// +GENRBOPTS=-k -R + { + "categories": [ + "coll_tree" + ], + "rules": [ + "-/UCARules", + "-/collations/*/Sequence" + ] + }, +// Based on filter_unit_data +// # Keep only duration and compound in units* sections. + { + "categories": [ + "unit_tree" + ], + "rules": [ + "-/units", + "-/unitsNarrow", + "-/unitsShort", + "+/*/*/acre", + "-/*/*/acre/dnam", + "+/*/*/bit", + "-/*/*/bit/dnam", + "+/*/*/byte", + "-/*/*/byte/dnam", + "+/*/*/celsius", + "-/*/*/celsius/dnam", + "+/*/*/centimeter", + "-/*/*/centimeter/dnam", + "+/*/*/day", + "-/*/*/day/dnam", + "+/*/*/degree", + "-/*/*/degree/dnam", + "+/*/*/fahrenheit", + "-/*/*/fahrenheit/dnam", + "+/*/*/fluid-ounce", + "-/*/*/fluid-ounce/dnam", + "+/*/*/foot", + "-/*/*/foot/dnam", + "+/*/*/gallon", + "-/*/*/gallon/dnam", + "+/*/*/gigabit", + "-/*/*/gigabit/dnam", + "+/*/*/gigabyte", + "-/*/*/gigabyte/dnam", + "+/*/*/gram", + "-/*/*/gram/dnam", + "+/*/*/hectare", + "-/*/*/hectare/dnam", + "+/*/*/hour", + "-/*/*/hour/dnam", + "+/*/*/inch", + "-/*/*/inch/dnam", + "+/*/*/kilobit", + "-/*/*/kilobit/dnam", + "+/*/*/kilobyte", + "-/*/*/kilobyte/dnam", + "+/*/*/kilogram", + "-/*/*/kilogram/dnam", + "+/*/*/kilometer", + "-/*/*/kilometer/dnam", + "+/*/*/liter", + "-/*/*/liter/dnam", + "+/*/*/megabit", + "-/*/*/megabit/dnam", + "+/*/*/megabyte", + "-/*/*/megabyte/dnam", + "+/*/*/meter", + "-/*/*/meter/dnam", + "+/*/*/mile", + "-/*/*/mile/dnam", + "+/*/*/mile-scandinavian", + "-/*/*/mile-scandinavian/dnam", + "+/*/*/millimeter", + "-/*/*/millimeter/dnam", + "+/*/*/milliliter", + "-/*/*/milliliter/dnam", + "+/*/*/millisecond", + "-/*/*/millisecond/dnam", + "+/*/*/minute", + "-/*/*/minute/dnam", + "+/*/*/month", + "-/*/*/month/dnam", + "+/*/*/ounce", + "-/*/*/ounce/dnam", + "+/*/*/percent", + "-/*/*/percent/dnam", + "+/*/*/petabyte", + "-/*/*/petabyte/dnam", + "+/*/*/pound", + "-/*/*/pound/dnam", + "+/*/*/second", + "-/*/*/second/dnam", + "+/*/*/stone", + "-/*/*/stone/dnam", + "+/*/*/terabit", + "-/*/*/terabit/dnam", + "+/*/*/terabyte", + "-/*/*/terabyte/dnam", + "+/*/*/week", + "-/*/*/week/dnam", + "+/*/*/yard", + "-/*/*/yard/dnam", + "+/*/*/year", + "-/*/*/year/dnam", + "+/*/*/kilometer-per-hour", + "-/*/*/kilometer-per-hour/dnam", + "+/*/*/meter-per-second", + "-/*/*/meter-per-second/dnam", + "+/*/*/mile-per-hour", + "-/*/*/mile-per-hour/dnam", + "+/*/*/liter-per-kilometer", + "-/*/*/liter-per-kilometer/dnam", + "+/*/*/mile-per-gallon", + "-/*/*/mile-per-gallon/dnam", + "+/*/compound" + ] + }, +// Based on +// https://cs.chromium.org/chromium/src/third_party/icu/source/data/translit/root_subset.txt?rcl=797b7c + { + "categories": [ + "translit" + ], + "rules": [ + "-/*", +// Need to leave an empty RuleBasedTransliteratorIDs + "+/RuleBasedTransliteratorIDs", + "-/RuleBasedTransliteratorIDs/*" + ] + }, +// # Remove exemplar cities in timezone data. + { + "categories": [ + "zone_tree" + ], + "rules": [ + "-/zoneStrings/Africa:Abidjan", + "-/zoneStrings/Africa:Accra", + "-/zoneStrings/Africa:Addis_Ababa", + "-/zoneStrings/Africa:Algiers", + "-/zoneStrings/Africa:Asmera", + "-/zoneStrings/Africa:Bamako", + "-/zoneStrings/Africa:Bangui", + "-/zoneStrings/Africa:Banjul", + "-/zoneStrings/Africa:Bissau", + "-/zoneStrings/Africa:Blantyre", + "-/zoneStrings/Africa:Brazzaville", + "-/zoneStrings/Africa:Bujumbura", + "-/zoneStrings/Africa:Cairo", + "-/zoneStrings/Africa:Casablanca", + "-/zoneStrings/Africa:Ceuta", + "-/zoneStrings/Africa:Conakry", + "-/zoneStrings/Africa:Dakar", + "-/zoneStrings/Africa:Dar_es_Salaam", + "-/zoneStrings/Africa:Djibouti", + "-/zoneStrings/Africa:Douala", + "-/zoneStrings/Africa:El_Aaiun", + "-/zoneStrings/Africa:Freetown", + "-/zoneStrings/Africa:Gaborone", + "-/zoneStrings/Africa:Harare", + "-/zoneStrings/Africa:Johannesburg", + "-/zoneStrings/Africa:Juba", + "-/zoneStrings/Africa:Kampala", + "-/zoneStrings/Africa:Khartoum", + "-/zoneStrings/Africa:Kigali", + "-/zoneStrings/Africa:Kinshasa", + "-/zoneStrings/Africa:Lagos", + "-/zoneStrings/Africa:Libreville", + "-/zoneStrings/Africa:Lome", + "-/zoneStrings/Africa:Luanda", + "-/zoneStrings/Africa:Lubumbashi", + "-/zoneStrings/Africa:Lusaka", + "-/zoneStrings/Africa:Malabo", + "-/zoneStrings/Africa:Maputo", + "-/zoneStrings/Africa:Maseru", + "-/zoneStrings/Africa:Mbabane", + "-/zoneStrings/Africa:Mogadishu", + "-/zoneStrings/Africa:Monrovia", + "-/zoneStrings/Africa:Nairobi", + "-/zoneStrings/Africa:Ndjamena", + "-/zoneStrings/Africa:Niamey", + "-/zoneStrings/Africa:Nouakchott", + "-/zoneStrings/Africa:Ouagadougou", + "-/zoneStrings/Africa:Porto-Novo", + "-/zoneStrings/Africa:Sao_Tome", + "-/zoneStrings/Africa:Tripoli", + "-/zoneStrings/Africa:Tunis", + "-/zoneStrings/Africa:Windhoek", + "-/zoneStrings/America:Adak", + "-/zoneStrings/America:Anchorage", + "-/zoneStrings/America:Anguilla", + "-/zoneStrings/America:Antigua", + "-/zoneStrings/America:Araguaina", + "-/zoneStrings/America:Argentina:La_Rioja", + "-/zoneStrings/America:Argentina:Rio_Gallegos", + "-/zoneStrings/America:Argentina:Salta", + "-/zoneStrings/America:Argentina:San_Juan", + "-/zoneStrings/America:Argentina:San_Luis", + "-/zoneStrings/America:Argentina:Tucuman", + "-/zoneStrings/America:Argentina:Ushuaia", + "-/zoneStrings/America:Aruba", + "-/zoneStrings/America:Asuncion", + "-/zoneStrings/America:Bahia", + "-/zoneStrings/America:Bahia_Banderas", + "-/zoneStrings/America:Barbados", + "-/zoneStrings/America:Belem", + "-/zoneStrings/America:Belize", + "-/zoneStrings/America:Blanc-Sablon", + "-/zoneStrings/America:Boa_Vista", + "-/zoneStrings/America:Bogota", + "-/zoneStrings/America:Boise", + "-/zoneStrings/America:Buenos_Aires", + "-/zoneStrings/America:Cambridge_Bay", + "-/zoneStrings/America:Campo_Grande", + "-/zoneStrings/America:Cancun", + "-/zoneStrings/America:Caracas", + "-/zoneStrings/America:Catamarca", + "-/zoneStrings/America:Cayenne", + "-/zoneStrings/America:Cayman", + "-/zoneStrings/America:Chicago", + "-/zoneStrings/America:Chihuahua", + "-/zoneStrings/America:Coral_Harbour", + "-/zoneStrings/America:Cordoba", + "-/zoneStrings/America:Costa_Rica", + "-/zoneStrings/America:Creston", + "-/zoneStrings/America:Cuiaba", + "-/zoneStrings/America:Curacao", + "-/zoneStrings/America:Danmarkshavn", + "-/zoneStrings/America:Dawson", + "-/zoneStrings/America:Dawson_Creek", + "-/zoneStrings/America:Denver", + "-/zoneStrings/America:Detroit", + "-/zoneStrings/America:Dominica", + "-/zoneStrings/America:Edmonton", + "-/zoneStrings/America:Eirunepe", + "-/zoneStrings/America:El_Salvador", + "-/zoneStrings/America:Fortaleza", + "-/zoneStrings/America:Fort_Nelson", + "-/zoneStrings/America:Glace_Bay", + "-/zoneStrings/America:Godthab", + "-/zoneStrings/America:Goose_Bay", + "-/zoneStrings/America:Grand_Turk", + "-/zoneStrings/America:Grenada", + "-/zoneStrings/America:Guadeloupe", + "-/zoneStrings/America:Guatemala", + "-/zoneStrings/America:Guayaquil", + "-/zoneStrings/America:Guyana", + "-/zoneStrings/America:Halifax", + "-/zoneStrings/America:Havana", + "-/zoneStrings/America:Hermosillo", + "-/zoneStrings/America:Indiana:Knox", + "-/zoneStrings/America:Indiana:Marengo", + "-/zoneStrings/America:Indiana:Petersburg", + "-/zoneStrings/America:Indianapolis", + "-/zoneStrings/America:Indiana:Tell_City", + "-/zoneStrings/America:Indiana:Vevay", + "-/zoneStrings/America:Indiana:Vincennes", + "-/zoneStrings/America:Indiana:Winamac", + "-/zoneStrings/America:Inuvik", + "-/zoneStrings/America:Iqaluit", + "-/zoneStrings/America:Jamaica", + "-/zoneStrings/America:Jujuy", + "-/zoneStrings/America:Juneau", + "-/zoneStrings/America:Kentucky:Monticello", + "-/zoneStrings/America:Kralendijk", + "-/zoneStrings/America:La_Paz", + "-/zoneStrings/America:Lima", + "-/zoneStrings/America:Los_Angeles", + "-/zoneStrings/America:Louisville", + "-/zoneStrings/America:Lower_Princes", + "-/zoneStrings/America:Maceio", + "-/zoneStrings/America:Managua", + "-/zoneStrings/America:Manaus", + "-/zoneStrings/America:Marigot", + "-/zoneStrings/America:Martinique", + "-/zoneStrings/America:Matamoros", + "-/zoneStrings/America:Mazatlan", + "-/zoneStrings/America:Mendoza", + "-/zoneStrings/America:Menominee", + "-/zoneStrings/America:Merida", + "-/zoneStrings/America:Metlakatla", + "-/zoneStrings/America:Mexico_City", + "-/zoneStrings/America:Miquelon", + "-/zoneStrings/America:Moncton", + "-/zoneStrings/America:Monterrey", + "-/zoneStrings/America:Montevideo", + "-/zoneStrings/America:Montserrat", + "-/zoneStrings/America:Nassau", + "-/zoneStrings/America:New_York", + "-/zoneStrings/America:Nipigon", + "-/zoneStrings/America:Nome", + "-/zoneStrings/America:Noronha", + "-/zoneStrings/America:North_Dakota:Beulah", + "-/zoneStrings/America:North_Dakota:Center", + "-/zoneStrings/America:North_Dakota:New_Salem", + "-/zoneStrings/America:Ojinaga", + "-/zoneStrings/America:Panama", + "-/zoneStrings/America:Pangnirtung", + "-/zoneStrings/America:Paramaribo", + "-/zoneStrings/America:Phoenix", + "-/zoneStrings/America:Port-au-Prince", + "-/zoneStrings/America:Port_of_Spain", + "-/zoneStrings/America:Porto_Velho", + "-/zoneStrings/America:Puerto_Rico", + "-/zoneStrings/America:Punta_Arenas", + "-/zoneStrings/America:Rainy_River", + "-/zoneStrings/America:Rankin_Inlet", + "-/zoneStrings/America:Recife", + "-/zoneStrings/America:Regina", + "-/zoneStrings/America:Resolute", + "-/zoneStrings/America:Rio_Branco", + "-/zoneStrings/America:Santa_Isabel", + "-/zoneStrings/America:Santarem", + "-/zoneStrings/America:Santiago", + "-/zoneStrings/America:Santo_Domingo", + "-/zoneStrings/America:Sao_Paulo", + "-/zoneStrings/America:Scoresbysund", + "-/zoneStrings/America:Sitka", + "-/zoneStrings/America:St_Barthelemy", + "-/zoneStrings/America:St_Johns", + "-/zoneStrings/America:St_Kitts", + "-/zoneStrings/America:St_Lucia", + "-/zoneStrings/America:St_Thomas", + "-/zoneStrings/America:St_Vincent", + "-/zoneStrings/America:Swift_Current", + "-/zoneStrings/America:Tegucigalpa", + "-/zoneStrings/America:Thule", + "-/zoneStrings/America:Thunder_Bay", + "-/zoneStrings/America:Tijuana", + "-/zoneStrings/America:Toronto", + "-/zoneStrings/America:Tortola", + "-/zoneStrings/America:Vancouver", + "-/zoneStrings/America:Whitehorse", + "-/zoneStrings/America:Winnipeg", + "-/zoneStrings/America:Yakutat", + "-/zoneStrings/America:Yellowknife", + "-/zoneStrings/Antarctica:Casey", + "-/zoneStrings/Antarctica:Davis", + "-/zoneStrings/Antarctica:DumontDUrville", + "-/zoneStrings/Antarctica:Macquarie", + "-/zoneStrings/Antarctica:Mawson", + "-/zoneStrings/Antarctica:McMurdo", + "-/zoneStrings/Antarctica:Palmer", + "-/zoneStrings/Antarctica:Rothera", + "-/zoneStrings/Antarctica:Syowa", + "-/zoneStrings/Antarctica:Troll", + "-/zoneStrings/Antarctica:Vostok", + "-/zoneStrings/Arctic:Longyearbyen", + "-/zoneStrings/Asia:Aden", + "-/zoneStrings/Asia:Almaty", + "-/zoneStrings/Asia:Amman", + "-/zoneStrings/Asia:Anadyr", + "-/zoneStrings/Asia:Aqtau", + "-/zoneStrings/Asia:Aqtobe", + "-/zoneStrings/Asia:Ashgabat", + "-/zoneStrings/Asia:Atyrau", + "-/zoneStrings/Asia:Baghdad", + "-/zoneStrings/Asia:Bahrain", + "-/zoneStrings/Asia:Baku", + "-/zoneStrings/Asia:Bangkok", + "-/zoneStrings/Asia:Barnaul", + "-/zoneStrings/Asia:Beirut", + "-/zoneStrings/Asia:Bishkek", + "-/zoneStrings/Asia:Brunei", + "-/zoneStrings/Asia:Calcutta", + "-/zoneStrings/Asia:Chita", + "-/zoneStrings/Asia:Choibalsan", + "-/zoneStrings/Asia:Colombo", + "-/zoneStrings/Asia:Damascus", + "-/zoneStrings/Asia:Dhaka", + "-/zoneStrings/Asia:Dili", + "-/zoneStrings/Asia:Dubai", + "-/zoneStrings/Asia:Dushanbe", + "-/zoneStrings/Asia:Famagusta", + "-/zoneStrings/Asia:Gaza", + "-/zoneStrings/Asia:Hebron", + "-/zoneStrings/Asia:Hong_Kong", + "-/zoneStrings/Asia:Hovd", + "-/zoneStrings/Asia:Irkutsk", + "-/zoneStrings/Asia:Jakarta", + "-/zoneStrings/Asia:Jayapura", + "-/zoneStrings/Asia:Jerusalem", + "-/zoneStrings/Asia:Kabul", + "-/zoneStrings/Asia:Kamchatka", + "-/zoneStrings/Asia:Karachi", + "-/zoneStrings/Asia:Katmandu", + "-/zoneStrings/Asia:Khandyga", + "-/zoneStrings/Asia:Krasnoyarsk", + "-/zoneStrings/Asia:Kuala_Lumpur", + "-/zoneStrings/Asia:Kuching", + "-/zoneStrings/Asia:Kuwait", + "-/zoneStrings/Asia:Macau", + "-/zoneStrings/Asia:Magadan", + "-/zoneStrings/Asia:Makassar", + "-/zoneStrings/Asia:Manila", + "-/zoneStrings/Asia:Muscat", + "-/zoneStrings/Asia:Nicosia", + "-/zoneStrings/Asia:Novokuznetsk", + "-/zoneStrings/Asia:Novosibirsk", + "-/zoneStrings/Asia:Omsk", + "-/zoneStrings/Asia:Oral", + "-/zoneStrings/Asia:Phnom_Penh", + "-/zoneStrings/Asia:Pontianak", + "-/zoneStrings/Asia:Pyongyang", + "-/zoneStrings/Asia:Qatar", + "-/zoneStrings/Asia:Qostanay", + "-/zoneStrings/Asia:Qyzylorda", + "-/zoneStrings/Asia:Rangoon", + "-/zoneStrings/Asia:Riyadh", + "-/zoneStrings/Asia:Saigon", + "-/zoneStrings/Asia:Sakhalin", + "-/zoneStrings/Asia:Samarkand", + "-/zoneStrings/Asia:Seoul", + "-/zoneStrings/Asia:Shanghai", + "-/zoneStrings/Asia:Singapore", + "-/zoneStrings/Asia:Srednekolymsk", + "-/zoneStrings/Asia:Taipei", + "-/zoneStrings/Asia:Tashkent", + "-/zoneStrings/Asia:Tbilisi", + "-/zoneStrings/Asia:Tehran", + "-/zoneStrings/Asia:Thimphu", + "-/zoneStrings/Asia:Tokyo", + "-/zoneStrings/Asia:Tomsk", + "-/zoneStrings/Asia:Ulaanbaatar", + "-/zoneStrings/Asia:Urumqi", + "-/zoneStrings/Asia:Ust-Nera", + "-/zoneStrings/Asia:Vientiane", + "-/zoneStrings/Asia:Vladivostok", + "-/zoneStrings/Asia:Yakutsk", + "-/zoneStrings/Asia:Yekaterinburg", + "-/zoneStrings/Asia:Yerevan", + "-/zoneStrings/Atlantic:Azores", + "-/zoneStrings/Atlantic:Bermuda", + "-/zoneStrings/Atlantic:Canary", + "-/zoneStrings/Atlantic:Cape_Verde", + "-/zoneStrings/Atlantic:Faeroe", + "-/zoneStrings/Atlantic:Madeira", + "-/zoneStrings/Atlantic:Reykjavik", + "-/zoneStrings/Atlantic:South_Georgia", + "-/zoneStrings/Atlantic:Stanley", + "-/zoneStrings/Atlantic:St_Helena", + "-/zoneStrings/Australia:Adelaide", + "-/zoneStrings/Australia:Brisbane", + "-/zoneStrings/Australia:Broken_Hill", + "-/zoneStrings/Australia:Currie", + "-/zoneStrings/Australia:Darwin", + "-/zoneStrings/Australia:Eucla", + "-/zoneStrings/Australia:Hobart", + "-/zoneStrings/Australia:Lindeman", + "-/zoneStrings/Australia:Lord_Howe", + "-/zoneStrings/Australia:Melbourne", + "-/zoneStrings/Australia:Perth", + "-/zoneStrings/Australia:Sydney", + "-/zoneStrings/Etc:Unknown", + "-/zoneStrings/Etc:UTC/ec", + "-/zoneStrings/Europe:Amsterdam", + "-/zoneStrings/Europe:Andorra", + "-/zoneStrings/Europe:Astrakhan", + "-/zoneStrings/Europe:Athens", + "-/zoneStrings/Europe:Belgrade", + "-/zoneStrings/Europe:Berlin", + "-/zoneStrings/Europe:Bratislava", + "-/zoneStrings/Europe:Brussels", + "-/zoneStrings/Europe:Bucharest", + "-/zoneStrings/Europe:Budapest", + "-/zoneStrings/Europe:Busingen", + "-/zoneStrings/Europe:Chisinau", + "-/zoneStrings/Europe:Copenhagen", + "-/zoneStrings/Europe:Dublin/ec", + "-/zoneStrings/Europe:Gibraltar", + "-/zoneStrings/Europe:Guernsey", + "-/zoneStrings/Europe:Helsinki", + "-/zoneStrings/Europe:Isle_of_Man", + "-/zoneStrings/Europe:Istanbul", + "-/zoneStrings/Europe:Jersey", + "-/zoneStrings/Europe:Kaliningrad", + "-/zoneStrings/Europe:Kiev", + "-/zoneStrings/Europe:Kirov", + "-/zoneStrings/Europe:Lisbon", + "-/zoneStrings/Europe:Ljubljana", + "-/zoneStrings/Europe:London/ec", + "-/zoneStrings/Europe:Luxembourg", + "-/zoneStrings/Europe:Madrid", + "-/zoneStrings/Europe:Malta", + "-/zoneStrings/Europe:Mariehamn", + "-/zoneStrings/Europe:Minsk", + "-/zoneStrings/Europe:Monaco", + "-/zoneStrings/Europe:Moscow", + "-/zoneStrings/Europe:Oslo", + "-/zoneStrings/Europe:Paris", + "-/zoneStrings/Europe:Podgorica", + "-/zoneStrings/Europe:Prague", + "-/zoneStrings/Europe:Riga", + "-/zoneStrings/Europe:Rome", + "-/zoneStrings/Europe:Samara", + "-/zoneStrings/Europe:San_Marino", + "-/zoneStrings/Europe:Sarajevo", + "-/zoneStrings/Europe:Saratov", + "-/zoneStrings/Europe:Simferopol", + "-/zoneStrings/Europe:Skopje", + "-/zoneStrings/Europe:Sofia", + "-/zoneStrings/Europe:Stockholm", + "-/zoneStrings/Europe:Tallinn", + "-/zoneStrings/Europe:Tirane", + "-/zoneStrings/Europe:Ulyanovsk", + "-/zoneStrings/Europe:Uzhgorod", + "-/zoneStrings/Europe:Vaduz", + "-/zoneStrings/Europe:Vatican", + "-/zoneStrings/Europe:Vienna", + "-/zoneStrings/Europe:Vilnius", + "-/zoneStrings/Europe:Volgograd", + "-/zoneStrings/Europe:Warsaw", + "-/zoneStrings/Europe:Zagreb", + "-/zoneStrings/Europe:Zaporozhye", + "-/zoneStrings/Europe:Zurich", + "-/zoneStrings/Indian:Antananarivo", + "-/zoneStrings/Indian:Chagos", + "-/zoneStrings/Indian:Christmas", + "-/zoneStrings/Indian:Cocos", + "-/zoneStrings/Indian:Comoro", + "-/zoneStrings/Indian:Kerguelen", + "-/zoneStrings/Indian:Mahe", + "-/zoneStrings/Indian:Maldives", + "-/zoneStrings/Indian:Mauritius", + "-/zoneStrings/Indian:Mayotte", + "-/zoneStrings/Indian:Reunion", + "-/zoneStrings/Pacific:Apia", + "-/zoneStrings/Pacific:Auckland", + "-/zoneStrings/Pacific:Bougainville", + "-/zoneStrings/Pacific:Chatham", + "-/zoneStrings/Pacific:Easter", + "-/zoneStrings/Pacific:Efate", + "-/zoneStrings/Pacific:Enderbury", + "-/zoneStrings/Pacific:Fakaofo", + "-/zoneStrings/Pacific:Fiji", + "-/zoneStrings/Pacific:Funafuti", + "-/zoneStrings/Pacific:Galapagos", + "-/zoneStrings/Pacific:Gambier", + "-/zoneStrings/Pacific:Guadalcanal", + "-/zoneStrings/Pacific:Guam", + "-/zoneStrings/Pacific:Honolulu/ec", + "-/zoneStrings/Pacific:Johnston", + "-/zoneStrings/Pacific:Kiritimati", + "-/zoneStrings/Pacific:Kosrae", + "-/zoneStrings/Pacific:Kwajalein", + "-/zoneStrings/Pacific:Majuro", + "-/zoneStrings/Pacific:Marquesas", + "-/zoneStrings/Pacific:Midway", + "-/zoneStrings/Pacific:Nauru", + "-/zoneStrings/Pacific:Niue", + "-/zoneStrings/Pacific:Norfolk", + "-/zoneStrings/Pacific:Noumea", + "-/zoneStrings/Pacific:Pago_Pago", + "-/zoneStrings/Pacific:Palau", + "-/zoneStrings/Pacific:Pitcairn", + "-/zoneStrings/Pacific:Ponape", + "-/zoneStrings/Pacific:Port_Moresby", + "-/zoneStrings/Pacific:Rarotonga", + "-/zoneStrings/Pacific:Saipan", + "-/zoneStrings/Pacific:Tahiti", + "-/zoneStrings/Pacific:Tarawa", + "-/zoneStrings/Pacific:Tongatapu", + "-/zoneStrings/Pacific:Truk", + "-/zoneStrings/Pacific:Wake", + "-/zoneStrings/Pacific:Wallis" + ] + }, + { + "categories": [ + "locales_tree" + ], + "rules": [ + "-/calendar/*", + "+/calendar/default", + "+/calendar/gregorian", + "+/calendar/generic" + ] + }, +// Based on +// https://cs.chromium.org/chromium/src/third_party/icu/android/patch_locale.sh?rcl=797b7c +// # On Android Java API is used to get lang data, except for the language and +// # script names for zh_Hans and zh_Hant which are not supported by Java API. +// # Here remove all lang data except those names. +// # See the comments in GetDisplayNameForLocale() (in Chromium's +// # src/ui/base/l10n/l10n_util.cc) about why we need the scripts. + { + "categories": [ + "lang_tree" + ], + "rules": [ + "-/Keys", + "-/Languages", + "-/Languages%long", + "-/Languages%short", + "-/Languages%variant", + "-/LanguagesShort", + "-/Scripts", + "-/Scripts%long", + "-/Scripts%short", + "-/Scripts%variant", + "-/Scripts%stand-alone", + "-/Types", + "-/Types%short", + "-/Variants", + "-/calendar", + "-/characterLabelPattern", + "-/codePatterns", + "-/localeDisplayPattern", + "+/Languages/zh", + "+/Languages%long/zh", + "+/Languages%short/zh", + "+/Languages%variant/zh", + "+/Scripts/Hans", + "+/Scripts%long/Hans", + "+/Scripts%short/Hans", + "+/Scripts%variant/Hans", + "+/Scripts%stand-alone/Hans", + "+/Scripts/Hant", + "+/Scripts%long/Hant", + "+/Scripts%short/Hant", + "+/Scripts%variant/Hant", + "+/Scripts%stand-alone/Hant" + ] + }, +// Remove all the Version + { + "categories": [ + "brkitr_tree", + "coll_tree", + "curr_tree", + "lang_tree", + "rbnf_tree", + "region_tree", + "unit_tree", + "zone_tree" + ], + "rules": [ + "-/Version" + ] + } + ] +} diff --git a/ios/icudtl.dat b/ios/icudtl.dat index 5283ef0074c9fad2c80b2670581088da762f5cc0..cf32bd28a48773acd9c4e59b7856a2e5ecfd1995 100644 Binary files a/ios/icudtl.dat and b/ios/icudtl.dat differ diff --git a/patches/calendarToAdopt.patch b/patches/calendarToAdopt.patch new file mode 100644 index 0000000000000000000000000000000000000000..3fd51a3b26c117e005887283e50f6b94da75ee46 --- /dev/null +++ b/patches/calendarToAdopt.patch @@ -0,0 +1,12 @@ +diff --git a/source/i18n/smpdtfmt.cpp b/source/i18n/smpdtfmt.cpp +index 98f36b3e..4fd1675e 100644 +--- a/source/i18n/smpdtfmt.cpp ++++ b/source/i18n/smpdtfmt.cpp +@@ -3998,6 +3998,7 @@ void SimpleDateFormat::adoptCalendar(Calendar* calendarToAdopt) + DateFormatSymbols *newSymbols = + DateFormatSymbols::createForLocale(calLocale, status); + if (U_FAILURE(status)) { ++ delete calendarToAdopt; + return; + } + DateFormat::adoptCalendar(calendarToAdopt); diff --git a/patches/hu_minimumGroupingDigits.patch b/patches/hu_minimumGroupingDigits.patch new file mode 100644 index 0000000000000000000000000000000000000000..29f1cc4c4cb7469d32f1016554d118b47cb91c5e --- /dev/null +++ b/patches/hu_minimumGroupingDigits.patch @@ -0,0 +1,13 @@ +diff --git a/source/data/locales/hu.txt b/source/data/locales/hu.txt +index ab73ac6b..b95228c2 100644 +--- a/source/data/locales/hu.txt ++++ b/source/data/locales/hu.txt +@@ -216,7 +216,7 @@ hu{ + other{"A kosár tartalma: {0} X. Megveszi őket?"} + } + } +- minimumGroupingDigits{"4"} ++ minimumGroupingDigits{"1"} + native{"latn"} + } + Version{"2.1.48.42"} diff --git a/patches/icupkg.patch b/patches/icupkg.patch new file mode 100644 index 0000000000000000000000000000000000000000..3f8f80ab1564a91186cd17f4c92f3952ba30f588 --- /dev/null +++ b/patches/icupkg.patch @@ -0,0 +1,17 @@ +diff --git a/source/tools/icupkg/icupkg.cpp b/source/tools/icupkg/icupkg.cpp +index ea7be4a9..51a66397 100644 +--- a/source/tools/icupkg/icupkg.cpp ++++ b/source/tools/icupkg/icupkg.cpp +@@ -501,10 +501,8 @@ main(int argc, char *argv[]) { + } + + /* check dependencies between items */ +- if(!pkg->checkDependencies()) { +- /* some dependencies are not fulfilled */ +- return U_MISSING_RESOURCE_ERROR; +- } ++ // Still check the checkDependencies to output warning but not produce error ++ pkg->checkDependencies(); + + /* write the output .dat package if there are any modifications */ + if(isModified) { diff --git a/patches/localematcher.patch b/patches/localematcher.patch new file mode 100644 index 0000000000000000000000000000000000000000..dd0f6a616929e98ab04686d5b2f01616a4e6e661 --- /dev/null +++ b/patches/localematcher.patch @@ -0,0 +1,4027 @@ +diff --git a/source/common/Makefile.in b/source/common/Makefile.in +index 79e371b0..d21f5d06 100644 +--- a/source/common/Makefile.in ++++ b/source/common/Makefile.in +@@ -88,8 +88,9 @@ ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \ + ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_set.o ucnv_ct.o \ + resource.o uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \ + ucurr.o \ +-localebuilder.o \ ++localebuilder.o localeprioritylist.o \ + messagepattern.o ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o locdspnm.o loclikely.o locresdata.o \ ++lsr.o loclikelysubtags.o locdistance.o localematcher.o \ + bytestream.o stringpiece.o bytesinkutil.o \ + stringtriebuilder.o bytestriebuilder.o \ + bytestrie.o bytestrieiterator.o \ +diff --git a/source/common/charstr.cpp b/source/common/charstr.cpp +index 852cc539..dda29dac 100644 +--- a/source/common/charstr.cpp ++++ b/source/common/charstr.cpp +@@ -35,6 +35,17 @@ CharString& CharString::operator=(CharString&& src) U_NOEXCEPT { + return *this; + } + ++char *CharString::cloneData(UErrorCode &errorCode) const { ++ if (U_FAILURE(errorCode)) { return nullptr; } ++ char *p = static_cast<char *>(uprv_malloc(len + 1)); ++ if (p == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return nullptr; ++ } ++ uprv_memcpy(p, buffer.getAlias(), len + 1); ++ return p; ++} ++ + CharString &CharString::copyFrom(const CharString &s, UErrorCode &errorCode) { + if(U_SUCCESS(errorCode) && this!=&s && ensureCapacity(s.len+1, 0, errorCode)) { + len=s.len; +@@ -52,6 +63,18 @@ int32_t CharString::lastIndexOf(char c) const { + return -1; + } + ++bool CharString::contains(StringPiece s) const { ++ if (s.empty()) { return false; } ++ const char *p = buffer.getAlias(); ++ int32_t lastStart = len - s.length(); ++ for (int32_t i = 0; i <= lastStart; ++i) { ++ if (uprv_memcmp(p + i, s.data(), s.length()) == 0) { ++ return true; ++ } ++ } ++ return false; ++} ++ + CharString &CharString::truncate(int32_t newLength) { + if(newLength<0) { + newLength=0; +diff --git a/source/common/charstr.h b/source/common/charstr.h +index 1a97e019..23b950ed 100644 +--- a/source/common/charstr.h ++++ b/source/common/charstr.h +@@ -82,10 +82,24 @@ public: + + const char *data() const { return buffer.getAlias(); } + char *data() { return buffer.getAlias(); } ++ /** ++ * Allocates length()+1 chars and copies the NUL-terminated data(). ++ * The caller must uprv_free() the result. ++ */ ++ char *cloneData(UErrorCode &errorCode) const; ++ ++ bool operator==(StringPiece other) const { ++ return len == other.length() && (len == 0 || uprv_memcmp(data(), other.data(), len) == 0); ++ } ++ bool operator!=(StringPiece other) const { ++ return !operator==(other); ++ } + + /** @return last index of c, or -1 if c is not in this string */ + int32_t lastIndexOf(char c) const; + ++ bool contains(StringPiece s) const; ++ + CharString &clear() { len=0; buffer[0]=0; return *this; } + CharString &truncate(int32_t newLength); + +diff --git a/source/common/localebuilder.cpp b/source/common/localebuilder.cpp +index fe931fcf..837b92f1 100644 +--- a/source/common/localebuilder.cpp ++++ b/source/common/localebuilder.cpp +@@ -157,13 +157,18 @@ _isKeywordValue(const char* key, const char* value, int32_t value_len) + } + + static void +-_copyExtensions(const Locale& from, Locale* to, bool validate, UErrorCode& errorCode) ++_copyExtensions(const Locale& from, icu::StringEnumeration *keywords, ++ Locale& to, bool validate, UErrorCode& errorCode) + { + if (U_FAILURE(errorCode)) { return; } +- LocalPointer<icu::StringEnumeration> iter(from.createKeywords(errorCode)); +- if (U_FAILURE(errorCode) || iter.isNull()) { return; } ++ LocalPointer<icu::StringEnumeration> ownedKeywords; ++ if (keywords == nullptr) { ++ ownedKeywords.adoptInstead(from.createKeywords(errorCode)); ++ if (U_FAILURE(errorCode) || ownedKeywords.isNull()) { return; } ++ keywords = ownedKeywords.getAlias(); ++ } + const char* key; +- while ((key = iter->next(nullptr, errorCode)) != nullptr) { ++ while ((key = keywords->next(nullptr, errorCode)) != nullptr) { + CharString value; + CharStringByteSink sink(&value); + from.getKeywordValue(key, sink, errorCode); +@@ -176,34 +181,34 @@ _copyExtensions(const Locale& from, Locale* to, bool validate, UErrorCode& error + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } +- to->setKeywordValue(key, value.data(), errorCode); ++ to.setKeywordValue(key, value.data(), errorCode); + if (U_FAILURE(errorCode)) { return; } + } + } + + void static +-_clearUAttributesAndKeyType(Locale* locale, UErrorCode& errorCode) ++_clearUAttributesAndKeyType(Locale& locale, UErrorCode& errorCode) + { + // Clear Unicode attributes +- locale->setKeywordValue(kAttributeKey, "", errorCode); ++ locale.setKeywordValue(kAttributeKey, "", errorCode); + + // Clear all Unicode keyword values +- LocalPointer<icu::StringEnumeration> iter(locale->createUnicodeKeywords(errorCode)); ++ LocalPointer<icu::StringEnumeration> iter(locale.createUnicodeKeywords(errorCode)); + if (U_FAILURE(errorCode) || iter.isNull()) { return; } + const char* key; + while ((key = iter->next(nullptr, errorCode)) != nullptr) { +- locale->setUnicodeKeywordValue(key, nullptr, errorCode); ++ locale.setUnicodeKeywordValue(key, nullptr, errorCode); + } + } + + static void +-_setUnicodeExtensions(Locale* locale, const CharString& value, UErrorCode& errorCode) ++_setUnicodeExtensions(Locale& locale, const CharString& value, UErrorCode& errorCode) + { + // Add the unicode extensions to extensions_ + CharString locale_str("und-u-", errorCode); + locale_str.append(value, errorCode); + _copyExtensions( +- Locale::forLanguageTag(locale_str.data(), errorCode), ++ Locale::forLanguageTag(locale_str.data(), errorCode), nullptr, + locale, false, errorCode); + } + +@@ -235,10 +240,10 @@ LocaleBuilder& LocaleBuilder::setExtension(char key, StringPiece value) + status_); + return *this; + } +- _clearUAttributesAndKeyType(extensions_, status_); ++ _clearUAttributesAndKeyType(*extensions_, status_); + if (U_FAILURE(status_)) { return *this; } + if (!value.empty()) { +- _setUnicodeExtensions(extensions_, value_str, status_); ++ _setUnicodeExtensions(*extensions_, value_str, status_); + } + return *this; + } +@@ -401,6 +406,24 @@ Locale makeBogusLocale() { + return bogus; + } + ++void LocaleBuilder::copyExtensionsFrom(const Locale& src, UErrorCode& errorCode) ++{ ++ if (U_FAILURE(errorCode)) { return; } ++ LocalPointer<icu::StringEnumeration> keywords(src.createKeywords(errorCode)); ++ if (U_FAILURE(errorCode) || keywords.isNull() || keywords->count(errorCode) == 0) { ++ // Error, or no extensions to copy. ++ return; ++ } ++ if (extensions_ == nullptr) { ++ extensions_ = new Locale(); ++ if (extensions_ == nullptr) { ++ status_ = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ } ++ _copyExtensions(src, keywords.getAlias(), *extensions_, false, errorCode); ++} ++ + Locale LocaleBuilder::build(UErrorCode& errorCode) + { + if (U_FAILURE(errorCode)) { +@@ -425,7 +448,7 @@ Locale LocaleBuilder::build(UErrorCode& errorCode) + } + Locale product(locale_str.data()); + if (extensions_ != nullptr) { +- _copyExtensions(*extensions_, &product, true, errorCode); ++ _copyExtensions(*extensions_, nullptr, product, true, errorCode); + } + if (U_FAILURE(errorCode)) { + return makeBogusLocale(); +diff --git a/source/common/localematcher.cpp b/source/common/localematcher.cpp +new file mode 100644 +index 00000000..d975fe75 +--- /dev/null ++++ b/source/common/localematcher.cpp +@@ -0,0 +1,720 @@ ++// © 2019 and later: Unicode, Inc. and others. ++// License & terms of use: http://www.unicode.org/copyright.html#License ++ ++// localematcher.cpp ++// created: 2019may08 Markus W. Scherer ++ ++#ifndef __LOCMATCHER_H__ ++#define __LOCMATCHER_H__ ++ ++#include "unicode/utypes.h" ++#include "unicode/localebuilder.h" ++#include "unicode/localematcher.h" ++#include "unicode/locid.h" ++#include "unicode/stringpiece.h" ++#include "unicode/uobject.h" ++#include "cstring.h" ++#include "localeprioritylist.h" ++#include "loclikelysubtags.h" ++#include "locdistance.h" ++#include "lsr.h" ++#include "uassert.h" ++#include "uhash.h" ++#include "uvector.h" ++ ++#define UND_LSR LSR("und", "", "") ++ ++/** ++ * Indicator for the lifetime of desired-locale objects passed into the LocaleMatcher. ++ * ++ * @draft ICU 65 ++ */ ++enum ULocMatchLifetime { ++ /** ++ * Locale objects are temporary. ++ * The matcher will make a copy of a locale that will be used beyond one function call. ++ * ++ * @draft ICU 65 ++ */ ++ ULOCMATCH_TEMPORARY_LOCALES, ++ /** ++ * Locale objects are stored at least as long as the matcher is used. ++ * The matcher will keep only a pointer to a locale that will be used beyond one function call, ++ * avoiding a copy. ++ * ++ * @draft ICU 65 ++ */ ++ ULOCMATCH_STORED_LOCALES // TODO: permanent? cached? clone? ++}; ++#ifndef U_IN_DOXYGEN ++typedef enum ULocMatchLifetime ULocMatchLifetime; ++#endif ++ ++U_NAMESPACE_BEGIN ++ ++LocaleMatcher::Result::Result(LocaleMatcher::Result &&src) U_NOEXCEPT : ++ desiredLocale(src.desiredLocale), ++ supportedLocale(src.supportedLocale), ++ desiredIndex(src.desiredIndex), ++ supportedIndex(src.supportedIndex), ++ desiredIsOwned(src.desiredIsOwned) { ++ if (desiredIsOwned) { ++ src.desiredLocale = nullptr; ++ src.desiredIndex = -1; ++ src.desiredIsOwned = FALSE; ++ } ++} ++ ++LocaleMatcher::Result::~Result() { ++ if (desiredIsOwned) { ++ delete desiredLocale; ++ } ++} ++ ++LocaleMatcher::Result &LocaleMatcher::Result::operator=(LocaleMatcher::Result &&src) U_NOEXCEPT { ++ this->~Result(); ++ ++ desiredLocale = src.desiredLocale; ++ supportedLocale = src.supportedLocale; ++ desiredIndex = src.desiredIndex; ++ supportedIndex = src.supportedIndex; ++ desiredIsOwned = src.desiredIsOwned; ++ ++ if (desiredIsOwned) { ++ src.desiredLocale = nullptr; ++ src.desiredIndex = -1; ++ src.desiredIsOwned = FALSE; ++ } ++ return *this; ++} ++ ++Locale LocaleMatcher::Result::makeResolvedLocale(UErrorCode &errorCode) const { ++ if (U_FAILURE(errorCode) || supportedLocale == nullptr) { ++ return Locale::getRoot(); ++ } ++ const Locale *bestDesired = getDesiredLocale(); ++ if (bestDesired == nullptr || *supportedLocale == *bestDesired) { ++ return *supportedLocale; ++ } ++ LocaleBuilder b; ++ b.setLocale(*supportedLocale); ++ ++ // Copy the region from bestDesired, if there is one. ++ const char *region = bestDesired->getCountry(); ++ if (*region != 0) { ++ b.setRegion(region); ++ } ++ ++ // Copy the variants from bestDesired, if there are any. ++ // Note that this will override any supportedLocale variants. ++ // For example, "sco-ulster-fonipa" + "...-fonupa" => "sco-fonupa" (replacing ulster). ++ const char *variants = bestDesired->getVariant(); ++ if (*variants != 0) { ++ b.setVariant(variants); ++ } ++ ++ // Copy the extensions from bestDesired, if there are any. ++ // C++ note: The following note, copied from Java, may not be true, ++ // as long as C++ copies by legacy ICU keyword, not by extension singleton. ++ // Note that this will override any supportedLocale extensions. ++ // For example, "th-u-nu-latn-ca-buddhist" + "...-u-nu-native" => "th-u-nu-native" ++ // (replacing calendar). ++ b.copyExtensionsFrom(*bestDesired, errorCode); ++ return b.build(errorCode); ++} ++ ++LocaleMatcher::Builder::Builder(LocaleMatcher::Builder &&src) U_NOEXCEPT : ++ errorCode_(src.errorCode_), ++ supportedLocales_(src.supportedLocales_), ++ thresholdDistance_(src.thresholdDistance_), ++ demotion_(src.demotion_), ++ defaultLocale_(src.defaultLocale_), ++ favor_(src.favor_) { ++ src.supportedLocales_ = nullptr; ++ src.defaultLocale_ = nullptr; ++} ++ ++LocaleMatcher::Builder::~Builder() { ++ delete supportedLocales_; ++ delete defaultLocale_; ++} ++ ++LocaleMatcher::Builder &LocaleMatcher::Builder::operator=(LocaleMatcher::Builder &&src) U_NOEXCEPT { ++ this->~Builder(); ++ ++ errorCode_ = src.errorCode_; ++ supportedLocales_ = src.supportedLocales_; ++ thresholdDistance_ = src.thresholdDistance_; ++ demotion_ = src.demotion_; ++ defaultLocale_ = src.defaultLocale_; ++ favor_ = src.favor_; ++ ++ src.supportedLocales_ = nullptr; ++ src.defaultLocale_ = nullptr; ++ return *this; ++} ++ ++void LocaleMatcher::Builder::clearSupportedLocales() { ++ if (supportedLocales_ != nullptr) { ++ supportedLocales_->removeAllElements(); ++ } ++} ++ ++bool LocaleMatcher::Builder::ensureSupportedLocaleVector() { ++ if (U_FAILURE(errorCode_)) { return false; } ++ if (supportedLocales_ != nullptr) { return true; } ++ supportedLocales_ = new UVector(uprv_deleteUObject, nullptr, errorCode_); ++ if (U_FAILURE(errorCode_)) { return false; } ++ if (supportedLocales_ == nullptr) { ++ errorCode_ = U_MEMORY_ALLOCATION_ERROR; ++ return false; ++ } ++ return true; ++} ++ ++LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListString( ++ StringPiece locales) { ++ LocalePriorityList list(locales, errorCode_); ++ if (U_FAILURE(errorCode_)) { return *this; } ++ clearSupportedLocales(); ++ if (!ensureSupportedLocaleVector()) { return *this; } ++ int32_t length = list.getLengthIncludingRemoved(); ++ for (int32_t i = 0; i < length; ++i) { ++ Locale *locale = list.orphanLocaleAt(i); ++ if (locale == nullptr) { continue; } ++ supportedLocales_->addElement(locale, errorCode_); ++ if (U_FAILURE(errorCode_)) { ++ delete locale; ++ break; ++ } ++ } ++ return *this; ++} ++ ++LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) { ++ if (U_FAILURE(errorCode_)) { return *this; } ++ clearSupportedLocales(); ++ if (!ensureSupportedLocaleVector()) { return *this; } ++ while (locales.hasNext()) { ++ const Locale &locale = locales.next(); ++ Locale *clone = locale.clone(); ++ if (clone == nullptr) { ++ errorCode_ = U_MEMORY_ALLOCATION_ERROR; ++ break; ++ } ++ supportedLocales_->addElement(clone, errorCode_); ++ if (U_FAILURE(errorCode_)) { ++ delete clone; ++ break; ++ } ++ } ++ return *this; ++} ++ ++LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) { ++ if (!ensureSupportedLocaleVector()) { return *this; } ++ Locale *clone = locale.clone(); ++ if (clone == nullptr) { ++ errorCode_ = U_MEMORY_ALLOCATION_ERROR; ++ return *this; ++ } ++ supportedLocales_->addElement(clone, errorCode_); ++ if (U_FAILURE(errorCode_)) { ++ delete clone; ++ } ++ return *this; ++} ++ ++LocaleMatcher::Builder &LocaleMatcher::Builder::setDefaultLocale(const Locale *defaultLocale) { ++ if (U_FAILURE(errorCode_)) { return *this; } ++ Locale *clone = nullptr; ++ if (defaultLocale != nullptr) { ++ clone = defaultLocale->clone(); ++ if (clone == nullptr) { ++ errorCode_ = U_MEMORY_ALLOCATION_ERROR; ++ return *this; ++ } ++ } ++ delete defaultLocale_; ++ defaultLocale_ = clone; ++ return *this; ++} ++ ++LocaleMatcher::Builder &LocaleMatcher::Builder::setFavorSubtag(ULocMatchFavorSubtag subtag) { ++ if (U_FAILURE(errorCode_)) { return *this; } ++ favor_ = subtag; ++ return *this; ++} ++ ++LocaleMatcher::Builder &LocaleMatcher::Builder::setDemotionPerDesiredLocale(ULocMatchDemotion demotion) { ++ if (U_FAILURE(errorCode_)) { return *this; } ++ demotion_ = demotion; ++ return *this; ++} ++ ++#if 0 ++/** ++ * <i>Internal only!</i> ++ * ++ * @param thresholdDistance the thresholdDistance to set, with -1 = default ++ * @return this Builder object ++ * @internal ++ * @deprecated This API is ICU internal only. ++ */ ++@Deprecated ++LocaleMatcher::Builder &LocaleMatcher::Builder::internalSetThresholdDistance(int32_t thresholdDistance) { ++ if (U_FAILURE(errorCode_)) { return *this; } ++ if (thresholdDistance > 100) { ++ thresholdDistance = 100; ++ } ++ thresholdDistance_ = thresholdDistance; ++ return *this; ++} ++#endif ++ ++UBool LocaleMatcher::Builder::copyErrorTo(UErrorCode &outErrorCode) const { ++ if (U_FAILURE(outErrorCode)) { return TRUE; } ++ if (U_SUCCESS(errorCode_)) { return FALSE; } ++ outErrorCode = errorCode_; ++ return TRUE; ++} ++ ++LocaleMatcher LocaleMatcher::Builder::build(UErrorCode &errorCode) const { ++ if (U_SUCCESS(errorCode) && U_FAILURE(errorCode_)) { ++ errorCode = errorCode_; ++ } ++ return LocaleMatcher(*this, errorCode); ++} ++ ++namespace { ++ ++LSR getMaximalLsrOrUnd(const XLikelySubtags &likelySubtags, const Locale &locale, ++ UErrorCode &errorCode) { ++ if (U_FAILURE(errorCode) || locale.isBogus() || *locale.getName() == 0 /* "und" */) { ++ return UND_LSR; ++ } else { ++ return likelySubtags.makeMaximizedLsrFrom(locale, errorCode); ++ } ++} ++ ++int32_t hashLSR(const UHashTok token) { ++ const LSR *lsr = static_cast<const LSR *>(token.pointer); ++ return lsr->hashCode; ++} ++ ++UBool compareLSRs(const UHashTok t1, const UHashTok t2) { ++ const LSR *lsr1 = static_cast<const LSR *>(t1.pointer); ++ const LSR *lsr2 = static_cast<const LSR *>(t2.pointer); ++ return *lsr1 == *lsr2; ++} ++ ++bool putIfAbsent(UHashtable *lsrToIndex, const LSR &lsr, int32_t i, UErrorCode &errorCode) { ++ if (U_FAILURE(errorCode)) { return false; } ++ U_ASSERT(i > 0); ++ int32_t index = uhash_geti(lsrToIndex, &lsr); ++ if (index != 0) { ++ return false; ++ } else { ++ uhash_puti(lsrToIndex, const_cast<LSR *>(&lsr), i, &errorCode); ++ return U_SUCCESS(errorCode); ++ } ++} ++ ++} // namespace ++ ++LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) : ++ likelySubtags(*XLikelySubtags::getSingleton(errorCode)), ++ localeDistance(*LocaleDistance::getSingleton(errorCode)), ++ thresholdDistance(builder.thresholdDistance_), ++ demotionPerDesiredLocale(0), ++ favorSubtag(builder.favor_), ++ supportedLocales(nullptr), lsrs(nullptr), supportedLocalesLength(0), ++ supportedLsrToIndex(nullptr), ++ supportedLSRs(nullptr), supportedIndexes(nullptr), supportedLSRsLength(0), ++ ownedDefaultLocale(nullptr), defaultLocale(nullptr), defaultLocaleIndex(-1) { ++ if (U_FAILURE(errorCode)) { return; } ++ if (thresholdDistance < 0) { ++ thresholdDistance = localeDistance.getDefaultScriptDistance(); ++ } ++ supportedLocalesLength = builder.supportedLocales_ != nullptr ? ++ builder.supportedLocales_->size() : 0; ++ const Locale *def = builder.defaultLocale_; ++ int32_t idef = -1; ++ if (supportedLocalesLength > 0) { ++ // Store the supported locales in input order, ++ // so that when different types are used (e.g., language tag strings) ++ // we can return those by parallel index. ++ supportedLocales = static_cast<const Locale **>( ++ uprv_malloc(supportedLocalesLength * sizeof(const Locale *))); ++ // Supported LRSs in input order. ++ // In C++, we store these permanently to simplify ownership management ++ // in the hash tables. Duplicate LSRs (if any) are unused overhead. ++ lsrs = new LSR[supportedLocalesLength]; ++ if (supportedLocales == nullptr || lsrs == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ // If the constructor fails partway, we need null pointers for destructibility. ++ uprv_memset(supportedLocales, 0, supportedLocalesLength * sizeof(const Locale *)); ++ // Also find the first supported locale whose LSR is ++ // the same as that for the default locale. ++ LSR builderDefaultLSR; ++ const LSR *defLSR = nullptr; ++ if (def != nullptr) { ++ builderDefaultLSR = getMaximalLsrOrUnd(likelySubtags, *def, errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ defLSR = &builderDefaultLSR; ++ } ++ for (int32_t i = 0; i < supportedLocalesLength; ++i) { ++ const Locale &locale = *static_cast<Locale *>(builder.supportedLocales_->elementAt(i)); ++ supportedLocales[i] = locale.clone(); ++ if (supportedLocales[i] == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ const Locale &supportedLocale = *supportedLocales[i]; ++ LSR &lsr = lsrs[i] = getMaximalLsrOrUnd(likelySubtags, supportedLocale, errorCode); ++ lsr.setHashCode(); ++ if (U_FAILURE(errorCode)) { return; } ++ if (idef < 0 && defLSR != nullptr && lsr == *defLSR) { ++ idef = i; ++ defLSR = &lsr; // owned pointer to put into supportedLsrToIndex ++ if (*def == supportedLocale) { ++ def = &supportedLocale; // owned pointer to keep ++ } ++ } ++ } ++ ++ // We need an unordered map from LSR to first supported locale with that LSR, ++ // and an ordered list of (LSR, supported index). ++ // We insert the supported locales in the following order: ++ // 1. Default locale, if it is supported. ++ // 2. Priority locales (aka "paradigm locales") in builder order. ++ // 3. Remaining locales in builder order. ++ // In Java, we use a LinkedHashMap for both map & ordered lists. ++ // In C++, we use separate structures. ++ // We over-allocate arrays of LSRs and indexes for simplicity. ++ // We reserve slots at the array starts for the default and paradigm locales, ++ // plus enough for all supported locales. ++ // If there are few paradigm locales and few duplicate supported LSRs, ++ // then the amount of wasted space is small. ++ supportedLsrToIndex = uhash_openSize(hashLSR, compareLSRs, uhash_compareLong, ++ supportedLocalesLength, &errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ int32_t paradigmLimit = 1 + localeDistance.getParadigmLSRsLength(); ++ int32_t suppLSRsCapacity = paradigmLimit + supportedLocalesLength; ++ supportedLSRs = static_cast<const LSR **>( ++ uprv_malloc(suppLSRsCapacity * sizeof(const LSR *))); ++ supportedIndexes = static_cast<int32_t *>( ++ uprv_malloc(suppLSRsCapacity * sizeof(int32_t))); ++ if (supportedLSRs == nullptr || supportedIndexes == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ int32_t paradigmIndex = 0; ++ int32_t otherIndex = paradigmLimit; ++ if (idef >= 0) { ++ uhash_puti(supportedLsrToIndex, const_cast<LSR *>(defLSR), idef + 1, &errorCode); ++ supportedLSRs[0] = defLSR; ++ supportedIndexes[0] = idef; ++ paradigmIndex = 1; ++ } ++ for (int32_t i = 0; i < supportedLocalesLength; ++i) { ++ if (i == idef) { continue; } ++ const Locale &locale = *supportedLocales[i]; ++ const LSR &lsr = lsrs[i]; ++ if (defLSR == nullptr) { ++ U_ASSERT(i == 0); ++ def = &locale; ++ defLSR = &lsr; ++ idef = 0; ++ uhash_puti(supportedLsrToIndex, const_cast<LSR *>(&lsr), 0 + 1, &errorCode); ++ supportedLSRs[0] = &lsr; ++ supportedIndexes[0] = 0; ++ paradigmIndex = 1; ++ } else if (idef >= 0 && lsr == *defLSR) { ++ // lsr == *defLSR means that this supported locale is ++ // a duplicate of the default locale. ++ // Either an explicit default locale is supported, and we added it before the loop, ++ // or there is no explicit default locale, and this is ++ // a duplicate of the first supported locale. ++ // In both cases, idef >= 0 now, so otherwise we can skip the comparison. ++ // For a duplicate, putIfAbsent() is a no-op, so nothing to do. ++ } else { ++ if (putIfAbsent(supportedLsrToIndex, lsr, i + 1, errorCode)) { ++ if (localeDistance.isParadigmLSR(lsr)) { ++ supportedLSRs[paradigmIndex] = &lsr; ++ supportedIndexes[paradigmIndex++] = i; ++ } else { ++ supportedLSRs[otherIndex] = &lsr; ++ supportedIndexes[otherIndex++] = i; ++ } ++ } ++ } ++ if (U_FAILURE(errorCode)) { return; } ++ } ++ // Squeeze out unused array slots. ++ if (paradigmIndex < paradigmLimit && paradigmLimit < otherIndex) { ++ uprv_memmove(supportedLSRs + paradigmIndex, supportedLSRs + paradigmLimit, ++ (otherIndex - paradigmLimit) * sizeof(const LSR *)); ++ uprv_memmove(supportedIndexes + paradigmIndex, supportedIndexes + paradigmLimit, ++ (otherIndex - paradigmLimit) * sizeof(int32_t)); ++ } ++ supportedLSRsLength = otherIndex - (paradigmLimit - paradigmIndex); ++ } ++ ++ if (def != nullptr && (idef < 0 || def != supportedLocales[idef])) { ++ ownedDefaultLocale = def->clone(); ++ if (ownedDefaultLocale == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ def = ownedDefaultLocale; ++ } ++ defaultLocale = def; ++ defaultLocaleIndex = idef; ++ ++ if (builder.demotion_ == ULOCMATCH_DEMOTION_REGION) { ++ demotionPerDesiredLocale = localeDistance.getDefaultDemotionPerDesiredLocale(); ++ } ++} ++ ++LocaleMatcher::LocaleMatcher(LocaleMatcher &&src) U_NOEXCEPT : ++ likelySubtags(src.likelySubtags), ++ localeDistance(src.localeDistance), ++ thresholdDistance(src.thresholdDistance), ++ demotionPerDesiredLocale(src.demotionPerDesiredLocale), ++ favorSubtag(src.favorSubtag), ++ supportedLocales(src.supportedLocales), lsrs(src.lsrs), ++ supportedLocalesLength(src.supportedLocalesLength), ++ supportedLsrToIndex(src.supportedLsrToIndex), ++ supportedLSRs(src.supportedLSRs), ++ supportedIndexes(src.supportedIndexes), ++ supportedLSRsLength(src.supportedLSRsLength), ++ ownedDefaultLocale(src.ownedDefaultLocale), defaultLocale(src.defaultLocale), ++ defaultLocaleIndex(src.defaultLocaleIndex) { ++ src.supportedLocales = nullptr; ++ src.lsrs = nullptr; ++ src.supportedLocalesLength = 0; ++ src.supportedLsrToIndex = nullptr; ++ src.supportedLSRs = nullptr; ++ src.supportedIndexes = nullptr; ++ src.supportedLSRsLength = 0; ++ src.ownedDefaultLocale = nullptr; ++ src.defaultLocale = nullptr; ++ src.defaultLocaleIndex = -1; ++} ++ ++LocaleMatcher::~LocaleMatcher() { ++ for (int32_t i = 0; i < supportedLocalesLength; ++i) { ++ delete supportedLocales[i]; ++ } ++ uprv_free(supportedLocales); ++ delete[] lsrs; ++ uhash_close(supportedLsrToIndex); ++ uprv_free(supportedLSRs); ++ uprv_free(supportedIndexes); ++ delete ownedDefaultLocale; ++} ++ ++LocaleMatcher &LocaleMatcher::operator=(LocaleMatcher &&src) U_NOEXCEPT { ++ this->~LocaleMatcher(); ++ ++ thresholdDistance = src.thresholdDistance; ++ demotionPerDesiredLocale = src.demotionPerDesiredLocale; ++ favorSubtag = src.favorSubtag; ++ supportedLocales = src.supportedLocales; ++ lsrs = src.lsrs; ++ supportedLocalesLength = src.supportedLocalesLength; ++ supportedLsrToIndex = src.supportedLsrToIndex; ++ supportedLSRs = src.supportedLSRs; ++ supportedIndexes = src.supportedIndexes; ++ supportedLSRsLength = src.supportedLSRsLength; ++ ownedDefaultLocale = src.ownedDefaultLocale; ++ defaultLocale = src.defaultLocale; ++ defaultLocaleIndex = src.defaultLocaleIndex; ++ ++ src.supportedLocales = nullptr; ++ src.lsrs = nullptr; ++ src.supportedLocalesLength = 0; ++ src.supportedLsrToIndex = nullptr; ++ src.supportedLSRs = nullptr; ++ src.supportedIndexes = nullptr; ++ src.supportedLSRsLength = 0; ++ src.ownedDefaultLocale = nullptr; ++ src.defaultLocale = nullptr; ++ src.defaultLocaleIndex = -1; ++ return *this; ++} ++ ++class LocaleLsrIterator { ++public: ++ LocaleLsrIterator(const XLikelySubtags &likelySubtags, Locale::Iterator &locales, ++ ULocMatchLifetime lifetime) : ++ likelySubtags(likelySubtags), locales(locales), lifetime(lifetime) {} ++ ++ ~LocaleLsrIterator() { ++ if (lifetime == ULOCMATCH_TEMPORARY_LOCALES) { ++ delete remembered; ++ } ++ } ++ ++ bool hasNext() const { ++ return locales.hasNext(); ++ } ++ ++ LSR next(UErrorCode &errorCode) { ++ current = &locales.next(); ++ return getMaximalLsrOrUnd(likelySubtags, *current, errorCode); ++ } ++ ++ void rememberCurrent(int32_t desiredIndex, UErrorCode &errorCode) { ++ if (U_FAILURE(errorCode)) { return; } ++ bestDesiredIndex = desiredIndex; ++ if (lifetime == ULOCMATCH_STORED_LOCALES) { ++ remembered = current; ++ } else { ++ // ULOCMATCH_TEMPORARY_LOCALES ++ delete remembered; ++ remembered = new Locale(*current); ++ if (remembered == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ } ++ } ++ } ++ ++ const Locale *orphanRemembered() { ++ const Locale *rem = remembered; ++ remembered = nullptr; ++ return rem; ++ } ++ ++ int32_t getBestDesiredIndex() const { ++ return bestDesiredIndex; ++ } ++ ++private: ++ const XLikelySubtags &likelySubtags; ++ Locale::Iterator &locales; ++ ULocMatchLifetime lifetime; ++ const Locale *current = nullptr, *remembered = nullptr; ++ int32_t bestDesiredIndex = -1; ++}; ++ ++const Locale *LocaleMatcher::getBestMatch(const Locale &desiredLocale, UErrorCode &errorCode) const { ++ if (U_FAILURE(errorCode)) { return nullptr; } ++ int32_t suppIndex = getBestSuppIndex( ++ getMaximalLsrOrUnd(likelySubtags, desiredLocale, errorCode), ++ nullptr, errorCode); ++ return U_SUCCESS(errorCode) && suppIndex >= 0 ? supportedLocales[suppIndex] : defaultLocale; ++} ++ ++const Locale *LocaleMatcher::getBestMatch(Locale::Iterator &desiredLocales, ++ UErrorCode &errorCode) const { ++ if (U_FAILURE(errorCode)) { return nullptr; } ++ if (!desiredLocales.hasNext()) { ++ return defaultLocale; ++ } ++ LocaleLsrIterator lsrIter(likelySubtags, desiredLocales, ULOCMATCH_TEMPORARY_LOCALES); ++ int32_t suppIndex = getBestSuppIndex(lsrIter.next(errorCode), &lsrIter, errorCode); ++ return U_SUCCESS(errorCode) && suppIndex >= 0 ? supportedLocales[suppIndex] : defaultLocale; ++} ++ ++const Locale *LocaleMatcher::getBestMatchForListString( ++ StringPiece desiredLocaleList, UErrorCode &errorCode) const { ++ LocalePriorityList list(desiredLocaleList, errorCode); ++ LocalePriorityList::Iterator iter = list.iterator(); ++ return getBestMatch(iter, errorCode); ++} ++ ++LocaleMatcher::Result LocaleMatcher::getBestMatchResult( ++ const Locale &desiredLocale, UErrorCode &errorCode) const { ++ if (U_FAILURE(errorCode)) { ++ return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); ++ } ++ int32_t suppIndex = getBestSuppIndex( ++ getMaximalLsrOrUnd(likelySubtags, desiredLocale, errorCode), ++ nullptr, errorCode); ++ if (U_FAILURE(errorCode) || suppIndex < 0) { ++ return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); ++ } else { ++ return Result(&desiredLocale, supportedLocales[suppIndex], 0, suppIndex, FALSE); ++ } ++} ++ ++LocaleMatcher::Result LocaleMatcher::getBestMatchResult( ++ Locale::Iterator &desiredLocales, UErrorCode &errorCode) const { ++ if (U_FAILURE(errorCode) || !desiredLocales.hasNext()) { ++ return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); ++ } ++ LocaleLsrIterator lsrIter(likelySubtags, desiredLocales, ULOCMATCH_TEMPORARY_LOCALES); ++ int32_t suppIndex = getBestSuppIndex(lsrIter.next(errorCode), &lsrIter, errorCode); ++ if (U_FAILURE(errorCode) || suppIndex < 0) { ++ return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); ++ } else { ++ return Result(lsrIter.orphanRemembered(), supportedLocales[suppIndex], ++ lsrIter.getBestDesiredIndex(), suppIndex, TRUE); ++ } ++} ++ ++int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remainingIter, ++ UErrorCode &errorCode) const { ++ if (U_FAILURE(errorCode)) { return -1; } ++ int32_t desiredIndex = 0; ++ int32_t bestSupportedLsrIndex = -1; ++ for (int32_t bestDistance = thresholdDistance;;) { ++ // Quick check for exact maximized LSR. ++ // Returns suppIndex+1 where 0 means not found. ++ if (supportedLsrToIndex != nullptr) { ++ desiredLSR.setHashCode(); ++ int32_t index = uhash_geti(supportedLsrToIndex, &desiredLSR); ++ if (index != 0) { ++ int32_t suppIndex = index - 1; ++ if (remainingIter != nullptr) { ++ remainingIter->rememberCurrent(desiredIndex, errorCode); ++ } ++ return suppIndex; ++ } ++ } ++ int32_t bestIndexAndDistance = localeDistance.getBestIndexAndDistance( ++ desiredLSR, supportedLSRs, supportedLSRsLength, bestDistance, favorSubtag); ++ if (bestIndexAndDistance >= 0) { ++ bestDistance = bestIndexAndDistance & 0xff; ++ if (remainingIter != nullptr) { ++ remainingIter->rememberCurrent(desiredIndex, errorCode); ++ if (U_FAILURE(errorCode)) { return -1; } ++ } ++ bestSupportedLsrIndex = bestIndexAndDistance >= 0 ? bestIndexAndDistance >> 8 : -1; ++ } ++ if ((bestDistance -= demotionPerDesiredLocale) <= 0) { ++ break; ++ } ++ if (remainingIter == nullptr || !remainingIter->hasNext()) { ++ break; ++ } ++ desiredLSR = remainingIter->next(errorCode); ++ if (U_FAILURE(errorCode)) { return -1; } ++ ++desiredIndex; ++ } ++ if (bestSupportedLsrIndex < 0) { ++ // no good match ++ return -1; ++ } ++ return supportedIndexes[bestSupportedLsrIndex]; ++} ++ ++double LocaleMatcher::internalMatch(const Locale &desired, const Locale &supported, UErrorCode &errorCode) const { ++ // Returns the inverse of the distance: That is, 1-distance(desired, supported). ++ LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, supported, errorCode); ++ if (U_FAILURE(errorCode)) { return 0; } ++ const LSR *pSuppLSR = &suppLSR; ++ int32_t distance = localeDistance.getBestIndexAndDistance( ++ getMaximalLsrOrUnd(likelySubtags, desired, errorCode), ++ &pSuppLSR, 1, ++ thresholdDistance, favorSubtag) & 0xff; ++ return (100 - distance) / 100.0; ++} ++ ++U_NAMESPACE_END ++ ++#endif // __LOCMATCHER_H__ +diff --git a/source/common/localeprioritylist.cpp b/source/common/localeprioritylist.cpp +new file mode 100644 +index 00000000..06442fb4 +--- /dev/null ++++ b/source/common/localeprioritylist.cpp +@@ -0,0 +1,239 @@ ++// © 2019 and later: Unicode, Inc. and others. ++// License & terms of use: http://www.unicode.org/copyright.html#License ++ ++// localeprioritylist.cpp ++// created: 2019jul11 Markus W. Scherer ++ ++#include "unicode/utypes.h" ++#include "unicode/localpointer.h" ++#include "unicode/locid.h" ++#include "unicode/stringpiece.h" ++#include "unicode/uobject.h" ++#include "charstr.h" ++#include "cmemory.h" ++#include "localeprioritylist.h" ++#include "uarrsort.h" ++#include "uassert.h" ++#include "uhash.h" ++ ++U_NAMESPACE_BEGIN ++ ++namespace { ++ ++int32_t hashLocale(const UHashTok token) { ++ auto *locale = static_cast<const Locale *>(token.pointer); ++ return locale->hashCode(); ++} ++ ++UBool compareLocales(const UHashTok t1, const UHashTok t2) { ++ auto *l1 = static_cast<const Locale *>(t1.pointer); ++ auto *l2 = static_cast<const Locale *>(t2.pointer); ++ return *l1 == *l2; ++} ++ ++constexpr int32_t WEIGHT_ONE = 1000; ++ ++struct LocaleAndWeight { ++ Locale *locale; ++ int32_t weight; // 0..1000 = 0.0..1.0 ++ int32_t index; // force stable sort ++ ++ int32_t compare(const LocaleAndWeight &other) const { ++ int32_t diff = other.weight - weight; // descending: other-this ++ if (diff != 0) { return diff; } ++ return index - other.index; ++ } ++}; ++ ++int32_t U_CALLCONV ++compareLocaleAndWeight(const void * /*context*/, const void *left, const void *right) { ++ return static_cast<const LocaleAndWeight *>(left)-> ++ compare(*static_cast<const LocaleAndWeight *>(right)); ++} ++ ++const char *skipSpaces(const char *p, const char *limit) { ++ while (p < limit && *p == ' ') { ++p; } ++ return p; ++} ++ ++int32_t findTagLength(const char *p, const char *limit) { ++ // Look for accept-language delimiters. ++ // Leave other validation up to the Locale constructor. ++ const char *q; ++ for (q = p; q < limit; ++q) { ++ char c = *q; ++ if (c == ' ' || c == ',' || c == ';') { break; } ++ } ++ return static_cast<int32_t>(q - p); ++} ++ ++/** ++ * Parses and returns a qvalue weight in millis. ++ * Advances p to after the parsed substring. ++ * Returns a negative value if parsing fails. ++ */ ++int32_t parseWeight(const char *&p, const char *limit) { ++ p = skipSpaces(p, limit); ++ char c; ++ if (p == limit || ((c = *p) != '0' && c != '1')) { return -1; } ++ int32_t weight = (c - '0') * 1000; ++ if (++p == limit || *p != '.') { return weight; } ++ int32_t multiplier = 100; ++ while (++p != limit && '0' <= (c = *p) && c <= '9') { ++ c -= '0'; ++ if (multiplier > 0) { ++ weight += c * multiplier; ++ multiplier /= 10; ++ } else if (multiplier == 0) { ++ // round up ++ if (c >= 5) { ++weight; } ++ multiplier = -1; ++ } // else ignore further fraction digits ++ } ++ return weight <= WEIGHT_ONE ? weight : -1; // bad if > 1.0 ++} ++ ++} // namespace ++ ++/** ++ * Nothing but a wrapper over a MaybeStackArray of LocaleAndWeight. ++ * ++ * This wrapper exists (and is not in an anonymous namespace) ++ * so that we can forward-declare it in the header file and ++ * don't have to expose the MaybeStackArray specialization and ++ * the LocaleAndWeight to code (like the test) that #includes localeprioritylist.h. ++ * Also, otherwise we would have to do a platform-specific ++ * template export declaration of some kind for the MaybeStackArray specialization ++ * to be properly exported from the common DLL. ++ */ ++struct LocaleAndWeightArray : public UMemory { ++ MaybeStackArray<LocaleAndWeight, 20> array; ++}; ++ ++LocalePriorityList::LocalePriorityList(StringPiece s, UErrorCode &errorCode) { ++ if (U_FAILURE(errorCode)) { return; } ++ list = new LocaleAndWeightArray(); ++ if (list == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ const char *p = s.data(); ++ const char *limit = p + s.length(); ++ while ((p = skipSpaces(p, limit)) != limit) { ++ if (*p == ',') { // empty range field ++ ++p; ++ continue; ++ } ++ int32_t tagLength = findTagLength(p, limit); ++ if (tagLength == 0) { ++ errorCode = U_ILLEGAL_ARGUMENT_ERROR; ++ return; ++ } ++ CharString tag(p, tagLength, errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ Locale locale = Locale(tag.data()); ++ if (locale.isBogus()) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ int32_t weight = WEIGHT_ONE; ++ if ((p = skipSpaces(p + tagLength, limit)) != limit && *p == ';') { ++ if ((p = skipSpaces(p + 1, limit)) == limit || *p != 'q' || ++ (p = skipSpaces(p + 1, limit)) == limit || *p != '=' || ++ (++p, (weight = parseWeight(p, limit)) < 0)) { ++ errorCode = U_ILLEGAL_ARGUMENT_ERROR; ++ return; ++ } ++ p = skipSpaces(p, limit); ++ } ++ if (p != limit && *p != ',') { // trailing junk ++ errorCode = U_ILLEGAL_ARGUMENT_ERROR; ++ return; ++ } ++ add(locale, weight, errorCode); ++ if (p == limit) { break; } ++ ++p; ++ } ++ sort(errorCode); ++} ++ ++LocalePriorityList::~LocalePriorityList() { ++ if (list != nullptr) { ++ for (int32_t i = 0; i < listLength; ++i) { ++ delete list->array[i].locale; ++ } ++ delete list; ++ } ++ uhash_close(map); ++} ++ ++const Locale *LocalePriorityList::localeAt(int32_t i) const { ++ return list->array[i].locale; ++} ++ ++Locale *LocalePriorityList::orphanLocaleAt(int32_t i) { ++ if (list == nullptr) { return nullptr; } ++ LocaleAndWeight &lw = list->array[i]; ++ Locale *l = lw.locale; ++ lw.locale = nullptr; ++ return l; ++} ++ ++bool LocalePriorityList::add(const Locale &locale, int32_t weight, UErrorCode &errorCode) { ++ if (U_FAILURE(errorCode)) { return false; } ++ if (map == nullptr) { ++ if (weight <= 0) { return true; } // do not add q=0 ++ map = uhash_open(hashLocale, compareLocales, uhash_compareLong, &errorCode); ++ if (U_FAILURE(errorCode)) { return false; } ++ } ++ LocalPointer<Locale> clone; ++ int32_t index = uhash_geti(map, &locale); ++ if (index != 0) { ++ // Duplicate: Remove the old item and append it anew. ++ LocaleAndWeight &lw = list->array[index - 1]; ++ clone.adoptInstead(lw.locale); ++ lw.locale = nullptr; ++ lw.weight = 0; ++ ++numRemoved; ++ } ++ if (weight <= 0) { // do not add q=0 ++ if (index != 0) { ++ // Not strictly necessary but cleaner. ++ uhash_removei(map, &locale); ++ } ++ return true; ++ } ++ if (clone.isNull()) { ++ clone.adoptInstead(locale.clone()); ++ if (clone.isNull() || (clone->isBogus() && !locale.isBogus())) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return false; ++ } ++ } ++ if (listLength == list->array.getCapacity()) { ++ int32_t newCapacity = listLength < 50 ? 100 : 4 * listLength; ++ if (list->array.resize(newCapacity, listLength) == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return false; ++ } ++ } ++ uhash_puti(map, clone.getAlias(), listLength + 1, &errorCode); ++ if (U_FAILURE(errorCode)) { return false; } ++ LocaleAndWeight &lw = list->array[listLength]; ++ lw.locale = clone.orphan(); ++ lw.weight = weight; ++ lw.index = listLength++; ++ if (weight < WEIGHT_ONE) { hasWeights = true; } ++ U_ASSERT(uhash_count(map) == getLength()); ++ return true; ++} ++ ++void LocalePriorityList::sort(UErrorCode &errorCode) { ++ // Sort by descending weights if there is a mix of weights. ++ // The comparator forces a stable sort via the item index. ++ if (U_FAILURE(errorCode) || getLength() <= 1 || !hasWeights) { return; } ++ uprv_sortArray(list->array.getAlias(), listLength, sizeof(LocaleAndWeight), ++ compareLocaleAndWeight, nullptr, FALSE, &errorCode); ++} ++ ++U_NAMESPACE_END +diff --git a/source/common/localeprioritylist.h b/source/common/localeprioritylist.h +new file mode 100644 +index 00000000..80ca38a7 +--- /dev/null ++++ b/source/common/localeprioritylist.h +@@ -0,0 +1,115 @@ ++// © 2019 and later: Unicode, Inc. and others. ++// License & terms of use: http://www.unicode.org/copyright.html#License ++ ++// localeprioritylist.h ++// created: 2019jul11 Markus W. Scherer ++ ++#ifndef __LOCALEPRIORITYLIST_H__ ++#define __LOCALEPRIORITYLIST_H__ ++ ++#include "unicode/utypes.h" ++#include "unicode/locid.h" ++#include "unicode/stringpiece.h" ++#include "unicode/uobject.h" ++ ++struct UHashtable; ++ ++U_NAMESPACE_BEGIN ++ ++struct LocaleAndWeightArray; ++ ++/** ++ * Parses a list of locales from an accept-language string. ++ * We are a bit more lenient than the spec: ++ * We accept extra whitespace in more places, empty range fields, ++ * and any number of qvalue fraction digits. ++ * ++ * https://tools.ietf.org/html/rfc2616#section-14.4 ++ * 14.4 Accept-Language ++ * ++ * Accept-Language = "Accept-Language" ":" ++ * 1#( language-range [ ";" "q" "=" qvalue ] ) ++ * language-range = ( ( 1*8ALPHA *( "-" 1*8ALPHA ) ) | "*" ) ++ * ++ * Each language-range MAY be given an associated quality value which ++ * represents an estimate of the user's preference for the languages ++ * specified by that range. The quality value defaults to "q=1". For ++ * example, ++ * ++ * Accept-Language: da, en-gb;q=0.8, en;q=0.7 ++ * ++ * https://tools.ietf.org/html/rfc2616#section-3.9 ++ * 3.9 Quality Values ++ * ++ * HTTP content negotiation (section 12) uses short "floating point" ++ * numbers to indicate the relative importance ("weight") of various ++ * negotiable parameters. A weight is normalized to a real number in ++ * the range 0 through 1, where 0 is the minimum and 1 the maximum ++ * value. If a parameter has a quality value of 0, then content with ++ * this parameter is `not acceptable' for the client. HTTP/1.1 ++ * applications MUST NOT generate more than three digits after the ++ * decimal point. User configuration of these values SHOULD also be ++ * limited in this fashion. ++ * ++ * qvalue = ( "0" [ "." 0*3DIGIT ] ) ++ * | ( "1" [ "." 0*3("0") ] ) ++ */ ++class U_COMMON_API LocalePriorityList : public UMemory { ++public: ++ class Iterator : public Locale::Iterator { ++ public: ++ UBool hasNext() const override { return count < length; } ++ ++ const Locale &next() override { ++ for(;;) { ++ const Locale *locale = list.localeAt(index++); ++ if (locale != nullptr) { ++ ++count; ++ return *locale; ++ } ++ } ++ } ++ ++ private: ++ friend class LocalePriorityList; ++ ++ Iterator(const LocalePriorityList &list) : list(list), length(list.getLength()) {} ++ ++ const LocalePriorityList &list; ++ int32_t index = 0; ++ int32_t count = 0; ++ const int32_t length; ++ }; ++ ++ LocalePriorityList(StringPiece s, UErrorCode &errorCode); ++ ++ ~LocalePriorityList(); ++ ++ int32_t getLength() const { return listLength - numRemoved; } ++ ++ int32_t getLengthIncludingRemoved() const { return listLength; } ++ ++ Iterator iterator() const { return Iterator(*this); } ++ ++ const Locale *localeAt(int32_t i) const; ++ ++ Locale *orphanLocaleAt(int32_t i); ++ ++private: ++ LocalePriorityList(const LocalePriorityList &) = delete; ++ LocalePriorityList &operator=(const LocalePriorityList &) = delete; ++ ++ bool add(const Locale &locale, int32_t weight, UErrorCode &errorCode); ++ ++ void sort(UErrorCode &errorCode); ++ ++ LocaleAndWeightArray *list = nullptr; ++ int32_t listLength = 0; ++ int32_t numRemoved = 0; ++ bool hasWeights = false; // other than 1.0 ++ UHashtable *map = nullptr; ++}; ++ ++U_NAMESPACE_END ++ ++#endif // __LOCALEPRIORITYLIST_H__ +diff --git a/source/common/locdistance.cpp b/source/common/locdistance.cpp +new file mode 100644 +index 00000000..800d0eac +--- /dev/null ++++ b/source/common/locdistance.cpp +@@ -0,0 +1,364 @@ ++// © 2019 and later: Unicode, Inc. and others. ++// License & terms of use: http://www.unicode.org/copyright.html#License ++ ++// locdistance.cpp ++// created: 2019may08 Markus W. Scherer ++ ++#include "unicode/utypes.h" ++#include "unicode/bytestrie.h" ++#include "unicode/localematcher.h" ++#include "unicode/locid.h" ++#include "unicode/uobject.h" ++#include "unicode/ures.h" ++#include "cstring.h" ++#include "locdistance.h" ++#include "loclikelysubtags.h" ++#include "uassert.h" ++#include "ucln_cmn.h" ++#include "uinvchar.h" ++#include "umutex.h" ++ ++U_NAMESPACE_BEGIN ++ ++namespace { ++ ++/** ++ * Bit flag used on the last character of a subtag in the trie. ++ * Must be set consistently by the builder and the lookup code. ++ */ ++constexpr int32_t END_OF_SUBTAG = 0x80; ++/** Distance value bit flag, set by the builder. */ ++constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80; ++/** Distance value bit flag, set by trieNext(). */ ++constexpr int32_t DISTANCE_IS_FINAL = 0x100; ++constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; ++ ++constexpr int32_t ABOVE_THRESHOLD = 100; ++ ++// Indexes into array of distances. ++enum { ++ IX_DEF_LANG_DISTANCE, ++ IX_DEF_SCRIPT_DISTANCE, ++ IX_DEF_REGION_DISTANCE, ++ IX_MIN_REGION_DISTANCE, ++ IX_LIMIT ++}; ++ ++LocaleDistance *gLocaleDistance = nullptr; ++UInitOnce gInitOnce = U_INITONCE_INITIALIZER; ++ ++UBool U_CALLCONV cleanup() { ++ delete gLocaleDistance; ++ gLocaleDistance = nullptr; ++ gInitOnce.reset(); ++ return TRUE; ++} ++ ++} // namespace ++ ++void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { ++ // This function is invoked only via umtx_initOnce(). ++ U_ASSERT(gLocaleDistance == nullptr); ++ const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ const LocaleDistanceData &data = likely.getDistanceData(); ++ if (data.distanceTrieBytes == nullptr || ++ data.regionToPartitions == nullptr || data.partitions == nullptr || ++ // ok if no paradigms ++ data.distances == nullptr) { ++ errorCode = U_MISSING_RESOURCE_ERROR; ++ return; ++ } ++ gLocaleDistance = new LocaleDistance(data); ++ if (gLocaleDistance == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup); ++} ++ ++const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { ++ if (U_FAILURE(errorCode)) { return nullptr; } ++ umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode); ++ return gLocaleDistance; ++} ++ ++LocaleDistance::LocaleDistance(const LocaleDistanceData &data) : ++ trie(data.distanceTrieBytes), ++ regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), ++ paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), ++ defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]), ++ defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]), ++ defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]), ++ minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) { ++ // For the default demotion value, use the ++ // default region distance between unrelated Englishes. ++ // Thus, unless demotion is turned off, ++ // a mere region difference for one desired locale ++ // is as good as a perfect match for the next following desired locale. ++ // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>. ++ LSR en("en", "Latn", "US"); ++ LSR enGB("en", "Latn", "GB"); ++ const LSR *p_enGB = &enGB; ++ defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, &p_enGB, 1, ++ 50, ULOCMATCH_FAVOR_LANGUAGE) & 0xff; ++} ++ ++int32_t LocaleDistance::getBestIndexAndDistance( ++ const LSR &desired, ++ const LSR **supportedLSRs, int32_t supportedLSRsLength, ++ int32_t threshold, ULocMatchFavorSubtag favorSubtag) const { ++ BytesTrie iter(trie); ++ // Look up the desired language only once for all supported LSRs. ++ // Its "distance" is either a match point value of 0, or a non-match negative value. ++ // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. ++ int32_t desLangDistance = trieNext(iter, desired.language, false); ++ uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; ++ // Index of the supported LSR with the lowest distance. ++ int32_t bestIndex = -1; ++ for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { ++ const LSR &supported = *supportedLSRs[slIndex]; ++ bool star = false; ++ int32_t distance = desLangDistance; ++ if (distance >= 0) { ++ U_ASSERT((distance & DISTANCE_IS_FINAL) == 0); ++ if (slIndex != 0) { ++ iter.resetToState64(desLangState); ++ } ++ distance = trieNext(iter, supported.language, true); ++ } ++ // Note: The data builder verifies that there are no rules with "any" (*) language and ++ // real (non *) script or region subtags. ++ // This means that if the lookup for either language fails we can use ++ // the default distances without further lookups. ++ int32_t flags; ++ if (distance >= 0) { ++ flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; ++ distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; ++ } else { // <*, *> ++ if (uprv_strcmp(desired.language, supported.language) == 0) { ++ distance = 0; ++ } else { ++ distance = defaultLanguageDistance; ++ } ++ flags = 0; ++ star = true; ++ } ++ U_ASSERT(0 <= distance && distance <= 100); ++ // We implement "favor subtag" by reducing the language subtag distance ++ // (unscientifically reducing it to a quarter of the normal value), ++ // so that the script distance is relatively more important. ++ // For example, given a default language distance of 80, we reduce it to 20, ++ // which is below the default threshold of 50, which is the default script distance. ++ if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { ++ distance >>= 2; ++ } ++ if (distance >= threshold) { ++ continue; ++ } ++ ++ int32_t scriptDistance; ++ if (star || flags != 0) { ++ if (uprv_strcmp(desired.script, supported.script) == 0) { ++ scriptDistance = 0; ++ } else { ++ scriptDistance = defaultScriptDistance; ++ } ++ } else { ++ scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(), ++ desired.script, supported.script); ++ flags = scriptDistance & DISTANCE_IS_FINAL; ++ scriptDistance &= ~DISTANCE_IS_FINAL; ++ } ++ distance += scriptDistance; ++ if (distance >= threshold) { ++ continue; ++ } ++ ++ if (uprv_strcmp(desired.region, supported.region) == 0) { ++ // regionDistance = 0 ++ } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { ++ distance += defaultRegionDistance; ++ } else { ++ int32_t remainingThreshold = threshold - distance; ++ if (minRegionDistance >= remainingThreshold) { ++ continue; ++ } ++ ++ // From here on we know the regions are not equal. ++ // Map each region to zero or more partitions. (zero = one non-matching string) ++ // (Each array of single-character partition strings is encoded as one string.) ++ // If either side has more than one, then we find the maximum distance. ++ // This could be optimized by adding some more structure, but probably not worth it. ++ distance += getRegionPartitionsDistance( ++ iter, iter.getState64(), ++ partitionsForRegion(desired), ++ partitionsForRegion(supported), ++ remainingThreshold); ++ } ++ if (distance < threshold) { ++ if (distance == 0) { ++ return slIndex << 8; ++ } ++ bestIndex = slIndex; ++ threshold = distance; ++ } ++ } ++ return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD; ++} ++ ++int32_t LocaleDistance::getDesSuppScriptDistance( ++ BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) { ++ // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. ++ int32_t distance = trieNext(iter, desired, false); ++ if (distance >= 0) { ++ distance = trieNext(iter, supported, true); ++ } ++ if (distance < 0) { ++ UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *> ++ U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); ++ if (uprv_strcmp(desired, supported) == 0) { ++ distance = 0; // same script ++ } else { ++ distance = iter.getValue(); ++ U_ASSERT(distance >= 0); ++ } ++ if (result == USTRINGTRIE_FINAL_VALUE) { ++ distance |= DISTANCE_IS_FINAL; ++ } ++ } ++ return distance; ++} ++ ++int32_t LocaleDistance::getRegionPartitionsDistance( ++ BytesTrie &iter, uint64_t startState, ++ const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) { ++ char desired = *desiredPartitions++; ++ char supported = *supportedPartitions++; ++ U_ASSERT(desired != 0 && supported != 0); ++ // See if we have single desired/supported partitions, from NUL-terminated ++ // partition strings without explicit length. ++ bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character ++ // equivalent to: if (desLength == 1 && suppLength == 1) ++ if (*desiredPartitions == 0 && !suppLengthGt1) { ++ // Fastpath for single desired/supported partitions. ++ UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); ++ if (USTRINGTRIE_HAS_NEXT(result)) { ++ result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); ++ if (USTRINGTRIE_HAS_VALUE(result)) { ++ return iter.getValue(); ++ } ++ } ++ return getFallbackRegionDistance(iter, startState); ++ } ++ ++ const char *supportedStart = supportedPartitions - 1; // for restart of inner loop ++ int32_t regionDistance = 0; ++ // Fall back to * only once, not for each pair of partition strings. ++ bool star = false; ++ for (;;) { ++ // Look up each desired-partition string only once, ++ // not for each (desired, supported) pair. ++ UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); ++ if (USTRINGTRIE_HAS_NEXT(result)) { ++ uint64_t desState = suppLengthGt1 ? iter.getState64() : 0; ++ for (;;) { ++ result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); ++ int32_t d; ++ if (USTRINGTRIE_HAS_VALUE(result)) { ++ d = iter.getValue(); ++ } else if (star) { ++ d = 0; ++ } else { ++ d = getFallbackRegionDistance(iter, startState); ++ star = true; ++ } ++ if (d >= threshold) { ++ return d; ++ } else if (regionDistance < d) { ++ regionDistance = d; ++ } ++ if ((supported = *supportedPartitions++) != 0) { ++ iter.resetToState64(desState); ++ } else { ++ break; ++ } ++ } ++ } else if (!star) { ++ int32_t d = getFallbackRegionDistance(iter, startState); ++ if (d >= threshold) { ++ return d; ++ } else if (regionDistance < d) { ++ regionDistance = d; ++ } ++ star = true; ++ } ++ if ((desired = *desiredPartitions++) != 0) { ++ iter.resetToState64(startState); ++ supportedPartitions = supportedStart; ++ supported = *supportedPartitions++; ++ } else { ++ break; ++ } ++ } ++ return regionDistance; ++} ++ ++int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) { ++#if U_DEBUG ++ UStringTrieResult result = ++#endif ++ iter.resetToState64(startState).next(u'*'); // <*, *> ++ U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); ++ int32_t distance = iter.getValue(); ++ U_ASSERT(distance >= 0); ++ return distance; ++} ++ ++int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) { ++ uint8_t c; ++ if ((c = *s) == 0) { ++ return -1; // no empty subtags in the distance data ++ } ++ for (;;) { ++ c = uprv_invCharToAscii(c); ++ // EBCDIC: If *s is not an invariant character, ++ // then c is now 0 and will simply not match anything, which is harmless. ++ uint8_t next = *++s; ++ if (next != 0) { ++ if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { ++ return -1; ++ } ++ } else { ++ // last character of this subtag ++ UStringTrieResult result = iter.next(c | END_OF_SUBTAG); ++ if (wantValue) { ++ if (USTRINGTRIE_HAS_VALUE(result)) { ++ int32_t value = iter.getValue(); ++ if (result == USTRINGTRIE_FINAL_VALUE) { ++ value |= DISTANCE_IS_FINAL; ++ } ++ return value; ++ } ++ } else { ++ if (USTRINGTRIE_HAS_NEXT(result)) { ++ return 0; ++ } ++ } ++ return -1; ++ } ++ c = next; ++ } ++} ++ ++UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const { ++ // Linear search for a very short list (length 6 as of 2019). ++ // If there are many paradigm LSRs we should use a hash set. ++ U_ASSERT(paradigmLSRsLength <= 15); ++ for (int32_t i = 0; i < paradigmLSRsLength; ++i) { ++ if (lsr == paradigmLSRs[i]) { return true; } ++ } ++ return false; ++} ++ ++U_NAMESPACE_END +diff --git a/source/common/locdistance.h b/source/common/locdistance.h +new file mode 100644 +index 00000000..7439f51c +--- /dev/null ++++ b/source/common/locdistance.h +@@ -0,0 +1,109 @@ ++// © 2019 and later: Unicode, Inc. and others. ++// License & terms of use: http://www.unicode.org/copyright.html#License ++ ++// locdistance.h ++// created: 2019may08 Markus W. Scherer ++ ++#ifndef __LOCDISTANCE_H__ ++#define __LOCDISTANCE_H__ ++ ++#include "unicode/utypes.h" ++#include "unicode/bytestrie.h" ++#include "unicode/localematcher.h" ++#include "unicode/locid.h" ++#include "unicode/uobject.h" ++#include "lsr.h" ++ ++U_NAMESPACE_BEGIN ++ ++struct LocaleDistanceData; ++ ++/** ++ * Offline-built data for LocaleMatcher. ++ * Mostly but not only the data for mapping locales to their maximized forms. ++ */ ++class LocaleDistance final : public UMemory { ++public: ++ static const LocaleDistance *getSingleton(UErrorCode &errorCode); ++ ++ /** ++ * Finds the supported LSR with the smallest distance from the desired one. ++ * Equivalent LSR subtags must be normalized into a canonical form. ++ * ++ * <p>Returns the index of the lowest-distance supported LSR in bits 31..8 ++ * (negative if none has a distance below the threshold), ++ * and its distance (0..ABOVE_THRESHOLD) in bits 7..0. ++ */ ++ int32_t getBestIndexAndDistance(const LSR &desired, ++ const LSR **supportedLSRs, int32_t supportedLSRsLength, ++ int32_t threshold, ULocMatchFavorSubtag favorSubtag) const; ++ ++ int32_t getParadigmLSRsLength() const { return paradigmLSRsLength; } ++ ++ UBool isParadigmLSR(const LSR &lsr) const; ++ ++ int32_t getDefaultScriptDistance() const { ++ return defaultScriptDistance; ++ } ++ ++ int32_t getDefaultDemotionPerDesiredLocale() const { ++ return defaultDemotionPerDesiredLocale; ++ } ++ ++private: ++ LocaleDistance(const LocaleDistanceData &data); ++ LocaleDistance(const LocaleDistance &other) = delete; ++ LocaleDistance &operator=(const LocaleDistance &other) = delete; ++ ++ static void initLocaleDistance(UErrorCode &errorCode); ++ ++ static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState, ++ const char *desired, const char *supported); ++ ++ static int32_t getRegionPartitionsDistance( ++ BytesTrie &iter, uint64_t startState, ++ const char *desiredPartitions, const char *supportedPartitions, ++ int32_t threshold); ++ ++ static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState); ++ ++ static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue); ++ ++ const char *partitionsForRegion(const LSR &lsr) const { ++ // ill-formed region -> one non-matching string ++ int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex]; ++ return partitionArrays[pIndex]; ++ } ++ ++ int32_t getDefaultRegionDistance() const { ++ return defaultRegionDistance; ++ } ++ ++ // The trie maps each dlang+slang+dscript+sscript+dregion+sregion ++ // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance. ++ // There is also a trie value for each subsequence of whole subtags. ++ // One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"". ++ BytesTrie trie; ++ ++ /** ++ * Maps each region to zero or more single-character partitions. ++ */ ++ const uint8_t *regionToPartitionsIndex; ++ const char **partitionArrays; ++ ++ /** ++ * Used to get the paradigm region for a cluster, if there is one. ++ */ ++ const LSR *paradigmLSRs; ++ int32_t paradigmLSRsLength; ++ ++ int32_t defaultLanguageDistance; ++ int32_t defaultScriptDistance; ++ int32_t defaultRegionDistance; ++ int32_t minRegionDistance; ++ int32_t defaultDemotionPerDesiredLocale; ++}; ++ ++U_NAMESPACE_END ++ ++#endif // __LOCDISTANCE_H__ +diff --git a/source/common/locid.cpp b/source/common/locid.cpp +index caffdb8b..93f3d3cb 100644 +--- a/source/common/locid.cpp ++++ b/source/common/locid.cpp +@@ -1399,5 +1399,7 @@ Locale::getBaseName() const { + return baseName; + } + ++Locale::Iterator::~Iterator() = default; ++ + //eof + U_NAMESPACE_END +diff --git a/source/common/loclikelysubtags.cpp b/source/common/loclikelysubtags.cpp +new file mode 100644 +index 00000000..d7f5e124 +--- /dev/null ++++ b/source/common/loclikelysubtags.cpp +@@ -0,0 +1,638 @@ ++// © 2019 and later: Unicode, Inc. and others. ++// License & terms of use: http://www.unicode.org/copyright.html#License ++ ++// loclikelysubtags.cpp ++// created: 2019may08 Markus W. Scherer ++ ++#include <utility> ++#include "unicode/utypes.h" ++#include "unicode/bytestrie.h" ++#include "unicode/localpointer.h" ++#include "unicode/locid.h" ++#include "unicode/uobject.h" ++#include "unicode/ures.h" ++#include "charstr.h" ++#include "cstring.h" ++#include "loclikelysubtags.h" ++#include "lsr.h" ++#include "uassert.h" ++#include "ucln_cmn.h" ++#include "uhash.h" ++#include "uinvchar.h" ++#include "umutex.h" ++#include "uresdata.h" ++#include "uresimp.h" ++ ++U_NAMESPACE_BEGIN ++ ++namespace { ++ ++constexpr char PSEUDO_ACCENTS_PREFIX = '\''; // -XA, -PSACCENT ++constexpr char PSEUDO_BIDI_PREFIX = '+'; // -XB, -PSBIDI ++constexpr char PSEUDO_CRACKED_PREFIX = ','; // -XC, -PSCRACK ++ ++/** ++ * Stores NUL-terminated strings with duplicate elimination. ++ * Checks for unique UTF-16 string pointers and converts to invariant characters. ++ */ ++class UniqueCharStrings { ++public: ++ UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) { ++ uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ strings = new CharString(); ++ if (strings == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ } ++ } ++ ~UniqueCharStrings() { ++ uhash_close(&map); ++ delete strings; ++ } ++ ++ /** Returns/orphans the CharString that contains all strings. */ ++ CharString *orphanCharStrings() { ++ CharString *result = strings; ++ strings = nullptr; ++ return result; ++ } ++ ++ /** Adds a string and returns a unique number for it. */ ++ int32_t add(const UnicodeString &s, UErrorCode &errorCode) { ++ if (U_FAILURE(errorCode)) { return 0; } ++ if (isFrozen) { ++ errorCode = U_NO_WRITE_PERMISSION; ++ return 0; ++ } ++ // The string points into the resource bundle. ++ const char16_t *p = s.getBuffer(); ++ int32_t oldIndex = uhash_geti(&map, p); ++ if (oldIndex != 0) { // found duplicate ++ return oldIndex; ++ } ++ // Explicit NUL terminator for the previous string. ++ // The strings object is also terminated with one implicit NUL. ++ strings->append(0, errorCode); ++ int32_t newIndex = strings->length(); ++ strings->appendInvariantChars(s, errorCode); ++ uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode); ++ return newIndex; ++ } ++ ++ void freeze() { isFrozen = true; } ++ ++ /** ++ * Returns a string pointer for its unique number, if this object is frozen. ++ * Otherwise nullptr. ++ */ ++ const char *get(int32_t i) const { ++ U_ASSERT(isFrozen); ++ return isFrozen && i > 0 ? strings->data() + i : nullptr; ++ } ++ ++private: ++ UHashtable map; ++ CharString *strings; ++ bool isFrozen = false; ++}; ++ ++} // namespace ++ ++LocaleDistanceData::LocaleDistanceData(LocaleDistanceData &&data) : ++ distanceTrieBytes(data.distanceTrieBytes), ++ regionToPartitions(data.regionToPartitions), ++ partitions(data.partitions), ++ paradigms(data.paradigms), paradigmsLength(data.paradigmsLength), ++ distances(data.distances) { ++ data.partitions = nullptr; ++ data.paradigms = nullptr; ++} ++ ++LocaleDistanceData::~LocaleDistanceData() { ++ uprv_free(partitions); ++ delete[] paradigms; ++} ++ ++// TODO(ICU-20777): Rename to just LikelySubtagsData. ++struct XLikelySubtagsData { ++ UResourceBundle *langInfoBundle = nullptr; ++ UniqueCharStrings strings; ++ CharStringMap languageAliases; ++ CharStringMap regionAliases; ++ const uint8_t *trieBytes = nullptr; ++ LSR *lsrs = nullptr; ++ int32_t lsrsLength = 0; ++ ++ LocaleDistanceData distanceData; ++ ++ XLikelySubtagsData(UErrorCode &errorCode) : strings(errorCode) {} ++ ++ ~XLikelySubtagsData() { ++ ures_close(langInfoBundle); ++ delete[] lsrs; ++ } ++ ++ void load(UErrorCode &errorCode) { ++ langInfoBundle = ures_openDirect(nullptr, "langInfo", &errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ StackUResourceBundle stackTempBundle; ++ ResourceDataValue value; ++ ures_getValueWithFallback(langInfoBundle, "likely", stackTempBundle.getAlias(), ++ value, errorCode); ++ ResourceTable likelyTable = value.getTable(errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ ++ // Read all strings in the resource bundle and convert them to invariant char *. ++ LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes; ++ int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0; ++ if (!readStrings(likelyTable, "languageAliases", value, ++ languageIndexes, languagesLength, errorCode) || ++ !readStrings(likelyTable, "regionAliases", value, ++ regionIndexes, regionsLength, errorCode) || ++ !readStrings(likelyTable, "lsrs", value, ++ lsrSubtagIndexes,lsrSubtagsLength, errorCode)) { ++ return; ++ } ++ if ((languagesLength & 1) != 0 || ++ (regionsLength & 1) != 0 || ++ (lsrSubtagsLength % 3) != 0) { ++ errorCode = U_INVALID_FORMAT_ERROR; ++ return; ++ } ++ if (lsrSubtagsLength == 0) { ++ errorCode = U_MISSING_RESOURCE_ERROR; ++ return; ++ } ++ ++ if (!likelyTable.findValue("trie", value)) { ++ errorCode = U_MISSING_RESOURCE_ERROR; ++ return; ++ } ++ int32_t length; ++ trieBytes = value.getBinary(length, errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ ++ // Also read distance/matcher data if available, ++ // to open & keep only one resource bundle pointer ++ // and to use one single UniqueCharStrings. ++ UErrorCode matchErrorCode = U_ZERO_ERROR; ++ ures_getValueWithFallback(langInfoBundle, "match", stackTempBundle.getAlias(), ++ value, matchErrorCode); ++ LocalMemory<int32_t> partitionIndexes, paradigmSubtagIndexes; ++ int32_t partitionsLength = 0, paradigmSubtagsLength = 0; ++ if (U_SUCCESS(matchErrorCode)) { ++ ResourceTable matchTable = value.getTable(errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ ++ if (matchTable.findValue("trie", value)) { ++ distanceData.distanceTrieBytes = value.getBinary(length, errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ } ++ ++ if (matchTable.findValue("regionToPartitions", value)) { ++ distanceData.regionToPartitions = value.getBinary(length, errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ if (length < LSR::REGION_INDEX_LIMIT) { ++ errorCode = U_INVALID_FORMAT_ERROR; ++ return; ++ } ++ } ++ ++ if (!readStrings(matchTable, "partitions", value, ++ partitionIndexes, partitionsLength, errorCode) || ++ !readStrings(matchTable, "paradigms", value, ++ paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) { ++ return; ++ } ++ if ((paradigmSubtagsLength % 3) != 0) { ++ errorCode = U_INVALID_FORMAT_ERROR; ++ return; ++ } ++ ++ if (matchTable.findValue("distances", value)) { ++ distanceData.distances = value.getIntVector(length, errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ if (length < 4) { // LocaleDistance IX_LIMIT ++ errorCode = U_INVALID_FORMAT_ERROR; ++ return; ++ } ++ } ++ } else if (matchErrorCode == U_MISSING_RESOURCE_ERROR) { ++ // ok for likely subtags ++ } else { // error other than missing resource ++ errorCode = matchErrorCode; ++ return; ++ } ++ ++ // Fetch & store invariant-character versions of strings ++ // only after we have collected and de-duplicated all of them. ++ strings.freeze(); ++ ++ languageAliases = CharStringMap(languagesLength / 2, errorCode); ++ for (int32_t i = 0; i < languagesLength; i += 2) { ++ languageAliases.put(strings.get(languageIndexes[i]), ++ strings.get(languageIndexes[i + 1]), errorCode); ++ } ++ ++ regionAliases = CharStringMap(regionsLength / 2, errorCode); ++ for (int32_t i = 0; i < regionsLength; i += 2) { ++ regionAliases.put(strings.get(regionIndexes[i]), ++ strings.get(regionIndexes[i + 1]), errorCode); ++ } ++ if (U_FAILURE(errorCode)) { return; } ++ ++ lsrsLength = lsrSubtagsLength / 3; ++ lsrs = new LSR[lsrsLength]; ++ if (lsrs == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) { ++ lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]), ++ strings.get(lsrSubtagIndexes[i + 1]), ++ strings.get(lsrSubtagIndexes[i + 2])); ++ } ++ ++ if (partitionsLength > 0) { ++ distanceData.partitions = static_cast<const char **>( ++ uprv_malloc(partitionsLength * sizeof(const char *))); ++ if (distanceData.partitions == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ for (int32_t i = 0; i < partitionsLength; ++i) { ++ distanceData.partitions[i] = strings.get(partitionIndexes[i]); ++ } ++ } ++ ++ if (paradigmSubtagsLength > 0) { ++ distanceData.paradigmsLength = paradigmSubtagsLength / 3; ++ LSR *paradigms = new LSR[distanceData.paradigmsLength]; ++ if (paradigms == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) { ++ paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]), ++ strings.get(paradigmSubtagIndexes[i + 1]), ++ strings.get(paradigmSubtagIndexes[i + 2])); ++ } ++ distanceData.paradigms = paradigms; ++ } ++ } ++ ++private: ++ bool readStrings(const ResourceTable &table, const char *key, ResourceValue &value, ++ LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) { ++ if (table.findValue(key, value)) { ++ ResourceArray stringArray = value.getArray(errorCode); ++ if (U_FAILURE(errorCode)) { return false; } ++ length = stringArray.getSize(); ++ if (length == 0) { return true; } ++ int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length); ++ if (rawIndexes == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return false; ++ } ++ for (int i = 0; i < length; ++i) { ++ stringArray.getValue(i, value); // returns TRUE because i < length ++ rawIndexes[i] = strings.add(value.getUnicodeString(errorCode), errorCode); ++ if (U_FAILURE(errorCode)) { return false; } ++ } ++ } ++ return true; ++ } ++}; ++ ++namespace { ++ ++XLikelySubtags *gLikelySubtags = nullptr; ++UInitOnce gInitOnce = U_INITONCE_INITIALIZER; ++ ++UBool U_CALLCONV cleanup() { ++ delete gLikelySubtags; ++ gLikelySubtags = nullptr; ++ gInitOnce.reset(); ++ return TRUE; ++} ++ ++} // namespace ++ ++void U_CALLCONV XLikelySubtags::initLikelySubtags(UErrorCode &errorCode) { ++ // This function is invoked only via umtx_initOnce(). ++ U_ASSERT(gLikelySubtags == nullptr); ++ XLikelySubtagsData data(errorCode); ++ data.load(errorCode); ++ if (U_FAILURE(errorCode)) { return; } ++ gLikelySubtags = new XLikelySubtags(data); ++ if (gLikelySubtags == nullptr) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ ucln_common_registerCleanup(UCLN_COMMON_LIKELY_SUBTAGS, cleanup); ++} ++ ++const XLikelySubtags *XLikelySubtags::getSingleton(UErrorCode &errorCode) { ++ if (U_FAILURE(errorCode)) { return nullptr; } ++ umtx_initOnce(gInitOnce, &XLikelySubtags::initLikelySubtags, errorCode); ++ return gLikelySubtags; ++} ++ ++XLikelySubtags::XLikelySubtags(XLikelySubtagsData &data) : ++ langInfoBundle(data.langInfoBundle), ++ strings(data.strings.orphanCharStrings()), ++ languageAliases(std::move(data.languageAliases)), ++ regionAliases(std::move(data.regionAliases)), ++ trie(data.trieBytes), ++ lsrs(data.lsrs), ++#if U_DEBUG ++ lsrsLength(data.lsrsLength), ++#endif ++ distanceData(std::move(data.distanceData)) { ++ data.langInfoBundle = nullptr; ++ data.lsrs = nullptr; ++ ++ // Cache the result of looking up language="und" encoded as "*", and "und-Zzzz" ("**"). ++ UStringTrieResult result = trie.next(u'*'); ++ U_ASSERT(USTRINGTRIE_HAS_NEXT(result)); ++ trieUndState = trie.getState64(); ++ result = trie.next(u'*'); ++ U_ASSERT(USTRINGTRIE_HAS_NEXT(result)); ++ trieUndZzzzState = trie.getState64(); ++ result = trie.next(u'*'); ++ U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); ++ defaultLsrIndex = trie.getValue(); ++ trie.reset(); ++ ++ for (char16_t c = u'a'; c <= u'z'; ++c) { ++ result = trie.next(c); ++ if (result == USTRINGTRIE_NO_VALUE) { ++ trieFirstLetterStates[c - u'a'] = trie.getState64(); ++ } ++ trie.reset(); ++ } ++} ++ ++XLikelySubtags::~XLikelySubtags() { ++ ures_close(langInfoBundle); ++ delete strings; ++ delete[] lsrs; ++} ++ ++LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const { ++ const char *name = locale.getName(); ++ if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=") ++ // Private use language tag x-subtag-subtag... ++ return LSR(name, "", ""); ++ } ++ return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(), ++ locale.getVariant(), errorCode); ++} ++ ++namespace { ++ ++const char *getCanonical(const CharStringMap &aliases, const char *alias) { ++ const char *canonical = aliases.get(alias); ++ return canonical == nullptr ? alias : canonical; ++} ++ ++} // namespace ++ ++LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region, ++ const char *variant, UErrorCode &errorCode) const { ++ // Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK. ++ // They should match only themselves, ++ // not other locales with what looks like the same language and script subtags. ++ char c1; ++ if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) { ++ switch (c1) { ++ case 'A': ++ return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region, errorCode); ++ case 'B': ++ return LSR(PSEUDO_BIDI_PREFIX, language, script, region, errorCode); ++ case 'C': ++ return LSR(PSEUDO_CRACKED_PREFIX, language, script, region, errorCode); ++ default: // normal locale ++ break; ++ } ++ } ++ ++ if (variant[0] == 'P' && variant[1] == 'S') { ++ if (uprv_strcmp(variant, "PSACCENT") == 0) { ++ return LSR(PSEUDO_ACCENTS_PREFIX, language, script, ++ *region == 0 ? "XA" : region, errorCode); ++ } else if (uprv_strcmp(variant, "PSBIDI") == 0) { ++ return LSR(PSEUDO_BIDI_PREFIX, language, script, ++ *region == 0 ? "XB" : region, errorCode); ++ } else if (uprv_strcmp(variant, "PSCRACK") == 0) { ++ return LSR(PSEUDO_CRACKED_PREFIX, language, script, ++ *region == 0 ? "XC" : region, errorCode); ++ } ++ // else normal locale ++ } ++ ++ language = getCanonical(languageAliases, language); ++ // (We have no script mappings.) ++ region = getCanonical(regionAliases, region); ++ return maximize(language, script, region); ++} ++ ++LSR XLikelySubtags::maximize(const char *language, const char *script, const char *region) const { ++ if (uprv_strcmp(language, "und") == 0) { ++ language = ""; ++ } ++ if (uprv_strcmp(script, "Zzzz") == 0) { ++ script = ""; ++ } ++ if (uprv_strcmp(region, "ZZ") == 0) { ++ region = ""; ++ } ++ if (*script != 0 && *region != 0 && *language != 0) { ++ return LSR(language, script, region); // already maximized ++ } ++ ++ uint32_t retainOldMask = 0; ++ BytesTrie iter(trie); ++ uint64_t state; ++ int32_t value; ++ // Small optimization: Array lookup for first language letter. ++ int32_t c0; ++ if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 && ++ language[1] != 0 && // language.length() >= 2 ++ (state = trieFirstLetterStates[c0]) != 0) { ++ value = trieNext(iter.resetToState64(state), language, 1); ++ } else { ++ value = trieNext(iter, language, 0); ++ } ++ if (value >= 0) { ++ if (*language != 0) { ++ retainOldMask |= 4; ++ } ++ state = iter.getState64(); ++ } else { ++ retainOldMask |= 4; ++ iter.resetToState64(trieUndState); // "und" ("*") ++ state = 0; ++ } ++ ++ if (value > 0) { ++ // Intermediate or final value from just language. ++ if (value == SKIP_SCRIPT) { ++ value = 0; ++ } ++ if (*script != 0) { ++ retainOldMask |= 2; ++ } ++ } else { ++ value = trieNext(iter, script, 0); ++ if (value >= 0) { ++ if (*script != 0) { ++ retainOldMask |= 2; ++ } ++ state = iter.getState64(); ++ } else { ++ retainOldMask |= 2; ++ if (state == 0) { ++ iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**") ++ } else { ++ iter.resetToState64(state); ++ value = trieNext(iter, "", 0); ++ U_ASSERT(value >= 0); ++ state = iter.getState64(); ++ } ++ } ++ } ++ ++ if (value > 0) { ++ // Final value from just language or language+script. ++ if (*region != 0) { ++ retainOldMask |= 1; ++ } ++ } else { ++ value = trieNext(iter, region, 0); ++ if (value >= 0) { ++ if (*region != 0) { ++ retainOldMask |= 1; ++ } ++ } else { ++ retainOldMask |= 1; ++ if (state == 0) { ++ value = defaultLsrIndex; ++ } else { ++ iter.resetToState64(state); ++ value = trieNext(iter, "", 0); ++ U_ASSERT(value > 0); ++ } ++ } ++ } ++ U_ASSERT(value < lsrsLength); ++ const LSR &result = lsrs[value]; ++ ++ if (*language == 0) { ++ language = "und"; ++ } ++ ++ if (retainOldMask == 0) { ++ // Quickly return a copy of the lookup-result LSR ++ // without new allocation of the subtags. ++ return LSR(result.language, result.script, result.region); ++ } ++ if ((retainOldMask & 4) == 0) { ++ language = result.language; ++ } ++ if ((retainOldMask & 2) == 0) { ++ script = result.script; ++ } ++ if ((retainOldMask & 1) == 0) { ++ region = result.region; ++ } ++ return LSR(language, script, region); ++} ++ ++int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) { ++ UStringTrieResult result; ++ uint8_t c; ++ if ((c = s[i]) == 0) { ++ result = iter.next(u'*'); ++ } else { ++ for (;;) { ++ c = uprv_invCharToAscii(c); ++ // EBCDIC: If s[i] is not an invariant character, ++ // then c is now 0 and will simply not match anything, which is harmless. ++ uint8_t next = s[++i]; ++ if (next != 0) { ++ if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { ++ return -1; ++ } ++ } else { ++ // last character of this subtag ++ result = iter.next(c | 0x80); ++ break; ++ } ++ c = next; ++ } ++ } ++ switch (result) { ++ case USTRINGTRIE_NO_MATCH: return -1; ++ case USTRINGTRIE_NO_VALUE: return 0; ++ case USTRINGTRIE_INTERMEDIATE_VALUE: ++ U_ASSERT(iter.getValue() == SKIP_SCRIPT); ++ return SKIP_SCRIPT; ++ case USTRINGTRIE_FINAL_VALUE: return iter.getValue(); ++ default: return -1; ++ } ++} ++ ++// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code ++// in loclikely.cpp to this new code, including activating this ++// minimizeSubtags() function. The LocaleMatcher does not minimize. ++#if 0 ++LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn, ++ const char *regionIn, ULocale.Minimize fieldToFavor, ++ UErrorCode &errorCode) const { ++ LSR result = maximize(languageIn, scriptIn, regionIn); ++ ++ // We could try just a series of checks, like: ++ // LSR result2 = addLikelySubtags(languageIn, "", ""); ++ // if result.equals(result2) return result2; ++ // However, we can optimize 2 of the cases: ++ // (languageIn, "", "") ++ // (languageIn, "", regionIn) ++ ++ // value00 = lookup(result.language, "", "") ++ BytesTrie iter = new BytesTrie(trie); ++ int value = trieNext(iter, result.language, 0); ++ U_ASSERT(value >= 0); ++ if (value == 0) { ++ value = trieNext(iter, "", 0); ++ U_ASSERT(value >= 0); ++ if (value == 0) { ++ value = trieNext(iter, "", 0); ++ } ++ } ++ U_ASSERT(value > 0); ++ LSR value00 = lsrs[value]; ++ boolean favorRegionOk = false; ++ if (result.script.equals(value00.script)) { //script is default ++ if (result.region.equals(value00.region)) { ++ return new LSR(result.language, "", ""); ++ } else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) { ++ return new LSR(result.language, "", result.region); ++ } else { ++ favorRegionOk = true; ++ } ++ } ++ ++ // The last case is not as easy to optimize. ++ // Maybe do later, but for now use the straightforward code. ++ LSR result2 = maximize(languageIn, scriptIn, ""); ++ if (result2.equals(result)) { ++ return new LSR(result.language, result.script, ""); ++ } else if (favorRegionOk) { ++ return new LSR(result.language, "", result.region); ++ } ++ return result; ++} ++#endif ++ ++U_NAMESPACE_END +diff --git a/source/common/loclikelysubtags.h b/source/common/loclikelysubtags.h +new file mode 100644 +index 00000000..8c8a08ac +--- /dev/null ++++ b/source/common/loclikelysubtags.h +@@ -0,0 +1,143 @@ ++// © 2019 and later: Unicode, Inc. and others. ++// License & terms of use: http://www.unicode.org/copyright.html#License ++ ++// loclikelysubtags.h ++// created: 2019may08 Markus W. Scherer ++ ++#ifndef __LOCLIKELYSUBTAGS_H__ ++#define __LOCLIKELYSUBTAGS_H__ ++ ++#include <utility> ++#include "unicode/utypes.h" ++#include "unicode/bytestrie.h" ++#include "unicode/locid.h" ++#include "unicode/uobject.h" ++#include "unicode/ures.h" ++#include "lsr.h" ++#include "uhash.h" ++ ++U_NAMESPACE_BEGIN ++ ++struct XLikelySubtagsData; ++ ++/** ++ * Map of const char * keys & values. ++ * Stores pointers as is: Does not own/copy/adopt/release strings. ++ */ ++class CharStringMap final : public UMemory { ++public: ++ /** Constructs an unusable non-map. */ ++ CharStringMap() : map(nullptr) {} ++ CharStringMap(int32_t size, UErrorCode &errorCode) { ++ map = uhash_openSize(uhash_hashChars, uhash_compareChars, uhash_compareChars, ++ size, &errorCode); ++ } ++ CharStringMap(CharStringMap &&other) U_NOEXCEPT : map(other.map) { ++ other.map = nullptr; ++ } ++ CharStringMap(const CharStringMap &other) = delete; ++ ~CharStringMap() { ++ uhash_close(map); ++ } ++ ++ CharStringMap &operator=(CharStringMap &&other) U_NOEXCEPT { ++ map = other.map; ++ other.map = nullptr; ++ return *this; ++ } ++ CharStringMap &operator=(const CharStringMap &other) = delete; ++ ++ const char *get(const char *key) const { return static_cast<const char *>(uhash_get(map, key)); } ++ void put(const char *key, const char *value, UErrorCode &errorCode) { ++ uhash_put(map, const_cast<char *>(key), const_cast<char *>(value), &errorCode); ++ } ++ ++private: ++ UHashtable *map; ++}; ++ ++struct LocaleDistanceData { ++ LocaleDistanceData() = default; ++ LocaleDistanceData(LocaleDistanceData &&data); ++ ~LocaleDistanceData(); ++ ++ const uint8_t *distanceTrieBytes = nullptr; ++ const uint8_t *regionToPartitions = nullptr; ++ const char **partitions = nullptr; ++ const LSR *paradigms = nullptr; ++ int32_t paradigmsLength = 0; ++ const int32_t *distances = nullptr; ++ ++private: ++ LocaleDistanceData &operator=(const LocaleDistanceData &) = delete; ++}; ++ ++// TODO(ICU-20777): Rename to just LikelySubtags. ++class XLikelySubtags final : public UMemory { ++public: ++ ~XLikelySubtags(); ++ ++ static constexpr int32_t SKIP_SCRIPT = 1; ++ ++ // VisibleForTesting ++ static const XLikelySubtags *getSingleton(UErrorCode &errorCode); ++ ++ // VisibleForTesting ++ LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const; ++ ++ // TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code ++ // in loclikely.cpp to this new code, including activating this ++ // minimizeSubtags() function. The LocaleMatcher does not minimize. ++#if 0 ++ LSR minimizeSubtags(const char *languageIn, const char *scriptIn, const char *regionIn, ++ ULocale.Minimize fieldToFavor, UErrorCode &errorCode) const; ++#endif ++ ++ // visible for LocaleDistance ++ const LocaleDistanceData &getDistanceData() const { return distanceData; } ++ ++private: ++ XLikelySubtags(XLikelySubtagsData &data); ++ XLikelySubtags(const XLikelySubtags &other) = delete; ++ XLikelySubtags &operator=(const XLikelySubtags &other) = delete; ++ ++ static void initLikelySubtags(UErrorCode &errorCode); ++ ++ LSR makeMaximizedLsr(const char *language, const char *script, const char *region, ++ const char *variant, UErrorCode &errorCode) const; ++ ++ /** ++ * Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN". ++ */ ++ LSR maximize(const char *language, const char *script, const char *region) const; ++ ++ static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i); ++ ++ UResourceBundle *langInfoBundle; ++ // We could store the strings by value, except that if there were few enough strings, ++ // moving the contents could copy it to a different array, ++ // invalidating the pointers stored in the maps. ++ CharString *strings; ++ CharStringMap languageAliases; ++ CharStringMap regionAliases; ++ ++ // The trie maps each lang+script+region (encoded in ASCII) to an index into lsrs. ++ // There is also a trie value for each intermediate lang and lang+script. ++ // '*' is used instead of "und", "Zzzz"/"" and "ZZ"/"". ++ BytesTrie trie; ++ uint64_t trieUndState; ++ uint64_t trieUndZzzzState; ++ int32_t defaultLsrIndex; ++ uint64_t trieFirstLetterStates[26]; ++ const LSR *lsrs; ++#if U_DEBUG ++ int32_t lsrsLength; ++#endif ++ ++ // distance/matcher data: see comment in XLikelySubtagsData::load() ++ LocaleDistanceData distanceData; ++}; ++ ++U_NAMESPACE_END ++ ++#endif // __LOCLIKELYSUBTAGS_H__ +diff --git a/source/common/lsr.cpp b/source/common/lsr.cpp +new file mode 100644 +index 00000000..0c28eeda +--- /dev/null ++++ b/source/common/lsr.cpp +@@ -0,0 +1,101 @@ ++// © 2019 and later: Unicode, Inc. and others. ++// License & terms of use: http://www.unicode.org/copyright.html#License ++ ++// lsr.cpp ++// created: 2019may08 Markus W. Scherer ++ ++#include "unicode/utypes.h" ++#include "charstr.h" ++#include "cmemory.h" ++#include "cstring.h" ++#include "lsr.h" ++#include "uinvchar.h" ++#include "ustr_imp.h" ++ ++U_NAMESPACE_BEGIN ++ ++LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode) : ++ language(nullptr), script(nullptr), region(r), ++ regionIndex(indexForRegion(region)) { ++ if (U_SUCCESS(errorCode)) { ++ CharString langScript; ++ langScript.append(prefix, errorCode).append(lang, errorCode).append('\0', errorCode); ++ int32_t scriptOffset = langScript.length(); ++ langScript.append(prefix, errorCode).append(scr, errorCode); ++ owned = langScript.cloneData(errorCode); ++ if (U_SUCCESS(errorCode)) { ++ language = owned; ++ script = owned + scriptOffset; ++ } ++ } ++} ++ ++LSR::LSR(LSR &&other) U_NOEXCEPT : ++ language(other.language), script(other.script), region(other.region), owned(other.owned), ++ regionIndex(other.regionIndex), hashCode(other.hashCode) { ++ if (owned != nullptr) { ++ other.language = other.script = ""; ++ other.owned = nullptr; ++ other.hashCode = 0; ++ } ++} ++ ++void LSR::deleteOwned() { ++ uprv_free(owned); ++} ++ ++LSR &LSR::operator=(LSR &&other) U_NOEXCEPT { ++ this->~LSR(); ++ language = other.language; ++ script = other.script; ++ region = other.region; ++ regionIndex = other.regionIndex; ++ owned = other.owned; ++ hashCode = other.hashCode; ++ if (owned != nullptr) { ++ other.language = other.script = ""; ++ other.owned = nullptr; ++ other.hashCode = 0; ++ } ++ return *this; ++} ++ ++UBool LSR::operator==(const LSR &other) const { ++ return ++ uprv_strcmp(language, other.language) == 0 && ++ uprv_strcmp(script, other.script) == 0 && ++ regionIndex == other.regionIndex && ++ // Compare regions if both are ill-formed (and their indexes are 0). ++ (regionIndex > 0 || uprv_strcmp(region, other.region) == 0); ++} ++ ++int32_t LSR::indexForRegion(const char *region) { ++ int32_t c = region[0]; ++ int32_t a = c - '0'; ++ if (0 <= a && a <= 9) { // digits: "419" ++ int32_t b = region[1] - '0'; ++ if (b < 0 || 9 < b) { return 0; } ++ c = region[2] - '0'; ++ if (c < 0 || 9 < c || region[3] != 0) { return 0; } ++ return (10 * a + b) * 10 + c + 1; ++ } else { // letters: "DE" ++ a = uprv_upperOrdinal(c); ++ if (a < 0 || 25 < a) { return 0; } ++ int32_t b = uprv_upperOrdinal(region[1]); ++ if (b < 0 || 25 < b || region[2] != 0) { return 0; } ++ return 26 * a + b + 1001; ++ } ++ return 0; ++} ++ ++LSR &LSR::setHashCode() { ++ if (hashCode == 0) { ++ hashCode = ++ (ustr_hashCharsN(language, static_cast<int32_t>(uprv_strlen(language))) * 37 + ++ ustr_hashCharsN(script, static_cast<int32_t>(uprv_strlen(script)))) * 37 + ++ regionIndex; ++ } ++ return *this; ++} ++ ++U_NAMESPACE_END +diff --git a/source/common/lsr.h b/source/common/lsr.h +new file mode 100644 +index 00000000..db6cf938 +--- /dev/null ++++ b/source/common/lsr.h +@@ -0,0 +1,72 @@ ++// © 2019 and later: Unicode, Inc. and others. ++// License & terms of use: http://www.unicode.org/copyright.html#License ++ ++// lsr.h ++// created: 2019may08 Markus W. Scherer ++ ++#ifndef __LSR_H__ ++#define __LSR_H__ ++ ++#include "unicode/utypes.h" ++#include "unicode/uobject.h" ++#include "cstring.h" ++ ++U_NAMESPACE_BEGIN ++ ++struct LSR final : public UMemory { ++ static constexpr int32_t REGION_INDEX_LIMIT = 1001 + 26 * 26; ++ ++ const char *language; ++ const char *script; ++ const char *region; ++ char *owned = nullptr; ++ /** Index for region, 0 if ill-formed. @see indexForRegion */ ++ int32_t regionIndex = 0; ++ /** Only set for LSRs that will be used in a hash table. */ ++ int32_t hashCode = 0; ++ ++ LSR() : language("und"), script(""), region("") {} ++ ++ /** Constructor which aliases all subtag pointers. */ ++ LSR(const char *lang, const char *scr, const char *r) : ++ language(lang), script(scr), region(r), ++ regionIndex(indexForRegion(region)) {} ++ /** ++ * Constructor which prepends the prefix to the language and script, ++ * copies those into owned memory, and aliases the region. ++ */ ++ LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode); ++ LSR(LSR &&other) U_NOEXCEPT; ++ LSR(const LSR &other) = delete; ++ inline ~LSR() { ++ // Pure inline code for almost all instances. ++ if (owned != nullptr) { ++ deleteOwned(); ++ } ++ } ++ ++ LSR &operator=(LSR &&other) U_NOEXCEPT; ++ LSR &operator=(const LSR &other) = delete; ++ ++ /** ++ * Returns a positive index (>0) for a well-formed region code. ++ * Do not rely on a particular region->index mapping; it may change. ++ * Returns 0 for ill-formed strings. ++ */ ++ static int32_t indexForRegion(const char *region); ++ ++ UBool operator==(const LSR &other) const; ++ ++ inline UBool operator!=(const LSR &other) const { ++ return !operator==(other); ++ } ++ ++ LSR &setHashCode(); ++ ++private: ++ void deleteOwned(); ++}; ++ ++U_NAMESPACE_END ++ ++#endif // __LSR_H__ +diff --git a/source/common/resource.h b/source/common/resource.h +index ee93d41a..5199b858 100644 +--- a/source/common/resource.h ++++ b/source/common/resource.h +@@ -94,13 +94,20 @@ public: + */ + int32_t getSize() const { return length; } + /** +- * @param i Array item index. ++ * @param i Table item index. + * @param key Output-only, receives the key of the i'th item. + * @param value Output-only, receives the value of the i'th item. + * @return TRUE if i is non-negative and less than getSize(). + */ + UBool getKeyAndValue(int32_t i, const char *&key, ResourceValue &value) const; + ++ /** ++ * @param key Key string to find in the table. ++ * @param value Output-only, receives the value of the item with that key. ++ * @return TRUE if the table contains the key. ++ */ ++ UBool findValue(const char *key, ResourceValue &value) const; ++ + private: + const uint16_t *keys16; + const int32_t *keys32; +diff --git a/source/common/ucln_cmn.h b/source/common/ucln_cmn.h +index 0ca911b4..c4b22ca4 100644 +--- a/source/common/ucln_cmn.h ++++ b/source/common/ucln_cmn.h +@@ -41,6 +41,8 @@ typedef enum ECleanupCommonType { + UCLN_COMMON_LOCALE_KEY_TYPE, + UCLN_COMMON_LOCALE, + UCLN_COMMON_LOCALE_AVAILABLE, ++ UCLN_COMMON_LIKELY_SUBTAGS, ++ UCLN_COMMON_LOCALE_DISTANCE, + UCLN_COMMON_ULOC, + UCLN_COMMON_CURRENCY, + UCLN_COMMON_LOADED_NORMALIZER2, +diff --git a/source/common/uinvchar.cpp b/source/common/uinvchar.cpp +index 8ce2350d..6e5fb48c 100644 +--- a/source/common/uinvchar.cpp ++++ b/source/common/uinvchar.cpp +@@ -445,6 +445,13 @@ uprv_copyEbcdic(const UDataSwapper *ds, + return length; + } + ++U_CFUNC UBool ++uprv_isEbcdicAtSign(char c) { ++ static const uint8_t ebcdicAtSigns[] = { ++ 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 }; ++ return c != 0 && uprv_strchr((const char *)ebcdicAtSigns, c) != nullptr; ++} ++ + /* compare invariant strings; variant characters compare less than others and unlike each other */ + U_CFUNC int32_t + uprv_compareInvAscii(const UDataSwapper *ds, +@@ -561,6 +568,11 @@ uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2) { + } + } + ++U_CAPI char U_EXPORT2 ++uprv_ebcdicToAscii(char c) { ++ return (char)asciiFromEbcdic[(uint8_t)c]; ++} ++ + U_CAPI char U_EXPORT2 + uprv_ebcdicToLowercaseAscii(char c) { + return (char)lowercaseAsciiFromEbcdic[(uint8_t)c]; +diff --git a/source/common/uinvchar.h b/source/common/uinvchar.h +index 56dddfa8..a43cfcd9 100644 +--- a/source/common/uinvchar.h ++++ b/source/common/uinvchar.h +@@ -68,6 +68,75 @@ uprv_isInvariantUString(const UChar *s, int32_t length); + # error Unknown charset family! + #endif + ++#ifdef __cplusplus ++ ++U_NAMESPACE_BEGIN ++ ++/** ++ * Like U_UPPER_ORDINAL(x) but with validation. ++ * Returns 0..25 for A..Z else a value outside 0..25. ++ */ ++inline int32_t uprv_upperOrdinal(int32_t c) { ++#if U_CHARSET_FAMILY==U_ASCII_FAMILY ++ return c - 'A'; ++#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY ++ // EBCDIC: A-Z (26 letters) is split into three ranges A-I (9 letters), J-R (9), S-Z (8). ++ // https://en.wikipedia.org/wiki/EBCDIC_037#Codepage_layout ++ if (c <= 'I') { return c - 'A'; } // A-I --> 0-8 ++ if (c < 'J') { return -1; } ++ if (c <= 'R') { return c - 'J' + 9; } // J-R --> 9..17 ++ if (c < 'S') { return -1; } ++ return c - 'S' + 18; // S-Z --> 18..25 ++#else ++# error Unknown charset family! ++#endif ++} ++ ++// Like U_UPPER_ORDINAL(x) but for lowercase and with validation. ++// Returns 0..25 for a..z else a value outside 0..25. ++inline int32_t uprv_lowerOrdinal(int32_t c) { ++#if U_CHARSET_FAMILY==U_ASCII_FAMILY ++ return c - 'a'; ++#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY ++ // EBCDIC: a-z (26 letters) is split into three ranges a-i (9 letters), j-r (9), s-z (8). ++ // https://en.wikipedia.org/wiki/EBCDIC_037#Codepage_layout ++ if (c <= 'i') { return c - 'a'; } // a-i --> 0-8 ++ if (c < 'j') { return -1; } ++ if (c <= 'r') { return c - 'j' + 9; } // j-r --> 9..17 ++ if (c < 's') { return -1; } ++ return c - 's' + 18; // s-z --> 18..25 ++#else ++# error Unknown charset family! ++#endif ++} ++ ++U_NAMESPACE_END ++ ++#endif ++ ++/** ++ * Returns true if c == '@' is possible. ++ * The @ sign is variant, and the @ sign used on one ++ * EBCDIC machine won't be compiled the same way on other EBCDIC based machines. ++ * @internal ++ */ ++U_CFUNC UBool ++uprv_isEbcdicAtSign(char c); ++ ++/** ++ * \def uprv_isAtSign ++ * Returns true if c == '@' is possible. ++ * For ASCII, checks for exactly '@'. For EBCDIC, calls uprv_isEbcdicAtSign(). ++ * @internal ++ */ ++#if U_CHARSET_FAMILY==U_ASCII_FAMILY ++# define uprv_isAtSign(c) ((c)=='@') ++#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY ++# define uprv_isAtSign(c) uprv_isEbcdicAtSign(c) ++#else ++# error Unknown charset family! ++#endif ++ + /** + * Compare two EBCDIC invariant-character strings in ASCII order. + * @internal +@@ -88,6 +157,26 @@ uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2); + # error Unknown charset family! + #endif + ++/** ++ * Converts an EBCDIC invariant character to ASCII. ++ * @internal ++ */ ++U_INTERNAL char U_EXPORT2 ++uprv_ebcdicToAscii(char c); ++ ++/** ++ * \def uprv_invCharToAscii ++ * Converts an invariant character to ASCII. ++ * @internal ++ */ ++#if U_CHARSET_FAMILY==U_ASCII_FAMILY ++# define uprv_invCharToAscii(c) (c) ++#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY ++# define uprv_invCharToAscii(c) uprv_ebcdicToAscii(c) ++#else ++# error Unknown charset family! ++#endif ++ + /** + * Converts an EBCDIC invariant character to lowercase ASCII. + * @internal +diff --git a/source/common/unicode/localebuilder.h b/source/common/unicode/localebuilder.h +index 960e5980..a91a0b51 100644 +--- a/source/common/unicode/localebuilder.h ++++ b/source/common/unicode/localebuilder.h +@@ -4,6 +4,7 @@ + #define __LOCALEBUILDER_H__ + + #include "unicode/locid.h" ++#include "unicode/localematcher.h" + #include "unicode/stringpiece.h" + #include "unicode/uobject.h" + #include "unicode/utypes.h" +@@ -277,6 +278,10 @@ public: + Locale build(UErrorCode& status); + + private: ++ friend class LocaleMatcher::Result; ++ ++ void copyExtensionsFrom(const Locale& src, UErrorCode& errorCode); ++ + UErrorCode status_; + char language_[9]; + char script_[5]; +diff --git a/source/common/unicode/localematcher.h b/source/common/unicode/localematcher.h +new file mode 100644 +index 00000000..701123f7 +--- /dev/null ++++ b/source/common/unicode/localematcher.h +@@ -0,0 +1,605 @@ ++// © 2019 and later: Unicode, Inc. and others. ++// License & terms of use: http://www.unicode.org/copyright.html#License ++ ++// localematcher.h ++// created: 2019may08 Markus W. Scherer ++ ++#ifndef __LOCALEMATCHER_H__ ++#define __LOCALEMATCHER_H__ ++ ++#include "unicode/utypes.h" ++ ++#if U_SHOW_CPLUSPLUS_API ++ ++#include "unicode/locid.h" ++#include "unicode/stringpiece.h" ++#include "unicode/uobject.h" ++ ++/** ++ * \file ++ * \brief C++ API: Locale matcher: User's desired locales vs. application's supported locales. ++ */ ++ ++#ifndef U_HIDE_DRAFT_API ++ ++/** ++ * Builder option for whether the language subtag or the script subtag is most important. ++ * ++ * @see Builder#setFavorSubtag(FavorSubtag) ++ * @draft ICU 65 ++ */ ++enum ULocMatchFavorSubtag { ++ /** ++ * Language differences are most important, then script differences, then region differences. ++ * (This is the default behavior.) ++ * ++ * @draft ICU 65 ++ */ ++ ULOCMATCH_FAVOR_LANGUAGE, ++ /** ++ * Makes script differences matter relatively more than language differences. ++ * ++ * @draft ICU 65 ++ */ ++ ULOCMATCH_FAVOR_SCRIPT ++}; ++#ifndef U_IN_DOXYGEN ++typedef enum ULocMatchFavorSubtag ULocMatchFavorSubtag; ++#endif ++ ++/** ++ * Builder option for whether all desired locales are treated equally or ++ * earlier ones are preferred. ++ * ++ * @see Builder#setDemotionPerDesiredLocale(Demotion) ++ * @draft ICU 65 ++ */ ++enum ULocMatchDemotion { ++ /** ++ * All desired locales are treated equally. ++ * ++ * @draft ICU 65 ++ */ ++ ULOCMATCH_DEMOTION_NONE, ++ /** ++ * Earlier desired locales are preferred. ++ * ++ * <p>From each desired locale to the next, ++ * the distance to any supported locale is increased by an additional amount ++ * which is at least as large as most region mismatches. ++ * A later desired locale has to have a better match with some supported locale ++ * due to more than merely having the same region subtag. ++ * ++ * <p>For example: <code>Supported={en, sv} desired=[en-GB, sv]</code> ++ * yields <code>Result(en-GB, en)</code> because ++ * with the demotion of sv its perfect match is no better than ++ * the region distance between the earlier desired locale en-GB and en=en-US. ++ * ++ * <p>Notes: ++ * <ul> ++ * <li>In some cases, language and/or script differences can be as small as ++ * the typical region difference. (Example: sr-Latn vs. sr-Cyrl) ++ * <li>It is possible for certain region differences to be larger than usual, ++ * and larger than the demotion. ++ * (As of CLDR 35 there is no such case, but ++ * this is possible in future versions of the data.) ++ * </ul> ++ * ++ * @draft ICU 65 ++ */ ++ ULOCMATCH_DEMOTION_REGION ++}; ++#ifndef U_IN_DOXYGEN ++typedef enum ULocMatchDemotion ULocMatchDemotion; ++#endif ++ ++struct UHashtable; ++ ++U_NAMESPACE_BEGIN ++ ++struct LSR; ++ ++class LocaleDistance; ++class LocaleLsrIterator; ++class UVector; ++class XLikelySubtags; ++ ++/** ++ * Immutable class that picks the best match between a user's desired locales and ++ * an application's supported locales. ++ * Movable but not copyable. ++ * ++ * <p>Example: ++ * <pre> ++ * UErrorCode errorCode = U_ZERO_ERROR; ++ * LocaleMatcher matcher = LocaleMatcher::Builder().setSupportedLocales("fr, en-GB, en").build(errorCode); ++ * Locale *bestSupported = matcher.getBestLocale(Locale.US, errorCode); // "en" ++ * </pre> ++ * ++ * <p>A matcher takes into account when languages are close to one another, ++ * such as Danish and Norwegian, ++ * and when regional variants are close, like en-GB and en-AU as opposed to en-US. ++ * ++ * <p>If there are multiple supported locales with the same (language, script, region) ++ * likely subtags, then the current implementation returns the first of those locales. ++ * It ignores variant subtags (except for pseudolocale variants) and extensions. ++ * This may change in future versions. ++ * ++ * <p>For example, the current implementation does not distinguish between ++ * de, de-DE, de-Latn, de-1901, de-u-co-phonebk. ++ * ++ * <p>If you prefer one equivalent locale over another, then provide only the preferred one, ++ * or place it earlier in the list of supported locales. ++ * ++ * <p>Otherwise, the order of supported locales may have no effect on the best-match results. ++ * The current implementation compares each desired locale with supported locales ++ * in the following order: ++ * 1. Default locale, if supported; ++ * 2. CLDR "paradigm locales" like en-GB and es-419; ++ * 3. other supported locales. ++ * This may change in future versions. ++ * ++ * <p>Often a product will just need one matcher instance, built with the languages ++ * that it supports. However, it may want multiple instances with different ++ * default languages based on additional information, such as the domain. ++ * ++ * <p>This class is not intended for public subclassing. ++ * ++ * @draft ICU 65 ++ */ ++class U_COMMON_API LocaleMatcher : public UMemory { ++public: ++ /** ++ * Data for the best-matching pair of a desired and a supported locale. ++ * Movable but not copyable. ++ * ++ * @draft ICU 65 ++ */ ++ class U_COMMON_API Result : public UMemory { ++ public: ++ /** ++ * Move constructor; might modify the source. ++ * This object will have the same contents that the source object had. ++ * ++ * @param src Result to move contents from. ++ * @draft ICU 65 ++ */ ++ Result(Result &&src) U_NOEXCEPT; ++ ++ /** ++ * Destructor. ++ * ++ * @draft ICU 65 ++ */ ++ ~Result(); ++ ++ /** ++ * Move assignment; might modify the source. ++ * This object will have the same contents that the source object had. ++ * ++ * @param src Result to move contents from. ++ * @draft ICU 65 ++ */ ++ Result &operator=(Result &&src) U_NOEXCEPT; ++ ++ /** ++ * Returns the best-matching desired locale. ++ * nullptr if the list of desired locales is empty or if none matched well enough. ++ * ++ * @return the best-matching desired locale, or nullptr. ++ * @draft ICU 65 ++ */ ++ inline const Locale *getDesiredLocale() const { return desiredLocale; } ++ ++ /** ++ * Returns the best-matching supported locale. ++ * If none matched well enough, this is the default locale. ++ * The default locale is nullptr if the list of supported locales is empty and ++ * no explicit default locale is set. ++ * ++ * @return the best-matching supported locale, or nullptr. ++ * @draft ICU 65 ++ */ ++ inline const Locale *getSupportedLocale() const { return supportedLocale; } ++ ++ /** ++ * Returns the index of the best-matching desired locale in the input Iterable order. ++ * -1 if the list of desired locales is empty or if none matched well enough. ++ * ++ * @return the index of the best-matching desired locale, or -1. ++ * @draft ICU 65 ++ */ ++ inline int32_t getDesiredIndex() const { return desiredIndex; } ++ ++ /** ++ * Returns the index of the best-matching supported locale in the ++ * constructor’s or builder’s input order (“set†Collection plus “added†locales). ++ * If the matcher was built from a locale list string, then the iteration order is that ++ * of a LocalePriorityList built from the same string. ++ * -1 if the list of supported locales is empty or if none matched well enough. ++ * ++ * @return the index of the best-matching supported locale, or -1. ++ * @draft ICU 65 ++ */ ++ inline int32_t getSupportedIndex() const { return supportedIndex; } ++ ++ /** ++ * Takes the best-matching supported locale and adds relevant fields of the ++ * best-matching desired locale, such as the -t- and -u- extensions. ++ * May replace some fields of the supported locale. ++ * The result is the locale that should be used for date and number formatting, collation, etc. ++ * Returns the root locale if getSupportedLocale() returns nullptr. ++ * ++ * <p>Example: desired=ar-SA-u-nu-latn, supported=ar-EG, resolved locale=ar-SA-u-nu-latn ++ * ++ * @return a locale combining the best-matching desired and supported locales. ++ * @draft ICU 65 ++ */ ++ Locale makeResolvedLocale(UErrorCode &errorCode) const; ++ ++ private: ++ Result(const Locale *desired, const Locale *supported, ++ int32_t desIndex, int32_t suppIndex, UBool owned) : ++ desiredLocale(desired), supportedLocale(supported), ++ desiredIndex(desIndex), supportedIndex(suppIndex), ++ desiredIsOwned(owned) {} ++ ++ Result(const Result &other) = delete; ++ Result &operator=(const Result &other) = delete; ++ ++ const Locale *desiredLocale; ++ const Locale *supportedLocale; ++ int32_t desiredIndex; ++ int32_t supportedIndex; ++ UBool desiredIsOwned; ++ ++ friend class LocaleMatcher; ++ }; ++ ++ /** ++ * LocaleMatcher builder. ++ * Movable but not copyable. ++ * ++ * @see LocaleMatcher#builder() ++ * @draft ICU 65 ++ */ ++ class U_COMMON_API Builder : public UMemory { ++ public: ++ /** ++ * Constructs a builder used in chaining parameters for building a LocaleMatcher. ++ * ++ * @return a new Builder object ++ * @draft ICU 65 ++ */ ++ Builder() {} ++ ++ /** ++ * Move constructor; might modify the source. ++ * This builder will have the same contents that the source builder had. ++ * ++ * @param src Builder to move contents from. ++ * @draft ICU 65 ++ */ ++ Builder(Builder &&src) U_NOEXCEPT; ++ ++ /** ++ * Destructor. ++ * ++ * @draft ICU 65 ++ */ ++ ~Builder(); ++ ++ /** ++ * Move assignment; might modify the source. ++ * This builder will have the same contents that the source builder had. ++ * ++ * @param src Builder to move contents from. ++ * @draft ICU 65 ++ */ ++ Builder &operator=(Builder &&src) U_NOEXCEPT; ++ ++ /** ++ * Parses an Accept-Language string ++ * (<a href="https://tools.ietf.org/html/rfc2616#section-14.4">RFC 2616 Section 14.4</a>), ++ * such as "af, en, fr;q=0.9", and sets the supported locales accordingly. ++ * Allows whitespace in more places but does not allow "*". ++ * Clears any previously set/added supported locales first. ++ * ++ * @param locales the Accept-Language string of locales to set ++ * @return this Builder object ++ * @draft ICU 65 ++ */ ++ Builder &setSupportedLocalesFromListString(StringPiece locales); ++ ++ /** ++ * Copies the supported locales, preserving iteration order. ++ * Clears any previously set/added supported locales first. ++ * Duplicates are allowed, and are not removed. ++ * ++ * @param locales the list of locale ++ * @return this Builder object ++ * @draft ICU 65 ++ */ ++ Builder &setSupportedLocales(Locale::Iterator &locales); ++ ++ /** ++ * Copies the supported locales from the begin/end range, preserving iteration order. ++ * Clears any previously set/added supported locales first. ++ * Duplicates are allowed, and are not removed. ++ * ++ * Each of the iterator parameter values must be an ++ * input iterator whose value is convertible to const Locale &. ++ * ++ * @param begin Start of range. ++ * @param end Exclusive end of range. ++ * @return this Builder object ++ * @draft ICU 65 ++ */ ++ template<typename Iter> ++ Builder &setSupportedLocales(Iter begin, Iter end) { ++ if (U_FAILURE(errorCode_)) { return *this; } ++ clearSupportedLocales(); ++ while (begin != end) { ++ addSupportedLocale(*begin++); ++ } ++ return *this; ++ } ++ ++ /** ++ * Copies the supported locales from the begin/end range, preserving iteration order. ++ * Calls the converter to convert each *begin to a Locale or const Locale &. ++ * Clears any previously set/added supported locales first. ++ * Duplicates are allowed, and are not removed. ++ * ++ * Each of the iterator parameter values must be an ++ * input iterator whose value is convertible to const Locale &. ++ * ++ * @param begin Start of range. ++ * @param end Exclusive end of range. ++ * @param converter Converter from *begin to const Locale & or compatible. ++ * @return this Builder object ++ * @draft ICU 65 ++ */ ++ template<typename Iter, typename Conv> ++ Builder &setSupportedLocalesViaConverter(Iter begin, Iter end, Conv converter) { ++ if (U_FAILURE(errorCode_)) { return *this; } ++ clearSupportedLocales(); ++ while (begin != end) { ++ addSupportedLocale(converter(*begin++)); ++ } ++ return *this; ++ } ++ ++ /** ++ * Adds another supported locale. ++ * Duplicates are allowed, and are not removed. ++ * ++ * @param locale another locale ++ * @return this Builder object ++ * @draft ICU 65 ++ */ ++ Builder &addSupportedLocale(const Locale &locale); ++ ++ /** ++ * Sets the default locale; if nullptr, or if it is not set explicitly, ++ * then the first supported locale is used as the default locale. ++ * ++ * @param defaultLocale the default locale (will be copied) ++ * @return this Builder object ++ * @draft ICU 65 ++ */ ++ Builder &setDefaultLocale(const Locale *defaultLocale); ++ ++ /** ++ * If ULOCMATCH_FAVOR_SCRIPT, then the language differences are smaller than script ++ * differences. ++ * This is used in situations (such as maps) where ++ * it is better to fall back to the same script than a similar language. ++ * ++ * @param subtag the subtag to favor ++ * @return this Builder object ++ * @draft ICU 65 ++ */ ++ Builder &setFavorSubtag(ULocMatchFavorSubtag subtag); ++ ++ /** ++ * Option for whether all desired locales are treated equally or ++ * earlier ones are preferred (this is the default). ++ * ++ * @param demotion the demotion per desired locale to set. ++ * @return this Builder object ++ * @draft ICU 65 ++ */ ++ Builder &setDemotionPerDesiredLocale(ULocMatchDemotion demotion); ++ ++ /** ++ * Sets the UErrorCode if an error occurred while setting parameters. ++ * Preserves older error codes in the outErrorCode. ++ * ++ * @param outErrorCode Set to an error code if it does not contain one already ++ * and an error occurred while setting parameters. ++ * Otherwise unchanged. ++ * @return TRUE if U_FAILURE(outErrorCode) ++ * @draft ICU 65 ++ */ ++ UBool copyErrorTo(UErrorCode &outErrorCode) const; ++ ++ /** ++ * Builds and returns a new locale matcher. ++ * This builder can continue to be used. ++ * ++ * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, ++ * or else the function returns immediately. Check for U_FAILURE() ++ * on output or use with function chaining. (See User Guide for details.) ++ * @return new LocaleMatcher. ++ * @draft ICU 65 ++ */ ++ LocaleMatcher build(UErrorCode &errorCode) const; ++ ++ private: ++ friend class LocaleMatcher; ++ ++ Builder(const Builder &other) = delete; ++ Builder &operator=(const Builder &other) = delete; ++ ++ void clearSupportedLocales(); ++ bool ensureSupportedLocaleVector(); ++ ++ UErrorCode errorCode_ = U_ZERO_ERROR; ++ UVector *supportedLocales_ = nullptr; ++ int32_t thresholdDistance_ = -1; ++ ULocMatchDemotion demotion_ = ULOCMATCH_DEMOTION_REGION; ++ Locale *defaultLocale_ = nullptr; ++ ULocMatchFavorSubtag favor_ = ULOCMATCH_FAVOR_LANGUAGE; ++ }; ++ ++ // FYI No public LocaleMatcher constructors in C++; use the Builder. ++ ++ /** ++ * Move copy constructor; might modify the source. ++ * This matcher will have the same settings that the source matcher had. ++ * @param src source matcher ++ * @draft ICU 65 ++ */ ++ LocaleMatcher(LocaleMatcher &&src) U_NOEXCEPT; ++ ++ /** ++ * Destructor. ++ * @draft ICU 65 ++ */ ++ ~LocaleMatcher(); ++ ++ /** ++ * Move assignment operator; might modify the source. ++ * This matcher will have the same settings that the source matcher had. ++ * The behavior is undefined if *this and src are the same object. ++ * @param src source matcher ++ * @return *this ++ * @draft ICU 65 ++ */ ++ LocaleMatcher &operator=(LocaleMatcher &&src) U_NOEXCEPT; ++ ++ /** ++ * Returns the supported locale which best matches the desired locale. ++ * ++ * @param desiredLocale Typically a user's language. ++ * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, ++ * or else the function returns immediately. Check for U_FAILURE() ++ * on output or use with function chaining. (See User Guide for details.) ++ * @return the best-matching supported locale. ++ * @draft ICU 65 ++ */ ++ const Locale *getBestMatch(const Locale &desiredLocale, UErrorCode &errorCode) const; ++ ++ /** ++ * Returns the supported locale which best matches one of the desired locales. ++ * ++ * @param desiredLocales Typically a user's languages, in order of preference (descending). ++ * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, ++ * or else the function returns immediately. Check for U_FAILURE() ++ * on output or use with function chaining. (See User Guide for details.) ++ * @return the best-matching supported locale. ++ * @draft ICU 65 ++ */ ++ const Locale *getBestMatch(Locale::Iterator &desiredLocales, UErrorCode &errorCode) const; ++ ++ /** ++ * Parses an Accept-Language string ++ * (<a href="https://tools.ietf.org/html/rfc2616#section-14.4">RFC 2616 Section 14.4</a>), ++ * such as "af, en, fr;q=0.9", ++ * and returns the supported locale which best matches one of the desired locales. ++ * Allows whitespace in more places but does not allow "*". ++ * ++ * @param desiredLocaleList Typically a user's languages, as an Accept-Language string. ++ * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, ++ * or else the function returns immediately. Check for U_FAILURE() ++ * on output or use with function chaining. (See User Guide for details.) ++ * @return the best-matching supported locale. ++ * @draft ICU 65 ++ */ ++ const Locale *getBestMatchForListString(StringPiece desiredLocaleList, UErrorCode &errorCode) const; ++ ++ /** ++ * Returns the best match between the desired locale and the supported locales. ++ * If the result's desired locale is not nullptr, then it is the address of the input locale. ++ * It has not been cloned. ++ * ++ * @param desiredLocale Typically a user's language. ++ * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, ++ * or else the function returns immediately. Check for U_FAILURE() ++ * on output or use with function chaining. (See User Guide for details.) ++ * @return the best-matching pair of the desired and a supported locale. ++ * @draft ICU 65 ++ */ ++ Result getBestMatchResult(const Locale &desiredLocale, UErrorCode &errorCode) const; ++ ++ /** ++ * Returns the best match between the desired and supported locales. ++ * If the result's desired locale is not nullptr, then it is a clone of ++ * the best-matching desired locale. The Result object owns the clone. ++ * ++ * @param desiredLocales Typically a user's languages, in order of preference (descending). ++ * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, ++ * or else the function returns immediately. Check for U_FAILURE() ++ * on output or use with function chaining. (See User Guide for details.) ++ * @return the best-matching pair of a desired and a supported locale. ++ * @draft ICU 65 ++ */ ++ Result getBestMatchResult(Locale::Iterator &desiredLocales, UErrorCode &errorCode) const; ++ ++#ifndef U_HIDE_INTERNAL_API ++ /** ++ * Returns a fraction between 0 and 1, where 1 means that the languages are a ++ * perfect match, and 0 means that they are completely different. ++ * ++ * <p>This is mostly an implementation detail, and the precise values may change over time. ++ * The implementation may use either the maximized forms or the others ones, or both. ++ * The implementation may or may not rely on the forms to be consistent with each other. ++ * ++ * <p>Callers should construct and use a matcher rather than match pairs of locales directly. ++ * ++ * @param desired Desired locale. ++ * @param supported Supported locale. ++ * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, ++ * or else the function returns immediately. Check for U_FAILURE() ++ * on output or use with function chaining. (See User Guide for details.) ++ * @return value between 0 and 1, inclusive. ++ * @internal (has a known user) ++ */ ++ double internalMatch(const Locale &desired, const Locale &supported, UErrorCode &errorCode) const; ++#endif // U_HIDE_INTERNAL_API ++ ++private: ++ LocaleMatcher(const Builder &builder, UErrorCode &errorCode); ++ LocaleMatcher(const LocaleMatcher &other) = delete; ++ LocaleMatcher &operator=(const LocaleMatcher &other) = delete; ++ ++ int32_t getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remainingIter, UErrorCode &errorCode) const; ++ ++ const XLikelySubtags &likelySubtags; ++ const LocaleDistance &localeDistance; ++ int32_t thresholdDistance; ++ int32_t demotionPerDesiredLocale; ++ ULocMatchFavorSubtag favorSubtag; ++ ++ // These are in input order. ++ const Locale ** supportedLocales; ++ LSR *lsrs; ++ int32_t supportedLocalesLength; ++ // These are in preference order: 1. Default locale 2. paradigm locales 3. others. ++ UHashtable *supportedLsrToIndex; // Map<LSR, Integer> stores index+1 because 0 is "not found" ++ // Array versions of the supportedLsrToIndex keys and values. ++ // The distance lookup loops over the supportedLSRs and returns the index of the best match. ++ const LSR **supportedLSRs; ++ int32_t *supportedIndexes; ++ int32_t supportedLSRsLength; ++ Locale *ownedDefaultLocale; ++ const Locale *defaultLocale; ++ int32_t defaultLocaleIndex; ++}; ++ ++U_NAMESPACE_END ++ ++#endif // U_HIDE_DRAFT_API ++#endif // U_SHOW_CPLUSPLUS_API ++#endif // __LOCALEMATCHER_H__ +diff --git a/source/common/unicode/locid.h b/source/common/unicode/locid.h +index 7e410e53..0c9aecb6 100644 +--- a/source/common/unicode/locid.h ++++ b/source/common/unicode/locid.h +@@ -1008,6 +1008,104 @@ public: + */ + virtual UClassID getDynamicClassID() const; + ++#ifndef U_HIDE_DRAFT_API ++ /** ++ * A Locale iterator interface similar to a Java Iterator<Locale>. ++ * @draft ICU 65 ++ */ ++ class U_COMMON_API Iterator /* not : public UObject because this is an interface/mixin class */ { ++ public: ++ /** @draft ICU 65 */ ++ virtual ~Iterator(); ++ ++ /** ++ * @return TRUE if next() can be called again. ++ * @draft ICU 65 ++ */ ++ virtual UBool hasNext() const = 0; ++ ++ /** ++ * @return the next locale. ++ * @draft ICU 65 ++ */ ++ virtual const Locale &next() = 0; ++ }; ++ ++ /** ++ * A generic Locale iterator implementation over Locale input iterators. ++ * @draft ICU 65 ++ */ ++ template<typename Iter> ++ class RangeIterator : public Iterator, public UMemory { ++ public: ++ /** ++ * Constructs an iterator from a begin/end range. ++ * Each of the iterator parameter values must be an ++ * input iterator whose value is convertible to const Locale &. ++ * ++ * @param begin Start of range. ++ * @param end Exclusive end of range. ++ * @draft ICU 65 ++ */ ++ RangeIterator(Iter begin, Iter end) : it_(begin), end_(end) {} ++ ++ /** ++ * @return TRUE if next() can be called again. ++ * @draft ICU 65 ++ */ ++ UBool hasNext() const override { return it_ != end_; } ++ ++ /** ++ * @return the next locale. ++ * @draft ICU 65 ++ */ ++ const Locale &next() override { return *it_++; } ++ ++ private: ++ Iter it_; ++ const Iter end_; ++ }; ++ ++ /** ++ * A generic Locale iterator implementation over Locale input iterators. ++ * Calls the converter to convert each *begin to a const Locale &. ++ * @draft ICU 65 ++ */ ++ template<typename Iter, typename Conv> ++ class ConvertingIterator : public Iterator, public UMemory { ++ public: ++ /** ++ * Constructs an iterator from a begin/end range. ++ * Each of the iterator parameter values must be an ++ * input iterator whose value the converter converts to const Locale &. ++ * ++ * @param begin Start of range. ++ * @param end Exclusive end of range. ++ * @param converter Converter from *begin to const Locale & or compatible. ++ * @draft ICU 65 ++ */ ++ ConvertingIterator(Iter begin, Iter end, Conv converter) : ++ it_(begin), end_(end), converter_(converter) {} ++ ++ /** ++ * @return TRUE if next() can be called again. ++ * @draft ICU 65 ++ */ ++ UBool hasNext() const override { return it_ != end_; } ++ ++ /** ++ * @return the next locale. ++ * @draft ICU 65 ++ */ ++ const Locale &next() override { return converter_(*it_++); } ++ ++ private: ++ Iter it_; ++ const Iter end_; ++ Conv converter_; ++ }; ++#endif // U_HIDE_DRAFT_API ++ + protected: /* only protected for testing purposes. DO NOT USE. */ + #ifndef U_HIDE_INTERNAL_API + /** +diff --git a/source/common/uresbund.cpp b/source/common/uresbund.cpp +index 585c0e5f..3224fb37 100644 +--- a/source/common/uresbund.cpp ++++ b/source/common/uresbund.cpp +@@ -38,6 +38,7 @@ + #include "umutex.h" + #include "putilimp.h" + #include "uassert.h" ++#include "uresdata.h" + + using namespace icu; + +@@ -1963,7 +1964,7 @@ void getAllItemsWithFallback( + // When the sink sees the no-fallback/no-inheritance marker, + // then it would remove the parent's item. + // We would deserialize parent values even though they are overridden in a child bundle. +- value.pResData = &bundle->fResData; ++ value.setData(&bundle->fResData); + UResourceDataEntry *parentEntry = bundle->fData->fParent; + UBool hasParent = parentEntry != NULL && U_SUCCESS(parentEntry->fBogus); + value.setResource(bundle->fRes, ResourceTracer(bundle)); +@@ -2011,31 +2012,60 @@ void getAllItemsWithFallback( + + } // namespace + ++// Requires a ResourceDataValue fill-in, so that we need not cast from a ResourceValue. ++// Unfortunately, the caller must know which subclass to make and pass in. ++// Alternatively, we could make it as polymorphic as in Java by ++// returning a ResourceValue pointer (possibly wrapped into a LocalPointer) ++// that the caller then owns. ++// ++// Also requires a UResourceBundle fill-in, so that the value's ResourceTracer ++// can point to a non-local bundle. ++// Without tracing, the child bundle could be a function-local object. ++U_CAPI void U_EXPORT2 ++ures_getValueWithFallback(const UResourceBundle *bundle, const char *path, ++ UResourceBundle *tempFillIn, ++ ResourceDataValue &value, UErrorCode &errorCode) { ++ if (U_FAILURE(errorCode)) { return; } ++ if (path == nullptr) { ++ errorCode = U_ILLEGAL_ARGUMENT_ERROR; ++ return; ++ } ++ const UResourceBundle *rb; ++ if (*path == 0) { ++ // empty path ++ rb = bundle; ++ } else { ++ rb = ures_getByKeyWithFallback(bundle, path, tempFillIn, &errorCode); ++ if (U_FAILURE(errorCode)) { ++ return; ++ } ++ } ++ value.setData(&rb->fResData); ++ value.setResource(rb->fRes, ResourceTracer(rb)); ++} ++ + U_CAPI void U_EXPORT2 + ures_getAllItemsWithFallback(const UResourceBundle *bundle, const char *path, + icu::ResourceSink &sink, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return; } +- if (path == NULL) { ++ if (path == nullptr) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } +- UResourceBundle stackBundle; +- ures_initStackObject(&stackBundle); ++ StackUResourceBundle stackBundle; + const UResourceBundle *rb; + if (*path == 0) { + // empty path + rb = bundle; + } else { +- rb = ures_getByKeyWithFallback(bundle, path, &stackBundle, &errorCode); ++ rb = ures_getByKeyWithFallback(bundle, path, stackBundle.getAlias(), &errorCode); + if (U_FAILURE(errorCode)) { +- ures_close(&stackBundle); + return; + } + } + // Get all table items with fallback. + ResourceDataValue value; + getAllItemsWithFallback(rb, value, sink, errorCode); +- ures_close(&stackBundle); + } + + U_CAPI UResourceBundle* U_EXPORT2 ures_getByKey(const UResourceBundle *resB, const char* inKey, UResourceBundle *fillIn, UErrorCode *status) { +diff --git a/source/common/uresdata.cpp b/source/common/uresdata.cpp +index ce04142f..b3c2e2e2 100644 +--- a/source/common/uresdata.cpp ++++ b/source/common/uresdata.cpp +@@ -509,7 +509,7 @@ const UChar *ResourceDataValue::getString(int32_t &length, UErrorCode &errorCode + if(U_FAILURE(errorCode)) { + return NULL; + } +- const UChar *s = res_getString(fTraceInfo, pResData, res, &length); ++ const UChar *s = res_getString(fTraceInfo, &getData(), res, &length); + if(s == NULL) { + errorCode = U_RESOURCE_TYPE_MISMATCH; + } +@@ -520,7 +520,7 @@ const UChar *ResourceDataValue::getAliasString(int32_t &length, UErrorCode &erro + if(U_FAILURE(errorCode)) { + return NULL; + } +- const UChar *s = res_getAlias(pResData, res, &length); ++ const UChar *s = res_getAlias(&getData(), res, &length); + if(s == NULL) { + errorCode = U_RESOURCE_TYPE_MISMATCH; + } +@@ -551,7 +551,7 @@ const int32_t *ResourceDataValue::getIntVector(int32_t &length, UErrorCode &erro + if(U_FAILURE(errorCode)) { + return NULL; + } +- const int32_t *iv = res_getIntVector(fTraceInfo, pResData, res, &length); ++ const int32_t *iv = res_getIntVector(fTraceInfo, &getData(), res, &length); + if(iv == NULL) { + errorCode = U_RESOURCE_TYPE_MISMATCH; + } +@@ -562,7 +562,7 @@ const uint8_t *ResourceDataValue::getBinary(int32_t &length, UErrorCode &errorCo + if(U_FAILURE(errorCode)) { + return NULL; + } +- const uint8_t *b = res_getBinary(fTraceInfo, pResData, res, &length); ++ const uint8_t *b = res_getBinary(fTraceInfo, &getData(), res, &length); + if(b == NULL) { + errorCode = U_RESOURCE_TYPE_MISMATCH; + } +@@ -580,12 +580,12 @@ ResourceArray ResourceDataValue::getArray(UErrorCode &errorCode) const { + switch(RES_GET_TYPE(res)) { + case URES_ARRAY: + if (offset!=0) { // empty if offset==0 +- items32 = (const Resource *)pResData->pRoot+offset; ++ items32 = (const Resource *)getData().pRoot+offset; + length = *items32++; + } + break; + case URES_ARRAY16: +- items16 = pResData->p16BitUnits+offset; ++ items16 = getData().p16BitUnits+offset; + length = *items16++; + break; + default: +@@ -608,19 +608,19 @@ ResourceTable ResourceDataValue::getTable(UErrorCode &errorCode) const { + switch(RES_GET_TYPE(res)) { + case URES_TABLE: + if (offset != 0) { // empty if offset==0 +- keys16 = (const uint16_t *)(pResData->pRoot+offset); ++ keys16 = (const uint16_t *)(getData().pRoot+offset); + length = *keys16++; + items32 = (const Resource *)(keys16+length+(~length&1)); + } + break; + case URES_TABLE16: +- keys16 = pResData->p16BitUnits+offset; ++ keys16 = getData().p16BitUnits+offset; + length = *keys16++; + items16 = keys16 + length; + break; + case URES_TABLE32: + if (offset != 0) { // empty if offset==0 +- keys32 = pResData->pRoot+offset; ++ keys32 = getData().pRoot+offset; + length = *keys32++; + items32 = (const Resource *)keys32 + length; + } +@@ -633,18 +633,18 @@ ResourceTable ResourceDataValue::getTable(UErrorCode &errorCode) const { + } + + UBool ResourceDataValue::isNoInheritanceMarker() const { +- return ::isNoInheritanceMarker(pResData, res); ++ return ::isNoInheritanceMarker(&getData(), res); + } + + int32_t ResourceDataValue::getStringArray(UnicodeString *dest, int32_t capacity, + UErrorCode &errorCode) const { +- return ::getStringArray(pResData, getArray(errorCode), dest, capacity, errorCode); ++ return ::getStringArray(&getData(), getArray(errorCode), dest, capacity, errorCode); + } + + int32_t ResourceDataValue::getStringArrayOrStringAsArray(UnicodeString *dest, int32_t capacity, + UErrorCode &errorCode) const { + if(URES_IS_ARRAY(res)) { +- return ::getStringArray(pResData, getArray(errorCode), dest, capacity, errorCode); ++ return ::getStringArray(&getData(), getArray(errorCode), dest, capacity, errorCode); + } + if(U_FAILURE(errorCode)) { + return 0; +@@ -658,7 +658,7 @@ int32_t ResourceDataValue::getStringArrayOrStringAsArray(UnicodeString *dest, in + return 1; + } + int32_t sLength; +- const UChar *s = res_getString(fTraceInfo, pResData, res, &sLength); ++ const UChar *s = res_getString(fTraceInfo, &getData(), res, &sLength); + if(s != NULL) { + dest[0].setTo(TRUE, s, sLength); + return 1; +@@ -673,7 +673,7 @@ UnicodeString ResourceDataValue::getStringOrFirstOfArray(UErrorCode &errorCode) + return us; + } + int32_t sLength; +- const UChar *s = res_getString(fTraceInfo, pResData, res, &sLength); ++ const UChar *s = res_getString(fTraceInfo, &getData(), res, &sLength); + if(s != NULL) { + us.setTo(TRUE, s, sLength); + return us; +@@ -684,7 +684,7 @@ UnicodeString ResourceDataValue::getStringOrFirstOfArray(UErrorCode &errorCode) + } + if(array.getSize() > 0) { + // Tracing is already performed above (unimportant for trace that this is an array) +- s = res_getStringNoTrace(pResData, array.internalGetResource(pResData, 0), &sLength); ++ s = res_getStringNoTrace(&getData(), array.internalGetResource(&getData(), 0), &sLength); + if(s != NULL) { + us.setTo(TRUE, s, sLength); + return us; +@@ -821,14 +821,14 @@ UBool icu::ResourceTable::getKeyAndValue(int32_t i, + const char *&key, icu::ResourceValue &value) const { + if(0 <= i && i < length) { + icu::ResourceDataValue &rdValue = static_cast<icu::ResourceDataValue &>(value); +- if (keys16 != NULL) { +- key = RES_GET_KEY16(rdValue.pResData, keys16[i]); ++ if (keys16 != nullptr) { ++ key = RES_GET_KEY16(&rdValue.getData(), keys16[i]); + } else { +- key = RES_GET_KEY32(rdValue.pResData, keys32[i]); ++ key = RES_GET_KEY32(&rdValue.getData(), keys32[i]); + } + Resource res; +- if (items16 != NULL) { +- res = makeResourceFrom16(rdValue.pResData, items16[i]); ++ if (items16 != nullptr) { ++ res = makeResourceFrom16(&rdValue.getData(), items16[i]); + } else { + res = items32[i]; + } +@@ -842,6 +842,29 @@ UBool icu::ResourceTable::getKeyAndValue(int32_t i, + return FALSE; + } + ++UBool icu::ResourceTable::findValue(const char *key, ResourceValue &value) const { ++ icu::ResourceDataValue &rdValue = static_cast<icu::ResourceDataValue &>(value); ++ const char *realKey = nullptr; ++ int32_t i; ++ if (keys16 != nullptr) { ++ i = _res_findTableItem(&rdValue.getData(), keys16, length, key, &realKey); ++ } else { ++ i = _res_findTable32Item(&rdValue.getData(), keys32, length, key, &realKey); ++ } ++ if (i >= 0) { ++ Resource res; ++ if (items16 != nullptr) { ++ res = makeResourceFrom16(&rdValue.getData(), items16[i]); ++ } else { ++ res = items32[i]; ++ } ++ // Same note about lifetime as in getKeyAndValue(). ++ rdValue.setResource(res, ResourceTracer(fTraceInfo, key)); ++ return TRUE; ++ } ++ return FALSE; ++} ++ + U_CAPI Resource U_EXPORT2 + res_getArrayItem(const ResourceData *pResData, Resource array, int32_t indexR) { + uint32_t offset=RES_GET_OFFSET(array); +@@ -887,7 +910,7 @@ UBool icu::ResourceArray::getValue(int32_t i, icu::ResourceValue &value) const { + // alive for the duration that fields are being read from it + // (including nested fields). + rdValue.setResource( +- internalGetResource(rdValue.pResData, i), ++ internalGetResource(&rdValue.getData(), i), + ResourceTracer(fTraceInfo, i)); + return TRUE; + } +diff --git a/source/common/uresdata.h b/source/common/uresdata.h +index 51647409..d1b67bab 100644 +--- a/source/common/uresdata.h ++++ b/source/common/uresdata.h +@@ -511,13 +511,12 @@ inline uint32_t res_getUInt(const ResourceTracer& traceInfo, Resource res) { + class ResourceDataValue : public ResourceValue { + public: + ResourceDataValue() : +- pResData(NULL), + res(static_cast<Resource>(URES_NONE)), + fTraceInfo() {} + virtual ~ResourceDataValue(); + + void setData(const ResourceData *data) { +- pResData = data; ++ resData = *data; + } + + void setResource(Resource r, ResourceTracer&& traceInfo) { +@@ -525,6 +524,7 @@ public: + fTraceInfo = traceInfo; + } + ++ const ResourceData &getData() const { return resData; } + virtual UResType getType() const; + virtual const UChar *getString(int32_t &length, UErrorCode &errorCode) const; + virtual const UChar *getAliasString(int32_t &length, UErrorCode &errorCode) const; +@@ -541,9 +541,10 @@ public: + UErrorCode &errorCode) const; + virtual UnicodeString getStringOrFirstOfArray(UErrorCode &errorCode) const; + +- const ResourceData *pResData; +- + private: ++ // TODO(ICU-20769): If UResourceBundle.fResData becomes a pointer, ++ // then remove this value field again and just store a pResData pointer. ++ ResourceData resData; + Resource res; + ResourceTracer fTraceInfo; + }; +diff --git a/source/common/uresimp.h b/source/common/uresimp.h +index 51db6c52..f453ddc0 100644 +--- a/source/common/uresimp.h ++++ b/source/common/uresimp.h +@@ -67,6 +67,9 @@ struct UResourceBundle { + char *fVersion; + UResourceDataEntry *fTopLevelData; /* for getting the valid locale */ + char *fResPath; /* full path to the resource: "zh_TW/CollationElements/Sequence" */ ++ // TODO(ICU-20769): Try to change the by-value fResData into a pointer, ++ // with the struct in only one place for each bundle. ++ // Also replace class ResourceDataValue.resData with a pResData pointer again. + ResourceData fResData; + char fResBuf[RES_BUFSIZE]; + int32_t fResPathLen; +@@ -281,6 +284,11 @@ ures_getStringByKeyWithFallback(const UResourceBundle *resB, + + #ifdef __cplusplus + ++U_CAPI void U_EXPORT2 ++ures_getValueWithFallback(const UResourceBundle *bundle, const char *path, ++ UResourceBundle *tempFillIn, ++ icu::ResourceDataValue &value, UErrorCode &errorCode); ++ + U_CAPI void U_EXPORT2 + ures_getAllItemsWithFallback(const UResourceBundle *bundle, const char *path, + icu::ResourceSink &sink, UErrorCode &errorCode); diff --git a/patches/tracing.patch b/patches/tracing.patch new file mode 100644 index 0000000000000000000000000000000000000000..f8a7f705d7cdf303abcde2156e59b41cc3b660ce --- /dev/null +++ b/patches/tracing.patch @@ -0,0 +1,652 @@ +diff --git a/source/common/Makefile.in b/source/common/Makefile.in +index e663cb8e..79e371b0 100644 +--- a/source/common/Makefile.in ++++ b/source/common/Makefile.in +@@ -115,7 +115,8 @@ ulist.o uloc_tag.o icudataver.o icuplug.o \ + sharedobject.o simpleformatter.o unifiedcache.o uloc_keytype.o \ + ubiditransform.o \ + pluralmap.o \ +-static_unicode_sets.o ++static_unicode_sets.o \ ++restrace.o + + ## Header files to install + HEADERS = $(srcdir)/unicode/*.h +diff --git a/source/common/resource.h b/source/common/resource.h +index 3dbff785..ee93d41a 100644 +--- a/source/common/resource.h ++++ b/source/common/resource.h +@@ -28,6 +28,7 @@ + #include "unicode/utypes.h" + #include "unicode/unistr.h" + #include "unicode/ures.h" ++#include "restrace.h" + + struct ResourceData; + +@@ -47,8 +48,10 @@ public: + ResourceArray() : items16(NULL), items32(NULL), length(0) {} + + /** Only for implementation use. @internal */ +- ResourceArray(const uint16_t *i16, const uint32_t *i32, int32_t len) : +- items16(i16), items32(i32), length(len) {} ++ ResourceArray(const uint16_t *i16, const uint32_t *i32, int32_t len, ++ const ResourceTracer& traceInfo) : ++ items16(i16), items32(i32), length(len), ++ fTraceInfo(traceInfo) {} + + /** + * @return The number of items in the array resource. +@@ -68,6 +71,7 @@ private: + const uint16_t *items16; + const uint32_t *items32; + int32_t length; ++ ResourceTracer fTraceInfo; + }; + + /** +@@ -80,8 +84,10 @@ public: + + /** Only for implementation use. @internal */ + ResourceTable(const uint16_t *k16, const int32_t *k32, +- const uint16_t *i16, const uint32_t *i32, int32_t len) : +- keys16(k16), keys32(k32), items16(i16), items32(i32), length(len) {} ++ const uint16_t *i16, const uint32_t *i32, int32_t len, ++ const ResourceTracer& traceInfo) : ++ keys16(k16), keys32(k32), items16(i16), items32(i32), length(len), ++ fTraceInfo(traceInfo) {} + + /** + * @return The number of items in the array resource. +@@ -101,6 +107,7 @@ private: + const uint16_t *items16; + const uint32_t *items32; + int32_t length; ++ ResourceTracer fTraceInfo; + }; + + /** +diff --git a/source/common/udata.cpp b/source/common/udata.cpp +index b62095cd..1051f18f 100644 +--- a/source/common/udata.cpp ++++ b/source/common/udata.cpp +@@ -33,6 +33,7 @@ might have to #include some other header + #include "cstring.h" + #include "mutex.h" + #include "putilimp.h" ++#include "restrace.h" + #include "uassert.h" + #include "ucln_cmn.h" + #include "ucmndata.h" +@@ -1168,6 +1169,9 @@ doOpenChoice(const char *path, const char *type, const char *name, + UBool isICUData = FALSE; + + ++ FileTracer::traceOpen(path, type, name); ++ ++ + /* Is this path ICU data? */ + if(path == NULL || + !strcmp(path, U_ICUDATA_ALIAS) || /* "ICUDATA" */ +diff --git a/source/common/unicode/utrace.h b/source/common/unicode/utrace.h +index 66269784..412e11ad 100644 +--- a/source/common/unicode/utrace.h ++++ b/source/common/unicode/utrace.h +@@ -66,6 +66,7 @@ typedef enum UTraceFunctionNumber { + UTRACE_FUNCTION_START=0, + UTRACE_U_INIT=UTRACE_FUNCTION_START, + UTRACE_U_CLEANUP, ++ + #ifndef U_HIDE_DEPRECATED_API + /** + * One more than the highest normal collation trace location. +@@ -83,6 +84,7 @@ typedef enum UTraceFunctionNumber { + UTRACE_UCNV_FLUSH_CACHE, + UTRACE_UCNV_LOAD, + UTRACE_UCNV_UNLOAD, ++ + #ifndef U_HIDE_DEPRECATED_API + /** + * One more than the highest normal collation trace location. +@@ -101,13 +103,55 @@ typedef enum UTraceFunctionNumber { + UTRACE_UCOL_STRCOLLITER, + UTRACE_UCOL_OPEN_FROM_SHORT_STRING, + UTRACE_UCOL_STRCOLLUTF8, /**< @stable ICU 50 */ ++ + #ifndef U_HIDE_DEPRECATED_API + /** + * One more than the highest normal collation trace location. + * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. + */ +- UTRACE_COLLATION_LIMIT ++ UTRACE_COLLATION_LIMIT, + #endif // U_HIDE_DEPRECATED_API ++ ++#ifndef U_HIDE_DRAFT_API ++ ++ /** ++ * The lowest resource/data location. ++ * @draft ICU 65 ++ */ ++ UTRACE_RES_DATA_START=0x3000, ++ ++ /** ++ * Indicates that a value was read from a resource bundle. Provides three ++ * C-style strings to UTraceData: type, file name, and resource path. The ++ * type is "string", "binary", "intvector", "int", or "uint". ++ * @draft ICU 65 ++ */ ++ UTRACE_UDATA_RESOURCE=UTRACE_RES_DATA_START, ++ ++ /** ++ * Indicates that a value was read from a resource bundle. Provides one ++ * C-style string to UTraceData: file name. ++ * @draft ICU 65 ++ */ ++ UTRACE_UDATA_DATA_FILE, ++ ++ /** ++ * Indicates that a value was read from a resource bundle. Provides one ++ * C-style string to UTraceData: file name. ++ * @draft ICU 65 ++ */ ++ UTRACE_UDATA_RES_FILE, ++ ++#endif // U_HIDE_DRAFT_API ++ ++#ifndef U_HIDE_INTERNAL_API ++ /** ++ * One more than the highest normal resource/data trace location. ++ * @internal The numeric value may change over time, see ICU ticket #12420. ++ */ ++ UTRACE_RES_DATA_LIMIT, ++#endif // U_HIDE_INTERNAL_API ++ + } UTraceFunctionNumber; + + /** +diff --git a/source/common/uresbund.cpp b/source/common/uresbund.cpp +index b20e3095..585c0e5f 100644 +--- a/source/common/uresbund.cpp ++++ b/source/common/uresbund.cpp +@@ -401,7 +401,8 @@ static UResourceDataEntry *init_entry(const char *localeID, const char *path, UE + /* We'll try to get alias string from the bundle */ + aliasres = res_getResource(&(r->fData), "%%ALIAS"); + if (aliasres != RES_BOGUS) { +- const UChar *alias = res_getString(&(r->fData), aliasres, &aliasLen); ++ // No tracing: called during initial data loading ++ const UChar *alias = res_getStringNoTrace(&(r->fData), aliasres, &aliasLen); + if(alias != NULL && aliasLen > 0) { /* if there is actual alias - unload and load new data */ + u_UCharsToChars(alias, aliasName, aliasLen+1); + r->fAlias = init_entry(aliasName, path, status); +@@ -542,7 +543,8 @@ loadParentsExceptRoot(UResourceDataEntry *&t1, + Resource parentRes = res_getResource(&t1->fData, "%%Parent"); + if (parentRes != RES_BOGUS) { // An explicit parent was found. + int32_t parentLocaleLen = 0; +- const UChar *parentLocaleName = res_getString(&(t1->fData), parentRes, &parentLocaleLen); ++ // No tracing: called during initial data loading ++ const UChar *parentLocaleName = res_getStringNoTrace(&(t1->fData), parentRes, &parentLocaleLen); + if(parentLocaleName != NULL && 0 < parentLocaleLen && parentLocaleLen < nameCapacity) { + u_UCharsToChars(parentLocaleName, name, parentLocaleLen + 1); + if (uprv_strcmp(name, kRootLocaleName) == 0) { +@@ -1304,7 +1306,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getString(const UResourceBundle* resB, int32_ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } +- s = res_getString(&(resB->fResData), resB->fRes, len); ++ s = res_getString({resB}, &(resB->fResData), resB->fRes, len); + if (s == NULL) { + *status = U_RESOURCE_TYPE_MISMATCH; + } +@@ -1393,7 +1395,7 @@ U_CAPI const uint8_t* U_EXPORT2 ures_getBinary(const UResourceBundle* resB, int3 + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } +- p = res_getBinary(&(resB->fResData), resB->fRes, len); ++ p = res_getBinary({resB}, &(resB->fResData), resB->fRes, len); + if (p == NULL) { + *status = U_RESOURCE_TYPE_MISMATCH; + } +@@ -1410,7 +1412,7 @@ U_CAPI const int32_t* U_EXPORT2 ures_getIntVector(const UResourceBundle* resB, i + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } +- p = res_getIntVector(&(resB->fResData), resB->fRes, len); ++ p = res_getIntVector({resB}, &(resB->fResData), resB->fRes, len); + if (p == NULL) { + *status = U_RESOURCE_TYPE_MISMATCH; + } +@@ -1431,7 +1433,7 @@ U_CAPI int32_t U_EXPORT2 ures_getInt(const UResourceBundle* resB, UErrorCode *st + *status = U_RESOURCE_TYPE_MISMATCH; + return 0xffffffff; + } +- return RES_GET_INT(resB->fRes); ++ return res_getInt({resB}, resB->fRes); + } + + U_CAPI uint32_t U_EXPORT2 ures_getUInt(const UResourceBundle* resB, UErrorCode *status) { +@@ -1446,7 +1448,7 @@ U_CAPI uint32_t U_EXPORT2 ures_getUInt(const UResourceBundle* resB, UErrorCode * + *status = U_RESOURCE_TYPE_MISMATCH; + return 0xffffffff; + } +- return RES_GET_UINT(resB->fRes); ++ return res_getUInt({resB}, resB->fRes); + } + + U_CAPI UResType U_EXPORT2 ures_getType(const UResourceBundle *resB) { +@@ -1457,10 +1459,18 @@ U_CAPI UResType U_EXPORT2 ures_getType(const UResourceBundle *resB) { + } + + U_CAPI const char * U_EXPORT2 ures_getKey(const UResourceBundle *resB) { ++ // ++ // TODO: Trace ures_getKey? I guess not usually. ++ // ++ // We usually get the key string to decide whether we want the value, or to ++ // make a key-value pair. Tracing the value should suffice. ++ // ++ // However, I believe we have some data (e.g., in res_index) where the key ++ // strings are the data. Tracing the enclosing table should suffice. ++ // + if(resB == NULL) { + return NULL; + } +- + return(resB->fKey); + } + +@@ -1480,7 +1490,7 @@ static const UChar* ures_getStringWithAlias(const UResourceBundle *resB, Resourc + ures_close(tempRes); + return result; + } else { +- return res_getString(&(resB->fResData), r, len); ++ return res_getString({resB, sIndex}, &(resB->fResData), r, len); + } + } + +@@ -1516,7 +1526,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getNextString(UResourceBundle *resB, int32_t* + switch(RES_GET_TYPE(resB->fRes)) { + case URES_STRING: + case URES_STRING_V2: +- return res_getString(&(resB->fResData), resB->fRes, len); ++ return res_getString({resB}, &(resB->fResData), resB->fRes, len); + case URES_TABLE: + case URES_TABLE16: + case URES_TABLE32: +@@ -1661,7 +1671,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getStringByIndex(const UResourceBundle *resB, + switch(RES_GET_TYPE(resB->fRes)) { + case URES_STRING: + case URES_STRING_V2: +- return res_getString(&(resB->fResData), resB->fRes, len); ++ return res_getString({resB}, &(resB->fResData), resB->fRes, len); + case URES_TABLE: + case URES_TABLE16: + case URES_TABLE32: +@@ -1956,7 +1966,7 @@ void getAllItemsWithFallback( + value.pResData = &bundle->fResData; + UResourceDataEntry *parentEntry = bundle->fData->fParent; + UBool hasParent = parentEntry != NULL && U_SUCCESS(parentEntry->fBogus); +- value.setResource(bundle->fRes); ++ value.setResource(bundle->fRes, ResourceTracer(bundle)); + sink.put(bundle->fKey, value, !hasParent, errorCode); + if (hasParent) { + // We might try to query the sink whether +@@ -2108,7 +2118,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getStringByKey(const UResourceBundle *resB, c + switch (RES_GET_TYPE(res)) { + case URES_STRING: + case URES_STRING_V2: +- return res_getString(rd, res, len); ++ return res_getString({resB, key}, rd, res, len); + case URES_ALIAS: + { + const UChar* result = 0; +@@ -2130,7 +2140,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getStringByKey(const UResourceBundle *resB, c + switch (RES_GET_TYPE(res)) { + case URES_STRING: + case URES_STRING_V2: +- return res_getString(&(resB->fResData), res, len); ++ return res_getString({resB, key}, &(resB->fResData), res, len); + case URES_ALIAS: + { + const UChar* result = 0; +@@ -2151,6 +2161,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getStringByKey(const UResourceBundle *resB, c + /* here should go a first attempt to locate the key using index table */ + const ResourceData *rd = getFallbackData(resB, &key, &realData, &res, status); + if(U_SUCCESS(*status)) { ++ // TODO: Tracing + return res_getString(rd, res, len); + } else { + *status = U_MISSING_RESOURCE_ERROR; +diff --git a/source/common/uresdata.cpp b/source/common/uresdata.cpp +index 8bcb9ab8..ce04142f 100644 +--- a/source/common/uresdata.cpp ++++ b/source/common/uresdata.cpp +@@ -33,6 +33,7 @@ + #include "uinvchar.h" + #include "uresdata.h" + #include "uresimp.h" ++#include "utracimp.h" + + /* + * Resource access helpers +@@ -307,7 +308,7 @@ res_getPublicType(Resource res) { + } + + U_CAPI const UChar * U_EXPORT2 +-res_getString(const ResourceData *pResData, Resource res, int32_t *pLength) { ++res_getStringNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength) { + const UChar *p; + uint32_t offset=RES_GET_OFFSET(res); + int32_t length; +@@ -402,7 +403,8 @@ int32_t getStringArray(const ResourceData *pResData, const icu::ResourceArray &a + } + for(int32_t i = 0; i < length; ++i) { + int32_t sLength; +- const UChar *s = res_getString(pResData, array.internalGetResource(pResData, i), &sLength); ++ // No tracing: handled by the caller ++ const UChar *s = res_getStringNoTrace(pResData, array.internalGetResource(pResData, i), &sLength); + if(s == NULL) { + errorCode = U_RESOURCE_TYPE_MISMATCH; + return 0; +@@ -434,7 +436,7 @@ res_getAlias(const ResourceData *pResData, Resource res, int32_t *pLength) { + } + + U_CAPI const uint8_t * U_EXPORT2 +-res_getBinary(const ResourceData *pResData, Resource res, int32_t *pLength) { ++res_getBinaryNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength) { + const uint8_t *p; + uint32_t offset=RES_GET_OFFSET(res); + int32_t length; +@@ -454,7 +456,7 @@ res_getBinary(const ResourceData *pResData, Resource res, int32_t *pLength) { + + + U_CAPI const int32_t * U_EXPORT2 +-res_getIntVector(const ResourceData *pResData, Resource res, int32_t *pLength) { ++res_getIntVectorNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength) { + const int32_t *p; + uint32_t offset=RES_GET_OFFSET(res); + int32_t length; +@@ -507,7 +509,7 @@ const UChar *ResourceDataValue::getString(int32_t &length, UErrorCode &errorCode + if(U_FAILURE(errorCode)) { + return NULL; + } +- const UChar *s = res_getString(pResData, res, &length); ++ const UChar *s = res_getString(fTraceInfo, pResData, res, &length); + if(s == NULL) { + errorCode = U_RESOURCE_TYPE_MISMATCH; + } +@@ -532,7 +534,7 @@ int32_t ResourceDataValue::getInt(UErrorCode &errorCode) const { + if(RES_GET_TYPE(res) != URES_INT) { + errorCode = U_RESOURCE_TYPE_MISMATCH; + } +- return RES_GET_INT(res); ++ return res_getInt(fTraceInfo, res); + } + + uint32_t ResourceDataValue::getUInt(UErrorCode &errorCode) const { +@@ -542,14 +544,14 @@ uint32_t ResourceDataValue::getUInt(UErrorCode &errorCode) const { + if(RES_GET_TYPE(res) != URES_INT) { + errorCode = U_RESOURCE_TYPE_MISMATCH; + } +- return RES_GET_UINT(res); ++ return res_getUInt(fTraceInfo, res); + } + + const int32_t *ResourceDataValue::getIntVector(int32_t &length, UErrorCode &errorCode) const { + if(U_FAILURE(errorCode)) { + return NULL; + } +- const int32_t *iv = res_getIntVector(pResData, res, &length); ++ const int32_t *iv = res_getIntVector(fTraceInfo, pResData, res, &length); + if(iv == NULL) { + errorCode = U_RESOURCE_TYPE_MISMATCH; + } +@@ -560,7 +562,7 @@ const uint8_t *ResourceDataValue::getBinary(int32_t &length, UErrorCode &errorCo + if(U_FAILURE(errorCode)) { + return NULL; + } +- const uint8_t *b = res_getBinary(pResData, res, &length); ++ const uint8_t *b = res_getBinary(fTraceInfo, pResData, res, &length); + if(b == NULL) { + errorCode = U_RESOURCE_TYPE_MISMATCH; + } +@@ -590,7 +592,7 @@ ResourceArray ResourceDataValue::getArray(UErrorCode &errorCode) const { + errorCode = U_RESOURCE_TYPE_MISMATCH; + return ResourceArray(); + } +- return ResourceArray(items16, items32, length); ++ return ResourceArray(items16, items32, length, fTraceInfo); + } + + ResourceTable ResourceDataValue::getTable(UErrorCode &errorCode) const { +@@ -627,7 +629,7 @@ ResourceTable ResourceDataValue::getTable(UErrorCode &errorCode) const { + errorCode = U_RESOURCE_TYPE_MISMATCH; + return ResourceTable(); + } +- return ResourceTable(keys16, keys32, items16, items32, length); ++ return ResourceTable(keys16, keys32, items16, items32, length, fTraceInfo); + } + + UBool ResourceDataValue::isNoInheritanceMarker() const { +@@ -656,7 +658,7 @@ int32_t ResourceDataValue::getStringArrayOrStringAsArray(UnicodeString *dest, in + return 1; + } + int32_t sLength; +- const UChar *s = res_getString(pResData, res, &sLength); ++ const UChar *s = res_getString(fTraceInfo, pResData, res, &sLength); + if(s != NULL) { + dest[0].setTo(TRUE, s, sLength); + return 1; +@@ -671,7 +673,7 @@ UnicodeString ResourceDataValue::getStringOrFirstOfArray(UErrorCode &errorCode) + return us; + } + int32_t sLength; +- const UChar *s = res_getString(pResData, res, &sLength); ++ const UChar *s = res_getString(fTraceInfo, pResData, res, &sLength); + if(s != NULL) { + us.setTo(TRUE, s, sLength); + return us; +@@ -681,7 +683,8 @@ UnicodeString ResourceDataValue::getStringOrFirstOfArray(UErrorCode &errorCode) + return us; + } + if(array.getSize() > 0) { +- s = res_getString(pResData, array.internalGetResource(pResData, 0), &sLength); ++ // Tracing is already performed above (unimportant for trace that this is an array) ++ s = res_getStringNoTrace(pResData, array.internalGetResource(pResData, 0), &sLength); + if(s != NULL) { + us.setTo(TRUE, s, sLength); + return us; +@@ -829,7 +832,11 @@ UBool icu::ResourceTable::getKeyAndValue(int32_t i, + } else { + res = items32[i]; + } +- rdValue.setResource(res); ++ // Note: the ResourceTracer keeps a reference to the field of this ++ // ResourceTable. This is OK because the ResourceTable should remain ++ // alive for the duration that fields are being read from it ++ // (including nested fields). ++ rdValue.setResource(res, ResourceTracer(fTraceInfo, key)); + return TRUE; + } + return FALSE; +@@ -875,7 +882,13 @@ uint32_t icu::ResourceArray::internalGetResource(const ResourceData *pResData, i + UBool icu::ResourceArray::getValue(int32_t i, icu::ResourceValue &value) const { + if(0 <= i && i < length) { + icu::ResourceDataValue &rdValue = static_cast<icu::ResourceDataValue &>(value); +- rdValue.setResource(internalGetResource(rdValue.pResData, i)); ++ // Note: the ResourceTracer keeps a reference to the field of this ++ // ResourceArray. This is OK because the ResourceArray should remain ++ // alive for the duration that fields are being read from it ++ // (including nested fields). ++ rdValue.setResource( ++ internalGetResource(rdValue.pResData, i), ++ ResourceTracer(fTraceInfo, i)); + return TRUE; + } + return FALSE; +diff --git a/source/common/uresdata.h b/source/common/uresdata.h +index 4e28ddcc..51647409 100644 +--- a/source/common/uresdata.h ++++ b/source/common/uresdata.h +@@ -69,14 +69,16 @@ typedef uint32_t Resource; + #define RES_GET_OFFSET(res) ((res)&0x0fffffff) + #define RES_GET_POINTER(pRoot, res) ((pRoot)+RES_GET_OFFSET(res)) + +-/* get signed and unsigned integer values directly from the Resource handle */ ++/* get signed and unsigned integer values directly from the Resource handle ++ * NOTE: For proper logging, please use the res_getInt() constexpr ++ */ + #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC +-# define RES_GET_INT(res) (((int32_t)((res)<<4L))>>4L) ++# define RES_GET_INT_NO_TRACE(res) (((int32_t)((res)<<4L))>>4L) + #else +-# define RES_GET_INT(res) (int32_t)(((res)&0x08000000) ? (res)|0xf0000000 : (res)&0x07ffffff) ++# define RES_GET_INT_NO_TRACE(res) (int32_t)(((res)&0x08000000) ? (res)|0xf0000000 : (res)&0x07ffffff) + #endif + +-#define RES_GET_UINT(res) ((res)&0x0fffffff) ++#define RES_GET_UINT_NO_TRACE(res) ((res)&0x0fffffff) + + #define URES_IS_ARRAY(type) ((int32_t)(type)==URES_ARRAY || (int32_t)(type)==URES_ARRAY16) + #define URES_IS_TABLE(type) ((int32_t)(type)==URES_TABLE || (int32_t)(type)==URES_TABLE16 || (int32_t)(type)==URES_TABLE32) +@@ -423,22 +425,26 @@ res_unload(ResourceData *pResData); + U_INTERNAL UResType U_EXPORT2 + res_getPublicType(Resource res); + ++/////////////////////////////////////////////////////////////////////////// ++// To enable tracing, use the inline versions of the res_get* functions. // ++/////////////////////////////////////////////////////////////////////////// ++ + /* + * Return a pointer to a zero-terminated, const UChar* string + * and set its length in *pLength. + * Returns NULL if not found. + */ + U_INTERNAL const UChar * U_EXPORT2 +-res_getString(const ResourceData *pResData, Resource res, int32_t *pLength); +- +-U_INTERNAL const UChar * U_EXPORT2 +-res_getAlias(const ResourceData *pResData, Resource res, int32_t *pLength); ++res_getStringNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength); + + U_INTERNAL const uint8_t * U_EXPORT2 +-res_getBinary(const ResourceData *pResData, Resource res, int32_t *pLength); ++res_getBinaryNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength); + + U_INTERNAL const int32_t * U_EXPORT2 +-res_getIntVector(const ResourceData *pResData, Resource res, int32_t *pLength); ++res_getIntVectorNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength); ++ ++U_INTERNAL const UChar * U_EXPORT2 ++res_getAlias(const ResourceData *pResData, Resource res, int32_t *pLength); + + U_INTERNAL Resource U_EXPORT2 + res_getResource(const ResourceData *pResData, const char *key); +@@ -470,16 +476,54 @@ U_CFUNC Resource res_findResource(const ResourceData *pResData, Resource r, + #ifdef __cplusplus + + #include "resource.h" ++#include "restrace.h" + + U_NAMESPACE_BEGIN + ++inline const UChar* res_getString(const ResourceTracer& traceInfo, ++ const ResourceData *pResData, Resource res, int32_t *pLength) { ++ traceInfo.trace("string"); ++ return res_getStringNoTrace(pResData, res, pLength); ++} ++ ++inline const uint8_t* res_getBinary(const ResourceTracer& traceInfo, ++ const ResourceData *pResData, Resource res, int32_t *pLength) { ++ traceInfo.trace("binary"); ++ return res_getBinaryNoTrace(pResData, res, pLength); ++} ++ ++inline const int32_t* res_getIntVector(const ResourceTracer& traceInfo, ++ const ResourceData *pResData, Resource res, int32_t *pLength) { ++ traceInfo.trace("intvector"); ++ return res_getIntVectorNoTrace(pResData, res, pLength); ++} ++ ++inline int32_t res_getInt(const ResourceTracer& traceInfo, Resource res) { ++ traceInfo.trace("int"); ++ return RES_GET_INT_NO_TRACE(res); ++} ++ ++inline uint32_t res_getUInt(const ResourceTracer& traceInfo, Resource res) { ++ traceInfo.trace("uint"); ++ return RES_GET_UINT_NO_TRACE(res); ++} ++ + class ResourceDataValue : public ResourceValue { + public: +- ResourceDataValue() : pResData(NULL), res(static_cast<Resource>(URES_NONE)) {} ++ ResourceDataValue() : ++ pResData(NULL), ++ res(static_cast<Resource>(URES_NONE)), ++ fTraceInfo() {} + virtual ~ResourceDataValue(); + +- void setData(const ResourceData *data) { pResData = data; } +- void setResource(Resource r) { res = r; } ++ void setData(const ResourceData *data) { ++ pResData = data; ++ } ++ ++ void setResource(Resource r, ResourceTracer&& traceInfo) { ++ res = r; ++ fTraceInfo = traceInfo; ++ } + + virtual UResType getType() const; + virtual const UChar *getString(int32_t &length, UErrorCode &errorCode) const; +@@ -501,6 +545,7 @@ public: + + private: + Resource res; ++ ResourceTracer fTraceInfo; + }; + + U_NAMESPACE_END +diff --git a/source/common/utrace.cpp b/source/common/utrace.cpp +index 2ac3d77c..eced03b8 100644 +--- a/source/common/utrace.cpp ++++ b/source/common/utrace.cpp +@@ -476,6 +476,15 @@ trCollNames[] = { + NULL + }; + ++ ++static const char* const ++trResDataNames[] = { ++ "ResourceTracer::trace", ++ "FileTracer::traceOpenDataFile", ++ "FileTracer::traceOpenResFile", ++ NULL ++}; ++ + + U_CAPI const char * U_EXPORT2 + utrace_functionName(int32_t fnNumber) { +@@ -485,6 +494,8 @@ utrace_functionName(int32_t fnNumber) { + return trConvNames[fnNumber - UTRACE_CONVERSION_START]; + } else if(UTRACE_COLLATION_START <= fnNumber && fnNumber < UTRACE_COLLATION_LIMIT){ + return trCollNames[fnNumber - UTRACE_COLLATION_START]; ++ } else if(UTRACE_RES_DATA_START <= fnNumber && fnNumber < UTRACE_RES_DATA_LIMIT){ ++ return trResDataNames[fnNumber - UTRACE_RES_DATA_START]; + } else { + return "[BOGUS Trace Function Number]"; + } +diff --git a/source/tools/toolutil/pkgitems.cpp b/source/tools/toolutil/pkgitems.cpp +index cb23b45e..7b86c55f 100644 +--- a/source/tools/toolutil/pkgitems.cpp ++++ b/source/tools/toolutil/pkgitems.cpp +@@ -305,7 +305,8 @@ ures_enumDependencies(const char *itemName, + break; + } + int32_t length; +- const UChar *alias=res_getString(pResData, res, &length); ++ // No tracing: build tool ++ const UChar *alias=res_getStringNoTrace(pResData, res, &length); + checkAlias(itemName, res, alias, length, useResSuffix, check, context, pErrorCode); + } + break; diff --git a/patches/trie.patch b/patches/trie.patch new file mode 100644 index 0000000000000000000000000000000000000000..7fefb6b1853f080836bdc230e35c468ea0d4d606 --- /dev/null +++ b/patches/trie.patch @@ -0,0 +1,116 @@ +diff --git a/source/common/unicode/bytestrie.h b/source/common/unicode/bytestrie.h +index c57b8ccf..1a35f604 100644 +--- a/source/common/unicode/bytestrie.h ++++ b/source/common/unicode/bytestrie.h +@@ -94,6 +94,39 @@ public: + return *this; + } + ++ /** ++ * Returns the state of this trie as a 64-bit integer. ++ * The state value is never 0. ++ * ++ * @return opaque state value ++ * @see resetToState64 ++ * @draft ICU 65 ++ */ ++ uint64_t getState64() const { ++ return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) | ++ (uint64_t)(pos_ - bytes_); ++ } ++ ++ /** ++ * Resets this trie to the saved state. ++ * Unlike resetToState(State), the 64-bit state value ++ * must be from getState64() from the same trie object or ++ * from one initialized the exact same way. ++ * Because of no validation, this method is faster. ++ * ++ * @param state The opaque trie state value from getState64(). ++ * @return *this ++ * @see getState64 ++ * @see resetToState ++ * @see reset ++ * @draft ICU 65 ++ */ ++ BytesTrie &resetToState64(uint64_t state) { ++ remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2; ++ pos_ = bytes_ + (state & kState64PosMask); ++ return *this; ++ } ++ + /** + * BytesTrie state object, for saving a trie's current state + * and resetting the trie back to this state later. +@@ -502,6 +535,13 @@ private: + static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff + static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff + ++ // For getState64(): ++ // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2 ++ // so we need at least 5 bits for that. ++ // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength. ++ static constexpr int32_t kState64RemainingShift = 59; ++ static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1; ++ + uint8_t *ownedArray_; + + // Fixed value referencing the BytesTrie bytes. +diff --git a/source/common/unicode/ucharstrie.h b/source/common/unicode/ucharstrie.h +index dfc93f6d..b8c83a6e 100644 +--- a/source/common/unicode/ucharstrie.h ++++ b/source/common/unicode/ucharstrie.h +@@ -94,6 +94,39 @@ public: + return *this; + } + ++ /** ++ * Returns the state of this trie as a 64-bit integer. ++ * The state value is never 0. ++ * ++ * @return opaque state value ++ * @see resetToState64 ++ * @draft ICU 65 ++ */ ++ uint64_t getState64() const { ++ return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) | ++ (uint64_t)(pos_ - uchars_); ++ } ++ ++ /** ++ * Resets this trie to the saved state. ++ * Unlike resetToState(State), the 64-bit state value ++ * must be from getState64() from the same trie object or ++ * from one initialized the exact same way. ++ * Because of no validation, this method is faster. ++ * ++ * @param state The opaque trie state value from getState64(). ++ * @return *this ++ * @see getState64 ++ * @see resetToState ++ * @see reset ++ * @draft ICU 65 ++ */ ++ UCharsTrie &resetToState64(uint64_t state) { ++ remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2; ++ pos_ = uchars_ + (state & kState64PosMask); ++ return *this; ++ } ++ + /** + * UCharsTrie state object, for saving a trie's current state + * and resetting the trie back to this state later. +@@ -560,6 +593,13 @@ private: + + static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff + ++ // For getState64(): ++ // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2 ++ // so we need at least 5 bits for that. ++ // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength. ++ static constexpr int32_t kState64RemainingShift = 59; ++ static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1; ++ + char16_t *ownedArray_; + + // Fixed value referencing the UCharsTrie words. diff --git a/patches/usePool.patch b/patches/usePool.patch new file mode 100644 index 0000000000000000000000000000000000000000..e8b7ad33555e7717be14b8660de10c917f46d30f --- /dev/null +++ b/patches/usePool.patch @@ -0,0 +1,106 @@ +diff --git a/source/data/BUILDRULES.py b/source/data/BUILDRULES.py +index 2442f4e3..d9e8ac19 100644 +--- a/source/data/BUILDRULES.py ++++ b/source/data/BUILDRULES.py +@@ -43,48 +43,49 @@ def generate(config, glob, common_vars): + "locales", + None, + "icu-locale-deprecates.xml", +- True, ++ config.use_pool_bundle, + []) + + requests += generate_tree(config, glob, common_vars, + "curr", + "curr", + "icu-locale-deprecates.xml", +- True, ++ config.use_pool_bundle, + []) + + requests += generate_tree(config, glob, common_vars, + "lang", + "lang", + "icu-locale-deprecates.xml", +- True, ++ config.use_pool_bundle, + []) + + requests += generate_tree(config, glob, common_vars, + "region", + "region", + "icu-locale-deprecates.xml", +- True, ++ config.use_pool_bundle, + []) + + requests += generate_tree(config, glob, common_vars, + "zone", + "zone", + "icu-locale-deprecates.xml", +- True, ++ config.use_pool_bundle, + []) + + requests += generate_tree(config, glob, common_vars, + "unit", + "unit", + "icu-locale-deprecates.xml", +- True, ++ config.use_pool_bundle, + []) + + requests += generate_tree(config, glob, common_vars, + "coll", + "coll", + "icu-coll-deprecates.xml", ++ # Never use pool bundle for coll, brkitr, or rbnf + False, + # Depends on timezoneTypes.res and keyTypeData.res. + # TODO: We should not need this dependency to build collation. +@@ -95,6 +96,7 @@ def generate(config, glob, common_vars): + "brkitr", + "brkitr", + "icu-locale-deprecates.xml", ++ # Never use pool bundle for coll, brkitr, or rbnf + False, + [DepTarget("brkitr_brk"), DepTarget("dictionaries")]) + +@@ -102,6 +104,7 @@ def generate(config, glob, common_vars): + "rbnf", + "rbnf", + "icu-rbnf-deprecates.xml", ++ # Never use pool bundle for coll, brkitr, or rbnf + False, + []) + +diff --git a/source/data/buildtool/__main__.py b/source/data/buildtool/__main__.py +index 52d869c8..30cfcdc2 100644 +--- a/source/data/buildtool/__main__.py ++++ b/source/data/buildtool/__main__.py +@@ -141,6 +141,11 @@ class Config(object): + if "collationUCAData" in self.filters_json_data: + self.coll_han_type = self.filters_json_data["collationUCAData"] + ++ # True or False (could be extended later to support enum/list) ++ self.use_pool_bundle = True ++ if "usePoolBundle" in self.filters_json_data: ++ self.use_pool_bundle = self.filters_json_data["usePoolBundle"] ++ + def _parse_filter_file(self, f): + # Use the Hjson parser if it is available; otherwise, use vanilla JSON. + try: +diff --git a/source/data/buildtool/filtration_schema.json b/source/data/buildtool/filtration_schema.json +index 479c65af..c9f9b8cd 100644 +--- a/source/data/buildtool/filtration_schema.json ++++ b/source/data/buildtool/filtration_schema.json +@@ -57,6 +57,9 @@ + "collationUCAData": { + "type": "string", + "enum": ["unihan", "implicithan"] ++ }, ++ "usePoolBundle": { ++ "type": "boolean" + } + }, + "additionalProperties": false, diff --git a/scripts/copy_data.sh b/scripts/copy_data.sh index 234a94979af3ecdcd32d45f7057e9c6f686c9397..8e1a4e916779ed5803782bcaf936dcf974931e6c 100755 --- a/scripts/copy_data.sh +++ b/scripts/copy_data.sh @@ -9,7 +9,7 @@ if [ $# -lt 1 ]; then - echo "Usage: "$0" (android|android_small|cast|chromeos|common|flutter|ios)" >&2 + echo "Usage: "$0" (android|android_extra|android_small|cast|chromeos|common|flutter|ios)" >&2 exit 1 fi @@ -47,6 +47,22 @@ function copy_data { echo "Done with copying pre-built ICU data file for $1." } +function copy_android_extra { + echo "Copying icudtl_extra.dat for AndroidExtra" + + LD_LIBRARY_PATH=lib/ bin/icupkg -r \ + "${TOPSRC}/filters/android-extra-removed-resources.txt" \ + "data/out/tmp/icudt${VERSION}l.dat" + + echo "AFTER strip out the content is" + LD_LIBRARY_PATH=lib/ bin/icupkg -l \ + "data/out/tmp/icudt${VERSION}l.dat" + + cp "data/out/tmp/icudt${VERSION}l.dat" "${TOPSRC}/android_small/icudtl_extra.dat" + + echo "Done with copying pre-built ICU data file for AndroidExtra." +} + BACKUP_DIR="dataout/$1" function backup_outdir { @@ -72,6 +88,10 @@ case "$1" in copy_data AndroidSmall $1 backup_outdir $1 ;; + "android_extra") + copy_android_extra + backup_outdir $1 + ;; "ios") copy_data iOS $1 backup_outdir $1 diff --git a/scripts/diff_data.sh b/scripts/diff_data.sh index 3ab81a436626662c5d3bfa3fc1c9c39574c09787..5c0d0f3eee30354f3891103262251f3e10026df5 100755 --- a/scripts/diff_data.sh +++ b/scripts/diff_data.sh @@ -6,7 +6,7 @@ ICUROOT="$(dirname "$0")/.." if [ $# -lt 3 ]; then - echo "Usage: "$0" (android|cast|chromeos|common|flutter|ios) icubuilddir1 icubuilddir2" >&2 + echo "Usage: "$0" (android|android_small|android_extra|cast|chromeos|common|flutter|ios) icubuilddir1 icubuilddir2" >&2 echo "$0 compare data files of a particular build inside two icu build directory." >&2 echo "These files were previously archived by backup_outdir in scripts/copy_data.sh." >&2 echo "The first parameter indicate which build to be compared." >&2 diff --git a/scripts/diff_data_all.sh b/scripts/diff_data_all.sh index 29e52ba9a24cfd00edc9d05a8323ddb8f0684556..0a3853ea82d22c6a2bb58c9dcc256718259a46d4 100755 --- a/scripts/diff_data_all.sh +++ b/scripts/diff_data_all.sh @@ -18,7 +18,7 @@ DIR2=$2 echo "#######################################################" echo " ICUDT*L.DAT FILE SIZE REPORT" echo "#######################################################" -for build in "chromeos" "common" "cast" "android" "android_small" "ios" "flutter" +for build in "chromeos" "common" "cast" "android" "android_small" "android_extra" "ios" "flutter" do ICUDT_L_DAT1=`ls ${DIR1}/dataout/${build}/data/out/tmp/icudt*l.dat` ICUDT_L_DAT2=`ls ${DIR2}/dataout/${build}/data/out/tmp/icudt*l.dat` diff --git a/scripts/make_data_all.sh b/scripts/make_data_all.sh index 46c58790cbb6ee119c7f4f7b6ff23376be1eed1a..cfeb49e804af033cb176f8c42e179114b84127b8 100755 --- a/scripts/make_data_all.sh +++ b/scripts/make_data_all.sh @@ -8,7 +8,7 @@ function config_data { if [ $# -lt 1 ]; then echo "config target missing." >&2 - echo "Should be (android|android_small|cast|chromeos|common|flutter|ios)" >&2 + echo "Should be (android|android_extra|android_small|cast|chromeos|common|flutter|ios)" >&2 exit 1 fi @@ -53,6 +53,12 @@ config_data android_small make -j 120 $ICUROOT/scripts/copy_data.sh android_small +echo "Build the filtered data for AndroidExtra" +(cd data && make clean) +config_data android_extra +make -j 120 +$ICUROOT/scripts/copy_data.sh android_extra + echo "Build the filtered data for iOS" (cd data && make clean) config_data ios diff --git a/source/common/Makefile.in b/source/common/Makefile.in index e663cb8e04b58a052670b6dedb713b4de1af67e2..d21f5d06b123c686b02fe9a6112f9a14c911deca 100644 --- a/source/common/Makefile.in +++ b/source/common/Makefile.in @@ -88,8 +88,9 @@ ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \ ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_set.o ucnv_ct.o \ resource.o uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \ ucurr.o \ -localebuilder.o \ +localebuilder.o localeprioritylist.o \ messagepattern.o ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o locdspnm.o loclikely.o locresdata.o \ +lsr.o loclikelysubtags.o locdistance.o localematcher.o \ bytestream.o stringpiece.o bytesinkutil.o \ stringtriebuilder.o bytestriebuilder.o \ bytestrie.o bytestrieiterator.o \ @@ -115,7 +116,8 @@ ulist.o uloc_tag.o icudataver.o icuplug.o \ sharedobject.o simpleformatter.o unifiedcache.o uloc_keytype.o \ ubiditransform.o \ pluralmap.o \ -static_unicode_sets.o +static_unicode_sets.o \ +restrace.o ## Header files to install HEADERS = $(srcdir)/unicode/*.h diff --git a/source/common/charstr.cpp b/source/common/charstr.cpp index 852cc539457760546b4dfaf122edff0187e1cc88..dda29dac63273c9bd904f45e84fb7182a4842669 100644 --- a/source/common/charstr.cpp +++ b/source/common/charstr.cpp @@ -35,6 +35,17 @@ CharString& CharString::operator=(CharString&& src) U_NOEXCEPT { return *this; } +char *CharString::cloneData(UErrorCode &errorCode) const { + if (U_FAILURE(errorCode)) { return nullptr; } + char *p = static_cast<char *>(uprv_malloc(len + 1)); + if (p == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + uprv_memcpy(p, buffer.getAlias(), len + 1); + return p; +} + CharString &CharString::copyFrom(const CharString &s, UErrorCode &errorCode) { if(U_SUCCESS(errorCode) && this!=&s && ensureCapacity(s.len+1, 0, errorCode)) { len=s.len; @@ -52,6 +63,18 @@ int32_t CharString::lastIndexOf(char c) const { return -1; } +bool CharString::contains(StringPiece s) const { + if (s.empty()) { return false; } + const char *p = buffer.getAlias(); + int32_t lastStart = len - s.length(); + for (int32_t i = 0; i <= lastStart; ++i) { + if (uprv_memcmp(p + i, s.data(), s.length()) == 0) { + return true; + } + } + return false; +} + CharString &CharString::truncate(int32_t newLength) { if(newLength<0) { newLength=0; diff --git a/source/common/charstr.h b/source/common/charstr.h index 1a97e01988f991b869773488a5f9c5cd0734b880..23b950ed6ecc769f275e3018406a2ace87936eb6 100644 --- a/source/common/charstr.h +++ b/source/common/charstr.h @@ -82,10 +82,24 @@ public: const char *data() const { return buffer.getAlias(); } char *data() { return buffer.getAlias(); } + /** + * Allocates length()+1 chars and copies the NUL-terminated data(). + * The caller must uprv_free() the result. + */ + char *cloneData(UErrorCode &errorCode) const; + + bool operator==(StringPiece other) const { + return len == other.length() && (len == 0 || uprv_memcmp(data(), other.data(), len) == 0); + } + bool operator!=(StringPiece other) const { + return !operator==(other); + } /** @return last index of c, or -1 if c is not in this string */ int32_t lastIndexOf(char c) const; + bool contains(StringPiece s) const; + CharString &clear() { len=0; buffer[0]=0; return *this; } CharString &truncate(int32_t newLength); diff --git a/source/common/localebuilder.cpp b/source/common/localebuilder.cpp index fe931fcf759dfdd20f5638989b2cb1394f5ebfe3..837b92f1837dc517b8a89bb6891070bd56d2a785 100644 --- a/source/common/localebuilder.cpp +++ b/source/common/localebuilder.cpp @@ -157,13 +157,18 @@ _isKeywordValue(const char* key, const char* value, int32_t value_len) } static void -_copyExtensions(const Locale& from, Locale* to, bool validate, UErrorCode& errorCode) +_copyExtensions(const Locale& from, icu::StringEnumeration *keywords, + Locale& to, bool validate, UErrorCode& errorCode) { if (U_FAILURE(errorCode)) { return; } - LocalPointer<icu::StringEnumeration> iter(from.createKeywords(errorCode)); - if (U_FAILURE(errorCode) || iter.isNull()) { return; } + LocalPointer<icu::StringEnumeration> ownedKeywords; + if (keywords == nullptr) { + ownedKeywords.adoptInstead(from.createKeywords(errorCode)); + if (U_FAILURE(errorCode) || ownedKeywords.isNull()) { return; } + keywords = ownedKeywords.getAlias(); + } const char* key; - while ((key = iter->next(nullptr, errorCode)) != nullptr) { + while ((key = keywords->next(nullptr, errorCode)) != nullptr) { CharString value; CharStringByteSink sink(&value); from.getKeywordValue(key, sink, errorCode); @@ -176,34 +181,34 @@ _copyExtensions(const Locale& from, Locale* to, bool validate, UErrorCode& error errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } - to->setKeywordValue(key, value.data(), errorCode); + to.setKeywordValue(key, value.data(), errorCode); if (U_FAILURE(errorCode)) { return; } } } void static -_clearUAttributesAndKeyType(Locale* locale, UErrorCode& errorCode) +_clearUAttributesAndKeyType(Locale& locale, UErrorCode& errorCode) { // Clear Unicode attributes - locale->setKeywordValue(kAttributeKey, "", errorCode); + locale.setKeywordValue(kAttributeKey, "", errorCode); // Clear all Unicode keyword values - LocalPointer<icu::StringEnumeration> iter(locale->createUnicodeKeywords(errorCode)); + LocalPointer<icu::StringEnumeration> iter(locale.createUnicodeKeywords(errorCode)); if (U_FAILURE(errorCode) || iter.isNull()) { return; } const char* key; while ((key = iter->next(nullptr, errorCode)) != nullptr) { - locale->setUnicodeKeywordValue(key, nullptr, errorCode); + locale.setUnicodeKeywordValue(key, nullptr, errorCode); } } static void -_setUnicodeExtensions(Locale* locale, const CharString& value, UErrorCode& errorCode) +_setUnicodeExtensions(Locale& locale, const CharString& value, UErrorCode& errorCode) { // Add the unicode extensions to extensions_ CharString locale_str("und-u-", errorCode); locale_str.append(value, errorCode); _copyExtensions( - Locale::forLanguageTag(locale_str.data(), errorCode), + Locale::forLanguageTag(locale_str.data(), errorCode), nullptr, locale, false, errorCode); } @@ -235,10 +240,10 @@ LocaleBuilder& LocaleBuilder::setExtension(char key, StringPiece value) status_); return *this; } - _clearUAttributesAndKeyType(extensions_, status_); + _clearUAttributesAndKeyType(*extensions_, status_); if (U_FAILURE(status_)) { return *this; } if (!value.empty()) { - _setUnicodeExtensions(extensions_, value_str, status_); + _setUnicodeExtensions(*extensions_, value_str, status_); } return *this; } @@ -401,6 +406,24 @@ Locale makeBogusLocale() { return bogus; } +void LocaleBuilder::copyExtensionsFrom(const Locale& src, UErrorCode& errorCode) +{ + if (U_FAILURE(errorCode)) { return; } + LocalPointer<icu::StringEnumeration> keywords(src.createKeywords(errorCode)); + if (U_FAILURE(errorCode) || keywords.isNull() || keywords->count(errorCode) == 0) { + // Error, or no extensions to copy. + return; + } + if (extensions_ == nullptr) { + extensions_ = new Locale(); + if (extensions_ == nullptr) { + status_ = U_MEMORY_ALLOCATION_ERROR; + return; + } + } + _copyExtensions(src, keywords.getAlias(), *extensions_, false, errorCode); +} + Locale LocaleBuilder::build(UErrorCode& errorCode) { if (U_FAILURE(errorCode)) { @@ -425,7 +448,7 @@ Locale LocaleBuilder::build(UErrorCode& errorCode) } Locale product(locale_str.data()); if (extensions_ != nullptr) { - _copyExtensions(*extensions_, &product, true, errorCode); + _copyExtensions(*extensions_, nullptr, product, true, errorCode); } if (U_FAILURE(errorCode)) { return makeBogusLocale(); diff --git a/source/common/localematcher.cpp b/source/common/localematcher.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d975fe759b4ce1911e2793567b1f0e7267aecdda --- /dev/null +++ b/source/common/localematcher.cpp @@ -0,0 +1,720 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// localematcher.cpp +// created: 2019may08 Markus W. Scherer + +#ifndef __LOCMATCHER_H__ +#define __LOCMATCHER_H__ + +#include "unicode/utypes.h" +#include "unicode/localebuilder.h" +#include "unicode/localematcher.h" +#include "unicode/locid.h" +#include "unicode/stringpiece.h" +#include "unicode/uobject.h" +#include "cstring.h" +#include "localeprioritylist.h" +#include "loclikelysubtags.h" +#include "locdistance.h" +#include "lsr.h" +#include "uassert.h" +#include "uhash.h" +#include "uvector.h" + +#define UND_LSR LSR("und", "", "") + +/** + * Indicator for the lifetime of desired-locale objects passed into the LocaleMatcher. + * + * @draft ICU 65 + */ +enum ULocMatchLifetime { + /** + * Locale objects are temporary. + * The matcher will make a copy of a locale that will be used beyond one function call. + * + * @draft ICU 65 + */ + ULOCMATCH_TEMPORARY_LOCALES, + /** + * Locale objects are stored at least as long as the matcher is used. + * The matcher will keep only a pointer to a locale that will be used beyond one function call, + * avoiding a copy. + * + * @draft ICU 65 + */ + ULOCMATCH_STORED_LOCALES // TODO: permanent? cached? clone? +}; +#ifndef U_IN_DOXYGEN +typedef enum ULocMatchLifetime ULocMatchLifetime; +#endif + +U_NAMESPACE_BEGIN + +LocaleMatcher::Result::Result(LocaleMatcher::Result &&src) U_NOEXCEPT : + desiredLocale(src.desiredLocale), + supportedLocale(src.supportedLocale), + desiredIndex(src.desiredIndex), + supportedIndex(src.supportedIndex), + desiredIsOwned(src.desiredIsOwned) { + if (desiredIsOwned) { + src.desiredLocale = nullptr; + src.desiredIndex = -1; + src.desiredIsOwned = FALSE; + } +} + +LocaleMatcher::Result::~Result() { + if (desiredIsOwned) { + delete desiredLocale; + } +} + +LocaleMatcher::Result &LocaleMatcher::Result::operator=(LocaleMatcher::Result &&src) U_NOEXCEPT { + this->~Result(); + + desiredLocale = src.desiredLocale; + supportedLocale = src.supportedLocale; + desiredIndex = src.desiredIndex; + supportedIndex = src.supportedIndex; + desiredIsOwned = src.desiredIsOwned; + + if (desiredIsOwned) { + src.desiredLocale = nullptr; + src.desiredIndex = -1; + src.desiredIsOwned = FALSE; + } + return *this; +} + +Locale LocaleMatcher::Result::makeResolvedLocale(UErrorCode &errorCode) const { + if (U_FAILURE(errorCode) || supportedLocale == nullptr) { + return Locale::getRoot(); + } + const Locale *bestDesired = getDesiredLocale(); + if (bestDesired == nullptr || *supportedLocale == *bestDesired) { + return *supportedLocale; + } + LocaleBuilder b; + b.setLocale(*supportedLocale); + + // Copy the region from bestDesired, if there is one. + const char *region = bestDesired->getCountry(); + if (*region != 0) { + b.setRegion(region); + } + + // Copy the variants from bestDesired, if there are any. + // Note that this will override any supportedLocale variants. + // For example, "sco-ulster-fonipa" + "...-fonupa" => "sco-fonupa" (replacing ulster). + const char *variants = bestDesired->getVariant(); + if (*variants != 0) { + b.setVariant(variants); + } + + // Copy the extensions from bestDesired, if there are any. + // C++ note: The following note, copied from Java, may not be true, + // as long as C++ copies by legacy ICU keyword, not by extension singleton. + // Note that this will override any supportedLocale extensions. + // For example, "th-u-nu-latn-ca-buddhist" + "...-u-nu-native" => "th-u-nu-native" + // (replacing calendar). + b.copyExtensionsFrom(*bestDesired, errorCode); + return b.build(errorCode); +} + +LocaleMatcher::Builder::Builder(LocaleMatcher::Builder &&src) U_NOEXCEPT : + errorCode_(src.errorCode_), + supportedLocales_(src.supportedLocales_), + thresholdDistance_(src.thresholdDistance_), + demotion_(src.demotion_), + defaultLocale_(src.defaultLocale_), + favor_(src.favor_) { + src.supportedLocales_ = nullptr; + src.defaultLocale_ = nullptr; +} + +LocaleMatcher::Builder::~Builder() { + delete supportedLocales_; + delete defaultLocale_; +} + +LocaleMatcher::Builder &LocaleMatcher::Builder::operator=(LocaleMatcher::Builder &&src) U_NOEXCEPT { + this->~Builder(); + + errorCode_ = src.errorCode_; + supportedLocales_ = src.supportedLocales_; + thresholdDistance_ = src.thresholdDistance_; + demotion_ = src.demotion_; + defaultLocale_ = src.defaultLocale_; + favor_ = src.favor_; + + src.supportedLocales_ = nullptr; + src.defaultLocale_ = nullptr; + return *this; +} + +void LocaleMatcher::Builder::clearSupportedLocales() { + if (supportedLocales_ != nullptr) { + supportedLocales_->removeAllElements(); + } +} + +bool LocaleMatcher::Builder::ensureSupportedLocaleVector() { + if (U_FAILURE(errorCode_)) { return false; } + if (supportedLocales_ != nullptr) { return true; } + supportedLocales_ = new UVector(uprv_deleteUObject, nullptr, errorCode_); + if (U_FAILURE(errorCode_)) { return false; } + if (supportedLocales_ == nullptr) { + errorCode_ = U_MEMORY_ALLOCATION_ERROR; + return false; + } + return true; +} + +LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListString( + StringPiece locales) { + LocalePriorityList list(locales, errorCode_); + if (U_FAILURE(errorCode_)) { return *this; } + clearSupportedLocales(); + if (!ensureSupportedLocaleVector()) { return *this; } + int32_t length = list.getLengthIncludingRemoved(); + for (int32_t i = 0; i < length; ++i) { + Locale *locale = list.orphanLocaleAt(i); + if (locale == nullptr) { continue; } + supportedLocales_->addElement(locale, errorCode_); + if (U_FAILURE(errorCode_)) { + delete locale; + break; + } + } + return *this; +} + +LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) { + if (U_FAILURE(errorCode_)) { return *this; } + clearSupportedLocales(); + if (!ensureSupportedLocaleVector()) { return *this; } + while (locales.hasNext()) { + const Locale &locale = locales.next(); + Locale *clone = locale.clone(); + if (clone == nullptr) { + errorCode_ = U_MEMORY_ALLOCATION_ERROR; + break; + } + supportedLocales_->addElement(clone, errorCode_); + if (U_FAILURE(errorCode_)) { + delete clone; + break; + } + } + return *this; +} + +LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) { + if (!ensureSupportedLocaleVector()) { return *this; } + Locale *clone = locale.clone(); + if (clone == nullptr) { + errorCode_ = U_MEMORY_ALLOCATION_ERROR; + return *this; + } + supportedLocales_->addElement(clone, errorCode_); + if (U_FAILURE(errorCode_)) { + delete clone; + } + return *this; +} + +LocaleMatcher::Builder &LocaleMatcher::Builder::setDefaultLocale(const Locale *defaultLocale) { + if (U_FAILURE(errorCode_)) { return *this; } + Locale *clone = nullptr; + if (defaultLocale != nullptr) { + clone = defaultLocale->clone(); + if (clone == nullptr) { + errorCode_ = U_MEMORY_ALLOCATION_ERROR; + return *this; + } + } + delete defaultLocale_; + defaultLocale_ = clone; + return *this; +} + +LocaleMatcher::Builder &LocaleMatcher::Builder::setFavorSubtag(ULocMatchFavorSubtag subtag) { + if (U_FAILURE(errorCode_)) { return *this; } + favor_ = subtag; + return *this; +} + +LocaleMatcher::Builder &LocaleMatcher::Builder::setDemotionPerDesiredLocale(ULocMatchDemotion demotion) { + if (U_FAILURE(errorCode_)) { return *this; } + demotion_ = demotion; + return *this; +} + +#if 0 +/** + * <i>Internal only!</i> + * + * @param thresholdDistance the thresholdDistance to set, with -1 = default + * @return this Builder object + * @internal + * @deprecated This API is ICU internal only. + */ +@Deprecated +LocaleMatcher::Builder &LocaleMatcher::Builder::internalSetThresholdDistance(int32_t thresholdDistance) { + if (U_FAILURE(errorCode_)) { return *this; } + if (thresholdDistance > 100) { + thresholdDistance = 100; + } + thresholdDistance_ = thresholdDistance; + return *this; +} +#endif + +UBool LocaleMatcher::Builder::copyErrorTo(UErrorCode &outErrorCode) const { + if (U_FAILURE(outErrorCode)) { return TRUE; } + if (U_SUCCESS(errorCode_)) { return FALSE; } + outErrorCode = errorCode_; + return TRUE; +} + +LocaleMatcher LocaleMatcher::Builder::build(UErrorCode &errorCode) const { + if (U_SUCCESS(errorCode) && U_FAILURE(errorCode_)) { + errorCode = errorCode_; + } + return LocaleMatcher(*this, errorCode); +} + +namespace { + +LSR getMaximalLsrOrUnd(const XLikelySubtags &likelySubtags, const Locale &locale, + UErrorCode &errorCode) { + if (U_FAILURE(errorCode) || locale.isBogus() || *locale.getName() == 0 /* "und" */) { + return UND_LSR; + } else { + return likelySubtags.makeMaximizedLsrFrom(locale, errorCode); + } +} + +int32_t hashLSR(const UHashTok token) { + const LSR *lsr = static_cast<const LSR *>(token.pointer); + return lsr->hashCode; +} + +UBool compareLSRs(const UHashTok t1, const UHashTok t2) { + const LSR *lsr1 = static_cast<const LSR *>(t1.pointer); + const LSR *lsr2 = static_cast<const LSR *>(t2.pointer); + return *lsr1 == *lsr2; +} + +bool putIfAbsent(UHashtable *lsrToIndex, const LSR &lsr, int32_t i, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return false; } + U_ASSERT(i > 0); + int32_t index = uhash_geti(lsrToIndex, &lsr); + if (index != 0) { + return false; + } else { + uhash_puti(lsrToIndex, const_cast<LSR *>(&lsr), i, &errorCode); + return U_SUCCESS(errorCode); + } +} + +} // namespace + +LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) : + likelySubtags(*XLikelySubtags::getSingleton(errorCode)), + localeDistance(*LocaleDistance::getSingleton(errorCode)), + thresholdDistance(builder.thresholdDistance_), + demotionPerDesiredLocale(0), + favorSubtag(builder.favor_), + supportedLocales(nullptr), lsrs(nullptr), supportedLocalesLength(0), + supportedLsrToIndex(nullptr), + supportedLSRs(nullptr), supportedIndexes(nullptr), supportedLSRsLength(0), + ownedDefaultLocale(nullptr), defaultLocale(nullptr), defaultLocaleIndex(-1) { + if (U_FAILURE(errorCode)) { return; } + if (thresholdDistance < 0) { + thresholdDistance = localeDistance.getDefaultScriptDistance(); + } + supportedLocalesLength = builder.supportedLocales_ != nullptr ? + builder.supportedLocales_->size() : 0; + const Locale *def = builder.defaultLocale_; + int32_t idef = -1; + if (supportedLocalesLength > 0) { + // Store the supported locales in input order, + // so that when different types are used (e.g., language tag strings) + // we can return those by parallel index. + supportedLocales = static_cast<const Locale **>( + uprv_malloc(supportedLocalesLength * sizeof(const Locale *))); + // Supported LRSs in input order. + // In C++, we store these permanently to simplify ownership management + // in the hash tables. Duplicate LSRs (if any) are unused overhead. + lsrs = new LSR[supportedLocalesLength]; + if (supportedLocales == nullptr || lsrs == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + // If the constructor fails partway, we need null pointers for destructibility. + uprv_memset(supportedLocales, 0, supportedLocalesLength * sizeof(const Locale *)); + // Also find the first supported locale whose LSR is + // the same as that for the default locale. + LSR builderDefaultLSR; + const LSR *defLSR = nullptr; + if (def != nullptr) { + builderDefaultLSR = getMaximalLsrOrUnd(likelySubtags, *def, errorCode); + if (U_FAILURE(errorCode)) { return; } + defLSR = &builderDefaultLSR; + } + for (int32_t i = 0; i < supportedLocalesLength; ++i) { + const Locale &locale = *static_cast<Locale *>(builder.supportedLocales_->elementAt(i)); + supportedLocales[i] = locale.clone(); + if (supportedLocales[i] == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + const Locale &supportedLocale = *supportedLocales[i]; + LSR &lsr = lsrs[i] = getMaximalLsrOrUnd(likelySubtags, supportedLocale, errorCode); + lsr.setHashCode(); + if (U_FAILURE(errorCode)) { return; } + if (idef < 0 && defLSR != nullptr && lsr == *defLSR) { + idef = i; + defLSR = &lsr; // owned pointer to put into supportedLsrToIndex + if (*def == supportedLocale) { + def = &supportedLocale; // owned pointer to keep + } + } + } + + // We need an unordered map from LSR to first supported locale with that LSR, + // and an ordered list of (LSR, supported index). + // We insert the supported locales in the following order: + // 1. Default locale, if it is supported. + // 2. Priority locales (aka "paradigm locales") in builder order. + // 3. Remaining locales in builder order. + // In Java, we use a LinkedHashMap for both map & ordered lists. + // In C++, we use separate structures. + // We over-allocate arrays of LSRs and indexes for simplicity. + // We reserve slots at the array starts for the default and paradigm locales, + // plus enough for all supported locales. + // If there are few paradigm locales and few duplicate supported LSRs, + // then the amount of wasted space is small. + supportedLsrToIndex = uhash_openSize(hashLSR, compareLSRs, uhash_compareLong, + supportedLocalesLength, &errorCode); + if (U_FAILURE(errorCode)) { return; } + int32_t paradigmLimit = 1 + localeDistance.getParadigmLSRsLength(); + int32_t suppLSRsCapacity = paradigmLimit + supportedLocalesLength; + supportedLSRs = static_cast<const LSR **>( + uprv_malloc(suppLSRsCapacity * sizeof(const LSR *))); + supportedIndexes = static_cast<int32_t *>( + uprv_malloc(suppLSRsCapacity * sizeof(int32_t))); + if (supportedLSRs == nullptr || supportedIndexes == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + int32_t paradigmIndex = 0; + int32_t otherIndex = paradigmLimit; + if (idef >= 0) { + uhash_puti(supportedLsrToIndex, const_cast<LSR *>(defLSR), idef + 1, &errorCode); + supportedLSRs[0] = defLSR; + supportedIndexes[0] = idef; + paradigmIndex = 1; + } + for (int32_t i = 0; i < supportedLocalesLength; ++i) { + if (i == idef) { continue; } + const Locale &locale = *supportedLocales[i]; + const LSR &lsr = lsrs[i]; + if (defLSR == nullptr) { + U_ASSERT(i == 0); + def = &locale; + defLSR = &lsr; + idef = 0; + uhash_puti(supportedLsrToIndex, const_cast<LSR *>(&lsr), 0 + 1, &errorCode); + supportedLSRs[0] = &lsr; + supportedIndexes[0] = 0; + paradigmIndex = 1; + } else if (idef >= 0 && lsr == *defLSR) { + // lsr == *defLSR means that this supported locale is + // a duplicate of the default locale. + // Either an explicit default locale is supported, and we added it before the loop, + // or there is no explicit default locale, and this is + // a duplicate of the first supported locale. + // In both cases, idef >= 0 now, so otherwise we can skip the comparison. + // For a duplicate, putIfAbsent() is a no-op, so nothing to do. + } else { + if (putIfAbsent(supportedLsrToIndex, lsr, i + 1, errorCode)) { + if (localeDistance.isParadigmLSR(lsr)) { + supportedLSRs[paradigmIndex] = &lsr; + supportedIndexes[paradigmIndex++] = i; + } else { + supportedLSRs[otherIndex] = &lsr; + supportedIndexes[otherIndex++] = i; + } + } + } + if (U_FAILURE(errorCode)) { return; } + } + // Squeeze out unused array slots. + if (paradigmIndex < paradigmLimit && paradigmLimit < otherIndex) { + uprv_memmove(supportedLSRs + paradigmIndex, supportedLSRs + paradigmLimit, + (otherIndex - paradigmLimit) * sizeof(const LSR *)); + uprv_memmove(supportedIndexes + paradigmIndex, supportedIndexes + paradigmLimit, + (otherIndex - paradigmLimit) * sizeof(int32_t)); + } + supportedLSRsLength = otherIndex - (paradigmLimit - paradigmIndex); + } + + if (def != nullptr && (idef < 0 || def != supportedLocales[idef])) { + ownedDefaultLocale = def->clone(); + if (ownedDefaultLocale == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + def = ownedDefaultLocale; + } + defaultLocale = def; + defaultLocaleIndex = idef; + + if (builder.demotion_ == ULOCMATCH_DEMOTION_REGION) { + demotionPerDesiredLocale = localeDistance.getDefaultDemotionPerDesiredLocale(); + } +} + +LocaleMatcher::LocaleMatcher(LocaleMatcher &&src) U_NOEXCEPT : + likelySubtags(src.likelySubtags), + localeDistance(src.localeDistance), + thresholdDistance(src.thresholdDistance), + demotionPerDesiredLocale(src.demotionPerDesiredLocale), + favorSubtag(src.favorSubtag), + supportedLocales(src.supportedLocales), lsrs(src.lsrs), + supportedLocalesLength(src.supportedLocalesLength), + supportedLsrToIndex(src.supportedLsrToIndex), + supportedLSRs(src.supportedLSRs), + supportedIndexes(src.supportedIndexes), + supportedLSRsLength(src.supportedLSRsLength), + ownedDefaultLocale(src.ownedDefaultLocale), defaultLocale(src.defaultLocale), + defaultLocaleIndex(src.defaultLocaleIndex) { + src.supportedLocales = nullptr; + src.lsrs = nullptr; + src.supportedLocalesLength = 0; + src.supportedLsrToIndex = nullptr; + src.supportedLSRs = nullptr; + src.supportedIndexes = nullptr; + src.supportedLSRsLength = 0; + src.ownedDefaultLocale = nullptr; + src.defaultLocale = nullptr; + src.defaultLocaleIndex = -1; +} + +LocaleMatcher::~LocaleMatcher() { + for (int32_t i = 0; i < supportedLocalesLength; ++i) { + delete supportedLocales[i]; + } + uprv_free(supportedLocales); + delete[] lsrs; + uhash_close(supportedLsrToIndex); + uprv_free(supportedLSRs); + uprv_free(supportedIndexes); + delete ownedDefaultLocale; +} + +LocaleMatcher &LocaleMatcher::operator=(LocaleMatcher &&src) U_NOEXCEPT { + this->~LocaleMatcher(); + + thresholdDistance = src.thresholdDistance; + demotionPerDesiredLocale = src.demotionPerDesiredLocale; + favorSubtag = src.favorSubtag; + supportedLocales = src.supportedLocales; + lsrs = src.lsrs; + supportedLocalesLength = src.supportedLocalesLength; + supportedLsrToIndex = src.supportedLsrToIndex; + supportedLSRs = src.supportedLSRs; + supportedIndexes = src.supportedIndexes; + supportedLSRsLength = src.supportedLSRsLength; + ownedDefaultLocale = src.ownedDefaultLocale; + defaultLocale = src.defaultLocale; + defaultLocaleIndex = src.defaultLocaleIndex; + + src.supportedLocales = nullptr; + src.lsrs = nullptr; + src.supportedLocalesLength = 0; + src.supportedLsrToIndex = nullptr; + src.supportedLSRs = nullptr; + src.supportedIndexes = nullptr; + src.supportedLSRsLength = 0; + src.ownedDefaultLocale = nullptr; + src.defaultLocale = nullptr; + src.defaultLocaleIndex = -1; + return *this; +} + +class LocaleLsrIterator { +public: + LocaleLsrIterator(const XLikelySubtags &likelySubtags, Locale::Iterator &locales, + ULocMatchLifetime lifetime) : + likelySubtags(likelySubtags), locales(locales), lifetime(lifetime) {} + + ~LocaleLsrIterator() { + if (lifetime == ULOCMATCH_TEMPORARY_LOCALES) { + delete remembered; + } + } + + bool hasNext() const { + return locales.hasNext(); + } + + LSR next(UErrorCode &errorCode) { + current = &locales.next(); + return getMaximalLsrOrUnd(likelySubtags, *current, errorCode); + } + + void rememberCurrent(int32_t desiredIndex, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return; } + bestDesiredIndex = desiredIndex; + if (lifetime == ULOCMATCH_STORED_LOCALES) { + remembered = current; + } else { + // ULOCMATCH_TEMPORARY_LOCALES + delete remembered; + remembered = new Locale(*current); + if (remembered == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } + } + } + + const Locale *orphanRemembered() { + const Locale *rem = remembered; + remembered = nullptr; + return rem; + } + + int32_t getBestDesiredIndex() const { + return bestDesiredIndex; + } + +private: + const XLikelySubtags &likelySubtags; + Locale::Iterator &locales; + ULocMatchLifetime lifetime; + const Locale *current = nullptr, *remembered = nullptr; + int32_t bestDesiredIndex = -1; +}; + +const Locale *LocaleMatcher::getBestMatch(const Locale &desiredLocale, UErrorCode &errorCode) const { + if (U_FAILURE(errorCode)) { return nullptr; } + int32_t suppIndex = getBestSuppIndex( + getMaximalLsrOrUnd(likelySubtags, desiredLocale, errorCode), + nullptr, errorCode); + return U_SUCCESS(errorCode) && suppIndex >= 0 ? supportedLocales[suppIndex] : defaultLocale; +} + +const Locale *LocaleMatcher::getBestMatch(Locale::Iterator &desiredLocales, + UErrorCode &errorCode) const { + if (U_FAILURE(errorCode)) { return nullptr; } + if (!desiredLocales.hasNext()) { + return defaultLocale; + } + LocaleLsrIterator lsrIter(likelySubtags, desiredLocales, ULOCMATCH_TEMPORARY_LOCALES); + int32_t suppIndex = getBestSuppIndex(lsrIter.next(errorCode), &lsrIter, errorCode); + return U_SUCCESS(errorCode) && suppIndex >= 0 ? supportedLocales[suppIndex] : defaultLocale; +} + +const Locale *LocaleMatcher::getBestMatchForListString( + StringPiece desiredLocaleList, UErrorCode &errorCode) const { + LocalePriorityList list(desiredLocaleList, errorCode); + LocalePriorityList::Iterator iter = list.iterator(); + return getBestMatch(iter, errorCode); +} + +LocaleMatcher::Result LocaleMatcher::getBestMatchResult( + const Locale &desiredLocale, UErrorCode &errorCode) const { + if (U_FAILURE(errorCode)) { + return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); + } + int32_t suppIndex = getBestSuppIndex( + getMaximalLsrOrUnd(likelySubtags, desiredLocale, errorCode), + nullptr, errorCode); + if (U_FAILURE(errorCode) || suppIndex < 0) { + return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); + } else { + return Result(&desiredLocale, supportedLocales[suppIndex], 0, suppIndex, FALSE); + } +} + +LocaleMatcher::Result LocaleMatcher::getBestMatchResult( + Locale::Iterator &desiredLocales, UErrorCode &errorCode) const { + if (U_FAILURE(errorCode) || !desiredLocales.hasNext()) { + return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); + } + LocaleLsrIterator lsrIter(likelySubtags, desiredLocales, ULOCMATCH_TEMPORARY_LOCALES); + int32_t suppIndex = getBestSuppIndex(lsrIter.next(errorCode), &lsrIter, errorCode); + if (U_FAILURE(errorCode) || suppIndex < 0) { + return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); + } else { + return Result(lsrIter.orphanRemembered(), supportedLocales[suppIndex], + lsrIter.getBestDesiredIndex(), suppIndex, TRUE); + } +} + +int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remainingIter, + UErrorCode &errorCode) const { + if (U_FAILURE(errorCode)) { return -1; } + int32_t desiredIndex = 0; + int32_t bestSupportedLsrIndex = -1; + for (int32_t bestDistance = thresholdDistance;;) { + // Quick check for exact maximized LSR. + // Returns suppIndex+1 where 0 means not found. + if (supportedLsrToIndex != nullptr) { + desiredLSR.setHashCode(); + int32_t index = uhash_geti(supportedLsrToIndex, &desiredLSR); + if (index != 0) { + int32_t suppIndex = index - 1; + if (remainingIter != nullptr) { + remainingIter->rememberCurrent(desiredIndex, errorCode); + } + return suppIndex; + } + } + int32_t bestIndexAndDistance = localeDistance.getBestIndexAndDistance( + desiredLSR, supportedLSRs, supportedLSRsLength, bestDistance, favorSubtag); + if (bestIndexAndDistance >= 0) { + bestDistance = bestIndexAndDistance & 0xff; + if (remainingIter != nullptr) { + remainingIter->rememberCurrent(desiredIndex, errorCode); + if (U_FAILURE(errorCode)) { return -1; } + } + bestSupportedLsrIndex = bestIndexAndDistance >= 0 ? bestIndexAndDistance >> 8 : -1; + } + if ((bestDistance -= demotionPerDesiredLocale) <= 0) { + break; + } + if (remainingIter == nullptr || !remainingIter->hasNext()) { + break; + } + desiredLSR = remainingIter->next(errorCode); + if (U_FAILURE(errorCode)) { return -1; } + ++desiredIndex; + } + if (bestSupportedLsrIndex < 0) { + // no good match + return -1; + } + return supportedIndexes[bestSupportedLsrIndex]; +} + +double LocaleMatcher::internalMatch(const Locale &desired, const Locale &supported, UErrorCode &errorCode) const { + // Returns the inverse of the distance: That is, 1-distance(desired, supported). + LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, supported, errorCode); + if (U_FAILURE(errorCode)) { return 0; } + const LSR *pSuppLSR = &suppLSR; + int32_t distance = localeDistance.getBestIndexAndDistance( + getMaximalLsrOrUnd(likelySubtags, desired, errorCode), + &pSuppLSR, 1, + thresholdDistance, favorSubtag) & 0xff; + return (100 - distance) / 100.0; +} + +U_NAMESPACE_END + +#endif // __LOCMATCHER_H__ diff --git a/source/common/localeprioritylist.cpp b/source/common/localeprioritylist.cpp new file mode 100644 index 0000000000000000000000000000000000000000..06442fb46a83ad9d250425dc87c993146be9a6db --- /dev/null +++ b/source/common/localeprioritylist.cpp @@ -0,0 +1,239 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// localeprioritylist.cpp +// created: 2019jul11 Markus W. Scherer + +#include "unicode/utypes.h" +#include "unicode/localpointer.h" +#include "unicode/locid.h" +#include "unicode/stringpiece.h" +#include "unicode/uobject.h" +#include "charstr.h" +#include "cmemory.h" +#include "localeprioritylist.h" +#include "uarrsort.h" +#include "uassert.h" +#include "uhash.h" + +U_NAMESPACE_BEGIN + +namespace { + +int32_t hashLocale(const UHashTok token) { + auto *locale = static_cast<const Locale *>(token.pointer); + return locale->hashCode(); +} + +UBool compareLocales(const UHashTok t1, const UHashTok t2) { + auto *l1 = static_cast<const Locale *>(t1.pointer); + auto *l2 = static_cast<const Locale *>(t2.pointer); + return *l1 == *l2; +} + +constexpr int32_t WEIGHT_ONE = 1000; + +struct LocaleAndWeight { + Locale *locale; + int32_t weight; // 0..1000 = 0.0..1.0 + int32_t index; // force stable sort + + int32_t compare(const LocaleAndWeight &other) const { + int32_t diff = other.weight - weight; // descending: other-this + if (diff != 0) { return diff; } + return index - other.index; + } +}; + +int32_t U_CALLCONV +compareLocaleAndWeight(const void * /*context*/, const void *left, const void *right) { + return static_cast<const LocaleAndWeight *>(left)-> + compare(*static_cast<const LocaleAndWeight *>(right)); +} + +const char *skipSpaces(const char *p, const char *limit) { + while (p < limit && *p == ' ') { ++p; } + return p; +} + +int32_t findTagLength(const char *p, const char *limit) { + // Look for accept-language delimiters. + // Leave other validation up to the Locale constructor. + const char *q; + for (q = p; q < limit; ++q) { + char c = *q; + if (c == ' ' || c == ',' || c == ';') { break; } + } + return static_cast<int32_t>(q - p); +} + +/** + * Parses and returns a qvalue weight in millis. + * Advances p to after the parsed substring. + * Returns a negative value if parsing fails. + */ +int32_t parseWeight(const char *&p, const char *limit) { + p = skipSpaces(p, limit); + char c; + if (p == limit || ((c = *p) != '0' && c != '1')) { return -1; } + int32_t weight = (c - '0') * 1000; + if (++p == limit || *p != '.') { return weight; } + int32_t multiplier = 100; + while (++p != limit && '0' <= (c = *p) && c <= '9') { + c -= '0'; + if (multiplier > 0) { + weight += c * multiplier; + multiplier /= 10; + } else if (multiplier == 0) { + // round up + if (c >= 5) { ++weight; } + multiplier = -1; + } // else ignore further fraction digits + } + return weight <= WEIGHT_ONE ? weight : -1; // bad if > 1.0 +} + +} // namespace + +/** + * Nothing but a wrapper over a MaybeStackArray of LocaleAndWeight. + * + * This wrapper exists (and is not in an anonymous namespace) + * so that we can forward-declare it in the header file and + * don't have to expose the MaybeStackArray specialization and + * the LocaleAndWeight to code (like the test) that #includes localeprioritylist.h. + * Also, otherwise we would have to do a platform-specific + * template export declaration of some kind for the MaybeStackArray specialization + * to be properly exported from the common DLL. + */ +struct LocaleAndWeightArray : public UMemory { + MaybeStackArray<LocaleAndWeight, 20> array; +}; + +LocalePriorityList::LocalePriorityList(StringPiece s, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return; } + list = new LocaleAndWeightArray(); + if (list == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + const char *p = s.data(); + const char *limit = p + s.length(); + while ((p = skipSpaces(p, limit)) != limit) { + if (*p == ',') { // empty range field + ++p; + continue; + } + int32_t tagLength = findTagLength(p, limit); + if (tagLength == 0) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + CharString tag(p, tagLength, errorCode); + if (U_FAILURE(errorCode)) { return; } + Locale locale = Locale(tag.data()); + if (locale.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + int32_t weight = WEIGHT_ONE; + if ((p = skipSpaces(p + tagLength, limit)) != limit && *p == ';') { + if ((p = skipSpaces(p + 1, limit)) == limit || *p != 'q' || + (p = skipSpaces(p + 1, limit)) == limit || *p != '=' || + (++p, (weight = parseWeight(p, limit)) < 0)) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + p = skipSpaces(p, limit); + } + if (p != limit && *p != ',') { // trailing junk + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + add(locale, weight, errorCode); + if (p == limit) { break; } + ++p; + } + sort(errorCode); +} + +LocalePriorityList::~LocalePriorityList() { + if (list != nullptr) { + for (int32_t i = 0; i < listLength; ++i) { + delete list->array[i].locale; + } + delete list; + } + uhash_close(map); +} + +const Locale *LocalePriorityList::localeAt(int32_t i) const { + return list->array[i].locale; +} + +Locale *LocalePriorityList::orphanLocaleAt(int32_t i) { + if (list == nullptr) { return nullptr; } + LocaleAndWeight &lw = list->array[i]; + Locale *l = lw.locale; + lw.locale = nullptr; + return l; +} + +bool LocalePriorityList::add(const Locale &locale, int32_t weight, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return false; } + if (map == nullptr) { + if (weight <= 0) { return true; } // do not add q=0 + map = uhash_open(hashLocale, compareLocales, uhash_compareLong, &errorCode); + if (U_FAILURE(errorCode)) { return false; } + } + LocalPointer<Locale> clone; + int32_t index = uhash_geti(map, &locale); + if (index != 0) { + // Duplicate: Remove the old item and append it anew. + LocaleAndWeight &lw = list->array[index - 1]; + clone.adoptInstead(lw.locale); + lw.locale = nullptr; + lw.weight = 0; + ++numRemoved; + } + if (weight <= 0) { // do not add q=0 + if (index != 0) { + // Not strictly necessary but cleaner. + uhash_removei(map, &locale); + } + return true; + } + if (clone.isNull()) { + clone.adoptInstead(locale.clone()); + if (clone.isNull() || (clone->isBogus() && !locale.isBogus())) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return false; + } + } + if (listLength == list->array.getCapacity()) { + int32_t newCapacity = listLength < 50 ? 100 : 4 * listLength; + if (list->array.resize(newCapacity, listLength) == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return false; + } + } + uhash_puti(map, clone.getAlias(), listLength + 1, &errorCode); + if (U_FAILURE(errorCode)) { return false; } + LocaleAndWeight &lw = list->array[listLength]; + lw.locale = clone.orphan(); + lw.weight = weight; + lw.index = listLength++; + if (weight < WEIGHT_ONE) { hasWeights = true; } + U_ASSERT(uhash_count(map) == getLength()); + return true; +} + +void LocalePriorityList::sort(UErrorCode &errorCode) { + // Sort by descending weights if there is a mix of weights. + // The comparator forces a stable sort via the item index. + if (U_FAILURE(errorCode) || getLength() <= 1 || !hasWeights) { return; } + uprv_sortArray(list->array.getAlias(), listLength, sizeof(LocaleAndWeight), + compareLocaleAndWeight, nullptr, FALSE, &errorCode); +} + +U_NAMESPACE_END diff --git a/source/common/localeprioritylist.h b/source/common/localeprioritylist.h new file mode 100644 index 0000000000000000000000000000000000000000..80ca38a7b528926c4e1c9ce872d05fd8cc4eeed2 --- /dev/null +++ b/source/common/localeprioritylist.h @@ -0,0 +1,115 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// localeprioritylist.h +// created: 2019jul11 Markus W. Scherer + +#ifndef __LOCALEPRIORITYLIST_H__ +#define __LOCALEPRIORITYLIST_H__ + +#include "unicode/utypes.h" +#include "unicode/locid.h" +#include "unicode/stringpiece.h" +#include "unicode/uobject.h" + +struct UHashtable; + +U_NAMESPACE_BEGIN + +struct LocaleAndWeightArray; + +/** + * Parses a list of locales from an accept-language string. + * We are a bit more lenient than the spec: + * We accept extra whitespace in more places, empty range fields, + * and any number of qvalue fraction digits. + * + * https://tools.ietf.org/html/rfc2616#section-14.4 + * 14.4 Accept-Language + * + * Accept-Language = "Accept-Language" ":" + * 1#( language-range [ ";" "q" "=" qvalue ] ) + * language-range = ( ( 1*8ALPHA *( "-" 1*8ALPHA ) ) | "*" ) + * + * Each language-range MAY be given an associated quality value which + * represents an estimate of the user's preference for the languages + * specified by that range. The quality value defaults to "q=1". For + * example, + * + * Accept-Language: da, en-gb;q=0.8, en;q=0.7 + * + * https://tools.ietf.org/html/rfc2616#section-3.9 + * 3.9 Quality Values + * + * HTTP content negotiation (section 12) uses short "floating point" + * numbers to indicate the relative importance ("weight") of various + * negotiable parameters. A weight is normalized to a real number in + * the range 0 through 1, where 0 is the minimum and 1 the maximum + * value. If a parameter has a quality value of 0, then content with + * this parameter is `not acceptable' for the client. HTTP/1.1 + * applications MUST NOT generate more than three digits after the + * decimal point. User configuration of these values SHOULD also be + * limited in this fashion. + * + * qvalue = ( "0" [ "." 0*3DIGIT ] ) + * | ( "1" [ "." 0*3("0") ] ) + */ +class U_COMMON_API LocalePriorityList : public UMemory { +public: + class Iterator : public Locale::Iterator { + public: + UBool hasNext() const override { return count < length; } + + const Locale &next() override { + for(;;) { + const Locale *locale = list.localeAt(index++); + if (locale != nullptr) { + ++count; + return *locale; + } + } + } + + private: + friend class LocalePriorityList; + + Iterator(const LocalePriorityList &list) : list(list), length(list.getLength()) {} + + const LocalePriorityList &list; + int32_t index = 0; + int32_t count = 0; + const int32_t length; + }; + + LocalePriorityList(StringPiece s, UErrorCode &errorCode); + + ~LocalePriorityList(); + + int32_t getLength() const { return listLength - numRemoved; } + + int32_t getLengthIncludingRemoved() const { return listLength; } + + Iterator iterator() const { return Iterator(*this); } + + const Locale *localeAt(int32_t i) const; + + Locale *orphanLocaleAt(int32_t i); + +private: + LocalePriorityList(const LocalePriorityList &) = delete; + LocalePriorityList &operator=(const LocalePriorityList &) = delete; + + bool add(const Locale &locale, int32_t weight, UErrorCode &errorCode); + + void sort(UErrorCode &errorCode); + + LocaleAndWeightArray *list = nullptr; + int32_t listLength = 0; + int32_t numRemoved = 0; + bool hasWeights = false; // other than 1.0 + UHashtable *map = nullptr; +}; + +U_NAMESPACE_END + +#endif // __LOCALEPRIORITYLIST_H__ diff --git a/source/common/locdistance.cpp b/source/common/locdistance.cpp new file mode 100644 index 0000000000000000000000000000000000000000..800d0eacf2b605f59f2c6455fb38867ac0706b14 --- /dev/null +++ b/source/common/locdistance.cpp @@ -0,0 +1,364 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// locdistance.cpp +// created: 2019may08 Markus W. Scherer + +#include "unicode/utypes.h" +#include "unicode/bytestrie.h" +#include "unicode/localematcher.h" +#include "unicode/locid.h" +#include "unicode/uobject.h" +#include "unicode/ures.h" +#include "cstring.h" +#include "locdistance.h" +#include "loclikelysubtags.h" +#include "uassert.h" +#include "ucln_cmn.h" +#include "uinvchar.h" +#include "umutex.h" + +U_NAMESPACE_BEGIN + +namespace { + +/** + * Bit flag used on the last character of a subtag in the trie. + * Must be set consistently by the builder and the lookup code. + */ +constexpr int32_t END_OF_SUBTAG = 0x80; +/** Distance value bit flag, set by the builder. */ +constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80; +/** Distance value bit flag, set by trieNext(). */ +constexpr int32_t DISTANCE_IS_FINAL = 0x100; +constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; + +constexpr int32_t ABOVE_THRESHOLD = 100; + +// Indexes into array of distances. +enum { + IX_DEF_LANG_DISTANCE, + IX_DEF_SCRIPT_DISTANCE, + IX_DEF_REGION_DISTANCE, + IX_MIN_REGION_DISTANCE, + IX_LIMIT +}; + +LocaleDistance *gLocaleDistance = nullptr; +UInitOnce gInitOnce = U_INITONCE_INITIALIZER; + +UBool U_CALLCONV cleanup() { + delete gLocaleDistance; + gLocaleDistance = nullptr; + gInitOnce.reset(); + return TRUE; +} + +} // namespace + +void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { + // This function is invoked only via umtx_initOnce(). + U_ASSERT(gLocaleDistance == nullptr); + const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode); + if (U_FAILURE(errorCode)) { return; } + const LocaleDistanceData &data = likely.getDistanceData(); + if (data.distanceTrieBytes == nullptr || + data.regionToPartitions == nullptr || data.partitions == nullptr || + // ok if no paradigms + data.distances == nullptr) { + errorCode = U_MISSING_RESOURCE_ERROR; + return; + } + gLocaleDistance = new LocaleDistance(data); + if (gLocaleDistance == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup); +} + +const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } + umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode); + return gLocaleDistance; +} + +LocaleDistance::LocaleDistance(const LocaleDistanceData &data) : + trie(data.distanceTrieBytes), + regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), + paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), + defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]), + defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]), + defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]), + minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) { + // For the default demotion value, use the + // default region distance between unrelated Englishes. + // Thus, unless demotion is turned off, + // a mere region difference for one desired locale + // is as good as a perfect match for the next following desired locale. + // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>. + LSR en("en", "Latn", "US"); + LSR enGB("en", "Latn", "GB"); + const LSR *p_enGB = &enGB; + defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, &p_enGB, 1, + 50, ULOCMATCH_FAVOR_LANGUAGE) & 0xff; +} + +int32_t LocaleDistance::getBestIndexAndDistance( + const LSR &desired, + const LSR **supportedLSRs, int32_t supportedLSRsLength, + int32_t threshold, ULocMatchFavorSubtag favorSubtag) const { + BytesTrie iter(trie); + // Look up the desired language only once for all supported LSRs. + // Its "distance" is either a match point value of 0, or a non-match negative value. + // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. + int32_t desLangDistance = trieNext(iter, desired.language, false); + uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; + // Index of the supported LSR with the lowest distance. + int32_t bestIndex = -1; + for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { + const LSR &supported = *supportedLSRs[slIndex]; + bool star = false; + int32_t distance = desLangDistance; + if (distance >= 0) { + U_ASSERT((distance & DISTANCE_IS_FINAL) == 0); + if (slIndex != 0) { + iter.resetToState64(desLangState); + } + distance = trieNext(iter, supported.language, true); + } + // Note: The data builder verifies that there are no rules with "any" (*) language and + // real (non *) script or region subtags. + // This means that if the lookup for either language fails we can use + // the default distances without further lookups. + int32_t flags; + if (distance >= 0) { + flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; + distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; + } else { // <*, *> + if (uprv_strcmp(desired.language, supported.language) == 0) { + distance = 0; + } else { + distance = defaultLanguageDistance; + } + flags = 0; + star = true; + } + U_ASSERT(0 <= distance && distance <= 100); + // We implement "favor subtag" by reducing the language subtag distance + // (unscientifically reducing it to a quarter of the normal value), + // so that the script distance is relatively more important. + // For example, given a default language distance of 80, we reduce it to 20, + // which is below the default threshold of 50, which is the default script distance. + if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { + distance >>= 2; + } + if (distance >= threshold) { + continue; + } + + int32_t scriptDistance; + if (star || flags != 0) { + if (uprv_strcmp(desired.script, supported.script) == 0) { + scriptDistance = 0; + } else { + scriptDistance = defaultScriptDistance; + } + } else { + scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(), + desired.script, supported.script); + flags = scriptDistance & DISTANCE_IS_FINAL; + scriptDistance &= ~DISTANCE_IS_FINAL; + } + distance += scriptDistance; + if (distance >= threshold) { + continue; + } + + if (uprv_strcmp(desired.region, supported.region) == 0) { + // regionDistance = 0 + } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { + distance += defaultRegionDistance; + } else { + int32_t remainingThreshold = threshold - distance; + if (minRegionDistance >= remainingThreshold) { + continue; + } + + // From here on we know the regions are not equal. + // Map each region to zero or more partitions. (zero = one non-matching string) + // (Each array of single-character partition strings is encoded as one string.) + // If either side has more than one, then we find the maximum distance. + // This could be optimized by adding some more structure, but probably not worth it. + distance += getRegionPartitionsDistance( + iter, iter.getState64(), + partitionsForRegion(desired), + partitionsForRegion(supported), + remainingThreshold); + } + if (distance < threshold) { + if (distance == 0) { + return slIndex << 8; + } + bestIndex = slIndex; + threshold = distance; + } + } + return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD; +} + +int32_t LocaleDistance::getDesSuppScriptDistance( + BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) { + // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. + int32_t distance = trieNext(iter, desired, false); + if (distance >= 0) { + distance = trieNext(iter, supported, true); + } + if (distance < 0) { + UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *> + U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); + if (uprv_strcmp(desired, supported) == 0) { + distance = 0; // same script + } else { + distance = iter.getValue(); + U_ASSERT(distance >= 0); + } + if (result == USTRINGTRIE_FINAL_VALUE) { + distance |= DISTANCE_IS_FINAL; + } + } + return distance; +} + +int32_t LocaleDistance::getRegionPartitionsDistance( + BytesTrie &iter, uint64_t startState, + const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) { + char desired = *desiredPartitions++; + char supported = *supportedPartitions++; + U_ASSERT(desired != 0 && supported != 0); + // See if we have single desired/supported partitions, from NUL-terminated + // partition strings without explicit length. + bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character + // equivalent to: if (desLength == 1 && suppLength == 1) + if (*desiredPartitions == 0 && !suppLengthGt1) { + // Fastpath for single desired/supported partitions. + UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); + if (USTRINGTRIE_HAS_NEXT(result)) { + result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); + if (USTRINGTRIE_HAS_VALUE(result)) { + return iter.getValue(); + } + } + return getFallbackRegionDistance(iter, startState); + } + + const char *supportedStart = supportedPartitions - 1; // for restart of inner loop + int32_t regionDistance = 0; + // Fall back to * only once, not for each pair of partition strings. + bool star = false; + for (;;) { + // Look up each desired-partition string only once, + // not for each (desired, supported) pair. + UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); + if (USTRINGTRIE_HAS_NEXT(result)) { + uint64_t desState = suppLengthGt1 ? iter.getState64() : 0; + for (;;) { + result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); + int32_t d; + if (USTRINGTRIE_HAS_VALUE(result)) { + d = iter.getValue(); + } else if (star) { + d = 0; + } else { + d = getFallbackRegionDistance(iter, startState); + star = true; + } + if (d >= threshold) { + return d; + } else if (regionDistance < d) { + regionDistance = d; + } + if ((supported = *supportedPartitions++) != 0) { + iter.resetToState64(desState); + } else { + break; + } + } + } else if (!star) { + int32_t d = getFallbackRegionDistance(iter, startState); + if (d >= threshold) { + return d; + } else if (regionDistance < d) { + regionDistance = d; + } + star = true; + } + if ((desired = *desiredPartitions++) != 0) { + iter.resetToState64(startState); + supportedPartitions = supportedStart; + supported = *supportedPartitions++; + } else { + break; + } + } + return regionDistance; +} + +int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) { +#if U_DEBUG + UStringTrieResult result = +#endif + iter.resetToState64(startState).next(u'*'); // <*, *> + U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); + int32_t distance = iter.getValue(); + U_ASSERT(distance >= 0); + return distance; +} + +int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) { + uint8_t c; + if ((c = *s) == 0) { + return -1; // no empty subtags in the distance data + } + for (;;) { + c = uprv_invCharToAscii(c); + // EBCDIC: If *s is not an invariant character, + // then c is now 0 and will simply not match anything, which is harmless. + uint8_t next = *++s; + if (next != 0) { + if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { + return -1; + } + } else { + // last character of this subtag + UStringTrieResult result = iter.next(c | END_OF_SUBTAG); + if (wantValue) { + if (USTRINGTRIE_HAS_VALUE(result)) { + int32_t value = iter.getValue(); + if (result == USTRINGTRIE_FINAL_VALUE) { + value |= DISTANCE_IS_FINAL; + } + return value; + } + } else { + if (USTRINGTRIE_HAS_NEXT(result)) { + return 0; + } + } + return -1; + } + c = next; + } +} + +UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const { + // Linear search for a very short list (length 6 as of 2019). + // If there are many paradigm LSRs we should use a hash set. + U_ASSERT(paradigmLSRsLength <= 15); + for (int32_t i = 0; i < paradigmLSRsLength; ++i) { + if (lsr == paradigmLSRs[i]) { return true; } + } + return false; +} + +U_NAMESPACE_END diff --git a/source/common/locdistance.h b/source/common/locdistance.h new file mode 100644 index 0000000000000000000000000000000000000000..7439f51c56bf8c3e328c3b99eb17839d50d1d884 --- /dev/null +++ b/source/common/locdistance.h @@ -0,0 +1,109 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// locdistance.h +// created: 2019may08 Markus W. Scherer + +#ifndef __LOCDISTANCE_H__ +#define __LOCDISTANCE_H__ + +#include "unicode/utypes.h" +#include "unicode/bytestrie.h" +#include "unicode/localematcher.h" +#include "unicode/locid.h" +#include "unicode/uobject.h" +#include "lsr.h" + +U_NAMESPACE_BEGIN + +struct LocaleDistanceData; + +/** + * Offline-built data for LocaleMatcher. + * Mostly but not only the data for mapping locales to their maximized forms. + */ +class LocaleDistance final : public UMemory { +public: + static const LocaleDistance *getSingleton(UErrorCode &errorCode); + + /** + * Finds the supported LSR with the smallest distance from the desired one. + * Equivalent LSR subtags must be normalized into a canonical form. + * + * <p>Returns the index of the lowest-distance supported LSR in bits 31..8 + * (negative if none has a distance below the threshold), + * and its distance (0..ABOVE_THRESHOLD) in bits 7..0. + */ + int32_t getBestIndexAndDistance(const LSR &desired, + const LSR **supportedLSRs, int32_t supportedLSRsLength, + int32_t threshold, ULocMatchFavorSubtag favorSubtag) const; + + int32_t getParadigmLSRsLength() const { return paradigmLSRsLength; } + + UBool isParadigmLSR(const LSR &lsr) const; + + int32_t getDefaultScriptDistance() const { + return defaultScriptDistance; + } + + int32_t getDefaultDemotionPerDesiredLocale() const { + return defaultDemotionPerDesiredLocale; + } + +private: + LocaleDistance(const LocaleDistanceData &data); + LocaleDistance(const LocaleDistance &other) = delete; + LocaleDistance &operator=(const LocaleDistance &other) = delete; + + static void initLocaleDistance(UErrorCode &errorCode); + + static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState, + const char *desired, const char *supported); + + static int32_t getRegionPartitionsDistance( + BytesTrie &iter, uint64_t startState, + const char *desiredPartitions, const char *supportedPartitions, + int32_t threshold); + + static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState); + + static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue); + + const char *partitionsForRegion(const LSR &lsr) const { + // ill-formed region -> one non-matching string + int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex]; + return partitionArrays[pIndex]; + } + + int32_t getDefaultRegionDistance() const { + return defaultRegionDistance; + } + + // The trie maps each dlang+slang+dscript+sscript+dregion+sregion + // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance. + // There is also a trie value for each subsequence of whole subtags. + // One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"". + BytesTrie trie; + + /** + * Maps each region to zero or more single-character partitions. + */ + const uint8_t *regionToPartitionsIndex; + const char **partitionArrays; + + /** + * Used to get the paradigm region for a cluster, if there is one. + */ + const LSR *paradigmLSRs; + int32_t paradigmLSRsLength; + + int32_t defaultLanguageDistance; + int32_t defaultScriptDistance; + int32_t defaultRegionDistance; + int32_t minRegionDistance; + int32_t defaultDemotionPerDesiredLocale; +}; + +U_NAMESPACE_END + +#endif // __LOCDISTANCE_H__ diff --git a/source/common/locid.cpp b/source/common/locid.cpp index caffdb8b0976d4b99f046c4d1c1640359d5cab64..93f3d3cb9fec14323c3ba96fd98f4b78956063f3 100644 --- a/source/common/locid.cpp +++ b/source/common/locid.cpp @@ -1399,5 +1399,7 @@ Locale::getBaseName() const { return baseName; } +Locale::Iterator::~Iterator() = default; + //eof U_NAMESPACE_END diff --git a/source/common/loclikelysubtags.cpp b/source/common/loclikelysubtags.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d7f5e124c2c790d2b4f0d0b94e71c0961a34b59d --- /dev/null +++ b/source/common/loclikelysubtags.cpp @@ -0,0 +1,638 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// loclikelysubtags.cpp +// created: 2019may08 Markus W. Scherer + +#include <utility> +#include "unicode/utypes.h" +#include "unicode/bytestrie.h" +#include "unicode/localpointer.h" +#include "unicode/locid.h" +#include "unicode/uobject.h" +#include "unicode/ures.h" +#include "charstr.h" +#include "cstring.h" +#include "loclikelysubtags.h" +#include "lsr.h" +#include "uassert.h" +#include "ucln_cmn.h" +#include "uhash.h" +#include "uinvchar.h" +#include "umutex.h" +#include "uresdata.h" +#include "uresimp.h" + +U_NAMESPACE_BEGIN + +namespace { + +constexpr char PSEUDO_ACCENTS_PREFIX = '\''; // -XA, -PSACCENT +constexpr char PSEUDO_BIDI_PREFIX = '+'; // -XB, -PSBIDI +constexpr char PSEUDO_CRACKED_PREFIX = ','; // -XC, -PSCRACK + +/** + * Stores NUL-terminated strings with duplicate elimination. + * Checks for unique UTF-16 string pointers and converts to invariant characters. + */ +class UniqueCharStrings { +public: + UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) { + uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode); + if (U_FAILURE(errorCode)) { return; } + strings = new CharString(); + if (strings == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } + } + ~UniqueCharStrings() { + uhash_close(&map); + delete strings; + } + + /** Returns/orphans the CharString that contains all strings. */ + CharString *orphanCharStrings() { + CharString *result = strings; + strings = nullptr; + return result; + } + + /** Adds a string and returns a unique number for it. */ + int32_t add(const UnicodeString &s, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return 0; } + if (isFrozen) { + errorCode = U_NO_WRITE_PERMISSION; + return 0; + } + // The string points into the resource bundle. + const char16_t *p = s.getBuffer(); + int32_t oldIndex = uhash_geti(&map, p); + if (oldIndex != 0) { // found duplicate + return oldIndex; + } + // Explicit NUL terminator for the previous string. + // The strings object is also terminated with one implicit NUL. + strings->append(0, errorCode); + int32_t newIndex = strings->length(); + strings->appendInvariantChars(s, errorCode); + uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode); + return newIndex; + } + + void freeze() { isFrozen = true; } + + /** + * Returns a string pointer for its unique number, if this object is frozen. + * Otherwise nullptr. + */ + const char *get(int32_t i) const { + U_ASSERT(isFrozen); + return isFrozen && i > 0 ? strings->data() + i : nullptr; + } + +private: + UHashtable map; + CharString *strings; + bool isFrozen = false; +}; + +} // namespace + +LocaleDistanceData::LocaleDistanceData(LocaleDistanceData &&data) : + distanceTrieBytes(data.distanceTrieBytes), + regionToPartitions(data.regionToPartitions), + partitions(data.partitions), + paradigms(data.paradigms), paradigmsLength(data.paradigmsLength), + distances(data.distances) { + data.partitions = nullptr; + data.paradigms = nullptr; +} + +LocaleDistanceData::~LocaleDistanceData() { + uprv_free(partitions); + delete[] paradigms; +} + +// TODO(ICU-20777): Rename to just LikelySubtagsData. +struct XLikelySubtagsData { + UResourceBundle *langInfoBundle = nullptr; + UniqueCharStrings strings; + CharStringMap languageAliases; + CharStringMap regionAliases; + const uint8_t *trieBytes = nullptr; + LSR *lsrs = nullptr; + int32_t lsrsLength = 0; + + LocaleDistanceData distanceData; + + XLikelySubtagsData(UErrorCode &errorCode) : strings(errorCode) {} + + ~XLikelySubtagsData() { + ures_close(langInfoBundle); + delete[] lsrs; + } + + void load(UErrorCode &errorCode) { + langInfoBundle = ures_openDirect(nullptr, "langInfo", &errorCode); + if (U_FAILURE(errorCode)) { return; } + StackUResourceBundle stackTempBundle; + ResourceDataValue value; + ures_getValueWithFallback(langInfoBundle, "likely", stackTempBundle.getAlias(), + value, errorCode); + ResourceTable likelyTable = value.getTable(errorCode); + if (U_FAILURE(errorCode)) { return; } + + // Read all strings in the resource bundle and convert them to invariant char *. + LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes; + int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0; + if (!readStrings(likelyTable, "languageAliases", value, + languageIndexes, languagesLength, errorCode) || + !readStrings(likelyTable, "regionAliases", value, + regionIndexes, regionsLength, errorCode) || + !readStrings(likelyTable, "lsrs", value, + lsrSubtagIndexes,lsrSubtagsLength, errorCode)) { + return; + } + if ((languagesLength & 1) != 0 || + (regionsLength & 1) != 0 || + (lsrSubtagsLength % 3) != 0) { + errorCode = U_INVALID_FORMAT_ERROR; + return; + } + if (lsrSubtagsLength == 0) { + errorCode = U_MISSING_RESOURCE_ERROR; + return; + } + + if (!likelyTable.findValue("trie", value)) { + errorCode = U_MISSING_RESOURCE_ERROR; + return; + } + int32_t length; + trieBytes = value.getBinary(length, errorCode); + if (U_FAILURE(errorCode)) { return; } + + // Also read distance/matcher data if available, + // to open & keep only one resource bundle pointer + // and to use one single UniqueCharStrings. + UErrorCode matchErrorCode = U_ZERO_ERROR; + ures_getValueWithFallback(langInfoBundle, "match", stackTempBundle.getAlias(), + value, matchErrorCode); + LocalMemory<int32_t> partitionIndexes, paradigmSubtagIndexes; + int32_t partitionsLength = 0, paradigmSubtagsLength = 0; + if (U_SUCCESS(matchErrorCode)) { + ResourceTable matchTable = value.getTable(errorCode); + if (U_FAILURE(errorCode)) { return; } + + if (matchTable.findValue("trie", value)) { + distanceData.distanceTrieBytes = value.getBinary(length, errorCode); + if (U_FAILURE(errorCode)) { return; } + } + + if (matchTable.findValue("regionToPartitions", value)) { + distanceData.regionToPartitions = value.getBinary(length, errorCode); + if (U_FAILURE(errorCode)) { return; } + if (length < LSR::REGION_INDEX_LIMIT) { + errorCode = U_INVALID_FORMAT_ERROR; + return; + } + } + + if (!readStrings(matchTable, "partitions", value, + partitionIndexes, partitionsLength, errorCode) || + !readStrings(matchTable, "paradigms", value, + paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) { + return; + } + if ((paradigmSubtagsLength % 3) != 0) { + errorCode = U_INVALID_FORMAT_ERROR; + return; + } + + if (matchTable.findValue("distances", value)) { + distanceData.distances = value.getIntVector(length, errorCode); + if (U_FAILURE(errorCode)) { return; } + if (length < 4) { // LocaleDistance IX_LIMIT + errorCode = U_INVALID_FORMAT_ERROR; + return; + } + } + } else if (matchErrorCode == U_MISSING_RESOURCE_ERROR) { + // ok for likely subtags + } else { // error other than missing resource + errorCode = matchErrorCode; + return; + } + + // Fetch & store invariant-character versions of strings + // only after we have collected and de-duplicated all of them. + strings.freeze(); + + languageAliases = CharStringMap(languagesLength / 2, errorCode); + for (int32_t i = 0; i < languagesLength; i += 2) { + languageAliases.put(strings.get(languageIndexes[i]), + strings.get(languageIndexes[i + 1]), errorCode); + } + + regionAliases = CharStringMap(regionsLength / 2, errorCode); + for (int32_t i = 0; i < regionsLength; i += 2) { + regionAliases.put(strings.get(regionIndexes[i]), + strings.get(regionIndexes[i + 1]), errorCode); + } + if (U_FAILURE(errorCode)) { return; } + + lsrsLength = lsrSubtagsLength / 3; + lsrs = new LSR[lsrsLength]; + if (lsrs == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) { + lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]), + strings.get(lsrSubtagIndexes[i + 1]), + strings.get(lsrSubtagIndexes[i + 2])); + } + + if (partitionsLength > 0) { + distanceData.partitions = static_cast<const char **>( + uprv_malloc(partitionsLength * sizeof(const char *))); + if (distanceData.partitions == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + for (int32_t i = 0; i < partitionsLength; ++i) { + distanceData.partitions[i] = strings.get(partitionIndexes[i]); + } + } + + if (paradigmSubtagsLength > 0) { + distanceData.paradigmsLength = paradigmSubtagsLength / 3; + LSR *paradigms = new LSR[distanceData.paradigmsLength]; + if (paradigms == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) { + paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]), + strings.get(paradigmSubtagIndexes[i + 1]), + strings.get(paradigmSubtagIndexes[i + 2])); + } + distanceData.paradigms = paradigms; + } + } + +private: + bool readStrings(const ResourceTable &table, const char *key, ResourceValue &value, + LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) { + if (table.findValue(key, value)) { + ResourceArray stringArray = value.getArray(errorCode); + if (U_FAILURE(errorCode)) { return false; } + length = stringArray.getSize(); + if (length == 0) { return true; } + int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length); + if (rawIndexes == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return false; + } + for (int i = 0; i < length; ++i) { + stringArray.getValue(i, value); // returns TRUE because i < length + rawIndexes[i] = strings.add(value.getUnicodeString(errorCode), errorCode); + if (U_FAILURE(errorCode)) { return false; } + } + } + return true; + } +}; + +namespace { + +XLikelySubtags *gLikelySubtags = nullptr; +UInitOnce gInitOnce = U_INITONCE_INITIALIZER; + +UBool U_CALLCONV cleanup() { + delete gLikelySubtags; + gLikelySubtags = nullptr; + gInitOnce.reset(); + return TRUE; +} + +} // namespace + +void U_CALLCONV XLikelySubtags::initLikelySubtags(UErrorCode &errorCode) { + // This function is invoked only via umtx_initOnce(). + U_ASSERT(gLikelySubtags == nullptr); + XLikelySubtagsData data(errorCode); + data.load(errorCode); + if (U_FAILURE(errorCode)) { return; } + gLikelySubtags = new XLikelySubtags(data); + if (gLikelySubtags == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + ucln_common_registerCleanup(UCLN_COMMON_LIKELY_SUBTAGS, cleanup); +} + +const XLikelySubtags *XLikelySubtags::getSingleton(UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } + umtx_initOnce(gInitOnce, &XLikelySubtags::initLikelySubtags, errorCode); + return gLikelySubtags; +} + +XLikelySubtags::XLikelySubtags(XLikelySubtagsData &data) : + langInfoBundle(data.langInfoBundle), + strings(data.strings.orphanCharStrings()), + languageAliases(std::move(data.languageAliases)), + regionAliases(std::move(data.regionAliases)), + trie(data.trieBytes), + lsrs(data.lsrs), +#if U_DEBUG + lsrsLength(data.lsrsLength), +#endif + distanceData(std::move(data.distanceData)) { + data.langInfoBundle = nullptr; + data.lsrs = nullptr; + + // Cache the result of looking up language="und" encoded as "*", and "und-Zzzz" ("**"). + UStringTrieResult result = trie.next(u'*'); + U_ASSERT(USTRINGTRIE_HAS_NEXT(result)); + trieUndState = trie.getState64(); + result = trie.next(u'*'); + U_ASSERT(USTRINGTRIE_HAS_NEXT(result)); + trieUndZzzzState = trie.getState64(); + result = trie.next(u'*'); + U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); + defaultLsrIndex = trie.getValue(); + trie.reset(); + + for (char16_t c = u'a'; c <= u'z'; ++c) { + result = trie.next(c); + if (result == USTRINGTRIE_NO_VALUE) { + trieFirstLetterStates[c - u'a'] = trie.getState64(); + } + trie.reset(); + } +} + +XLikelySubtags::~XLikelySubtags() { + ures_close(langInfoBundle); + delete strings; + delete[] lsrs; +} + +LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const { + const char *name = locale.getName(); + if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=") + // Private use language tag x-subtag-subtag... + return LSR(name, "", ""); + } + return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(), + locale.getVariant(), errorCode); +} + +namespace { + +const char *getCanonical(const CharStringMap &aliases, const char *alias) { + const char *canonical = aliases.get(alias); + return canonical == nullptr ? alias : canonical; +} + +} // namespace + +LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region, + const char *variant, UErrorCode &errorCode) const { + // Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK. + // They should match only themselves, + // not other locales with what looks like the same language and script subtags. + char c1; + if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) { + switch (c1) { + case 'A': + return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region, errorCode); + case 'B': + return LSR(PSEUDO_BIDI_PREFIX, language, script, region, errorCode); + case 'C': + return LSR(PSEUDO_CRACKED_PREFIX, language, script, region, errorCode); + default: // normal locale + break; + } + } + + if (variant[0] == 'P' && variant[1] == 'S') { + if (uprv_strcmp(variant, "PSACCENT") == 0) { + return LSR(PSEUDO_ACCENTS_PREFIX, language, script, + *region == 0 ? "XA" : region, errorCode); + } else if (uprv_strcmp(variant, "PSBIDI") == 0) { + return LSR(PSEUDO_BIDI_PREFIX, language, script, + *region == 0 ? "XB" : region, errorCode); + } else if (uprv_strcmp(variant, "PSCRACK") == 0) { + return LSR(PSEUDO_CRACKED_PREFIX, language, script, + *region == 0 ? "XC" : region, errorCode); + } + // else normal locale + } + + language = getCanonical(languageAliases, language); + // (We have no script mappings.) + region = getCanonical(regionAliases, region); + return maximize(language, script, region); +} + +LSR XLikelySubtags::maximize(const char *language, const char *script, const char *region) const { + if (uprv_strcmp(language, "und") == 0) { + language = ""; + } + if (uprv_strcmp(script, "Zzzz") == 0) { + script = ""; + } + if (uprv_strcmp(region, "ZZ") == 0) { + region = ""; + } + if (*script != 0 && *region != 0 && *language != 0) { + return LSR(language, script, region); // already maximized + } + + uint32_t retainOldMask = 0; + BytesTrie iter(trie); + uint64_t state; + int32_t value; + // Small optimization: Array lookup for first language letter. + int32_t c0; + if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 && + language[1] != 0 && // language.length() >= 2 + (state = trieFirstLetterStates[c0]) != 0) { + value = trieNext(iter.resetToState64(state), language, 1); + } else { + value = trieNext(iter, language, 0); + } + if (value >= 0) { + if (*language != 0) { + retainOldMask |= 4; + } + state = iter.getState64(); + } else { + retainOldMask |= 4; + iter.resetToState64(trieUndState); // "und" ("*") + state = 0; + } + + if (value > 0) { + // Intermediate or final value from just language. + if (value == SKIP_SCRIPT) { + value = 0; + } + if (*script != 0) { + retainOldMask |= 2; + } + } else { + value = trieNext(iter, script, 0); + if (value >= 0) { + if (*script != 0) { + retainOldMask |= 2; + } + state = iter.getState64(); + } else { + retainOldMask |= 2; + if (state == 0) { + iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**") + } else { + iter.resetToState64(state); + value = trieNext(iter, "", 0); + U_ASSERT(value >= 0); + state = iter.getState64(); + } + } + } + + if (value > 0) { + // Final value from just language or language+script. + if (*region != 0) { + retainOldMask |= 1; + } + } else { + value = trieNext(iter, region, 0); + if (value >= 0) { + if (*region != 0) { + retainOldMask |= 1; + } + } else { + retainOldMask |= 1; + if (state == 0) { + value = defaultLsrIndex; + } else { + iter.resetToState64(state); + value = trieNext(iter, "", 0); + U_ASSERT(value > 0); + } + } + } + U_ASSERT(value < lsrsLength); + const LSR &result = lsrs[value]; + + if (*language == 0) { + language = "und"; + } + + if (retainOldMask == 0) { + // Quickly return a copy of the lookup-result LSR + // without new allocation of the subtags. + return LSR(result.language, result.script, result.region); + } + if ((retainOldMask & 4) == 0) { + language = result.language; + } + if ((retainOldMask & 2) == 0) { + script = result.script; + } + if ((retainOldMask & 1) == 0) { + region = result.region; + } + return LSR(language, script, region); +} + +int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) { + UStringTrieResult result; + uint8_t c; + if ((c = s[i]) == 0) { + result = iter.next(u'*'); + } else { + for (;;) { + c = uprv_invCharToAscii(c); + // EBCDIC: If s[i] is not an invariant character, + // then c is now 0 and will simply not match anything, which is harmless. + uint8_t next = s[++i]; + if (next != 0) { + if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { + return -1; + } + } else { + // last character of this subtag + result = iter.next(c | 0x80); + break; + } + c = next; + } + } + switch (result) { + case USTRINGTRIE_NO_MATCH: return -1; + case USTRINGTRIE_NO_VALUE: return 0; + case USTRINGTRIE_INTERMEDIATE_VALUE: + U_ASSERT(iter.getValue() == SKIP_SCRIPT); + return SKIP_SCRIPT; + case USTRINGTRIE_FINAL_VALUE: return iter.getValue(); + default: return -1; + } +} + +// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code +// in loclikely.cpp to this new code, including activating this +// minimizeSubtags() function. The LocaleMatcher does not minimize. +#if 0 +LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn, + const char *regionIn, ULocale.Minimize fieldToFavor, + UErrorCode &errorCode) const { + LSR result = maximize(languageIn, scriptIn, regionIn); + + // We could try just a series of checks, like: + // LSR result2 = addLikelySubtags(languageIn, "", ""); + // if result.equals(result2) return result2; + // However, we can optimize 2 of the cases: + // (languageIn, "", "") + // (languageIn, "", regionIn) + + // value00 = lookup(result.language, "", "") + BytesTrie iter = new BytesTrie(trie); + int value = trieNext(iter, result.language, 0); + U_ASSERT(value >= 0); + if (value == 0) { + value = trieNext(iter, "", 0); + U_ASSERT(value >= 0); + if (value == 0) { + value = trieNext(iter, "", 0); + } + } + U_ASSERT(value > 0); + LSR value00 = lsrs[value]; + boolean favorRegionOk = false; + if (result.script.equals(value00.script)) { //script is default + if (result.region.equals(value00.region)) { + return new LSR(result.language, "", ""); + } else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) { + return new LSR(result.language, "", result.region); + } else { + favorRegionOk = true; + } + } + + // The last case is not as easy to optimize. + // Maybe do later, but for now use the straightforward code. + LSR result2 = maximize(languageIn, scriptIn, ""); + if (result2.equals(result)) { + return new LSR(result.language, result.script, ""); + } else if (favorRegionOk) { + return new LSR(result.language, "", result.region); + } + return result; +} +#endif + +U_NAMESPACE_END diff --git a/source/common/loclikelysubtags.h b/source/common/loclikelysubtags.h new file mode 100644 index 0000000000000000000000000000000000000000..8c8a08ac5e314e52171873a76f80c5f11e9e8128 --- /dev/null +++ b/source/common/loclikelysubtags.h @@ -0,0 +1,143 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// loclikelysubtags.h +// created: 2019may08 Markus W. Scherer + +#ifndef __LOCLIKELYSUBTAGS_H__ +#define __LOCLIKELYSUBTAGS_H__ + +#include <utility> +#include "unicode/utypes.h" +#include "unicode/bytestrie.h" +#include "unicode/locid.h" +#include "unicode/uobject.h" +#include "unicode/ures.h" +#include "lsr.h" +#include "uhash.h" + +U_NAMESPACE_BEGIN + +struct XLikelySubtagsData; + +/** + * Map of const char * keys & values. + * Stores pointers as is: Does not own/copy/adopt/release strings. + */ +class CharStringMap final : public UMemory { +public: + /** Constructs an unusable non-map. */ + CharStringMap() : map(nullptr) {} + CharStringMap(int32_t size, UErrorCode &errorCode) { + map = uhash_openSize(uhash_hashChars, uhash_compareChars, uhash_compareChars, + size, &errorCode); + } + CharStringMap(CharStringMap &&other) U_NOEXCEPT : map(other.map) { + other.map = nullptr; + } + CharStringMap(const CharStringMap &other) = delete; + ~CharStringMap() { + uhash_close(map); + } + + CharStringMap &operator=(CharStringMap &&other) U_NOEXCEPT { + map = other.map; + other.map = nullptr; + return *this; + } + CharStringMap &operator=(const CharStringMap &other) = delete; + + const char *get(const char *key) const { return static_cast<const char *>(uhash_get(map, key)); } + void put(const char *key, const char *value, UErrorCode &errorCode) { + uhash_put(map, const_cast<char *>(key), const_cast<char *>(value), &errorCode); + } + +private: + UHashtable *map; +}; + +struct LocaleDistanceData { + LocaleDistanceData() = default; + LocaleDistanceData(LocaleDistanceData &&data); + ~LocaleDistanceData(); + + const uint8_t *distanceTrieBytes = nullptr; + const uint8_t *regionToPartitions = nullptr; + const char **partitions = nullptr; + const LSR *paradigms = nullptr; + int32_t paradigmsLength = 0; + const int32_t *distances = nullptr; + +private: + LocaleDistanceData &operator=(const LocaleDistanceData &) = delete; +}; + +// TODO(ICU-20777): Rename to just LikelySubtags. +class XLikelySubtags final : public UMemory { +public: + ~XLikelySubtags(); + + static constexpr int32_t SKIP_SCRIPT = 1; + + // VisibleForTesting + static const XLikelySubtags *getSingleton(UErrorCode &errorCode); + + // VisibleForTesting + LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const; + + // TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code + // in loclikely.cpp to this new code, including activating this + // minimizeSubtags() function. The LocaleMatcher does not minimize. +#if 0 + LSR minimizeSubtags(const char *languageIn, const char *scriptIn, const char *regionIn, + ULocale.Minimize fieldToFavor, UErrorCode &errorCode) const; +#endif + + // visible for LocaleDistance + const LocaleDistanceData &getDistanceData() const { return distanceData; } + +private: + XLikelySubtags(XLikelySubtagsData &data); + XLikelySubtags(const XLikelySubtags &other) = delete; + XLikelySubtags &operator=(const XLikelySubtags &other) = delete; + + static void initLikelySubtags(UErrorCode &errorCode); + + LSR makeMaximizedLsr(const char *language, const char *script, const char *region, + const char *variant, UErrorCode &errorCode) const; + + /** + * Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN". + */ + LSR maximize(const char *language, const char *script, const char *region) const; + + static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i); + + UResourceBundle *langInfoBundle; + // We could store the strings by value, except that if there were few enough strings, + // moving the contents could copy it to a different array, + // invalidating the pointers stored in the maps. + CharString *strings; + CharStringMap languageAliases; + CharStringMap regionAliases; + + // The trie maps each lang+script+region (encoded in ASCII) to an index into lsrs. + // There is also a trie value for each intermediate lang and lang+script. + // '*' is used instead of "und", "Zzzz"/"" and "ZZ"/"". + BytesTrie trie; + uint64_t trieUndState; + uint64_t trieUndZzzzState; + int32_t defaultLsrIndex; + uint64_t trieFirstLetterStates[26]; + const LSR *lsrs; +#if U_DEBUG + int32_t lsrsLength; +#endif + + // distance/matcher data: see comment in XLikelySubtagsData::load() + LocaleDistanceData distanceData; +}; + +U_NAMESPACE_END + +#endif // __LOCLIKELYSUBTAGS_H__ diff --git a/source/common/lsr.cpp b/source/common/lsr.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0c28eeda1bc7b6642458c4eda84fce5fcaedb991 --- /dev/null +++ b/source/common/lsr.cpp @@ -0,0 +1,101 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// lsr.cpp +// created: 2019may08 Markus W. Scherer + +#include "unicode/utypes.h" +#include "charstr.h" +#include "cmemory.h" +#include "cstring.h" +#include "lsr.h" +#include "uinvchar.h" +#include "ustr_imp.h" + +U_NAMESPACE_BEGIN + +LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode) : + language(nullptr), script(nullptr), region(r), + regionIndex(indexForRegion(region)) { + if (U_SUCCESS(errorCode)) { + CharString langScript; + langScript.append(prefix, errorCode).append(lang, errorCode).append('\0', errorCode); + int32_t scriptOffset = langScript.length(); + langScript.append(prefix, errorCode).append(scr, errorCode); + owned = langScript.cloneData(errorCode); + if (U_SUCCESS(errorCode)) { + language = owned; + script = owned + scriptOffset; + } + } +} + +LSR::LSR(LSR &&other) U_NOEXCEPT : + language(other.language), script(other.script), region(other.region), owned(other.owned), + regionIndex(other.regionIndex), hashCode(other.hashCode) { + if (owned != nullptr) { + other.language = other.script = ""; + other.owned = nullptr; + other.hashCode = 0; + } +} + +void LSR::deleteOwned() { + uprv_free(owned); +} + +LSR &LSR::operator=(LSR &&other) U_NOEXCEPT { + this->~LSR(); + language = other.language; + script = other.script; + region = other.region; + regionIndex = other.regionIndex; + owned = other.owned; + hashCode = other.hashCode; + if (owned != nullptr) { + other.language = other.script = ""; + other.owned = nullptr; + other.hashCode = 0; + } + return *this; +} + +UBool LSR::operator==(const LSR &other) const { + return + uprv_strcmp(language, other.language) == 0 && + uprv_strcmp(script, other.script) == 0 && + regionIndex == other.regionIndex && + // Compare regions if both are ill-formed (and their indexes are 0). + (regionIndex > 0 || uprv_strcmp(region, other.region) == 0); +} + +int32_t LSR::indexForRegion(const char *region) { + int32_t c = region[0]; + int32_t a = c - '0'; + if (0 <= a && a <= 9) { // digits: "419" + int32_t b = region[1] - '0'; + if (b < 0 || 9 < b) { return 0; } + c = region[2] - '0'; + if (c < 0 || 9 < c || region[3] != 0) { return 0; } + return (10 * a + b) * 10 + c + 1; + } else { // letters: "DE" + a = uprv_upperOrdinal(c); + if (a < 0 || 25 < a) { return 0; } + int32_t b = uprv_upperOrdinal(region[1]); + if (b < 0 || 25 < b || region[2] != 0) { return 0; } + return 26 * a + b + 1001; + } + return 0; +} + +LSR &LSR::setHashCode() { + if (hashCode == 0) { + hashCode = + (ustr_hashCharsN(language, static_cast<int32_t>(uprv_strlen(language))) * 37 + + ustr_hashCharsN(script, static_cast<int32_t>(uprv_strlen(script)))) * 37 + + regionIndex; + } + return *this; +} + +U_NAMESPACE_END diff --git a/source/common/lsr.h b/source/common/lsr.h new file mode 100644 index 0000000000000000000000000000000000000000..db6cf938f47d021dbc06c9ded8c86d0781ff6e5a --- /dev/null +++ b/source/common/lsr.h @@ -0,0 +1,72 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// lsr.h +// created: 2019may08 Markus W. Scherer + +#ifndef __LSR_H__ +#define __LSR_H__ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "cstring.h" + +U_NAMESPACE_BEGIN + +struct LSR final : public UMemory { + static constexpr int32_t REGION_INDEX_LIMIT = 1001 + 26 * 26; + + const char *language; + const char *script; + const char *region; + char *owned = nullptr; + /** Index for region, 0 if ill-formed. @see indexForRegion */ + int32_t regionIndex = 0; + /** Only set for LSRs that will be used in a hash table. */ + int32_t hashCode = 0; + + LSR() : language("und"), script(""), region("") {} + + /** Constructor which aliases all subtag pointers. */ + LSR(const char *lang, const char *scr, const char *r) : + language(lang), script(scr), region(r), + regionIndex(indexForRegion(region)) {} + /** + * Constructor which prepends the prefix to the language and script, + * copies those into owned memory, and aliases the region. + */ + LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode); + LSR(LSR &&other) U_NOEXCEPT; + LSR(const LSR &other) = delete; + inline ~LSR() { + // Pure inline code for almost all instances. + if (owned != nullptr) { + deleteOwned(); + } + } + + LSR &operator=(LSR &&other) U_NOEXCEPT; + LSR &operator=(const LSR &other) = delete; + + /** + * Returns a positive index (>0) for a well-formed region code. + * Do not rely on a particular region->index mapping; it may change. + * Returns 0 for ill-formed strings. + */ + static int32_t indexForRegion(const char *region); + + UBool operator==(const LSR &other) const; + + inline UBool operator!=(const LSR &other) const { + return !operator==(other); + } + + LSR &setHashCode(); + +private: + void deleteOwned(); +}; + +U_NAMESPACE_END + +#endif // __LSR_H__ diff --git a/source/common/resource.h b/source/common/resource.h index 3dbff785ef12a85b422c053a79ff62d48886d633..5199b858880770dfab43fde174bfe636651c5a5f 100644 --- a/source/common/resource.h +++ b/source/common/resource.h @@ -28,6 +28,7 @@ #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/ures.h" +#include "restrace.h" struct ResourceData; @@ -47,8 +48,10 @@ public: ResourceArray() : items16(NULL), items32(NULL), length(0) {} /** Only for implementation use. @internal */ - ResourceArray(const uint16_t *i16, const uint32_t *i32, int32_t len) : - items16(i16), items32(i32), length(len) {} + ResourceArray(const uint16_t *i16, const uint32_t *i32, int32_t len, + const ResourceTracer& traceInfo) : + items16(i16), items32(i32), length(len), + fTraceInfo(traceInfo) {} /** * @return The number of items in the array resource. @@ -68,6 +71,7 @@ private: const uint16_t *items16; const uint32_t *items32; int32_t length; + ResourceTracer fTraceInfo; }; /** @@ -80,27 +84,37 @@ public: /** Only for implementation use. @internal */ ResourceTable(const uint16_t *k16, const int32_t *k32, - const uint16_t *i16, const uint32_t *i32, int32_t len) : - keys16(k16), keys32(k32), items16(i16), items32(i32), length(len) {} + const uint16_t *i16, const uint32_t *i32, int32_t len, + const ResourceTracer& traceInfo) : + keys16(k16), keys32(k32), items16(i16), items32(i32), length(len), + fTraceInfo(traceInfo) {} /** * @return The number of items in the array resource. */ int32_t getSize() const { return length; } /** - * @param i Array item index. + * @param i Table item index. * @param key Output-only, receives the key of the i'th item. * @param value Output-only, receives the value of the i'th item. * @return TRUE if i is non-negative and less than getSize(). */ UBool getKeyAndValue(int32_t i, const char *&key, ResourceValue &value) const; + /** + * @param key Key string to find in the table. + * @param value Output-only, receives the value of the item with that key. + * @return TRUE if the table contains the key. + */ + UBool findValue(const char *key, ResourceValue &value) const; + private: const uint16_t *keys16; const int32_t *keys32; const uint16_t *items16; const uint32_t *items32; int32_t length; + ResourceTracer fTraceInfo; }; /** diff --git a/source/common/restrace.cpp b/source/common/restrace.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2ab82415f4becdc2f9a992416415ab102a7fac35 --- /dev/null +++ b/source/common/restrace.cpp @@ -0,0 +1,111 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if U_ENABLE_TRACING + +#include "restrace.h" +#include "charstr.h" +#include "cstring.h" +#include "utracimp.h" +#include "uresimp.h" +#include "uassert.h" +#include "util.h" + +U_NAMESPACE_BEGIN + +ResourceTracer::~ResourceTracer() = default; + +void ResourceTracer::trace(const char* resType) const { + U_ASSERT(fResB || fParent); + UTRACE_ENTRY(UTRACE_UDATA_RESOURCE); + UErrorCode status = U_ZERO_ERROR; + + icu::CharString filePath; + getFilePath(filePath, status); + + icu::CharString resPath; + getResPath(resPath, status); + + UTRACE_DATA3(UTRACE_VERBOSE, "(%s) %s @ %s", + resType, + filePath.data(), + resPath.data()); + UTRACE_EXIT_STATUS(status); +} + +void ResourceTracer::getFilePath(CharString& output, UErrorCode& status) const { + if (fResB) { + output.append(fResB->fData->fPath, status); + output.append('/', status); + output.append(fResB->fData->fName, status); + output.append(".res", status); + } else { + fParent->getFilePath(output, status); + } +} + +void ResourceTracer::getResPath(CharString& output, UErrorCode& status) const { + if (fResB) { + output.append('/', status); + output.append(fResB->fResPath, status); + // removing the trailing / + U_ASSERT(output[output.length()-1] == '/'); + output.truncate(output.length()-1); + } else { + fParent->getResPath(output, status); + } + if (fKey) { + output.append('/', status); + output.append(fKey, status); + } + if (fIndex != -1) { + output.append('[', status); + UnicodeString indexString; + ICU_Utility::appendNumber(indexString, fIndex); + output.appendInvariantChars(indexString, status); + output.append(']', status); + } +} + +void FileTracer::traceOpen(const char* path, const char* type, const char* name) { + if (uprv_strcmp(type, "res") == 0) { + traceOpenResFile(path, name); + } else { + traceOpenDataFile(path, type, name); + } +} + +void FileTracer::traceOpenDataFile(const char* path, const char* type, const char* name) { + UTRACE_ENTRY(UTRACE_UDATA_DATA_FILE); + UErrorCode status = U_ZERO_ERROR; + + icu::CharString filePath; + filePath.append(path, status); + filePath.append('/', status); + filePath.append(name, status); + filePath.append('.', status); + filePath.append(type, status); + + UTRACE_DATA1(UTRACE_VERBOSE, "%s", filePath.data()); + UTRACE_EXIT_STATUS(status); +} + +void FileTracer::traceOpenResFile(const char* path, const char* name) { + UTRACE_ENTRY(UTRACE_UDATA_RES_FILE); + UErrorCode status = U_ZERO_ERROR; + + icu::CharString filePath; + filePath.append(path, status); + filePath.append('/', status); + filePath.append(name, status); + filePath.append(".res", status); + + UTRACE_DATA1(UTRACE_VERBOSE, "%s", filePath.data()); + UTRACE_EXIT_STATUS(status); +} + +U_NAMESPACE_END + +#endif // U_ENABLE_TRACING diff --git a/source/common/restrace.h b/source/common/restrace.h new file mode 100644 index 0000000000000000000000000000000000000000..a8af9190cd5c28b8df050957249fa11d4cba8cf3 --- /dev/null +++ b/source/common/restrace.h @@ -0,0 +1,132 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#ifndef __RESTRACE_H__ +#define __RESTRACE_H__ + +#include "unicode/utypes.h" + +#if U_ENABLE_TRACING + +struct UResourceBundle; + +U_NAMESPACE_BEGIN + +class CharString; + +/** + * Instances of this class store information used to trace reads from resource + * bundles when ICU is built with --enable-tracing. + * + * All arguments of type const UResourceBundle*, const char*, and + * const ResourceTracer& are stored as pointers. The caller must retain + * ownership for the lifetime of this ResourceTracer. + * + * Exported as U_COMMON_API for Windows because it is a value field + * in other exported types. + */ +class U_COMMON_API ResourceTracer { +public: + ResourceTracer() : + fResB(nullptr), + fParent(nullptr), + fKey(nullptr), + fIndex(-1) {} + + ResourceTracer(const UResourceBundle* resB) : + fResB(resB), + fParent(nullptr), + fKey(nullptr), + fIndex(-1) {} + + ResourceTracer(const UResourceBundle* resB, const char* key) : + fResB(resB), + fParent(nullptr), + fKey(key), + fIndex(-1) {} + + ResourceTracer(const UResourceBundle* resB, int32_t index) : + fResB(resB), + fParent(nullptr), + fKey(nullptr), + fIndex(index) {} + + ResourceTracer(const ResourceTracer& parent, const char* key) : + fResB(nullptr), + fParent(&parent), + fKey(key), + fIndex(-1) {} + + ResourceTracer(const ResourceTracer& parent, int32_t index) : + fResB(nullptr), + fParent(&parent), + fKey(nullptr), + fIndex(index) {} + + ~ResourceTracer(); + + void trace(const char* type) const; + +private: + const UResourceBundle* fResB; + const ResourceTracer* fParent; + const char* fKey; + int32_t fIndex; + + void getFilePath(CharString& output, UErrorCode& status) const; + + void getResPath(CharString& output, UErrorCode& status) const; +}; + +/** + * This class provides methods to trace data file reads when ICU is built + * with --enable-tracing. + */ +class FileTracer { +public: + static void traceOpen(const char* path, const char* type, const char* name); + +private: + static void traceOpenDataFile(const char* path, const char* type, const char* name); + static void traceOpenResFile(const char* path, const char* name); +}; + +U_NAMESPACE_END + +#else // U_ENABLE_TRACING + +U_NAMESPACE_BEGIN + +/** + * Default trivial implementation when --enable-tracing is not used. + */ +class U_COMMON_API ResourceTracer { +public: + ResourceTracer() {} + + ResourceTracer(const void*) {} + + ResourceTracer(const void*, const char*) {} + + ResourceTracer(const void*, int32_t) {} + + ResourceTracer(const ResourceTracer&, const char*) {} + + ResourceTracer(const ResourceTracer&, int32_t) {} + + void trace(const char*) const {} +}; + +/** + * Default trivial implementation when --enable-tracing is not used. + */ +class FileTracer { +public: + static void traceOpen(const char*, const char*, const char*) {} +}; + +U_NAMESPACE_END + +#endif // U_ENABLE_TRACING + +#endif //__RESTRACE_H__ diff --git a/source/common/ucln_cmn.h b/source/common/ucln_cmn.h index 0ca911b47d9875a7dcabadd0dad6b6d8ab611fcd..c4b22ca47ea530c1f62429991171196dd1f7c3f0 100644 --- a/source/common/ucln_cmn.h +++ b/source/common/ucln_cmn.h @@ -41,6 +41,8 @@ typedef enum ECleanupCommonType { UCLN_COMMON_LOCALE_KEY_TYPE, UCLN_COMMON_LOCALE, UCLN_COMMON_LOCALE_AVAILABLE, + UCLN_COMMON_LIKELY_SUBTAGS, + UCLN_COMMON_LOCALE_DISTANCE, UCLN_COMMON_ULOC, UCLN_COMMON_CURRENCY, UCLN_COMMON_LOADED_NORMALIZER2, diff --git a/source/common/udata.cpp b/source/common/udata.cpp index b62095cd025c8b0c2961e18027a44a9d3b286554..1051f18f28b71ba3070e9f7d0eb699258e8fc442 100644 --- a/source/common/udata.cpp +++ b/source/common/udata.cpp @@ -33,6 +33,7 @@ might have to #include some other header #include "cstring.h" #include "mutex.h" #include "putilimp.h" +#include "restrace.h" #include "uassert.h" #include "ucln_cmn.h" #include "ucmndata.h" @@ -1168,6 +1169,9 @@ doOpenChoice(const char *path, const char *type, const char *name, UBool isICUData = FALSE; + FileTracer::traceOpen(path, type, name); + + /* Is this path ICU data? */ if(path == NULL || !strcmp(path, U_ICUDATA_ALIAS) || /* "ICUDATA" */ diff --git a/source/common/uinvchar.cpp b/source/common/uinvchar.cpp index 8ce2350dfd7f7c3fb696898eb393bf81071ee688..6e5fb48cf43a6ee2febd4e33667cd20b5e6f03c3 100644 --- a/source/common/uinvchar.cpp +++ b/source/common/uinvchar.cpp @@ -445,6 +445,13 @@ uprv_copyEbcdic(const UDataSwapper *ds, return length; } +U_CFUNC UBool +uprv_isEbcdicAtSign(char c) { + static const uint8_t ebcdicAtSigns[] = { + 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 }; + return c != 0 && uprv_strchr((const char *)ebcdicAtSigns, c) != nullptr; +} + /* compare invariant strings; variant characters compare less than others and unlike each other */ U_CFUNC int32_t uprv_compareInvAscii(const UDataSwapper *ds, @@ -561,6 +568,11 @@ uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2) { } } +U_CAPI char U_EXPORT2 +uprv_ebcdicToAscii(char c) { + return (char)asciiFromEbcdic[(uint8_t)c]; +} + U_CAPI char U_EXPORT2 uprv_ebcdicToLowercaseAscii(char c) { return (char)lowercaseAsciiFromEbcdic[(uint8_t)c]; diff --git a/source/common/uinvchar.h b/source/common/uinvchar.h index 56dddfa8fde9bbb84e2bd9bbb94b7304b72dbbc9..a43cfcd98286fe093a993750de541227101919ed 100644 --- a/source/common/uinvchar.h +++ b/source/common/uinvchar.h @@ -68,6 +68,75 @@ uprv_isInvariantUString(const UChar *s, int32_t length); # error Unknown charset family! #endif +#ifdef __cplusplus + +U_NAMESPACE_BEGIN + +/** + * Like U_UPPER_ORDINAL(x) but with validation. + * Returns 0..25 for A..Z else a value outside 0..25. + */ +inline int32_t uprv_upperOrdinal(int32_t c) { +#if U_CHARSET_FAMILY==U_ASCII_FAMILY + return c - 'A'; +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY + // EBCDIC: A-Z (26 letters) is split into three ranges A-I (9 letters), J-R (9), S-Z (8). + // https://en.wikipedia.org/wiki/EBCDIC_037#Codepage_layout + if (c <= 'I') { return c - 'A'; } // A-I --> 0-8 + if (c < 'J') { return -1; } + if (c <= 'R') { return c - 'J' + 9; } // J-R --> 9..17 + if (c < 'S') { return -1; } + return c - 'S' + 18; // S-Z --> 18..25 +#else +# error Unknown charset family! +#endif +} + +// Like U_UPPER_ORDINAL(x) but for lowercase and with validation. +// Returns 0..25 for a..z else a value outside 0..25. +inline int32_t uprv_lowerOrdinal(int32_t c) { +#if U_CHARSET_FAMILY==U_ASCII_FAMILY + return c - 'a'; +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY + // EBCDIC: a-z (26 letters) is split into three ranges a-i (9 letters), j-r (9), s-z (8). + // https://en.wikipedia.org/wiki/EBCDIC_037#Codepage_layout + if (c <= 'i') { return c - 'a'; } // a-i --> 0-8 + if (c < 'j') { return -1; } + if (c <= 'r') { return c - 'j' + 9; } // j-r --> 9..17 + if (c < 's') { return -1; } + return c - 's' + 18; // s-z --> 18..25 +#else +# error Unknown charset family! +#endif +} + +U_NAMESPACE_END + +#endif + +/** + * Returns true if c == '@' is possible. + * The @ sign is variant, and the @ sign used on one + * EBCDIC machine won't be compiled the same way on other EBCDIC based machines. + * @internal + */ +U_CFUNC UBool +uprv_isEbcdicAtSign(char c); + +/** + * \def uprv_isAtSign + * Returns true if c == '@' is possible. + * For ASCII, checks for exactly '@'. For EBCDIC, calls uprv_isEbcdicAtSign(). + * @internal + */ +#if U_CHARSET_FAMILY==U_ASCII_FAMILY +# define uprv_isAtSign(c) ((c)=='@') +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY +# define uprv_isAtSign(c) uprv_isEbcdicAtSign(c) +#else +# error Unknown charset family! +#endif + /** * Compare two EBCDIC invariant-character strings in ASCII order. * @internal @@ -88,6 +157,26 @@ uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2); # error Unknown charset family! #endif +/** + * Converts an EBCDIC invariant character to ASCII. + * @internal + */ +U_INTERNAL char U_EXPORT2 +uprv_ebcdicToAscii(char c); + +/** + * \def uprv_invCharToAscii + * Converts an invariant character to ASCII. + * @internal + */ +#if U_CHARSET_FAMILY==U_ASCII_FAMILY +# define uprv_invCharToAscii(c) (c) +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY +# define uprv_invCharToAscii(c) uprv_ebcdicToAscii(c) +#else +# error Unknown charset family! +#endif + /** * Converts an EBCDIC invariant character to lowercase ASCII. * @internal diff --git a/source/common/unicode/bytestrie.h b/source/common/unicode/bytestrie.h index c57b8ccfeb59600d9e8fe8ae690bddff9cafb7a8..1a35f604d6d9161c191d7f39e89bdec0c84cea54 100644 --- a/source/common/unicode/bytestrie.h +++ b/source/common/unicode/bytestrie.h @@ -94,6 +94,39 @@ public: return *this; } + /** + * Returns the state of this trie as a 64-bit integer. + * The state value is never 0. + * + * @return opaque state value + * @see resetToState64 + * @draft ICU 65 + */ + uint64_t getState64() const { + return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) | + (uint64_t)(pos_ - bytes_); + } + + /** + * Resets this trie to the saved state. + * Unlike resetToState(State), the 64-bit state value + * must be from getState64() from the same trie object or + * from one initialized the exact same way. + * Because of no validation, this method is faster. + * + * @param state The opaque trie state value from getState64(). + * @return *this + * @see getState64 + * @see resetToState + * @see reset + * @draft ICU 65 + */ + BytesTrie &resetToState64(uint64_t state) { + remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2; + pos_ = bytes_ + (state & kState64PosMask); + return *this; + } + /** * BytesTrie state object, for saving a trie's current state * and resetting the trie back to this state later. @@ -502,6 +535,13 @@ private: static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff + // For getState64(): + // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2 + // so we need at least 5 bits for that. + // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength. + static constexpr int32_t kState64RemainingShift = 59; + static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1; + uint8_t *ownedArray_; // Fixed value referencing the BytesTrie bytes. diff --git a/source/common/unicode/localebuilder.h b/source/common/unicode/localebuilder.h index 960e5980c03c2e1e0bc505379c3390d09866bc71..a91a0b51471bfec6e4be44e98b041dc96fd36629 100644 --- a/source/common/unicode/localebuilder.h +++ b/source/common/unicode/localebuilder.h @@ -4,6 +4,7 @@ #define __LOCALEBUILDER_H__ #include "unicode/locid.h" +#include "unicode/localematcher.h" #include "unicode/stringpiece.h" #include "unicode/uobject.h" #include "unicode/utypes.h" @@ -277,6 +278,10 @@ public: Locale build(UErrorCode& status); private: + friend class LocaleMatcher::Result; + + void copyExtensionsFrom(const Locale& src, UErrorCode& errorCode); + UErrorCode status_; char language_[9]; char script_[5]; diff --git a/source/common/unicode/localematcher.h b/source/common/unicode/localematcher.h new file mode 100644 index 0000000000000000000000000000000000000000..701123f750b222d59a6254652d3a8be0a3c76405 --- /dev/null +++ b/source/common/unicode/localematcher.h @@ -0,0 +1,605 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// localematcher.h +// created: 2019may08 Markus W. Scherer + +#ifndef __LOCALEMATCHER_H__ +#define __LOCALEMATCHER_H__ + +#include "unicode/utypes.h" + +#if U_SHOW_CPLUSPLUS_API + +#include "unicode/locid.h" +#include "unicode/stringpiece.h" +#include "unicode/uobject.h" + +/** + * \file + * \brief C++ API: Locale matcher: User's desired locales vs. application's supported locales. + */ + +#ifndef U_HIDE_DRAFT_API + +/** + * Builder option for whether the language subtag or the script subtag is most important. + * + * @see Builder#setFavorSubtag(FavorSubtag) + * @draft ICU 65 + */ +enum ULocMatchFavorSubtag { + /** + * Language differences are most important, then script differences, then region differences. + * (This is the default behavior.) + * + * @draft ICU 65 + */ + ULOCMATCH_FAVOR_LANGUAGE, + /** + * Makes script differences matter relatively more than language differences. + * + * @draft ICU 65 + */ + ULOCMATCH_FAVOR_SCRIPT +}; +#ifndef U_IN_DOXYGEN +typedef enum ULocMatchFavorSubtag ULocMatchFavorSubtag; +#endif + +/** + * Builder option for whether all desired locales are treated equally or + * earlier ones are preferred. + * + * @see Builder#setDemotionPerDesiredLocale(Demotion) + * @draft ICU 65 + */ +enum ULocMatchDemotion { + /** + * All desired locales are treated equally. + * + * @draft ICU 65 + */ + ULOCMATCH_DEMOTION_NONE, + /** + * Earlier desired locales are preferred. + * + * <p>From each desired locale to the next, + * the distance to any supported locale is increased by an additional amount + * which is at least as large as most region mismatches. + * A later desired locale has to have a better match with some supported locale + * due to more than merely having the same region subtag. + * + * <p>For example: <code>Supported={en, sv} desired=[en-GB, sv]</code> + * yields <code>Result(en-GB, en)</code> because + * with the demotion of sv its perfect match is no better than + * the region distance between the earlier desired locale en-GB and en=en-US. + * + * <p>Notes: + * <ul> + * <li>In some cases, language and/or script differences can be as small as + * the typical region difference. (Example: sr-Latn vs. sr-Cyrl) + * <li>It is possible for certain region differences to be larger than usual, + * and larger than the demotion. + * (As of CLDR 35 there is no such case, but + * this is possible in future versions of the data.) + * </ul> + * + * @draft ICU 65 + */ + ULOCMATCH_DEMOTION_REGION +}; +#ifndef U_IN_DOXYGEN +typedef enum ULocMatchDemotion ULocMatchDemotion; +#endif + +struct UHashtable; + +U_NAMESPACE_BEGIN + +struct LSR; + +class LocaleDistance; +class LocaleLsrIterator; +class UVector; +class XLikelySubtags; + +/** + * Immutable class that picks the best match between a user's desired locales and + * an application's supported locales. + * Movable but not copyable. + * + * <p>Example: + * <pre> + * UErrorCode errorCode = U_ZERO_ERROR; + * LocaleMatcher matcher = LocaleMatcher::Builder().setSupportedLocales("fr, en-GB, en").build(errorCode); + * Locale *bestSupported = matcher.getBestLocale(Locale.US, errorCode); // "en" + * </pre> + * + * <p>A matcher takes into account when languages are close to one another, + * such as Danish and Norwegian, + * and when regional variants are close, like en-GB and en-AU as opposed to en-US. + * + * <p>If there are multiple supported locales with the same (language, script, region) + * likely subtags, then the current implementation returns the first of those locales. + * It ignores variant subtags (except for pseudolocale variants) and extensions. + * This may change in future versions. + * + * <p>For example, the current implementation does not distinguish between + * de, de-DE, de-Latn, de-1901, de-u-co-phonebk. + * + * <p>If you prefer one equivalent locale over another, then provide only the preferred one, + * or place it earlier in the list of supported locales. + * + * <p>Otherwise, the order of supported locales may have no effect on the best-match results. + * The current implementation compares each desired locale with supported locales + * in the following order: + * 1. Default locale, if supported; + * 2. CLDR "paradigm locales" like en-GB and es-419; + * 3. other supported locales. + * This may change in future versions. + * + * <p>Often a product will just need one matcher instance, built with the languages + * that it supports. However, it may want multiple instances with different + * default languages based on additional information, such as the domain. + * + * <p>This class is not intended for public subclassing. + * + * @draft ICU 65 + */ +class U_COMMON_API LocaleMatcher : public UMemory { +public: + /** + * Data for the best-matching pair of a desired and a supported locale. + * Movable but not copyable. + * + * @draft ICU 65 + */ + class U_COMMON_API Result : public UMemory { + public: + /** + * Move constructor; might modify the source. + * This object will have the same contents that the source object had. + * + * @param src Result to move contents from. + * @draft ICU 65 + */ + Result(Result &&src) U_NOEXCEPT; + + /** + * Destructor. + * + * @draft ICU 65 + */ + ~Result(); + + /** + * Move assignment; might modify the source. + * This object will have the same contents that the source object had. + * + * @param src Result to move contents from. + * @draft ICU 65 + */ + Result &operator=(Result &&src) U_NOEXCEPT; + + /** + * Returns the best-matching desired locale. + * nullptr if the list of desired locales is empty or if none matched well enough. + * + * @return the best-matching desired locale, or nullptr. + * @draft ICU 65 + */ + inline const Locale *getDesiredLocale() const { return desiredLocale; } + + /** + * Returns the best-matching supported locale. + * If none matched well enough, this is the default locale. + * The default locale is nullptr if the list of supported locales is empty and + * no explicit default locale is set. + * + * @return the best-matching supported locale, or nullptr. + * @draft ICU 65 + */ + inline const Locale *getSupportedLocale() const { return supportedLocale; } + + /** + * Returns the index of the best-matching desired locale in the input Iterable order. + * -1 if the list of desired locales is empty or if none matched well enough. + * + * @return the index of the best-matching desired locale, or -1. + * @draft ICU 65 + */ + inline int32_t getDesiredIndex() const { return desiredIndex; } + + /** + * Returns the index of the best-matching supported locale in the + * constructor’s or builder’s input order (“set†Collection plus “added†locales). + * If the matcher was built from a locale list string, then the iteration order is that + * of a LocalePriorityList built from the same string. + * -1 if the list of supported locales is empty or if none matched well enough. + * + * @return the index of the best-matching supported locale, or -1. + * @draft ICU 65 + */ + inline int32_t getSupportedIndex() const { return supportedIndex; } + + /** + * Takes the best-matching supported locale and adds relevant fields of the + * best-matching desired locale, such as the -t- and -u- extensions. + * May replace some fields of the supported locale. + * The result is the locale that should be used for date and number formatting, collation, etc. + * Returns the root locale if getSupportedLocale() returns nullptr. + * + * <p>Example: desired=ar-SA-u-nu-latn, supported=ar-EG, resolved locale=ar-SA-u-nu-latn + * + * @return a locale combining the best-matching desired and supported locales. + * @draft ICU 65 + */ + Locale makeResolvedLocale(UErrorCode &errorCode) const; + + private: + Result(const Locale *desired, const Locale *supported, + int32_t desIndex, int32_t suppIndex, UBool owned) : + desiredLocale(desired), supportedLocale(supported), + desiredIndex(desIndex), supportedIndex(suppIndex), + desiredIsOwned(owned) {} + + Result(const Result &other) = delete; + Result &operator=(const Result &other) = delete; + + const Locale *desiredLocale; + const Locale *supportedLocale; + int32_t desiredIndex; + int32_t supportedIndex; + UBool desiredIsOwned; + + friend class LocaleMatcher; + }; + + /** + * LocaleMatcher builder. + * Movable but not copyable. + * + * @see LocaleMatcher#builder() + * @draft ICU 65 + */ + class U_COMMON_API Builder : public UMemory { + public: + /** + * Constructs a builder used in chaining parameters for building a LocaleMatcher. + * + * @return a new Builder object + * @draft ICU 65 + */ + Builder() {} + + /** + * Move constructor; might modify the source. + * This builder will have the same contents that the source builder had. + * + * @param src Builder to move contents from. + * @draft ICU 65 + */ + Builder(Builder &&src) U_NOEXCEPT; + + /** + * Destructor. + * + * @draft ICU 65 + */ + ~Builder(); + + /** + * Move assignment; might modify the source. + * This builder will have the same contents that the source builder had. + * + * @param src Builder to move contents from. + * @draft ICU 65 + */ + Builder &operator=(Builder &&src) U_NOEXCEPT; + + /** + * Parses an Accept-Language string + * (<a href="https://tools.ietf.org/html/rfc2616#section-14.4">RFC 2616 Section 14.4</a>), + * such as "af, en, fr;q=0.9", and sets the supported locales accordingly. + * Allows whitespace in more places but does not allow "*". + * Clears any previously set/added supported locales first. + * + * @param locales the Accept-Language string of locales to set + * @return this Builder object + * @draft ICU 65 + */ + Builder &setSupportedLocalesFromListString(StringPiece locales); + + /** + * Copies the supported locales, preserving iteration order. + * Clears any previously set/added supported locales first. + * Duplicates are allowed, and are not removed. + * + * @param locales the list of locale + * @return this Builder object + * @draft ICU 65 + */ + Builder &setSupportedLocales(Locale::Iterator &locales); + + /** + * Copies the supported locales from the begin/end range, preserving iteration order. + * Clears any previously set/added supported locales first. + * Duplicates are allowed, and are not removed. + * + * Each of the iterator parameter values must be an + * input iterator whose value is convertible to const Locale &. + * + * @param begin Start of range. + * @param end Exclusive end of range. + * @return this Builder object + * @draft ICU 65 + */ + template<typename Iter> + Builder &setSupportedLocales(Iter begin, Iter end) { + if (U_FAILURE(errorCode_)) { return *this; } + clearSupportedLocales(); + while (begin != end) { + addSupportedLocale(*begin++); + } + return *this; + } + + /** + * Copies the supported locales from the begin/end range, preserving iteration order. + * Calls the converter to convert each *begin to a Locale or const Locale &. + * Clears any previously set/added supported locales first. + * Duplicates are allowed, and are not removed. + * + * Each of the iterator parameter values must be an + * input iterator whose value is convertible to const Locale &. + * + * @param begin Start of range. + * @param end Exclusive end of range. + * @param converter Converter from *begin to const Locale & or compatible. + * @return this Builder object + * @draft ICU 65 + */ + template<typename Iter, typename Conv> + Builder &setSupportedLocalesViaConverter(Iter begin, Iter end, Conv converter) { + if (U_FAILURE(errorCode_)) { return *this; } + clearSupportedLocales(); + while (begin != end) { + addSupportedLocale(converter(*begin++)); + } + return *this; + } + + /** + * Adds another supported locale. + * Duplicates are allowed, and are not removed. + * + * @param locale another locale + * @return this Builder object + * @draft ICU 65 + */ + Builder &addSupportedLocale(const Locale &locale); + + /** + * Sets the default locale; if nullptr, or if it is not set explicitly, + * then the first supported locale is used as the default locale. + * + * @param defaultLocale the default locale (will be copied) + * @return this Builder object + * @draft ICU 65 + */ + Builder &setDefaultLocale(const Locale *defaultLocale); + + /** + * If ULOCMATCH_FAVOR_SCRIPT, then the language differences are smaller than script + * differences. + * This is used in situations (such as maps) where + * it is better to fall back to the same script than a similar language. + * + * @param subtag the subtag to favor + * @return this Builder object + * @draft ICU 65 + */ + Builder &setFavorSubtag(ULocMatchFavorSubtag subtag); + + /** + * Option for whether all desired locales are treated equally or + * earlier ones are preferred (this is the default). + * + * @param demotion the demotion per desired locale to set. + * @return this Builder object + * @draft ICU 65 + */ + Builder &setDemotionPerDesiredLocale(ULocMatchDemotion demotion); + + /** + * Sets the UErrorCode if an error occurred while setting parameters. + * Preserves older error codes in the outErrorCode. + * + * @param outErrorCode Set to an error code if it does not contain one already + * and an error occurred while setting parameters. + * Otherwise unchanged. + * @return TRUE if U_FAILURE(outErrorCode) + * @draft ICU 65 + */ + UBool copyErrorTo(UErrorCode &outErrorCode) const; + + /** + * Builds and returns a new locale matcher. + * This builder can continue to be used. + * + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, + * or else the function returns immediately. Check for U_FAILURE() + * on output or use with function chaining. (See User Guide for details.) + * @return new LocaleMatcher. + * @draft ICU 65 + */ + LocaleMatcher build(UErrorCode &errorCode) const; + + private: + friend class LocaleMatcher; + + Builder(const Builder &other) = delete; + Builder &operator=(const Builder &other) = delete; + + void clearSupportedLocales(); + bool ensureSupportedLocaleVector(); + + UErrorCode errorCode_ = U_ZERO_ERROR; + UVector *supportedLocales_ = nullptr; + int32_t thresholdDistance_ = -1; + ULocMatchDemotion demotion_ = ULOCMATCH_DEMOTION_REGION; + Locale *defaultLocale_ = nullptr; + ULocMatchFavorSubtag favor_ = ULOCMATCH_FAVOR_LANGUAGE; + }; + + // FYI No public LocaleMatcher constructors in C++; use the Builder. + + /** + * Move copy constructor; might modify the source. + * This matcher will have the same settings that the source matcher had. + * @param src source matcher + * @draft ICU 65 + */ + LocaleMatcher(LocaleMatcher &&src) U_NOEXCEPT; + + /** + * Destructor. + * @draft ICU 65 + */ + ~LocaleMatcher(); + + /** + * Move assignment operator; might modify the source. + * This matcher will have the same settings that the source matcher had. + * The behavior is undefined if *this and src are the same object. + * @param src source matcher + * @return *this + * @draft ICU 65 + */ + LocaleMatcher &operator=(LocaleMatcher &&src) U_NOEXCEPT; + + /** + * Returns the supported locale which best matches the desired locale. + * + * @param desiredLocale Typically a user's language. + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, + * or else the function returns immediately. Check for U_FAILURE() + * on output or use with function chaining. (See User Guide for details.) + * @return the best-matching supported locale. + * @draft ICU 65 + */ + const Locale *getBestMatch(const Locale &desiredLocale, UErrorCode &errorCode) const; + + /** + * Returns the supported locale which best matches one of the desired locales. + * + * @param desiredLocales Typically a user's languages, in order of preference (descending). + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, + * or else the function returns immediately. Check for U_FAILURE() + * on output or use with function chaining. (See User Guide for details.) + * @return the best-matching supported locale. + * @draft ICU 65 + */ + const Locale *getBestMatch(Locale::Iterator &desiredLocales, UErrorCode &errorCode) const; + + /** + * Parses an Accept-Language string + * (<a href="https://tools.ietf.org/html/rfc2616#section-14.4">RFC 2616 Section 14.4</a>), + * such as "af, en, fr;q=0.9", + * and returns the supported locale which best matches one of the desired locales. + * Allows whitespace in more places but does not allow "*". + * + * @param desiredLocaleList Typically a user's languages, as an Accept-Language string. + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, + * or else the function returns immediately. Check for U_FAILURE() + * on output or use with function chaining. (See User Guide for details.) + * @return the best-matching supported locale. + * @draft ICU 65 + */ + const Locale *getBestMatchForListString(StringPiece desiredLocaleList, UErrorCode &errorCode) const; + + /** + * Returns the best match between the desired locale and the supported locales. + * If the result's desired locale is not nullptr, then it is the address of the input locale. + * It has not been cloned. + * + * @param desiredLocale Typically a user's language. + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, + * or else the function returns immediately. Check for U_FAILURE() + * on output or use with function chaining. (See User Guide for details.) + * @return the best-matching pair of the desired and a supported locale. + * @draft ICU 65 + */ + Result getBestMatchResult(const Locale &desiredLocale, UErrorCode &errorCode) const; + + /** + * Returns the best match between the desired and supported locales. + * If the result's desired locale is not nullptr, then it is a clone of + * the best-matching desired locale. The Result object owns the clone. + * + * @param desiredLocales Typically a user's languages, in order of preference (descending). + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, + * or else the function returns immediately. Check for U_FAILURE() + * on output or use with function chaining. (See User Guide for details.) + * @return the best-matching pair of a desired and a supported locale. + * @draft ICU 65 + */ + Result getBestMatchResult(Locale::Iterator &desiredLocales, UErrorCode &errorCode) const; + +#ifndef U_HIDE_INTERNAL_API + /** + * Returns a fraction between 0 and 1, where 1 means that the languages are a + * perfect match, and 0 means that they are completely different. + * + * <p>This is mostly an implementation detail, and the precise values may change over time. + * The implementation may use either the maximized forms or the others ones, or both. + * The implementation may or may not rely on the forms to be consistent with each other. + * + * <p>Callers should construct and use a matcher rather than match pairs of locales directly. + * + * @param desired Desired locale. + * @param supported Supported locale. + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, + * or else the function returns immediately. Check for U_FAILURE() + * on output or use with function chaining. (See User Guide for details.) + * @return value between 0 and 1, inclusive. + * @internal (has a known user) + */ + double internalMatch(const Locale &desired, const Locale &supported, UErrorCode &errorCode) const; +#endif // U_HIDE_INTERNAL_API + +private: + LocaleMatcher(const Builder &builder, UErrorCode &errorCode); + LocaleMatcher(const LocaleMatcher &other) = delete; + LocaleMatcher &operator=(const LocaleMatcher &other) = delete; + + int32_t getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remainingIter, UErrorCode &errorCode) const; + + const XLikelySubtags &likelySubtags; + const LocaleDistance &localeDistance; + int32_t thresholdDistance; + int32_t demotionPerDesiredLocale; + ULocMatchFavorSubtag favorSubtag; + + // These are in input order. + const Locale ** supportedLocales; + LSR *lsrs; + int32_t supportedLocalesLength; + // These are in preference order: 1. Default locale 2. paradigm locales 3. others. + UHashtable *supportedLsrToIndex; // Map<LSR, Integer> stores index+1 because 0 is "not found" + // Array versions of the supportedLsrToIndex keys and values. + // The distance lookup loops over the supportedLSRs and returns the index of the best match. + const LSR **supportedLSRs; + int32_t *supportedIndexes; + int32_t supportedLSRsLength; + Locale *ownedDefaultLocale; + const Locale *defaultLocale; + int32_t defaultLocaleIndex; +}; + +U_NAMESPACE_END + +#endif // U_HIDE_DRAFT_API +#endif // U_SHOW_CPLUSPLUS_API +#endif // __LOCALEMATCHER_H__ diff --git a/source/common/unicode/locid.h b/source/common/unicode/locid.h index 7e410e53c7414d4905a1566ea419cbea3cd9287e..0c9aecb6ec1c7c51c826446e72c0d8ffbaabf47f 100644 --- a/source/common/unicode/locid.h +++ b/source/common/unicode/locid.h @@ -1008,6 +1008,104 @@ public: */ virtual UClassID getDynamicClassID() const; +#ifndef U_HIDE_DRAFT_API + /** + * A Locale iterator interface similar to a Java Iterator<Locale>. + * @draft ICU 65 + */ + class U_COMMON_API Iterator /* not : public UObject because this is an interface/mixin class */ { + public: + /** @draft ICU 65 */ + virtual ~Iterator(); + + /** + * @return TRUE if next() can be called again. + * @draft ICU 65 + */ + virtual UBool hasNext() const = 0; + + /** + * @return the next locale. + * @draft ICU 65 + */ + virtual const Locale &next() = 0; + }; + + /** + * A generic Locale iterator implementation over Locale input iterators. + * @draft ICU 65 + */ + template<typename Iter> + class RangeIterator : public Iterator, public UMemory { + public: + /** + * Constructs an iterator from a begin/end range. + * Each of the iterator parameter values must be an + * input iterator whose value is convertible to const Locale &. + * + * @param begin Start of range. + * @param end Exclusive end of range. + * @draft ICU 65 + */ + RangeIterator(Iter begin, Iter end) : it_(begin), end_(end) {} + + /** + * @return TRUE if next() can be called again. + * @draft ICU 65 + */ + UBool hasNext() const override { return it_ != end_; } + + /** + * @return the next locale. + * @draft ICU 65 + */ + const Locale &next() override { return *it_++; } + + private: + Iter it_; + const Iter end_; + }; + + /** + * A generic Locale iterator implementation over Locale input iterators. + * Calls the converter to convert each *begin to a const Locale &. + * @draft ICU 65 + */ + template<typename Iter, typename Conv> + class ConvertingIterator : public Iterator, public UMemory { + public: + /** + * Constructs an iterator from a begin/end range. + * Each of the iterator parameter values must be an + * input iterator whose value the converter converts to const Locale &. + * + * @param begin Start of range. + * @param end Exclusive end of range. + * @param converter Converter from *begin to const Locale & or compatible. + * @draft ICU 65 + */ + ConvertingIterator(Iter begin, Iter end, Conv converter) : + it_(begin), end_(end), converter_(converter) {} + + /** + * @return TRUE if next() can be called again. + * @draft ICU 65 + */ + UBool hasNext() const override { return it_ != end_; } + + /** + * @return the next locale. + * @draft ICU 65 + */ + const Locale &next() override { return converter_(*it_++); } + + private: + Iter it_; + const Iter end_; + Conv converter_; + }; +#endif // U_HIDE_DRAFT_API + protected: /* only protected for testing purposes. DO NOT USE. */ #ifndef U_HIDE_INTERNAL_API /** diff --git a/source/common/unicode/ucharstrie.h b/source/common/unicode/ucharstrie.h index dfc93f6d0bae8acf02463b99044e8dea72e939bb..b8c83a6e95e8da23d6aa0e201d744ac69ad1399b 100644 --- a/source/common/unicode/ucharstrie.h +++ b/source/common/unicode/ucharstrie.h @@ -94,6 +94,39 @@ public: return *this; } + /** + * Returns the state of this trie as a 64-bit integer. + * The state value is never 0. + * + * @return opaque state value + * @see resetToState64 + * @draft ICU 65 + */ + uint64_t getState64() const { + return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) | + (uint64_t)(pos_ - uchars_); + } + + /** + * Resets this trie to the saved state. + * Unlike resetToState(State), the 64-bit state value + * must be from getState64() from the same trie object or + * from one initialized the exact same way. + * Because of no validation, this method is faster. + * + * @param state The opaque trie state value from getState64(). + * @return *this + * @see getState64 + * @see resetToState + * @see reset + * @draft ICU 65 + */ + UCharsTrie &resetToState64(uint64_t state) { + remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2; + pos_ = uchars_ + (state & kState64PosMask); + return *this; + } + /** * UCharsTrie state object, for saving a trie's current state * and resetting the trie back to this state later. @@ -560,6 +593,13 @@ private: static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff + // For getState64(): + // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2 + // so we need at least 5 bits for that. + // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength. + static constexpr int32_t kState64RemainingShift = 59; + static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1; + char16_t *ownedArray_; // Fixed value referencing the UCharsTrie words. diff --git a/source/common/unicode/utrace.h b/source/common/unicode/utrace.h index 66269784dbc71f51d0d1a7d2278b39fc8e2cb44f..412e11ad2bc43bc7114726d71c6704235670fd51 100644 --- a/source/common/unicode/utrace.h +++ b/source/common/unicode/utrace.h @@ -66,6 +66,7 @@ typedef enum UTraceFunctionNumber { UTRACE_FUNCTION_START=0, UTRACE_U_INIT=UTRACE_FUNCTION_START, UTRACE_U_CLEANUP, + #ifndef U_HIDE_DEPRECATED_API /** * One more than the highest normal collation trace location. @@ -83,6 +84,7 @@ typedef enum UTraceFunctionNumber { UTRACE_UCNV_FLUSH_CACHE, UTRACE_UCNV_LOAD, UTRACE_UCNV_UNLOAD, + #ifndef U_HIDE_DEPRECATED_API /** * One more than the highest normal collation trace location. @@ -101,13 +103,55 @@ typedef enum UTraceFunctionNumber { UTRACE_UCOL_STRCOLLITER, UTRACE_UCOL_OPEN_FROM_SHORT_STRING, UTRACE_UCOL_STRCOLLUTF8, /**< @stable ICU 50 */ + #ifndef U_HIDE_DEPRECATED_API /** * One more than the highest normal collation trace location. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. */ - UTRACE_COLLATION_LIMIT + UTRACE_COLLATION_LIMIT, #endif // U_HIDE_DEPRECATED_API + +#ifndef U_HIDE_DRAFT_API + + /** + * The lowest resource/data location. + * @draft ICU 65 + */ + UTRACE_RES_DATA_START=0x3000, + + /** + * Indicates that a value was read from a resource bundle. Provides three + * C-style strings to UTraceData: type, file name, and resource path. The + * type is "string", "binary", "intvector", "int", or "uint". + * @draft ICU 65 + */ + UTRACE_UDATA_RESOURCE=UTRACE_RES_DATA_START, + + /** + * Indicates that a value was read from a resource bundle. Provides one + * C-style string to UTraceData: file name. + * @draft ICU 65 + */ + UTRACE_UDATA_DATA_FILE, + + /** + * Indicates that a value was read from a resource bundle. Provides one + * C-style string to UTraceData: file name. + * @draft ICU 65 + */ + UTRACE_UDATA_RES_FILE, + +#endif // U_HIDE_DRAFT_API + +#ifndef U_HIDE_INTERNAL_API + /** + * One more than the highest normal resource/data trace location. + * @internal The numeric value may change over time, see ICU ticket #12420. + */ + UTRACE_RES_DATA_LIMIT, +#endif // U_HIDE_INTERNAL_API + } UTraceFunctionNumber; /** diff --git a/source/common/uresbund.cpp b/source/common/uresbund.cpp index b20e3095054b43435a5e04a230ccdbf98674ecbf..3224fb37187d8d7a047a2a685a34c4752e67d7c5 100644 --- a/source/common/uresbund.cpp +++ b/source/common/uresbund.cpp @@ -38,6 +38,7 @@ #include "umutex.h" #include "putilimp.h" #include "uassert.h" +#include "uresdata.h" using namespace icu; @@ -401,7 +402,8 @@ static UResourceDataEntry *init_entry(const char *localeID, const char *path, UE /* We'll try to get alias string from the bundle */ aliasres = res_getResource(&(r->fData), "%%ALIAS"); if (aliasres != RES_BOGUS) { - const UChar *alias = res_getString(&(r->fData), aliasres, &aliasLen); + // No tracing: called during initial data loading + const UChar *alias = res_getStringNoTrace(&(r->fData), aliasres, &aliasLen); if(alias != NULL && aliasLen > 0) { /* if there is actual alias - unload and load new data */ u_UCharsToChars(alias, aliasName, aliasLen+1); r->fAlias = init_entry(aliasName, path, status); @@ -542,7 +544,8 @@ loadParentsExceptRoot(UResourceDataEntry *&t1, Resource parentRes = res_getResource(&t1->fData, "%%Parent"); if (parentRes != RES_BOGUS) { // An explicit parent was found. int32_t parentLocaleLen = 0; - const UChar *parentLocaleName = res_getString(&(t1->fData), parentRes, &parentLocaleLen); + // No tracing: called during initial data loading + const UChar *parentLocaleName = res_getStringNoTrace(&(t1->fData), parentRes, &parentLocaleLen); if(parentLocaleName != NULL && 0 < parentLocaleLen && parentLocaleLen < nameCapacity) { u_UCharsToChars(parentLocaleName, name, parentLocaleLen + 1); if (uprv_strcmp(name, kRootLocaleName) == 0) { @@ -1304,7 +1307,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getString(const UResourceBundle* resB, int32_ *status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } - s = res_getString(&(resB->fResData), resB->fRes, len); + s = res_getString({resB}, &(resB->fResData), resB->fRes, len); if (s == NULL) { *status = U_RESOURCE_TYPE_MISMATCH; } @@ -1393,7 +1396,7 @@ U_CAPI const uint8_t* U_EXPORT2 ures_getBinary(const UResourceBundle* resB, int3 *status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } - p = res_getBinary(&(resB->fResData), resB->fRes, len); + p = res_getBinary({resB}, &(resB->fResData), resB->fRes, len); if (p == NULL) { *status = U_RESOURCE_TYPE_MISMATCH; } @@ -1410,7 +1413,7 @@ U_CAPI const int32_t* U_EXPORT2 ures_getIntVector(const UResourceBundle* resB, i *status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } - p = res_getIntVector(&(resB->fResData), resB->fRes, len); + p = res_getIntVector({resB}, &(resB->fResData), resB->fRes, len); if (p == NULL) { *status = U_RESOURCE_TYPE_MISMATCH; } @@ -1431,7 +1434,7 @@ U_CAPI int32_t U_EXPORT2 ures_getInt(const UResourceBundle* resB, UErrorCode *st *status = U_RESOURCE_TYPE_MISMATCH; return 0xffffffff; } - return RES_GET_INT(resB->fRes); + return res_getInt({resB}, resB->fRes); } U_CAPI uint32_t U_EXPORT2 ures_getUInt(const UResourceBundle* resB, UErrorCode *status) { @@ -1446,7 +1449,7 @@ U_CAPI uint32_t U_EXPORT2 ures_getUInt(const UResourceBundle* resB, UErrorCode * *status = U_RESOURCE_TYPE_MISMATCH; return 0xffffffff; } - return RES_GET_UINT(resB->fRes); + return res_getUInt({resB}, resB->fRes); } U_CAPI UResType U_EXPORT2 ures_getType(const UResourceBundle *resB) { @@ -1457,10 +1460,18 @@ U_CAPI UResType U_EXPORT2 ures_getType(const UResourceBundle *resB) { } U_CAPI const char * U_EXPORT2 ures_getKey(const UResourceBundle *resB) { + // + // TODO: Trace ures_getKey? I guess not usually. + // + // We usually get the key string to decide whether we want the value, or to + // make a key-value pair. Tracing the value should suffice. + // + // However, I believe we have some data (e.g., in res_index) where the key + // strings are the data. Tracing the enclosing table should suffice. + // if(resB == NULL) { return NULL; } - return(resB->fKey); } @@ -1480,7 +1491,7 @@ static const UChar* ures_getStringWithAlias(const UResourceBundle *resB, Resourc ures_close(tempRes); return result; } else { - return res_getString(&(resB->fResData), r, len); + return res_getString({resB, sIndex}, &(resB->fResData), r, len); } } @@ -1516,7 +1527,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getNextString(UResourceBundle *resB, int32_t* switch(RES_GET_TYPE(resB->fRes)) { case URES_STRING: case URES_STRING_V2: - return res_getString(&(resB->fResData), resB->fRes, len); + return res_getString({resB}, &(resB->fResData), resB->fRes, len); case URES_TABLE: case URES_TABLE16: case URES_TABLE32: @@ -1661,7 +1672,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getStringByIndex(const UResourceBundle *resB, switch(RES_GET_TYPE(resB->fRes)) { case URES_STRING: case URES_STRING_V2: - return res_getString(&(resB->fResData), resB->fRes, len); + return res_getString({resB}, &(resB->fResData), resB->fRes, len); case URES_TABLE: case URES_TABLE16: case URES_TABLE32: @@ -1953,10 +1964,10 @@ void getAllItemsWithFallback( // When the sink sees the no-fallback/no-inheritance marker, // then it would remove the parent's item. // We would deserialize parent values even though they are overridden in a child bundle. - value.pResData = &bundle->fResData; + value.setData(&bundle->fResData); UResourceDataEntry *parentEntry = bundle->fData->fParent; UBool hasParent = parentEntry != NULL && U_SUCCESS(parentEntry->fBogus); - value.setResource(bundle->fRes); + value.setResource(bundle->fRes, ResourceTracer(bundle)); sink.put(bundle->fKey, value, !hasParent, errorCode); if (hasParent) { // We might try to query the sink whether @@ -2001,31 +2012,60 @@ void getAllItemsWithFallback( } // namespace +// Requires a ResourceDataValue fill-in, so that we need not cast from a ResourceValue. +// Unfortunately, the caller must know which subclass to make and pass in. +// Alternatively, we could make it as polymorphic as in Java by +// returning a ResourceValue pointer (possibly wrapped into a LocalPointer) +// that the caller then owns. +// +// Also requires a UResourceBundle fill-in, so that the value's ResourceTracer +// can point to a non-local bundle. +// Without tracing, the child bundle could be a function-local object. +U_CAPI void U_EXPORT2 +ures_getValueWithFallback(const UResourceBundle *bundle, const char *path, + UResourceBundle *tempFillIn, + ResourceDataValue &value, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return; } + if (path == nullptr) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + const UResourceBundle *rb; + if (*path == 0) { + // empty path + rb = bundle; + } else { + rb = ures_getByKeyWithFallback(bundle, path, tempFillIn, &errorCode); + if (U_FAILURE(errorCode)) { + return; + } + } + value.setData(&rb->fResData); + value.setResource(rb->fRes, ResourceTracer(rb)); +} + U_CAPI void U_EXPORT2 ures_getAllItemsWithFallback(const UResourceBundle *bundle, const char *path, icu::ResourceSink &sink, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return; } - if (path == NULL) { + if (path == nullptr) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } - UResourceBundle stackBundle; - ures_initStackObject(&stackBundle); + StackUResourceBundle stackBundle; const UResourceBundle *rb; if (*path == 0) { // empty path rb = bundle; } else { - rb = ures_getByKeyWithFallback(bundle, path, &stackBundle, &errorCode); + rb = ures_getByKeyWithFallback(bundle, path, stackBundle.getAlias(), &errorCode); if (U_FAILURE(errorCode)) { - ures_close(&stackBundle); return; } } // Get all table items with fallback. ResourceDataValue value; getAllItemsWithFallback(rb, value, sink, errorCode); - ures_close(&stackBundle); } U_CAPI UResourceBundle* U_EXPORT2 ures_getByKey(const UResourceBundle *resB, const char* inKey, UResourceBundle *fillIn, UErrorCode *status) { @@ -2108,7 +2148,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getStringByKey(const UResourceBundle *resB, c switch (RES_GET_TYPE(res)) { case URES_STRING: case URES_STRING_V2: - return res_getString(rd, res, len); + return res_getString({resB, key}, rd, res, len); case URES_ALIAS: { const UChar* result = 0; @@ -2130,7 +2170,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getStringByKey(const UResourceBundle *resB, c switch (RES_GET_TYPE(res)) { case URES_STRING: case URES_STRING_V2: - return res_getString(&(resB->fResData), res, len); + return res_getString({resB, key}, &(resB->fResData), res, len); case URES_ALIAS: { const UChar* result = 0; @@ -2151,6 +2191,7 @@ U_CAPI const UChar* U_EXPORT2 ures_getStringByKey(const UResourceBundle *resB, c /* here should go a first attempt to locate the key using index table */ const ResourceData *rd = getFallbackData(resB, &key, &realData, &res, status); if(U_SUCCESS(*status)) { + // TODO: Tracing return res_getString(rd, res, len); } else { *status = U_MISSING_RESOURCE_ERROR; diff --git a/source/common/uresdata.cpp b/source/common/uresdata.cpp index 8bcb9ab8b88aa2bc27b2590b2aeb187b8fc3ca72..b3c2e2e27ccf9a12875fa2f028c933b3ceff06ab 100644 --- a/source/common/uresdata.cpp +++ b/source/common/uresdata.cpp @@ -33,6 +33,7 @@ #include "uinvchar.h" #include "uresdata.h" #include "uresimp.h" +#include "utracimp.h" /* * Resource access helpers @@ -307,7 +308,7 @@ res_getPublicType(Resource res) { } U_CAPI const UChar * U_EXPORT2 -res_getString(const ResourceData *pResData, Resource res, int32_t *pLength) { +res_getStringNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength) { const UChar *p; uint32_t offset=RES_GET_OFFSET(res); int32_t length; @@ -402,7 +403,8 @@ int32_t getStringArray(const ResourceData *pResData, const icu::ResourceArray &a } for(int32_t i = 0; i < length; ++i) { int32_t sLength; - const UChar *s = res_getString(pResData, array.internalGetResource(pResData, i), &sLength); + // No tracing: handled by the caller + const UChar *s = res_getStringNoTrace(pResData, array.internalGetResource(pResData, i), &sLength); if(s == NULL) { errorCode = U_RESOURCE_TYPE_MISMATCH; return 0; @@ -434,7 +436,7 @@ res_getAlias(const ResourceData *pResData, Resource res, int32_t *pLength) { } U_CAPI const uint8_t * U_EXPORT2 -res_getBinary(const ResourceData *pResData, Resource res, int32_t *pLength) { +res_getBinaryNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength) { const uint8_t *p; uint32_t offset=RES_GET_OFFSET(res); int32_t length; @@ -454,7 +456,7 @@ res_getBinary(const ResourceData *pResData, Resource res, int32_t *pLength) { U_CAPI const int32_t * U_EXPORT2 -res_getIntVector(const ResourceData *pResData, Resource res, int32_t *pLength) { +res_getIntVectorNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength) { const int32_t *p; uint32_t offset=RES_GET_OFFSET(res); int32_t length; @@ -507,7 +509,7 @@ const UChar *ResourceDataValue::getString(int32_t &length, UErrorCode &errorCode if(U_FAILURE(errorCode)) { return NULL; } - const UChar *s = res_getString(pResData, res, &length); + const UChar *s = res_getString(fTraceInfo, &getData(), res, &length); if(s == NULL) { errorCode = U_RESOURCE_TYPE_MISMATCH; } @@ -518,7 +520,7 @@ const UChar *ResourceDataValue::getAliasString(int32_t &length, UErrorCode &erro if(U_FAILURE(errorCode)) { return NULL; } - const UChar *s = res_getAlias(pResData, res, &length); + const UChar *s = res_getAlias(&getData(), res, &length); if(s == NULL) { errorCode = U_RESOURCE_TYPE_MISMATCH; } @@ -532,7 +534,7 @@ int32_t ResourceDataValue::getInt(UErrorCode &errorCode) const { if(RES_GET_TYPE(res) != URES_INT) { errorCode = U_RESOURCE_TYPE_MISMATCH; } - return RES_GET_INT(res); + return res_getInt(fTraceInfo, res); } uint32_t ResourceDataValue::getUInt(UErrorCode &errorCode) const { @@ -542,14 +544,14 @@ uint32_t ResourceDataValue::getUInt(UErrorCode &errorCode) const { if(RES_GET_TYPE(res) != URES_INT) { errorCode = U_RESOURCE_TYPE_MISMATCH; } - return RES_GET_UINT(res); + return res_getUInt(fTraceInfo, res); } const int32_t *ResourceDataValue::getIntVector(int32_t &length, UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { return NULL; } - const int32_t *iv = res_getIntVector(pResData, res, &length); + const int32_t *iv = res_getIntVector(fTraceInfo, &getData(), res, &length); if(iv == NULL) { errorCode = U_RESOURCE_TYPE_MISMATCH; } @@ -560,7 +562,7 @@ const uint8_t *ResourceDataValue::getBinary(int32_t &length, UErrorCode &errorCo if(U_FAILURE(errorCode)) { return NULL; } - const uint8_t *b = res_getBinary(pResData, res, &length); + const uint8_t *b = res_getBinary(fTraceInfo, &getData(), res, &length); if(b == NULL) { errorCode = U_RESOURCE_TYPE_MISMATCH; } @@ -578,19 +580,19 @@ ResourceArray ResourceDataValue::getArray(UErrorCode &errorCode) const { switch(RES_GET_TYPE(res)) { case URES_ARRAY: if (offset!=0) { // empty if offset==0 - items32 = (const Resource *)pResData->pRoot+offset; + items32 = (const Resource *)getData().pRoot+offset; length = *items32++; } break; case URES_ARRAY16: - items16 = pResData->p16BitUnits+offset; + items16 = getData().p16BitUnits+offset; length = *items16++; break; default: errorCode = U_RESOURCE_TYPE_MISMATCH; return ResourceArray(); } - return ResourceArray(items16, items32, length); + return ResourceArray(items16, items32, length, fTraceInfo); } ResourceTable ResourceDataValue::getTable(UErrorCode &errorCode) const { @@ -606,19 +608,19 @@ ResourceTable ResourceDataValue::getTable(UErrorCode &errorCode) const { switch(RES_GET_TYPE(res)) { case URES_TABLE: if (offset != 0) { // empty if offset==0 - keys16 = (const uint16_t *)(pResData->pRoot+offset); + keys16 = (const uint16_t *)(getData().pRoot+offset); length = *keys16++; items32 = (const Resource *)(keys16+length+(~length&1)); } break; case URES_TABLE16: - keys16 = pResData->p16BitUnits+offset; + keys16 = getData().p16BitUnits+offset; length = *keys16++; items16 = keys16 + length; break; case URES_TABLE32: if (offset != 0) { // empty if offset==0 - keys32 = pResData->pRoot+offset; + keys32 = getData().pRoot+offset; length = *keys32++; items32 = (const Resource *)keys32 + length; } @@ -627,22 +629,22 @@ ResourceTable ResourceDataValue::getTable(UErrorCode &errorCode) const { errorCode = U_RESOURCE_TYPE_MISMATCH; return ResourceTable(); } - return ResourceTable(keys16, keys32, items16, items32, length); + return ResourceTable(keys16, keys32, items16, items32, length, fTraceInfo); } UBool ResourceDataValue::isNoInheritanceMarker() const { - return ::isNoInheritanceMarker(pResData, res); + return ::isNoInheritanceMarker(&getData(), res); } int32_t ResourceDataValue::getStringArray(UnicodeString *dest, int32_t capacity, UErrorCode &errorCode) const { - return ::getStringArray(pResData, getArray(errorCode), dest, capacity, errorCode); + return ::getStringArray(&getData(), getArray(errorCode), dest, capacity, errorCode); } int32_t ResourceDataValue::getStringArrayOrStringAsArray(UnicodeString *dest, int32_t capacity, UErrorCode &errorCode) const { if(URES_IS_ARRAY(res)) { - return ::getStringArray(pResData, getArray(errorCode), dest, capacity, errorCode); + return ::getStringArray(&getData(), getArray(errorCode), dest, capacity, errorCode); } if(U_FAILURE(errorCode)) { return 0; @@ -656,7 +658,7 @@ int32_t ResourceDataValue::getStringArrayOrStringAsArray(UnicodeString *dest, in return 1; } int32_t sLength; - const UChar *s = res_getString(pResData, res, &sLength); + const UChar *s = res_getString(fTraceInfo, &getData(), res, &sLength); if(s != NULL) { dest[0].setTo(TRUE, s, sLength); return 1; @@ -671,7 +673,7 @@ UnicodeString ResourceDataValue::getStringOrFirstOfArray(UErrorCode &errorCode) return us; } int32_t sLength; - const UChar *s = res_getString(pResData, res, &sLength); + const UChar *s = res_getString(fTraceInfo, &getData(), res, &sLength); if(s != NULL) { us.setTo(TRUE, s, sLength); return us; @@ -681,7 +683,8 @@ UnicodeString ResourceDataValue::getStringOrFirstOfArray(UErrorCode &errorCode) return us; } if(array.getSize() > 0) { - s = res_getString(pResData, array.internalGetResource(pResData, 0), &sLength); + // Tracing is already performed above (unimportant for trace that this is an array) + s = res_getStringNoTrace(&getData(), array.internalGetResource(&getData(), 0), &sLength); if(s != NULL) { us.setTo(TRUE, s, sLength); return us; @@ -818,18 +821,45 @@ UBool icu::ResourceTable::getKeyAndValue(int32_t i, const char *&key, icu::ResourceValue &value) const { if(0 <= i && i < length) { icu::ResourceDataValue &rdValue = static_cast<icu::ResourceDataValue &>(value); - if (keys16 != NULL) { - key = RES_GET_KEY16(rdValue.pResData, keys16[i]); + if (keys16 != nullptr) { + key = RES_GET_KEY16(&rdValue.getData(), keys16[i]); } else { - key = RES_GET_KEY32(rdValue.pResData, keys32[i]); + key = RES_GET_KEY32(&rdValue.getData(), keys32[i]); } Resource res; - if (items16 != NULL) { - res = makeResourceFrom16(rdValue.pResData, items16[i]); + if (items16 != nullptr) { + res = makeResourceFrom16(&rdValue.getData(), items16[i]); } else { res = items32[i]; } - rdValue.setResource(res); + // Note: the ResourceTracer keeps a reference to the field of this + // ResourceTable. This is OK because the ResourceTable should remain + // alive for the duration that fields are being read from it + // (including nested fields). + rdValue.setResource(res, ResourceTracer(fTraceInfo, key)); + return TRUE; + } + return FALSE; +} + +UBool icu::ResourceTable::findValue(const char *key, ResourceValue &value) const { + icu::ResourceDataValue &rdValue = static_cast<icu::ResourceDataValue &>(value); + const char *realKey = nullptr; + int32_t i; + if (keys16 != nullptr) { + i = _res_findTableItem(&rdValue.getData(), keys16, length, key, &realKey); + } else { + i = _res_findTable32Item(&rdValue.getData(), keys32, length, key, &realKey); + } + if (i >= 0) { + Resource res; + if (items16 != nullptr) { + res = makeResourceFrom16(&rdValue.getData(), items16[i]); + } else { + res = items32[i]; + } + // Same note about lifetime as in getKeyAndValue(). + rdValue.setResource(res, ResourceTracer(fTraceInfo, key)); return TRUE; } return FALSE; @@ -875,7 +905,13 @@ uint32_t icu::ResourceArray::internalGetResource(const ResourceData *pResData, i UBool icu::ResourceArray::getValue(int32_t i, icu::ResourceValue &value) const { if(0 <= i && i < length) { icu::ResourceDataValue &rdValue = static_cast<icu::ResourceDataValue &>(value); - rdValue.setResource(internalGetResource(rdValue.pResData, i)); + // Note: the ResourceTracer keeps a reference to the field of this + // ResourceArray. This is OK because the ResourceArray should remain + // alive for the duration that fields are being read from it + // (including nested fields). + rdValue.setResource( + internalGetResource(&rdValue.getData(), i), + ResourceTracer(fTraceInfo, i)); return TRUE; } return FALSE; diff --git a/source/common/uresdata.h b/source/common/uresdata.h index 4e28ddccf63199a4dac23fb14dc73aba509b7f90..d1b67babf29049c523f77f8d2bec00ef24799a4e 100644 --- a/source/common/uresdata.h +++ b/source/common/uresdata.h @@ -69,14 +69,16 @@ typedef uint32_t Resource; #define RES_GET_OFFSET(res) ((res)&0x0fffffff) #define RES_GET_POINTER(pRoot, res) ((pRoot)+RES_GET_OFFSET(res)) -/* get signed and unsigned integer values directly from the Resource handle */ +/* get signed and unsigned integer values directly from the Resource handle + * NOTE: For proper logging, please use the res_getInt() constexpr + */ #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC -# define RES_GET_INT(res) (((int32_t)((res)<<4L))>>4L) +# define RES_GET_INT_NO_TRACE(res) (((int32_t)((res)<<4L))>>4L) #else -# define RES_GET_INT(res) (int32_t)(((res)&0x08000000) ? (res)|0xf0000000 : (res)&0x07ffffff) +# define RES_GET_INT_NO_TRACE(res) (int32_t)(((res)&0x08000000) ? (res)|0xf0000000 : (res)&0x07ffffff) #endif -#define RES_GET_UINT(res) ((res)&0x0fffffff) +#define RES_GET_UINT_NO_TRACE(res) ((res)&0x0fffffff) #define URES_IS_ARRAY(type) ((int32_t)(type)==URES_ARRAY || (int32_t)(type)==URES_ARRAY16) #define URES_IS_TABLE(type) ((int32_t)(type)==URES_TABLE || (int32_t)(type)==URES_TABLE16 || (int32_t)(type)==URES_TABLE32) @@ -423,22 +425,26 @@ res_unload(ResourceData *pResData); U_INTERNAL UResType U_EXPORT2 res_getPublicType(Resource res); +/////////////////////////////////////////////////////////////////////////// +// To enable tracing, use the inline versions of the res_get* functions. // +/////////////////////////////////////////////////////////////////////////// + /* * Return a pointer to a zero-terminated, const UChar* string * and set its length in *pLength. * Returns NULL if not found. */ U_INTERNAL const UChar * U_EXPORT2 -res_getString(const ResourceData *pResData, Resource res, int32_t *pLength); - -U_INTERNAL const UChar * U_EXPORT2 -res_getAlias(const ResourceData *pResData, Resource res, int32_t *pLength); +res_getStringNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength); U_INTERNAL const uint8_t * U_EXPORT2 -res_getBinary(const ResourceData *pResData, Resource res, int32_t *pLength); +res_getBinaryNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength); U_INTERNAL const int32_t * U_EXPORT2 -res_getIntVector(const ResourceData *pResData, Resource res, int32_t *pLength); +res_getIntVectorNoTrace(const ResourceData *pResData, Resource res, int32_t *pLength); + +U_INTERNAL const UChar * U_EXPORT2 +res_getAlias(const ResourceData *pResData, Resource res, int32_t *pLength); U_INTERNAL Resource U_EXPORT2 res_getResource(const ResourceData *pResData, const char *key); @@ -470,17 +476,55 @@ U_CFUNC Resource res_findResource(const ResourceData *pResData, Resource r, #ifdef __cplusplus #include "resource.h" +#include "restrace.h" U_NAMESPACE_BEGIN +inline const UChar* res_getString(const ResourceTracer& traceInfo, + const ResourceData *pResData, Resource res, int32_t *pLength) { + traceInfo.trace("string"); + return res_getStringNoTrace(pResData, res, pLength); +} + +inline const uint8_t* res_getBinary(const ResourceTracer& traceInfo, + const ResourceData *pResData, Resource res, int32_t *pLength) { + traceInfo.trace("binary"); + return res_getBinaryNoTrace(pResData, res, pLength); +} + +inline const int32_t* res_getIntVector(const ResourceTracer& traceInfo, + const ResourceData *pResData, Resource res, int32_t *pLength) { + traceInfo.trace("intvector"); + return res_getIntVectorNoTrace(pResData, res, pLength); +} + +inline int32_t res_getInt(const ResourceTracer& traceInfo, Resource res) { + traceInfo.trace("int"); + return RES_GET_INT_NO_TRACE(res); +} + +inline uint32_t res_getUInt(const ResourceTracer& traceInfo, Resource res) { + traceInfo.trace("uint"); + return RES_GET_UINT_NO_TRACE(res); +} + class ResourceDataValue : public ResourceValue { public: - ResourceDataValue() : pResData(NULL), res(static_cast<Resource>(URES_NONE)) {} + ResourceDataValue() : + res(static_cast<Resource>(URES_NONE)), + fTraceInfo() {} virtual ~ResourceDataValue(); - void setData(const ResourceData *data) { pResData = data; } - void setResource(Resource r) { res = r; } + void setData(const ResourceData *data) { + resData = *data; + } + void setResource(Resource r, ResourceTracer&& traceInfo) { + res = r; + fTraceInfo = traceInfo; + } + + const ResourceData &getData() const { return resData; } virtual UResType getType() const; virtual const UChar *getString(int32_t &length, UErrorCode &errorCode) const; virtual const UChar *getAliasString(int32_t &length, UErrorCode &errorCode) const; @@ -497,10 +541,12 @@ public: UErrorCode &errorCode) const; virtual UnicodeString getStringOrFirstOfArray(UErrorCode &errorCode) const; - const ResourceData *pResData; - private: + // TODO(ICU-20769): If UResourceBundle.fResData becomes a pointer, + // then remove this value field again and just store a pResData pointer. + ResourceData resData; Resource res; + ResourceTracer fTraceInfo; }; U_NAMESPACE_END diff --git a/source/common/uresimp.h b/source/common/uresimp.h index 51db6c52634848ac2edf541eadb889d59d45d79c..f453ddc004a9fd309c17b140b307f95bb090a36a 100644 --- a/source/common/uresimp.h +++ b/source/common/uresimp.h @@ -67,6 +67,9 @@ struct UResourceBundle { char *fVersion; UResourceDataEntry *fTopLevelData; /* for getting the valid locale */ char *fResPath; /* full path to the resource: "zh_TW/CollationElements/Sequence" */ + // TODO(ICU-20769): Try to change the by-value fResData into a pointer, + // with the struct in only one place for each bundle. + // Also replace class ResourceDataValue.resData with a pResData pointer again. ResourceData fResData; char fResBuf[RES_BUFSIZE]; int32_t fResPathLen; @@ -281,6 +284,11 @@ ures_getStringByKeyWithFallback(const UResourceBundle *resB, #ifdef __cplusplus +U_CAPI void U_EXPORT2 +ures_getValueWithFallback(const UResourceBundle *bundle, const char *path, + UResourceBundle *tempFillIn, + icu::ResourceDataValue &value, UErrorCode &errorCode); + U_CAPI void U_EXPORT2 ures_getAllItemsWithFallback(const UResourceBundle *bundle, const char *path, icu::ResourceSink &sink, UErrorCode &errorCode); diff --git a/source/common/utrace.cpp b/source/common/utrace.cpp index 2ac3d77c43a036b4a3be565b11b6dbaa089a4996..eced03b8bac6373470bc7c94e5ae78b15052f79e 100644 --- a/source/common/utrace.cpp +++ b/source/common/utrace.cpp @@ -476,6 +476,15 @@ trCollNames[] = { NULL }; + +static const char* const +trResDataNames[] = { + "ResourceTracer::trace", + "FileTracer::traceOpenDataFile", + "FileTracer::traceOpenResFile", + NULL +}; + U_CAPI const char * U_EXPORT2 utrace_functionName(int32_t fnNumber) { @@ -485,6 +494,8 @@ utrace_functionName(int32_t fnNumber) { return trConvNames[fnNumber - UTRACE_CONVERSION_START]; } else if(UTRACE_COLLATION_START <= fnNumber && fnNumber < UTRACE_COLLATION_LIMIT){ return trCollNames[fnNumber - UTRACE_COLLATION_START]; + } else if(UTRACE_RES_DATA_START <= fnNumber && fnNumber < UTRACE_RES_DATA_LIMIT){ + return trResDataNames[fnNumber - UTRACE_RES_DATA_START]; } else { return "[BOGUS Trace Function Number]"; } diff --git a/source/data/BUILDRULES.py b/source/data/BUILDRULES.py index 2442f4e3ff61084b352ddd0a778a23141de25128..d9e8ac19b651afc86d3c6fecb6bb8f88cce6d1e6 100644 --- a/source/data/BUILDRULES.py +++ b/source/data/BUILDRULES.py @@ -43,48 +43,49 @@ def generate(config, glob, common_vars): "locales", None, "icu-locale-deprecates.xml", - True, + config.use_pool_bundle, []) requests += generate_tree(config, glob, common_vars, "curr", "curr", "icu-locale-deprecates.xml", - True, + config.use_pool_bundle, []) requests += generate_tree(config, glob, common_vars, "lang", "lang", "icu-locale-deprecates.xml", - True, + config.use_pool_bundle, []) requests += generate_tree(config, glob, common_vars, "region", "region", "icu-locale-deprecates.xml", - True, + config.use_pool_bundle, []) requests += generate_tree(config, glob, common_vars, "zone", "zone", "icu-locale-deprecates.xml", - True, + config.use_pool_bundle, []) requests += generate_tree(config, glob, common_vars, "unit", "unit", "icu-locale-deprecates.xml", - True, + config.use_pool_bundle, []) requests += generate_tree(config, glob, common_vars, "coll", "coll", "icu-coll-deprecates.xml", + # Never use pool bundle for coll, brkitr, or rbnf False, # Depends on timezoneTypes.res and keyTypeData.res. # TODO: We should not need this dependency to build collation. @@ -95,6 +96,7 @@ def generate(config, glob, common_vars): "brkitr", "brkitr", "icu-locale-deprecates.xml", + # Never use pool bundle for coll, brkitr, or rbnf False, [DepTarget("brkitr_brk"), DepTarget("dictionaries")]) @@ -102,6 +104,7 @@ def generate(config, glob, common_vars): "rbnf", "rbnf", "icu-rbnf-deprecates.xml", + # Never use pool bundle for coll, brkitr, or rbnf False, []) diff --git a/source/data/buildtool/__main__.py b/source/data/buildtool/__main__.py index 52d869c89514ef2fdc06ab2a93e20645486a881f..30cfcdc2d7791716e33b08f7a220ebab526cd6eb 100644 --- a/source/data/buildtool/__main__.py +++ b/source/data/buildtool/__main__.py @@ -141,6 +141,11 @@ class Config(object): if "collationUCAData" in self.filters_json_data: self.coll_han_type = self.filters_json_data["collationUCAData"] + # True or False (could be extended later to support enum/list) + self.use_pool_bundle = True + if "usePoolBundle" in self.filters_json_data: + self.use_pool_bundle = self.filters_json_data["usePoolBundle"] + def _parse_filter_file(self, f): # Use the Hjson parser if it is available; otherwise, use vanilla JSON. try: diff --git a/source/data/buildtool/filtration_schema.json b/source/data/buildtool/filtration_schema.json index 479c65affe60089b9120808b2b330ac654768145..c9f9b8cd84ecf3e2b4c314735e46ad698e7c95a9 100644 --- a/source/data/buildtool/filtration_schema.json +++ b/source/data/buildtool/filtration_schema.json @@ -57,6 +57,9 @@ "collationUCAData": { "type": "string", "enum": ["unihan", "implicithan"] + }, + "usePoolBundle": { + "type": "boolean" } }, "additionalProperties": false, diff --git a/source/data/locales/hu.txt b/source/data/locales/hu.txt index ab73ac6bf84e69379aceafd82fe31eaab6ae7078..b95228c20c96396f59aa4d23e3f1e19307876a4e 100644 --- a/source/data/locales/hu.txt +++ b/source/data/locales/hu.txt @@ -216,7 +216,7 @@ hu{ other{"A kosár tartalma: {0} X. Megveszi őket?"} } } - minimumGroupingDigits{"4"} + minimumGroupingDigits{"1"} native{"latn"} } Version{"2.1.48.42"} diff --git a/source/i18n/smpdtfmt.cpp b/source/i18n/smpdtfmt.cpp index 98f36b3e7e2eefe6add8338f197dbcbc24f83890..4fd1675e52113e0604249df56a214c2826455161 100644 --- a/source/i18n/smpdtfmt.cpp +++ b/source/i18n/smpdtfmt.cpp @@ -3998,6 +3998,7 @@ void SimpleDateFormat::adoptCalendar(Calendar* calendarToAdopt) DateFormatSymbols *newSymbols = DateFormatSymbols::createForLocale(calLocale, status); if (U_FAILURE(status)) { + delete calendarToAdopt; return; } DateFormat::adoptCalendar(calendarToAdopt); diff --git a/source/tools/icupkg/icupkg.cpp b/source/tools/icupkg/icupkg.cpp index ea7be4a90923fd7adda726a46c63928892522b7c..51a66397232acb1242b2b856c57a1f11b9c0fbb9 100644 --- a/source/tools/icupkg/icupkg.cpp +++ b/source/tools/icupkg/icupkg.cpp @@ -501,10 +501,8 @@ main(int argc, char *argv[]) { } /* check dependencies between items */ - if(!pkg->checkDependencies()) { - /* some dependencies are not fulfilled */ - return U_MISSING_RESOURCE_ERROR; - } + // Still check the checkDependencies to output warning but not produce error + pkg->checkDependencies(); /* write the output .dat package if there are any modifications */ if(isModified) { diff --git a/source/tools/toolutil/pkgitems.cpp b/source/tools/toolutil/pkgitems.cpp index cb23b45e633bcfefd77987afec7bbfebdb697479..7b86c55fa423bf656b6363bb3416f57b4944caaf 100644 --- a/source/tools/toolutil/pkgitems.cpp +++ b/source/tools/toolutil/pkgitems.cpp @@ -305,7 +305,8 @@ ures_enumDependencies(const char *itemName, break; } int32_t length; - const UChar *alias=res_getString(pResData, res, &length); + // No tracing: build tool + const UChar *alias=res_getStringNoTrace(pResData, res, &length); checkAlias(itemName, res, alias, length, useResSuffix, check, context, pErrorCode); } break;