#!/bin/bash # Copyright (c) 2014 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. set -e # Remove entries currently not used in Chromium/V8. function filter_locale_data { echo Removing unncessary categories in ${localedatapath} for langpath in ${localedatapath}/*.txt do echo Overwriting ${langpath} ... sed -r -i \ '/^ characterLabel\{$/,/^ \}$/d /^ AuxExemplarCharacters\{.*\}$/d /^ AuxExemplarCharacters\{$/, /^ \}$/d /^ ExemplarCharacters\{.*\}$/d /^ ExemplarCharacters\{$/, /^ \}$/d /^ (mon|tue|wed|thu|fri|sat|sun|quarter)(|-short|-narrow)\{$/, /^ \}$/d' ${langpath} done } # Remove display names for languages that are not listed in the accept-language # list of Chromium. function filter_display_language_names { for lang in $(grep -v '^#' "${scriptdir}/accept_lang.list") do # Set $OP to '|' only if $ACCEPT_LANG_PATTERN is not empty. OP=${ACCEPT_LANG_PATTERN:+|} ACCEPT_LANG_PATTERN="${ACCEPT_LANG_PATTERN}${OP}${lang}" done ACCEPT_LANG_PATTERN="(${ACCEPT_LANG_PATTERN})[^a-z]" echo "Filtering out display names for non-A-L languages in ${langdatapath}" for langpath in ${langdatapath}/*.txt do target=${langpath} echo Overwriting ${target} ... sed -r -i \ '/^ Keys\{$/,/^ \}$/d /^ Languages\{$/, /^ \}$/ { /^ Languages\{$/p /^ '${ACCEPT_LANG_PATTERN}'/p /^ \}$/p d } /^ Types\{$/,/^ \}$/d /^ Types%short\{$/,/^ \}$/d /^ characterLabelPattern\{$/,/^ \}$/d /^ Variants\{$/,/^ \}$/d' ${target} # Delete an empty "Languages" block. Otherwise, getting the display # name for all the language in a given locale (e.g. en_GB) would fail # when the above filtering sed command results in an empty "Languages" # block. sed -r -i \ '/^ Languages\{$/ { N /^ Languages\{\n \}/ d }' ${target} done } # Keep only the minimum locale data for non-UI languages. function abridge_locale_data_for_non_ui_languages { for lang in $(grep -v '^#' "${scriptdir}/chrome_ui_languages.list") do # Set $OP to '|' only if $UI_LANGUAGES is not empty. OP=${UI_LANGUAGES:+|} UI_LANGUAGES="${UI_LANGUAGES}${OP}${lang}" done EXTRA_LANGUAGES=$(egrep -v -e '^#' -e "(${UI_LANGUAGES})" \ "${scriptdir}/accept_lang.list") echo Creating minimum locale data in ${localedatapath} for lang in ${EXTRA_LANGUAGES} do target=${localedatapath}/${lang}.txt [ -e ${target} ] || { echo "missing ${lang}"; continue; } echo Overwriting ${target} ... # Do not include '%%Parent' line on purpose. sed -n -r -i \ '1, /^'${lang}'\{$/p /^ "%%ALIAS"\{/p /^ (LocaleScript|layout)\{$/, /^ \}$/p /^ Version\{.*$/p /^\}$/p' ${target} done echo Creating minimum locale data in ${langdatapath} for lang in ${EXTRA_LANGUAGES} do target=${langdatapath}/${lang}.txt [ -e ${target} ] || { echo "missing ${lang}"; continue; } echo Overwriting ${target} ... # Do not include '%%Parent' line on purpose. sed -n -r -i \ '1, /^'${lang}'\{$/p /^ "%%ALIAS"\{/p /^ Languages\{$/, /^ \}$/ { /^ Languages\{$/p /^ '${lang}'\{.*\}$/p /^ \}$/p } /^\}$/p' ${target} done } # Keep only the currencies used by the larget 150 economies in terms of GDP. # TODO(jshin): Use ucurr_isAvailable in ICU to drop more currencies. # See also http://en.wikipedia.org/wiki/List_of_circulating_currencies function filter_currency_data { unset KEEPLIST for currency in $(grep -v '^#' "${scriptdir}/currencies.list") do OP=${KEEPLIST:+|} KEEPLIST=${KEEPLIST}${OP}${currency} done KEEPLIST="(${KEEPLIST})" for i in ${dataroot}/curr/*.txt do locale=$(basename $i .txt) [ $locale == 'supplementalData' ] && continue; echo "Overwriting $i for $locale" sed -n -r -i \ '1, /^'${locale}'\{$/ p /^ "%%ALIAS"\{/p /^ %%Parent\{/p /^ Currencies\{$/, /^ \}$/ { /^ Currencies\{$/ p /^ '$KEEPLIST'\{$/, /^ \}$/ p /^ \}$/ p } /^ Currencies%narrow\{$/, /^ \}$/ { /^ Currencies%narrow\{$/ p /^ '$KEEPLIST'\{".*\}$/ p /^ \}$/ p } /^ CurrencyPlurals\{$/, /^ \}$/ { /^ CurrencyPlurals\{$/ p /^ '$KEEPLIST'\{$/, /^ \}$/ p /^ \}$/ p } /^ [cC]urrency(Map|Meta|Spacing|UnitPatterns)\{$/, /^ \}$/ p /^ Version\{.*\}$/p /^\}$/p' $i done } # Remove the display names for numeric region codes other than # 419 (Latin America) because we don't use them. function filter_region_data { sed -i '/[0-35-9][0-9][0-9]{/ d' ${dataroot}/region/*.txt } function remove_exemplar_cities { for i in ${dataroot}/zone/*.txt do [ $i != 'root.txt' ] && \ sed -i '/^ zoneStrings/, /^ "meta:/ { /^ zoneStrings/ p /^ "meta:/ p d }' $i done } # Keep only duration and compound in units* sections. function filter_unit_data { for i in ${dataroot}/unit/*.txt do echo Overwriting $i ... sed -r -i \ '/^ units(|Narrow|Short)\{$/, /^ \}$/ { /^ units(|Narrow|Short)\{$/ p /^ (duration|compound)\{$/, /^ \}$/ p /^ \}$/ p d }' ${i} # Delete empty units,units{Narrow|Short} block. Otherwise, locale fallback # fails. See crbug.com/707515. sed -r -i \ '/^ units(|Narrow|Short)\{$/ { N /^ units(|Narrow|Short)\{\n \}/ d }' ${i} done } # big5han and gb2312han collation do not make any sense and nobody uses them. function remove_legacy_chinese_codepoint_collation { echo "Removing Big5 / GB2312 / UniHan collation data from Chinese locale" target="${dataroot}/coll/zh.txt" echo "Overwriting ${target}" sed -r -i '/^ (uni|big5|gb2312)han\{$/,/^ \}$/ d' ${target} } treeroot="$(dirname "$0")/.." dataroot="${treeroot}/source/data" scriptdir="${treeroot}/scripts" localedatapath="${dataroot}/locales" langdatapath="${dataroot}/lang" filter_locale_data filter_display_language_names abridge_locale_data_for_non_ui_languages filter_currency_data filter_region_data remove_legacy_chinese_codepoint_collation filter_unit_data # Chromium OS needs exemplar cities for timezones, but not Chromium. # It'll save 400kB (uncompressed), but the size difference in # 7z compressed installer is <= 100kB. # TODO(jshin): Make separate data files for CrOS and Chromium. #remove_exemplar_cities