Skip to content
Snippets Groups Projects
trim_data.sh 6.72 KiB
Newer Older
#!/bin/bash
# Copyright (c) 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

Jungshik Shin's avatar
Jungshik Shin committed
# Remove entries currently not used in Chromium/V8.
function filter_locale_data {
  echo Removing unncessary categories in ${localedatapath}
  for langpath in ${localedatapath}/*.txt
  do
    echo Overwriting ${langpath} ...
    sed -r -i \
      '/^    characterLabel\{$/,/^    \}$/d
       /^    AuxExemplarCharacters\{.*\}$/d
       /^    AuxExemplarCharacters\{$/, /^    \}$/d
       /^    ExemplarCharacters\{.*\}$/d
       /^    ExemplarCharacters\{$/, /^    \}$/d
       /^        (mon|tue|wed|thu|fri|sat|sun|quarter)(|-short|-narrow)\{$/, /^        \}$/d' ${langpath}
  done
}

# Remove display names for languages that are not listed in the accept-language
# list of Chromium.
function filter_display_language_names {
Jungshik Shin's avatar
Jungshik Shin committed
  for lang in $(grep -v '^#' "${scriptdir}/accept_lang.list")
  do
    # Set $OP to '|' only if $ACCEPT_LANG_PATTERN is not empty.
    OP=${ACCEPT_LANG_PATTERN:+|}
    ACCEPT_LANG_PATTERN="${ACCEPT_LANG_PATTERN}${OP}${lang}"
  done
  ACCEPT_LANG_PATTERN="(${ACCEPT_LANG_PATTERN})[^a-z]"
Jungshik Shin's avatar
Jungshik Shin committed
  echo "Filtering out display names for non-A-L languages in ${langdatapath}"
  for langpath in ${langdatapath}/*.txt
Jungshik Shin's avatar
Jungshik Shin committed
    target=${langpath}
    echo Overwriting ${target} ...
    sed -r -i \
    '/^    Keys\{$/,/^    \}$/d
     /^    Languages\{$/, /^    \}$/ {
       /^    Languages\{$/p
       /^        '${ACCEPT_LANG_PATTERN}'/p
Jungshik Shin's avatar
Jungshik Shin committed
     /^    Types%short\{$/,/^    \}$/d
     /^    characterLabelPattern\{$/,/^    \}$/d
     /^    Variants\{$/,/^    \}$/d' ${target}

    # Delete an empty "Languages" block. Otherwise, getting the display
    # name for all the language in a given locale (e.g. en_GB) would fail
    # when the above filtering sed command results in an empty "Languages"
    # block.
    sed -r -i \
    '/^    Languages\{$/ {
       N
       /^    Languages\{\n    \}/ d
Jungshik Shin's avatar
Jungshik Shin committed
    }' ${target}
  done
}


# Keep only the minimum locale data for non-UI languages.
function abridge_locale_data_for_non_ui_languages {
Jungshik Shin's avatar
Jungshik Shin committed
  for lang in $(grep -v '^#' "${scriptdir}/chrome_ui_languages.list")
  do
    # Set $OP to '|' only if $UI_LANGUAGES is not empty.
    OP=${UI_LANGUAGES:+|}
    UI_LANGUAGES="${UI_LANGUAGES}${OP}${lang}"
  done

Jungshik Shin's avatar
Jungshik Shin committed
  EXTRA_LANGUAGES=$(egrep -v -e '^#' -e "(${UI_LANGUAGES})" \
                    "${scriptdir}/accept_lang.list")

  echo Creating minimum locale data in ${localedatapath}
  for lang in ${EXTRA_LANGUAGES}
  do
    target=${localedatapath}/${lang}.txt
    [  -e ${target} ] || { echo "missing ${lang}"; continue; }
    echo Overwriting ${target} ...

    # Do not include '%%Parent' line on purpose.
    sed -n -r -i \
      '1, /^'${lang}'\{$/p
       /^    (LocaleScript|layout)\{$/, /^    \}$/p
       /^    Version\{.*$/p
       /^\}$/p' ${target}
  done

  echo Creating minimum locale data in ${langdatapath}
  for lang in ${EXTRA_LANGUAGES}
  do
    target=${langdatapath}/${lang}.txt
    [  -e ${target} ] || { echo "missing ${lang}"; continue; }
    echo Overwriting ${target} ...

    # Do not include '%%Parent' line on purpose.
    sed -n -r -i \
      '1, /^'${lang}'\{$/p
       /^    Languages\{$/, /^    \}$/ {
         /^    Languages\{$/p
         /^        '${lang}'\{.*\}$/p
         /^    \}$/p
       }
       /^\}$/p' ${target}
  done
}

# Keep only the currencies used by the larget 150 economies in terms of GDP.
# TODO(jshin): Use ucurr_isAvailable in ICU to drop more currencies.
# See also http://en.wikipedia.org/wiki/List_of_circulating_currencies
function filter_currency_data {
Jungshik Shin's avatar
Jungshik Shin committed
  for currency in $(grep -v '^#' "${scriptdir}/currencies.list")
    OP=${KEEPLIST:+|}
    KEEPLIST=${KEEPLIST}${OP}${currency}
  KEEPLIST="(${KEEPLIST})"
  for i in ${dataroot}/curr/*.txt
    locale=$(basename $i .txt)
    [ $locale == 'supplementalData' ] && continue;
    echo "Overwriting $i for $locale"
    sed -n -r -i \
      '1, /^'${locale}'\{$/ p
       /^    "%%ALIAS"\{/p
       /^    %%Parent\{/p
       /^    Currencies\{$/, /^    \}$/ {
         /^    Currencies\{$/ p
         /^        '$KEEPLIST'\{$/, /^        \}$/ p
         /^    \}$/ p
       }
       /^    Currencies%narrow\{$/, /^    \}$/ {
         /^    Currencies%narrow\{$/ p
         /^        '$KEEPLIST'\{".*\}$/ p
         /^    \}$/ p
       }
       /^    CurrencyPlurals\{$/, /^    \}$/ {
         /^    CurrencyPlurals\{$/ p
         /^        '$KEEPLIST'\{$/, /^        \}$/ p
         /^    \}$/ p
       }
       /^    [cC]urrency(Map|Meta|Spacing|UnitPatterns)\{$/, /^    \}$/ p
       /^    Version\{.*\}$/p
       /^\}$/p' $i
  done
}

# Remove the display names for numeric region codes other than
# 419 (Latin America) because we don't use them.
function filter_region_data {
  sed -i  '/[0-35-9][0-9][0-9]{/ d' ${dataroot}/region/*.txt
  for i in ${dataroot}/zone/*.txt
  do
    [ $i != 'root.txt' ] && \
    sed -i '/^    zoneStrings/, /^        "meta:/ {
      /^    zoneStrings/ p
      /^        "meta:/ p
      d
    }' $i
  done
}

# Keep only duration and compound in units* sections.
function filter_unit_data {
  for i in ${dataroot}/unit/*.txt
  do
    echo Overwriting $i ...
    sed -r -i \
      '/^    units(|Narrow|Short)\{$/, /^    \}$/ {
         /^    units(|Narrow|Short)\{$/ p
         /^        (duration|compound)\{$/, /^        \}$/ p
         /^    \}$/ p
         d
       }' ${i}

    # Delete empty units,units{Narrow|Short} block. Otherwise, locale fallback
    # fails. See crbug.com/707515.
    sed -r -i \
      '/^    units(|Narrow|Short)\{$/ {
         N
         /^    units(|Narrow|Short)\{\n    \}/ d
      }' ${i}
# big5han and gb2312han collation do not make any sense and nobody uses them.
function remove_legacy_chinese_codepoint_collation {
  echo "Removing Big5 / GB2312 / UniHan collation data from Chinese locale"
  target="${dataroot}/coll/zh.txt"
  echo "Overwriting ${target}"
  sed -r -i '/^        (uni|big5|gb2312)han\{$/,/^        \}$/ d' ${target}
Jungshik Shin's avatar
Jungshik Shin committed
treeroot="$(dirname "$0")/.."
dataroot="${treeroot}/source/data"
scriptdir="${treeroot}/scripts"
localedatapath="${dataroot}/locales"
langdatapath="${dataroot}/lang"



Jungshik Shin's avatar
Jungshik Shin committed
filter_locale_data
filter_display_language_names
abridge_locale_data_for_non_ui_languages
filter_currency_data
filter_region_data
remove_legacy_chinese_codepoint_collation

# Chromium OS needs exemplar cities for timezones, but not Chromium.
# It'll save 400kB (uncompressed), but the size difference in
# 7z compressed installer is <= 100kB.
# TODO(jshin): Make separate data files for CrOS and Chromium.
#remove_exemplar_cities