Switch language normalization to 2-letter language codes.

2-letter codes (ISO 639-1) are the standard Android normalization and thus we
should prefer them to 3-letter codes (although both are technically allowed
according the BCP47).

This helps in two ways:
 1. It simplifies app interaction with our normalized language codes as the
    Locale class makes it easy to convert a 2-letter to a 3-letter code but
    not the other way round.
 2. It better normalizes codes on API<21 where we previously had issues with
    language+country codes (see tests).
 3. It allows us to normalize both ISO 639-2/T and ISO 639-2/B codes to the same
    language.

PiperOrigin-RevId: 258729728
This commit is contained in:
tonihei 2019-07-18 10:08:19 +01:00 committed by Oliver Woodman
parent e181d4bd35
commit f82920926d
5 changed files with 114 additions and 26 deletions

View file

@ -8,6 +8,8 @@
* Fix issue where initial seek positions get ignored when playing a preroll ad.
* Fix `DataSchemeDataSource` re-opening and range requests
([#6192](https://github.com/google/ExoPlayer/issues/6192)).
* Switch normalized BCP-47 language codes to use 2-letter ISO 639-1 language
tags instead of 3-letter ISO 639-2 language tags.
### 2.10.3 ###

View file

@ -2318,14 +2318,14 @@ public class DefaultTrackSelector extends MappingTrackSelector {
if (TextUtils.equals(format.language, language)) {
return 3;
}
// Partial match where one language is a subset of the other (e.g. "zho-hans" and "zho-hans-hk")
// Partial match where one language is a subset of the other (e.g. "zh-hans" and "zh-hans-hk")
if (format.language.startsWith(language) || language.startsWith(format.language)) {
return 2;
}
// Partial match where only the main language tag is the same (e.g. "fra-fr" and "fra-ca")
if (format.language.length() >= 3
&& language.length() >= 3
&& format.language.substring(0, 3).equals(language.substring(0, 3))) {
// Partial match where only the main language tag is the same (e.g. "fr-fr" and "fr-ca")
String formatMainLanguage = Util.splitAtFirst(format.language, "-")[0];
String queryMainLanguage = Util.splitAtFirst(language, "-")[0];
if (formatMainLanguage.equals(queryMainLanguage)) {
return 1;
}
return 0;

View file

@ -71,6 +71,7 @@ import java.util.Calendar;
import java.util.Collections;
import java.util.Formatter;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.MissingResourceException;
@ -135,6 +136,10 @@ public final class Util {
+ "(T(([0-9]*)H)?(([0-9]*)M)?(([0-9.]*)S)?)?$");
private static final Pattern ESCAPED_CHARACTER_PATTERN = Pattern.compile("%([A-Fa-f0-9]{2})");
// Android standardizes to ISO 639-1 2-letter codes and provides no way to map a 3-letter
// ISO 639-2 code back to the corresponding 2-letter code.
@Nullable private static HashMap<String, String> languageTagIso3ToIso2;
private Util() {}
/**
@ -450,18 +455,25 @@ public final class Util {
if (language == null) {
return null;
}
try {
Locale locale = getLocaleForLanguageTag(language);
int localeLanguageLength = locale.getLanguage().length();
String normLanguage = locale.getISO3Language();
if (normLanguage.isEmpty()) {
return toLowerInvariant(language);
}
String normTag = getLocaleLanguageTag(locale);
return toLowerInvariant(normLanguage + normTag.substring(localeLanguageLength));
} catch (MissingResourceException e) {
Locale locale = getLocaleForLanguageTag(language);
String localeLanguage = locale.getLanguage();
int localeLanguageLength = localeLanguage.length();
if (localeLanguageLength == 0) {
// Return original language for invalid language tags.
return toLowerInvariant(language);
} else if (localeLanguageLength == 3) {
// Locale.toLanguageTag will ensure a normalized well-formed output. However, 3-letter
// ISO 639-2 language codes will not be converted to 2-letter ISO 639-1 codes automatically.
if (languageTagIso3ToIso2 == null) {
languageTagIso3ToIso2 = createIso3ToIso2Map();
}
String iso2Language = languageTagIso3ToIso2.get(localeLanguage);
if (iso2Language != null) {
localeLanguage = iso2Language;
}
}
String normTag = getLocaleLanguageTag(locale);
return toLowerInvariant(localeLanguage + normTag.substring(localeLanguageLength));
}
/**
@ -2013,6 +2025,54 @@ public final class Util {
}
}
private static HashMap<String, String> createIso3ToIso2Map() {
String[] iso2Languages = Locale.getISOLanguages();
HashMap<String, String> iso3ToIso2 =
new HashMap<>(
/* initialCapacity= */ iso2Languages.length + iso3BibliographicalToIso2.length);
for (String iso2 : iso2Languages) {
try {
// This returns the ISO 639-2/T code for the language.
String iso3 = new Locale(iso2).getISO3Language();
if (!TextUtils.isEmpty(iso3)) {
iso3ToIso2.put(iso3, iso2);
}
} catch (MissingResourceException e) {
// Shouldn't happen for list of known languages, but we don't want to throw either.
}
}
// Add additional ISO 639-2/B codes to mapping.
for (int i = 0; i < iso3BibliographicalToIso2.length; i += 2) {
iso3ToIso2.put(iso3BibliographicalToIso2[i], iso3BibliographicalToIso2[i + 1]);
}
return iso3ToIso2;
}
// See https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes.
private static final String[] iso3BibliographicalToIso2 =
new String[] {
"alb", "sq",
"arm", "hy",
"baq", "eu",
"bur", "my",
"tib", "bo",
"chi", "zh",
"cze", "cs",
"dut", "nl",
"ger", "de",
"gre", "el",
"fre", "fr",
"geo", "ka",
"ice", "is",
"mac", "mk",
"mao", "mi",
"may", "ms",
"per", "fa",
"rum", "ro",
"slo", "sk",
"wel", "cy"
};
/**
* Allows the CRC calculation to be done byte by byte instead of bit per bit being the order
* "most significant bit first".

View file

@ -268,14 +268,15 @@ public class UtilTest {
@Test
@Config(sdk = 21)
public void testNormalizeLanguageCodeV21() {
assertThat(Util.normalizeLanguageCode("es")).isEqualTo("spa");
assertThat(Util.normalizeLanguageCode("spa")).isEqualTo("spa");
assertThat(Util.normalizeLanguageCode("es-AR")).isEqualTo("spa-ar");
assertThat(Util.normalizeLanguageCode("SpA-ar")).isEqualTo("spa-ar");
assertThat(Util.normalizeLanguageCode("es-AR-dialect")).isEqualTo("spa-ar-dialect");
assertThat(Util.normalizeLanguageCode("es-419")).isEqualTo("spa-419");
assertThat(Util.normalizeLanguageCode("zh-hans-tw")).isEqualTo("zho-hans-tw");
assertThat(Util.normalizeLanguageCode("zh-tw-hans")).isEqualTo("zho-tw");
assertThat(Util.normalizeLanguageCode("es")).isEqualTo("es");
assertThat(Util.normalizeLanguageCode("spa")).isEqualTo("es");
assertThat(Util.normalizeLanguageCode("es-AR")).isEqualTo("es-ar");
assertThat(Util.normalizeLanguageCode("SpA-ar")).isEqualTo("es-ar");
assertThat(Util.normalizeLanguageCode("es-AR-dialect")).isEqualTo("es-ar-dialect");
assertThat(Util.normalizeLanguageCode("ES-419")).isEqualTo("es-419");
assertThat(Util.normalizeLanguageCode("zh-hans-tw")).isEqualTo("zh-hans-tw");
assertThat(Util.normalizeLanguageCode("zh-tw-hans")).isEqualTo("zh-tw");
assertThat(Util.normalizeLanguageCode("zho-hans-tw")).isEqualTo("zh-hans-tw");
assertThat(Util.normalizeLanguageCode("und")).isEqualTo("und");
assertThat(Util.normalizeLanguageCode("DoesNotExist")).isEqualTo("doesnotexist");
}
@ -283,13 +284,38 @@ public class UtilTest {
@Test
@Config(sdk = 16)
public void testNormalizeLanguageCode() {
assertThat(Util.normalizeLanguageCode("es")).isEqualTo("spa");
assertThat(Util.normalizeLanguageCode("spa")).isEqualTo("spa");
assertThat(Util.normalizeLanguageCode("es")).isEqualTo("es");
assertThat(Util.normalizeLanguageCode("spa")).isEqualTo("es");
assertThat(Util.normalizeLanguageCode("es-AR")).isEqualTo("es-ar");
assertThat(Util.normalizeLanguageCode("und")).isEqualTo("und");
assertThat(Util.normalizeLanguageCode("DoesNotExist")).isEqualTo("doesnotexist");
}
@Test
public void testNormalizeIso6392BibliographicalAndTextualCodes() {
// See https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes.
assertThat(Util.normalizeLanguageCode("alb")).isEqualTo(Util.normalizeLanguageCode("sqi"));
assertThat(Util.normalizeLanguageCode("arm")).isEqualTo(Util.normalizeLanguageCode("hye"));
assertThat(Util.normalizeLanguageCode("baq")).isEqualTo(Util.normalizeLanguageCode("eus"));
assertThat(Util.normalizeLanguageCode("bur")).isEqualTo(Util.normalizeLanguageCode("mya"));
assertThat(Util.normalizeLanguageCode("chi")).isEqualTo(Util.normalizeLanguageCode("zho"));
assertThat(Util.normalizeLanguageCode("cze")).isEqualTo(Util.normalizeLanguageCode("ces"));
assertThat(Util.normalizeLanguageCode("dut")).isEqualTo(Util.normalizeLanguageCode("nld"));
assertThat(Util.normalizeLanguageCode("fre")).isEqualTo(Util.normalizeLanguageCode("fra"));
assertThat(Util.normalizeLanguageCode("geo")).isEqualTo(Util.normalizeLanguageCode("kat"));
assertThat(Util.normalizeLanguageCode("ger")).isEqualTo(Util.normalizeLanguageCode("deu"));
assertThat(Util.normalizeLanguageCode("gre")).isEqualTo(Util.normalizeLanguageCode("ell"));
assertThat(Util.normalizeLanguageCode("ice")).isEqualTo(Util.normalizeLanguageCode("isl"));
assertThat(Util.normalizeLanguageCode("mac")).isEqualTo(Util.normalizeLanguageCode("mkd"));
assertThat(Util.normalizeLanguageCode("mao")).isEqualTo(Util.normalizeLanguageCode("mri"));
assertThat(Util.normalizeLanguageCode("may")).isEqualTo(Util.normalizeLanguageCode("msa"));
assertThat(Util.normalizeLanguageCode("per")).isEqualTo(Util.normalizeLanguageCode("fas"));
assertThat(Util.normalizeLanguageCode("rum")).isEqualTo(Util.normalizeLanguageCode("ron"));
assertThat(Util.normalizeLanguageCode("slo")).isEqualTo(Util.normalizeLanguageCode("slk"));
assertThat(Util.normalizeLanguageCode("tib")).isEqualTo(Util.normalizeLanguageCode("bod"));
assertThat(Util.normalizeLanguageCode("wel")).isEqualTo(Util.normalizeLanguageCode("cym"));
}
private static void assertEscapeUnescapeFileName(String fileName, String escapedFileName) {
assertThat(escapeFileName(fileName)).isEqualTo(escapedFileName);
assertThat(unescapeFileName(escapedFileName)).isEqualTo(fileName);

View file

@ -263,7 +263,7 @@ public class HlsMasterPlaylistParserTest {
Format closedCaptionFormat = playlist.muxedCaptionFormats.get(0);
assertThat(closedCaptionFormat.sampleMimeType).isEqualTo(MimeTypes.APPLICATION_CEA708);
assertThat(closedCaptionFormat.accessibilityChannel).isEqualTo(4);
assertThat(closedCaptionFormat.language).isEqualTo("spa");
assertThat(closedCaptionFormat.language).isEqualTo("es");
}
@Test