mirror of
https://github.com/samsonjs/media.git
synced 2026-04-27 15:07:40 +00:00
Added UTF-16 (LE) and UTF-16 (BE) support for subrip subtitles.
This commit is contained in:
parent
ab4d37f499
commit
5609efd0e0
6 changed files with 177 additions and 4 deletions
|
|
@ -531,6 +531,54 @@ public final class ParsableByteArray {
|
||||||
return line;
|
return line;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a line of text.
|
||||||
|
*
|
||||||
|
* <p>A line is considered to be terminated by any one of a carriage return ('\r'), a line feed
|
||||||
|
* ('\n'), or a carriage return followed immediately by a line feed ('\r\n'). The UTF-16 charset
|
||||||
|
* is used. This method discards leading UTF-16 byte order marks (BOM), if present.
|
||||||
|
*
|
||||||
|
* @param isLittleEndian UTF-16 (LE) or UTF-16 (BE) encoding should be used
|
||||||
|
* @return The line not including any line-termination characters, or null if the end of the data
|
||||||
|
* has already been reached.
|
||||||
|
*/
|
||||||
|
@Nullable
|
||||||
|
public String readLineUtf16(boolean isLittleEndian) {
|
||||||
|
if (bytesLeft() == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
int lineLimit = calculateLineLimitForUtf16(isLittleEndian);
|
||||||
|
|
||||||
|
if (lineLimit - position >= 2 && isUtf16BOM(data[position], data[position + 1])) {
|
||||||
|
// There's a UTF-16 byte order mark at the start of the line. Discard it.
|
||||||
|
position += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
String line;
|
||||||
|
if (isLittleEndian) {
|
||||||
|
line = Util.fromUtf16LEBytes(data, position, lineLimit - position);
|
||||||
|
} else {
|
||||||
|
line = Util.fromUtf16BEBytes(data, position, lineLimit - position);
|
||||||
|
}
|
||||||
|
|
||||||
|
position = lineLimit;
|
||||||
|
if (position == limit) {
|
||||||
|
return line;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isEqualsInUtf16(data[position], data[position + 1], '\r', isLittleEndian)) {
|
||||||
|
position += 2;
|
||||||
|
if (position == limit) {
|
||||||
|
return line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isEqualsInUtf16(data[position], data[position + 1], '\n', isLittleEndian)) {
|
||||||
|
position += 2;
|
||||||
|
}
|
||||||
|
return line;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a long value encoded by UTF-8 encoding
|
* Reads a long value encoded by UTF-8 encoding
|
||||||
*
|
*
|
||||||
|
|
@ -565,4 +613,29 @@ public final class ParsableByteArray {
|
||||||
position += length;
|
position += length;
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isEqualsInUtf16(byte first, byte second, char value, boolean isLittleEndian) {
|
||||||
|
return (isLittleEndian && (first | second << 8) == value)
|
||||||
|
|| (!isLittleEndian && (first << 8 | second) == value);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isUtf16BOM(byte first, byte second) {
|
||||||
|
return (first == (byte) 0xFF && second == (byte) 0xFE)
|
||||||
|
|| (first == (byte) 0xFE && second == (byte) 0xFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int calculateLineLimitForUtf16(boolean isLittleEndian) {
|
||||||
|
int lineLimit = position;
|
||||||
|
while (lineLimit < limit - 1) {
|
||||||
|
if (isLittleEndian && Util.isLinebreak(data[lineLimit] | data[lineLimit + 1] << 8)) {
|
||||||
|
break;
|
||||||
|
} else if (!isLittleEndian && Util.isLinebreak(data[lineLimit] << 8 | data[lineLimit + 1])) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
lineLimit += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return lineLimit;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -682,6 +682,30 @@ public final class Util {
|
||||||
return new String(bytes, offset, length, Charsets.UTF_8);
|
return new String(bytes, offset, length, Charsets.UTF_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a new {@link String} constructed by decoding UTF-16 (LE) encoded bytes in a subarray.
|
||||||
|
*
|
||||||
|
* @param bytes The UTF-16 encoded bytes to decode.
|
||||||
|
* @param offset The index of the first byte to decode.
|
||||||
|
* @param length The number of bytes to decode.
|
||||||
|
* @return The string.
|
||||||
|
*/
|
||||||
|
public static String fromUtf16LEBytes(byte[] bytes, int offset, int length) {
|
||||||
|
return new String(bytes, offset, length, Charsets.UTF_16LE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a new {@link String} constructed by decoding UTF-16 (BE) encoded bytes in a subarray.
|
||||||
|
*
|
||||||
|
* @param bytes The UTF-16 encoded bytes to decode.
|
||||||
|
* @param offset The index of the first byte to decode.
|
||||||
|
* @param length The number of bytes to decode.
|
||||||
|
* @return The string.
|
||||||
|
*/
|
||||||
|
public static String fromUtf16BEBytes(byte[] bytes, int offset, int length) {
|
||||||
|
return new String(bytes, offset, length, Charsets.UTF_16BE);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a new byte array containing the code points of a {@link String} encoded using UTF-8.
|
* Returns a new byte array containing the code points of a {@link String} encoded using UTF-8.
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,8 @@ import com.google.android.exoplayer2.util.Assertions;
|
||||||
import com.google.android.exoplayer2.util.Log;
|
import com.google.android.exoplayer2.util.Log;
|
||||||
import com.google.android.exoplayer2.util.LongArray;
|
import com.google.android.exoplayer2.util.LongArray;
|
||||||
import com.google.android.exoplayer2.util.ParsableByteArray;
|
import com.google.android.exoplayer2.util.ParsableByteArray;
|
||||||
|
import com.google.common.base.Charsets;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
@ -75,8 +77,25 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
|
||||||
LongArray cueTimesUs = new LongArray();
|
LongArray cueTimesUs = new LongArray();
|
||||||
ParsableByteArray subripData = new ParsableByteArray(bytes, length);
|
ParsableByteArray subripData = new ParsableByteArray(bytes, length);
|
||||||
|
|
||||||
|
@Nullable Charset utf16Charset;
|
||||||
|
if (bytes.length >= 2) {
|
||||||
|
utf16Charset = getUtf16Charset(bytes[0], bytes[1]);
|
||||||
|
} else {
|
||||||
|
utf16Charset = null;
|
||||||
|
}
|
||||||
|
|
||||||
@Nullable String currentLine;
|
@Nullable String currentLine;
|
||||||
while ((currentLine = subripData.readLine()) != null) {
|
while (true) {
|
||||||
|
if (utf16Charset != null) {
|
||||||
|
currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
|
||||||
|
} else {
|
||||||
|
currentLine = subripData.readLine();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentLine == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (currentLine.length() == 0) {
|
if (currentLine.length() == 0) {
|
||||||
// Skip blank lines.
|
// Skip blank lines.
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -91,7 +110,11 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read and parse the timing line.
|
// Read and parse the timing line.
|
||||||
|
if (utf16Charset != null) {
|
||||||
|
currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
|
||||||
|
} else {
|
||||||
currentLine = subripData.readLine();
|
currentLine = subripData.readLine();
|
||||||
|
}
|
||||||
if (currentLine == null) {
|
if (currentLine == null) {
|
||||||
Log.w(TAG, "Unexpected end");
|
Log.w(TAG, "Unexpected end");
|
||||||
break;
|
break;
|
||||||
|
|
@ -109,14 +132,22 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
|
||||||
// Read and parse the text and tags.
|
// Read and parse the text and tags.
|
||||||
textBuilder.setLength(0);
|
textBuilder.setLength(0);
|
||||||
tags.clear();
|
tags.clear();
|
||||||
|
if (utf16Charset != null) {
|
||||||
|
currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
|
||||||
|
} else {
|
||||||
currentLine = subripData.readLine();
|
currentLine = subripData.readLine();
|
||||||
|
}
|
||||||
while (!TextUtils.isEmpty(currentLine)) {
|
while (!TextUtils.isEmpty(currentLine)) {
|
||||||
if (textBuilder.length() > 0) {
|
if (textBuilder.length() > 0) {
|
||||||
textBuilder.append("<br>");
|
textBuilder.append("<br>");
|
||||||
}
|
}
|
||||||
textBuilder.append(processLine(currentLine, tags));
|
textBuilder.append(processLine(currentLine, tags));
|
||||||
|
if (utf16Charset != null) {
|
||||||
|
currentLine = subripData.readLineUtf16(utf16Charset.equals(Charsets.UTF_16LE));
|
||||||
|
} else {
|
||||||
currentLine = subripData.readLine();
|
currentLine = subripData.readLine();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Spanned text = Html.fromHtml(textBuilder.toString());
|
Spanned text = Html.fromHtml(textBuilder.toString());
|
||||||
|
|
||||||
|
|
@ -138,6 +169,21 @@ public final class SubripDecoder extends SimpleSubtitleDecoder {
|
||||||
return new SubripSubtitle(cuesArray, cueTimesUsArray);
|
return new SubripSubtitle(cuesArray, cueTimesUsArray);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private Charset getUtf16Charset(byte first, byte second) {
|
||||||
|
if (first == (byte) 0xFE && second == (byte) 0xFF) {
|
||||||
|
// UTF-16 (BE)
|
||||||
|
return Charsets.UTF_16BE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (first == (byte) 0xFF && second == (byte) 0xFE) {
|
||||||
|
// UTF-16 (LE)
|
||||||
|
return Charsets.UTF_16LE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Trims and removes tags from the given line. The removed tags are added to {@code tags}.
|
* Trims and removes tags from the given line. The removed tags are added to {@code tags}.
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,8 @@ public final class SubripDecoderTest {
|
||||||
private static final String TYPICAL_NEGATIVE_TIMESTAMPS =
|
private static final String TYPICAL_NEGATIVE_TIMESTAMPS =
|
||||||
"media/subrip/typical_negative_timestamps";
|
"media/subrip/typical_negative_timestamps";
|
||||||
private static final String TYPICAL_UNEXPECTED_END = "media/subrip/typical_unexpected_end";
|
private static final String TYPICAL_UNEXPECTED_END = "media/subrip/typical_unexpected_end";
|
||||||
|
private static final String TYPICAL_UTF16BE = "media/subrip/typical_utf16be";
|
||||||
|
private static final String TYPICAL_UTF16LE = "media/subrip/typical_utf16le";
|
||||||
private static final String TYPICAL_WITH_TAGS = "media/subrip/typical_with_tags";
|
private static final String TYPICAL_WITH_TAGS = "media/subrip/typical_with_tags";
|
||||||
private static final String TYPICAL_NO_HOURS_AND_MILLIS =
|
private static final String TYPICAL_NO_HOURS_AND_MILLIS =
|
||||||
"media/subrip/typical_no_hours_and_millis";
|
"media/subrip/typical_no_hours_and_millis";
|
||||||
|
|
@ -80,6 +82,34 @@ public final class SubripDecoderTest {
|
||||||
assertTypicalCue3(subtitle, 4);
|
assertTypicalCue3(subtitle, 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void decodeTypicalUtf16LE() throws IOException {
|
||||||
|
SubripDecoder decoder = new SubripDecoder();
|
||||||
|
byte[] bytes =
|
||||||
|
TestUtil.getByteArray(
|
||||||
|
ApplicationProvider.getApplicationContext(), TYPICAL_UTF16LE);
|
||||||
|
Subtitle subtitle = decoder.decode(bytes, bytes.length, false);
|
||||||
|
|
||||||
|
assertThat(subtitle.getEventTimeCount()).isEqualTo(6);
|
||||||
|
assertTypicalCue1(subtitle, 0);
|
||||||
|
assertTypicalCue2(subtitle, 2);
|
||||||
|
assertTypicalCue3(subtitle, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void decodeTypicalUtf16BE() throws IOException {
|
||||||
|
SubripDecoder decoder = new SubripDecoder();
|
||||||
|
byte[] bytes =
|
||||||
|
TestUtil.getByteArray(
|
||||||
|
ApplicationProvider.getApplicationContext(), TYPICAL_UTF16BE);
|
||||||
|
Subtitle subtitle = decoder.decode(bytes, bytes.length, false);
|
||||||
|
|
||||||
|
assertThat(subtitle.getEventTimeCount()).isEqualTo(6);
|
||||||
|
assertTypicalCue1(subtitle, 0);
|
||||||
|
assertTypicalCue2(subtitle, 2);
|
||||||
|
assertTypicalCue3(subtitle, 4);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void decodeTypicalExtraBlankLine() throws IOException {
|
public void decodeTypicalExtraBlankLine() throws IOException {
|
||||||
SubripDecoder decoder = new SubripDecoder();
|
SubripDecoder decoder = new SubripDecoder();
|
||||||
|
|
|
||||||
BIN
testdata/src/test/assets/media/subrip/typical_utf16be
vendored
Normal file
BIN
testdata/src/test/assets/media/subrip/typical_utf16be
vendored
Normal file
Binary file not shown.
BIN
testdata/src/test/assets/media/subrip/typical_utf16le
vendored
Normal file
BIN
testdata/src/test/assets/media/subrip/typical_utf16le
vendored
Normal file
Binary file not shown.
Loading…
Reference in a new issue