diff --git a/RELEASENOTES.md b/RELEASENOTES.md index 3534ff283b..0dd62f79c3 100644 --- a/RELEASENOTES.md +++ b/RELEASENOTES.md @@ -78,6 +78,8 @@ timescale, `media_time` is now properly scaled using the track timescale, as specified by the MP4 format standard ([#1792](https://github.com/androidx/media/issues/1792)). + * Handle out-of-order frames in `endIndices` calculation for MP4 with edit + list ([#1797](https://github.com/androidx/media/issues/1797)). * DataSource: * Audio: * Fix pop sounds that may occur during seeks. diff --git a/libraries/extractor/src/main/java/androidx/media3/extractor/mp4/BoxParser.java b/libraries/extractor/src/main/java/androidx/media3/extractor/mp4/BoxParser.java index 88cfa3a8b5..9b51d089d5 100644 --- a/libraries/extractor/src/main/java/androidx/media3/extractor/mp4/BoxParser.java +++ b/libraries/extractor/src/main/java/androidx/media3/extractor/mp4/BoxParser.java @@ -717,22 +717,39 @@ public final class BoxParser { Util.scaleLargeTimestamp( track.editListDurations[i], track.timescale, track.movieTimescale); // The timestamps array is in the order read from the media, which might not be strictly - // sorted, but will ensure that a) all sync frames are in-order and b) any out-of-order - // frames are after their respective sync frames. This means that although the result of - // this binary search might be slightly incorrect (due to out-of-order timestamps), the loop - // below that walks backward to find the previous sync frame will result in a correct start - // index. + // sorted. However, all sync frames are guaranteed to be in order, and any out-of-order + // frames appear after their respective sync frames. This ensures that although the result + // of the binary search might not be entirely accurate (due to the out-of-order timestamps), + // the following logic ensures correctness for both start and end indices. + // + // The startIndices calculation finds the largest timestamp that is less than or equal to + // editMediaTime. It then walks backward to ensure the index points to a sync frame, since + // decoding must start from a keyframe. startIndices[i] = Util.binarySearchFloor( timestamps, editMediaTime, /* inclusive= */ true, /* stayInBounds= */ true); + while (startIndices[i] >= 0 && (flags[startIndices[i]] & C.BUFFER_FLAG_KEY_FRAME) == 0) { + startIndices[i]--; + } + // The endIndices calculation finds the smallest timestamp that is greater than + // editMediaTime + editDuration, except when omitZeroDurationClippedSample is true, in which + // case it finds the smallest timestamp that is greater than or equal to editMediaTime + + // editDuration. endIndices[i] = Util.binarySearchCeil( timestamps, editMediaTime + editDuration, /* inclusive= */ omitZeroDurationClippedSample, /* stayInBounds= */ false); - while (startIndices[i] >= 0 && (flags[startIndices[i]] & C.BUFFER_FLAG_KEY_FRAME) == 0) { - startIndices[i]--; + if (track.type == C.TRACK_TYPE_VIDEO) { + // To account for out-of-order video frames that may have timestamps smaller than or equal + // to editMediaTime + editDuration, but still fall within the valid range, the loop walks + // forward through the timestamps array to ensure all frames with timestamps within the + // edit duration are included. + while (endIndices[i] < timestamps.length - 1 + && timestamps[endIndices[i] + 1] <= (editMediaTime + editDuration)) { + endIndices[i]++; + } } editedSampleCount += endIndices[i] - startIndices[i]; copyMetadata |= nextSampleIndex != startIndices[i]; diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.0.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.0.dump index 8d83ce17f4..2107c177f7 100644 --- a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.0.dump +++ b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.0.dump @@ -7,8 +7,8 @@ seekMap: getPosition(2548333) = [[timeUs=1680000, position=34939]] numberOfTracks = 2 track 0: - total output bytes = 3112471 - sample count = 83 + total output bytes = 3208515 + sample count = 85 format 0: id = 1 sampleMimeType = video/dolby-vision @@ -358,8 +358,16 @@ track 0: data = length 23136, hash 8AF1C1AD sample 82: time = 2446666 - flags = 536870912 + flags = 0 data = length 26792, hash 3157758F + sample 83: + time = 2613333 + flags = 0 + data = length 62711, hash EF9AC8F5 + sample 84: + time = 2546666 + flags = 536870912 + data = length 33333, hash 567D33D6 track 1: total output bytes = 45765 sample count = 112 diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.1.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.1.dump index 5cd8680d0c..6d7b424e90 100644 --- a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.1.dump +++ b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.1.dump @@ -7,8 +7,8 @@ seekMap: getPosition(2548333) = [[timeUs=1680000, position=34939]] numberOfTracks = 2 track 0: - total output bytes = 2168517 - sample count = 60 + total output bytes = 2264561 + sample count = 62 format 0: id = 1 sampleMimeType = video/dolby-vision @@ -266,8 +266,16 @@ track 0: data = length 23136, hash 8AF1C1AD sample 59: time = 2446666 - flags = 536870912 + flags = 0 data = length 26792, hash 3157758F + sample 60: + time = 2613333 + flags = 0 + data = length 62711, hash EF9AC8F5 + sample 61: + time = 2546666 + flags = 536870912 + data = length 33333, hash 567D33D6 track 1: total output bytes = 30664 sample count = 76 diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.2.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.2.dump index f5d65655d5..9622c90846 100644 --- a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.2.dump +++ b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.2.dump @@ -7,8 +7,8 @@ seekMap: getPosition(2548333) = [[timeUs=1680000, position=34939]] numberOfTracks = 2 track 0: - total output bytes = 1019852 - sample count = 28 + total output bytes = 1115896 + sample count = 30 format 0: id = 1 sampleMimeType = video/dolby-vision @@ -138,8 +138,16 @@ track 0: data = length 23136, hash 8AF1C1AD sample 27: time = 2446666 - flags = 536870912 + flags = 0 data = length 26792, hash 3157758F + sample 28: + time = 2613333 + flags = 0 + data = length 62711, hash EF9AC8F5 + sample 29: + time = 2546666 + flags = 536870912 + data = length 33333, hash 567D33D6 track 1: total output bytes = 15570 sample count = 39 diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.3.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.3.dump index 12622c903d..b54c01e901 100644 --- a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.3.dump +++ b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.3.dump @@ -7,8 +7,8 @@ seekMap: getPosition(2548333) = [[timeUs=1680000, position=34939]] numberOfTracks = 2 track 0: - total output bytes = 1019852 - sample count = 28 + total output bytes = 1115896 + sample count = 30 format 0: id = 1 sampleMimeType = video/dolby-vision @@ -138,8 +138,16 @@ track 0: data = length 23136, hash 8AF1C1AD sample 27: time = 2446666 - flags = 536870912 + flags = 0 data = length 26792, hash 3157758F + sample 28: + time = 2613333 + flags = 0 + data = length 62711, hash EF9AC8F5 + sample 29: + time = 2546666 + flags = 536870912 + data = length 33333, hash 567D33D6 track 1: total output bytes = 1239 sample count = 3 diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.0.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.0.dump index 8d83ce17f4..2107c177f7 100644 --- a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.0.dump +++ b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.0.dump @@ -7,8 +7,8 @@ seekMap: getPosition(2548333) = [[timeUs=1680000, position=34939]] numberOfTracks = 2 track 0: - total output bytes = 3112471 - sample count = 83 + total output bytes = 3208515 + sample count = 85 format 0: id = 1 sampleMimeType = video/dolby-vision @@ -358,8 +358,16 @@ track 0: data = length 23136, hash 8AF1C1AD sample 82: time = 2446666 - flags = 536870912 + flags = 0 data = length 26792, hash 3157758F + sample 83: + time = 2613333 + flags = 0 + data = length 62711, hash EF9AC8F5 + sample 84: + time = 2546666 + flags = 536870912 + data = length 33333, hash 567D33D6 track 1: total output bytes = 45765 sample count = 112 diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.1.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.1.dump index 5cd8680d0c..6d7b424e90 100644 --- a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.1.dump +++ b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.1.dump @@ -7,8 +7,8 @@ seekMap: getPosition(2548333) = [[timeUs=1680000, position=34939]] numberOfTracks = 2 track 0: - total output bytes = 2168517 - sample count = 60 + total output bytes = 2264561 + sample count = 62 format 0: id = 1 sampleMimeType = video/dolby-vision @@ -266,8 +266,16 @@ track 0: data = length 23136, hash 8AF1C1AD sample 59: time = 2446666 - flags = 536870912 + flags = 0 data = length 26792, hash 3157758F + sample 60: + time = 2613333 + flags = 0 + data = length 62711, hash EF9AC8F5 + sample 61: + time = 2546666 + flags = 536870912 + data = length 33333, hash 567D33D6 track 1: total output bytes = 30664 sample count = 76 diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.2.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.2.dump index f5d65655d5..9622c90846 100644 --- a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.2.dump +++ b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.2.dump @@ -7,8 +7,8 @@ seekMap: getPosition(2548333) = [[timeUs=1680000, position=34939]] numberOfTracks = 2 track 0: - total output bytes = 1019852 - sample count = 28 + total output bytes = 1115896 + sample count = 30 format 0: id = 1 sampleMimeType = video/dolby-vision @@ -138,8 +138,16 @@ track 0: data = length 23136, hash 8AF1C1AD sample 27: time = 2446666 - flags = 536870912 + flags = 0 data = length 26792, hash 3157758F + sample 28: + time = 2613333 + flags = 0 + data = length 62711, hash EF9AC8F5 + sample 29: + time = 2546666 + flags = 536870912 + data = length 33333, hash 567D33D6 track 1: total output bytes = 15570 sample count = 39 diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.3.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.3.dump index 12622c903d..b54c01e901 100644 --- a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.3.dump +++ b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.3.dump @@ -7,8 +7,8 @@ seekMap: getPosition(2548333) = [[timeUs=1680000, position=34939]] numberOfTracks = 2 track 0: - total output bytes = 1019852 - sample count = 28 + total output bytes = 1115896 + sample count = 30 format 0: id = 1 sampleMimeType = video/dolby-vision @@ -138,8 +138,16 @@ track 0: data = length 23136, hash 8AF1C1AD sample 27: time = 2446666 - flags = 536870912 + flags = 0 data = length 26792, hash 3157758F + sample 28: + time = 2613333 + flags = 0 + data = length 62711, hash EF9AC8F5 + sample 29: + time = 2546666 + flags = 536870912 + data = length 33333, hash 567D33D6 track 1: total output bytes = 1239 sample count = 3 diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.unknown_length.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.unknown_length.dump index 8d83ce17f4..2107c177f7 100644 --- a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.unknown_length.dump +++ b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.unknown_length.dump @@ -7,8 +7,8 @@ seekMap: getPosition(2548333) = [[timeUs=1680000, position=34939]] numberOfTracks = 2 track 0: - total output bytes = 3112471 - sample count = 83 + total output bytes = 3208515 + sample count = 85 format 0: id = 1 sampleMimeType = video/dolby-vision @@ -358,8 +358,16 @@ track 0: data = length 23136, hash 8AF1C1AD sample 82: time = 2446666 - flags = 536870912 + flags = 0 data = length 26792, hash 3157758F + sample 83: + time = 2613333 + flags = 0 + data = length 62711, hash EF9AC8F5 + sample 84: + time = 2546666 + flags = 536870912 + data = length 33333, hash 567D33D6 track 1: total output bytes = 45765 sample count = 112 diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.unknown_length.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.unknown_length.dump index 8d83ce17f4..2107c177f7 100644 --- a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.unknown_length.dump +++ b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.unknown_length.dump @@ -7,8 +7,8 @@ seekMap: getPosition(2548333) = [[timeUs=1680000, position=34939]] numberOfTracks = 2 track 0: - total output bytes = 3112471 - sample count = 83 + total output bytes = 3208515 + sample count = 85 format 0: id = 1 sampleMimeType = video/dolby-vision @@ -358,8 +358,16 @@ track 0: data = length 23136, hash 8AF1C1AD sample 82: time = 2446666 - flags = 536870912 + flags = 0 data = length 26792, hash 3157758F + sample 83: + time = 2613333 + flags = 0 + data = length 62711, hash EF9AC8F5 + sample 84: + time = 2546666 + flags = 536870912 + data = length 33333, hash 567D33D6 track 1: total output bytes = 45765 sample count = 112 diff --git a/libraries/test_data/src/test/assets/playbackdumps/mp4/sample_edit_list.mp4.dump b/libraries/test_data/src/test/assets/playbackdumps/mp4/sample_edit_list.mp4.dump index df30e3e46e..4cf262c414 100644 --- a/libraries/test_data/src/test/assets/playbackdumps/mp4/sample_edit_list.mp4.dump +++ b/libraries/test_data/src/test/assets/playbackdumps/mp4/sample_edit_list.mp4.dump @@ -793,7 +793,7 @@ MediaCodecAdapter (exotest.audio.aac): rendered = false MediaCodecAdapter (exotest.video.hevc): inputBuffers: - count = 84 + count = 86 input buffer #0: timeUs = 999999545000 contents = length 78829, hash 9265686F @@ -1044,11 +1044,17 @@ MediaCodecAdapter (exotest.video.hevc): timeUs = 1000002446666 contents = length 26792, hash 3157758F input buffer #83: + timeUs = 1000002613333 + contents = length 62711, hash EF9AC8F5 + input buffer #84: + timeUs = 1000002546666 + contents = length 33333, hash 567D33D6 + input buffer #85: timeUs = 0 flags = 4 contents = length 0, hash 1 outputBuffers: - count = 83 + count = 85 output buffer #0: timeUs = 999999545000 size = 78829 @@ -1381,6 +1387,14 @@ MediaCodecAdapter (exotest.video.hevc): timeUs = 1000002446666 size = 26792 rendered = true + output buffer #83: + timeUs = 1000002613333 + size = 62711 + rendered = true + output buffer #84: + timeUs = 1000002546666 + size = 33333 + rendered = true AudioSink: buffer count = 112 config: