Handle out-of-order frames in endIndices for MP4 with edit list

Updated logic to walk forward in the timestamps array to include all frames within the valid edit duration, accounting for out-of-order frames. This ensures that no frames with timestamps less than `editMediaTime` + `editDuration` are incorrectly excluded.

Issue: androidx/media#1797
PiperOrigin-RevId: 686075680
This commit is contained in:
rohks 2024-10-15 06:05:46 -07:00 committed by Copybara-Service
parent 9adb3aaf41
commit 91c56335ef
13 changed files with 152 additions and 39 deletions

View file

@ -78,6 +78,8 @@
timescale, `media_time` is now properly scaled using the track
timescale, as specified by the MP4 format standard
([#1792](https://github.com/androidx/media/issues/1792)).
* Handle out-of-order frames in `endIndices` calculation for MP4 with edit
list ([#1797](https://github.com/androidx/media/issues/1797)).
* DataSource:
* Audio:
* Fix pop sounds that may occur during seeks.

View file

@ -717,22 +717,39 @@ public final class BoxParser {
Util.scaleLargeTimestamp(
track.editListDurations[i], track.timescale, track.movieTimescale);
// The timestamps array is in the order read from the media, which might not be strictly
// sorted, but will ensure that a) all sync frames are in-order and b) any out-of-order
// frames are after their respective sync frames. This means that although the result of
// this binary search might be slightly incorrect (due to out-of-order timestamps), the loop
// below that walks backward to find the previous sync frame will result in a correct start
// index.
// sorted. However, all sync frames are guaranteed to be in order, and any out-of-order
// frames appear after their respective sync frames. This ensures that although the result
// of the binary search might not be entirely accurate (due to the out-of-order timestamps),
// the following logic ensures correctness for both start and end indices.
//
// The startIndices calculation finds the largest timestamp that is less than or equal to
// editMediaTime. It then walks backward to ensure the index points to a sync frame, since
// decoding must start from a keyframe.
startIndices[i] =
Util.binarySearchFloor(
timestamps, editMediaTime, /* inclusive= */ true, /* stayInBounds= */ true);
while (startIndices[i] >= 0 && (flags[startIndices[i]] & C.BUFFER_FLAG_KEY_FRAME) == 0) {
startIndices[i]--;
}
// The endIndices calculation finds the smallest timestamp that is greater than
// editMediaTime + editDuration, except when omitZeroDurationClippedSample is true, in which
// case it finds the smallest timestamp that is greater than or equal to editMediaTime +
// editDuration.
endIndices[i] =
Util.binarySearchCeil(
timestamps,
editMediaTime + editDuration,
/* inclusive= */ omitZeroDurationClippedSample,
/* stayInBounds= */ false);
while (startIndices[i] >= 0 && (flags[startIndices[i]] & C.BUFFER_FLAG_KEY_FRAME) == 0) {
startIndices[i]--;
if (track.type == C.TRACK_TYPE_VIDEO) {
// To account for out-of-order video frames that may have timestamps smaller than or equal
// to editMediaTime + editDuration, but still fall within the valid range, the loop walks
// forward through the timestamps array to ensure all frames with timestamps within the
// edit duration are included.
while (endIndices[i] < timestamps.length - 1
&& timestamps[endIndices[i] + 1] <= (editMediaTime + editDuration)) {
endIndices[i]++;
}
}
editedSampleCount += endIndices[i] - startIndices[i];
copyMetadata |= nextSampleIndex != startIndices[i];

View file

@ -7,8 +7,8 @@ seekMap:
getPosition(2548333) = [[timeUs=1680000, position=34939]]
numberOfTracks = 2
track 0:
total output bytes = 3112471
sample count = 83
total output bytes = 3208515
sample count = 85
format 0:
id = 1
sampleMimeType = video/dolby-vision
@ -358,8 +358,16 @@ track 0:
data = length 23136, hash 8AF1C1AD
sample 82:
time = 2446666
flags = 536870912
flags = 0
data = length 26792, hash 3157758F
sample 83:
time = 2613333
flags = 0
data = length 62711, hash EF9AC8F5
sample 84:
time = 2546666
flags = 536870912
data = length 33333, hash 567D33D6
track 1:
total output bytes = 45765
sample count = 112

View file

@ -7,8 +7,8 @@ seekMap:
getPosition(2548333) = [[timeUs=1680000, position=34939]]
numberOfTracks = 2
track 0:
total output bytes = 2168517
sample count = 60
total output bytes = 2264561
sample count = 62
format 0:
id = 1
sampleMimeType = video/dolby-vision
@ -266,8 +266,16 @@ track 0:
data = length 23136, hash 8AF1C1AD
sample 59:
time = 2446666
flags = 536870912
flags = 0
data = length 26792, hash 3157758F
sample 60:
time = 2613333
flags = 0
data = length 62711, hash EF9AC8F5
sample 61:
time = 2546666
flags = 536870912
data = length 33333, hash 567D33D6
track 1:
total output bytes = 30664
sample count = 76

View file

@ -7,8 +7,8 @@ seekMap:
getPosition(2548333) = [[timeUs=1680000, position=34939]]
numberOfTracks = 2
track 0:
total output bytes = 1019852
sample count = 28
total output bytes = 1115896
sample count = 30
format 0:
id = 1
sampleMimeType = video/dolby-vision
@ -138,8 +138,16 @@ track 0:
data = length 23136, hash 8AF1C1AD
sample 27:
time = 2446666
flags = 536870912
flags = 0
data = length 26792, hash 3157758F
sample 28:
time = 2613333
flags = 0
data = length 62711, hash EF9AC8F5
sample 29:
time = 2546666
flags = 536870912
data = length 33333, hash 567D33D6
track 1:
total output bytes = 15570
sample count = 39

View file

@ -7,8 +7,8 @@ seekMap:
getPosition(2548333) = [[timeUs=1680000, position=34939]]
numberOfTracks = 2
track 0:
total output bytes = 1019852
sample count = 28
total output bytes = 1115896
sample count = 30
format 0:
id = 1
sampleMimeType = video/dolby-vision
@ -138,8 +138,16 @@ track 0:
data = length 23136, hash 8AF1C1AD
sample 27:
time = 2446666
flags = 536870912
flags = 0
data = length 26792, hash 3157758F
sample 28:
time = 2613333
flags = 0
data = length 62711, hash EF9AC8F5
sample 29:
time = 2546666
flags = 536870912
data = length 33333, hash 567D33D6
track 1:
total output bytes = 1239
sample count = 3

View file

@ -7,8 +7,8 @@ seekMap:
getPosition(2548333) = [[timeUs=1680000, position=34939]]
numberOfTracks = 2
track 0:
total output bytes = 3112471
sample count = 83
total output bytes = 3208515
sample count = 85
format 0:
id = 1
sampleMimeType = video/dolby-vision
@ -358,8 +358,16 @@ track 0:
data = length 23136, hash 8AF1C1AD
sample 82:
time = 2446666
flags = 536870912
flags = 0
data = length 26792, hash 3157758F
sample 83:
time = 2613333
flags = 0
data = length 62711, hash EF9AC8F5
sample 84:
time = 2546666
flags = 536870912
data = length 33333, hash 567D33D6
track 1:
total output bytes = 45765
sample count = 112

View file

@ -7,8 +7,8 @@ seekMap:
getPosition(2548333) = [[timeUs=1680000, position=34939]]
numberOfTracks = 2
track 0:
total output bytes = 2168517
sample count = 60
total output bytes = 2264561
sample count = 62
format 0:
id = 1
sampleMimeType = video/dolby-vision
@ -266,8 +266,16 @@ track 0:
data = length 23136, hash 8AF1C1AD
sample 59:
time = 2446666
flags = 536870912
flags = 0
data = length 26792, hash 3157758F
sample 60:
time = 2613333
flags = 0
data = length 62711, hash EF9AC8F5
sample 61:
time = 2546666
flags = 536870912
data = length 33333, hash 567D33D6
track 1:
total output bytes = 30664
sample count = 76

View file

@ -7,8 +7,8 @@ seekMap:
getPosition(2548333) = [[timeUs=1680000, position=34939]]
numberOfTracks = 2
track 0:
total output bytes = 1019852
sample count = 28
total output bytes = 1115896
sample count = 30
format 0:
id = 1
sampleMimeType = video/dolby-vision
@ -138,8 +138,16 @@ track 0:
data = length 23136, hash 8AF1C1AD
sample 27:
time = 2446666
flags = 536870912
flags = 0
data = length 26792, hash 3157758F
sample 28:
time = 2613333
flags = 0
data = length 62711, hash EF9AC8F5
sample 29:
time = 2546666
flags = 536870912
data = length 33333, hash 567D33D6
track 1:
total output bytes = 15570
sample count = 39

View file

@ -7,8 +7,8 @@ seekMap:
getPosition(2548333) = [[timeUs=1680000, position=34939]]
numberOfTracks = 2
track 0:
total output bytes = 1019852
sample count = 28
total output bytes = 1115896
sample count = 30
format 0:
id = 1
sampleMimeType = video/dolby-vision
@ -138,8 +138,16 @@ track 0:
data = length 23136, hash 8AF1C1AD
sample 27:
time = 2446666
flags = 536870912
flags = 0
data = length 26792, hash 3157758F
sample 28:
time = 2613333
flags = 0
data = length 62711, hash EF9AC8F5
sample 29:
time = 2546666
flags = 536870912
data = length 33333, hash 567D33D6
track 1:
total output bytes = 1239
sample count = 3

View file

@ -7,8 +7,8 @@ seekMap:
getPosition(2548333) = [[timeUs=1680000, position=34939]]
numberOfTracks = 2
track 0:
total output bytes = 3112471
sample count = 83
total output bytes = 3208515
sample count = 85
format 0:
id = 1
sampleMimeType = video/dolby-vision
@ -358,8 +358,16 @@ track 0:
data = length 23136, hash 8AF1C1AD
sample 82:
time = 2446666
flags = 536870912
flags = 0
data = length 26792, hash 3157758F
sample 83:
time = 2613333
flags = 0
data = length 62711, hash EF9AC8F5
sample 84:
time = 2546666
flags = 536870912
data = length 33333, hash 567D33D6
track 1:
total output bytes = 45765
sample count = 112

View file

@ -7,8 +7,8 @@ seekMap:
getPosition(2548333) = [[timeUs=1680000, position=34939]]
numberOfTracks = 2
track 0:
total output bytes = 3112471
sample count = 83
total output bytes = 3208515
sample count = 85
format 0:
id = 1
sampleMimeType = video/dolby-vision
@ -358,8 +358,16 @@ track 0:
data = length 23136, hash 8AF1C1AD
sample 82:
time = 2446666
flags = 536870912
flags = 0
data = length 26792, hash 3157758F
sample 83:
time = 2613333
flags = 0
data = length 62711, hash EF9AC8F5
sample 84:
time = 2546666
flags = 536870912
data = length 33333, hash 567D33D6
track 1:
total output bytes = 45765
sample count = 112

View file

@ -793,7 +793,7 @@ MediaCodecAdapter (exotest.audio.aac):
rendered = false
MediaCodecAdapter (exotest.video.hevc):
inputBuffers:
count = 84
count = 86
input buffer #0:
timeUs = 999999545000
contents = length 78829, hash 9265686F
@ -1044,11 +1044,17 @@ MediaCodecAdapter (exotest.video.hevc):
timeUs = 1000002446666
contents = length 26792, hash 3157758F
input buffer #83:
timeUs = 1000002613333
contents = length 62711, hash EF9AC8F5
input buffer #84:
timeUs = 1000002546666
contents = length 33333, hash 567D33D6
input buffer #85:
timeUs = 0
flags = 4
contents = length 0, hash 1
outputBuffers:
count = 83
count = 85
output buffer #0:
timeUs = 999999545000
size = 78829
@ -1381,6 +1387,14 @@ MediaCodecAdapter (exotest.video.hevc):
timeUs = 1000002446666
size = 26792
rendered = true
output buffer #83:
timeUs = 1000002613333
size = 62711
rendered = true
output buffer #84:
timeUs = 1000002546666
size = 33333
rendered = true
AudioSink:
buffer count = 112
config: