mirror of
https://github.com/samsonjs/media.git
synced 2026-04-26 14:57:47 +00:00
Handle out-of-order frames in endIndices for MP4 with edit list
Updated logic to walk forward in the timestamps array to include all frames within the valid edit duration, accounting for out-of-order frames. This ensures that no frames with timestamps less than `editMediaTime` + `editDuration` are incorrectly excluded. Issue: androidx/media#1797 PiperOrigin-RevId: 686075680
This commit is contained in:
parent
9adb3aaf41
commit
91c56335ef
13 changed files with 152 additions and 39 deletions
|
|
@ -78,6 +78,8 @@
|
||||||
timescale, `media_time` is now properly scaled using the track
|
timescale, `media_time` is now properly scaled using the track
|
||||||
timescale, as specified by the MP4 format standard
|
timescale, as specified by the MP4 format standard
|
||||||
([#1792](https://github.com/androidx/media/issues/1792)).
|
([#1792](https://github.com/androidx/media/issues/1792)).
|
||||||
|
* Handle out-of-order frames in `endIndices` calculation for MP4 with edit
|
||||||
|
list ([#1797](https://github.com/androidx/media/issues/1797)).
|
||||||
* DataSource:
|
* DataSource:
|
||||||
* Audio:
|
* Audio:
|
||||||
* Fix pop sounds that may occur during seeks.
|
* Fix pop sounds that may occur during seeks.
|
||||||
|
|
|
||||||
|
|
@ -717,22 +717,39 @@ public final class BoxParser {
|
||||||
Util.scaleLargeTimestamp(
|
Util.scaleLargeTimestamp(
|
||||||
track.editListDurations[i], track.timescale, track.movieTimescale);
|
track.editListDurations[i], track.timescale, track.movieTimescale);
|
||||||
// The timestamps array is in the order read from the media, which might not be strictly
|
// The timestamps array is in the order read from the media, which might not be strictly
|
||||||
// sorted, but will ensure that a) all sync frames are in-order and b) any out-of-order
|
// sorted. However, all sync frames are guaranteed to be in order, and any out-of-order
|
||||||
// frames are after their respective sync frames. This means that although the result of
|
// frames appear after their respective sync frames. This ensures that although the result
|
||||||
// this binary search might be slightly incorrect (due to out-of-order timestamps), the loop
|
// of the binary search might not be entirely accurate (due to the out-of-order timestamps),
|
||||||
// below that walks backward to find the previous sync frame will result in a correct start
|
// the following logic ensures correctness for both start and end indices.
|
||||||
// index.
|
//
|
||||||
|
// The startIndices calculation finds the largest timestamp that is less than or equal to
|
||||||
|
// editMediaTime. It then walks backward to ensure the index points to a sync frame, since
|
||||||
|
// decoding must start from a keyframe.
|
||||||
startIndices[i] =
|
startIndices[i] =
|
||||||
Util.binarySearchFloor(
|
Util.binarySearchFloor(
|
||||||
timestamps, editMediaTime, /* inclusive= */ true, /* stayInBounds= */ true);
|
timestamps, editMediaTime, /* inclusive= */ true, /* stayInBounds= */ true);
|
||||||
|
while (startIndices[i] >= 0 && (flags[startIndices[i]] & C.BUFFER_FLAG_KEY_FRAME) == 0) {
|
||||||
|
startIndices[i]--;
|
||||||
|
}
|
||||||
|
// The endIndices calculation finds the smallest timestamp that is greater than
|
||||||
|
// editMediaTime + editDuration, except when omitZeroDurationClippedSample is true, in which
|
||||||
|
// case it finds the smallest timestamp that is greater than or equal to editMediaTime +
|
||||||
|
// editDuration.
|
||||||
endIndices[i] =
|
endIndices[i] =
|
||||||
Util.binarySearchCeil(
|
Util.binarySearchCeil(
|
||||||
timestamps,
|
timestamps,
|
||||||
editMediaTime + editDuration,
|
editMediaTime + editDuration,
|
||||||
/* inclusive= */ omitZeroDurationClippedSample,
|
/* inclusive= */ omitZeroDurationClippedSample,
|
||||||
/* stayInBounds= */ false);
|
/* stayInBounds= */ false);
|
||||||
while (startIndices[i] >= 0 && (flags[startIndices[i]] & C.BUFFER_FLAG_KEY_FRAME) == 0) {
|
if (track.type == C.TRACK_TYPE_VIDEO) {
|
||||||
startIndices[i]--;
|
// To account for out-of-order video frames that may have timestamps smaller than or equal
|
||||||
|
// to editMediaTime + editDuration, but still fall within the valid range, the loop walks
|
||||||
|
// forward through the timestamps array to ensure all frames with timestamps within the
|
||||||
|
// edit duration are included.
|
||||||
|
while (endIndices[i] < timestamps.length - 1
|
||||||
|
&& timestamps[endIndices[i] + 1] <= (editMediaTime + editDuration)) {
|
||||||
|
endIndices[i]++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
editedSampleCount += endIndices[i] - startIndices[i];
|
editedSampleCount += endIndices[i] - startIndices[i];
|
||||||
copyMetadata |= nextSampleIndex != startIndices[i];
|
copyMetadata |= nextSampleIndex != startIndices[i];
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ seekMap:
|
||||||
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
||||||
numberOfTracks = 2
|
numberOfTracks = 2
|
||||||
track 0:
|
track 0:
|
||||||
total output bytes = 3112471
|
total output bytes = 3208515
|
||||||
sample count = 83
|
sample count = 85
|
||||||
format 0:
|
format 0:
|
||||||
id = 1
|
id = 1
|
||||||
sampleMimeType = video/dolby-vision
|
sampleMimeType = video/dolby-vision
|
||||||
|
|
@ -358,8 +358,16 @@ track 0:
|
||||||
data = length 23136, hash 8AF1C1AD
|
data = length 23136, hash 8AF1C1AD
|
||||||
sample 82:
|
sample 82:
|
||||||
time = 2446666
|
time = 2446666
|
||||||
flags = 536870912
|
flags = 0
|
||||||
data = length 26792, hash 3157758F
|
data = length 26792, hash 3157758F
|
||||||
|
sample 83:
|
||||||
|
time = 2613333
|
||||||
|
flags = 0
|
||||||
|
data = length 62711, hash EF9AC8F5
|
||||||
|
sample 84:
|
||||||
|
time = 2546666
|
||||||
|
flags = 536870912
|
||||||
|
data = length 33333, hash 567D33D6
|
||||||
track 1:
|
track 1:
|
||||||
total output bytes = 45765
|
total output bytes = 45765
|
||||||
sample count = 112
|
sample count = 112
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ seekMap:
|
||||||
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
||||||
numberOfTracks = 2
|
numberOfTracks = 2
|
||||||
track 0:
|
track 0:
|
||||||
total output bytes = 2168517
|
total output bytes = 2264561
|
||||||
sample count = 60
|
sample count = 62
|
||||||
format 0:
|
format 0:
|
||||||
id = 1
|
id = 1
|
||||||
sampleMimeType = video/dolby-vision
|
sampleMimeType = video/dolby-vision
|
||||||
|
|
@ -266,8 +266,16 @@ track 0:
|
||||||
data = length 23136, hash 8AF1C1AD
|
data = length 23136, hash 8AF1C1AD
|
||||||
sample 59:
|
sample 59:
|
||||||
time = 2446666
|
time = 2446666
|
||||||
flags = 536870912
|
flags = 0
|
||||||
data = length 26792, hash 3157758F
|
data = length 26792, hash 3157758F
|
||||||
|
sample 60:
|
||||||
|
time = 2613333
|
||||||
|
flags = 0
|
||||||
|
data = length 62711, hash EF9AC8F5
|
||||||
|
sample 61:
|
||||||
|
time = 2546666
|
||||||
|
flags = 536870912
|
||||||
|
data = length 33333, hash 567D33D6
|
||||||
track 1:
|
track 1:
|
||||||
total output bytes = 30664
|
total output bytes = 30664
|
||||||
sample count = 76
|
sample count = 76
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ seekMap:
|
||||||
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
||||||
numberOfTracks = 2
|
numberOfTracks = 2
|
||||||
track 0:
|
track 0:
|
||||||
total output bytes = 1019852
|
total output bytes = 1115896
|
||||||
sample count = 28
|
sample count = 30
|
||||||
format 0:
|
format 0:
|
||||||
id = 1
|
id = 1
|
||||||
sampleMimeType = video/dolby-vision
|
sampleMimeType = video/dolby-vision
|
||||||
|
|
@ -138,8 +138,16 @@ track 0:
|
||||||
data = length 23136, hash 8AF1C1AD
|
data = length 23136, hash 8AF1C1AD
|
||||||
sample 27:
|
sample 27:
|
||||||
time = 2446666
|
time = 2446666
|
||||||
flags = 536870912
|
flags = 0
|
||||||
data = length 26792, hash 3157758F
|
data = length 26792, hash 3157758F
|
||||||
|
sample 28:
|
||||||
|
time = 2613333
|
||||||
|
flags = 0
|
||||||
|
data = length 62711, hash EF9AC8F5
|
||||||
|
sample 29:
|
||||||
|
time = 2546666
|
||||||
|
flags = 536870912
|
||||||
|
data = length 33333, hash 567D33D6
|
||||||
track 1:
|
track 1:
|
||||||
total output bytes = 15570
|
total output bytes = 15570
|
||||||
sample count = 39
|
sample count = 39
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ seekMap:
|
||||||
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
||||||
numberOfTracks = 2
|
numberOfTracks = 2
|
||||||
track 0:
|
track 0:
|
||||||
total output bytes = 1019852
|
total output bytes = 1115896
|
||||||
sample count = 28
|
sample count = 30
|
||||||
format 0:
|
format 0:
|
||||||
id = 1
|
id = 1
|
||||||
sampleMimeType = video/dolby-vision
|
sampleMimeType = video/dolby-vision
|
||||||
|
|
@ -138,8 +138,16 @@ track 0:
|
||||||
data = length 23136, hash 8AF1C1AD
|
data = length 23136, hash 8AF1C1AD
|
||||||
sample 27:
|
sample 27:
|
||||||
time = 2446666
|
time = 2446666
|
||||||
flags = 536870912
|
flags = 0
|
||||||
data = length 26792, hash 3157758F
|
data = length 26792, hash 3157758F
|
||||||
|
sample 28:
|
||||||
|
time = 2613333
|
||||||
|
flags = 0
|
||||||
|
data = length 62711, hash EF9AC8F5
|
||||||
|
sample 29:
|
||||||
|
time = 2546666
|
||||||
|
flags = 536870912
|
||||||
|
data = length 33333, hash 567D33D6
|
||||||
track 1:
|
track 1:
|
||||||
total output bytes = 1239
|
total output bytes = 1239
|
||||||
sample count = 3
|
sample count = 3
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ seekMap:
|
||||||
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
||||||
numberOfTracks = 2
|
numberOfTracks = 2
|
||||||
track 0:
|
track 0:
|
||||||
total output bytes = 3112471
|
total output bytes = 3208515
|
||||||
sample count = 83
|
sample count = 85
|
||||||
format 0:
|
format 0:
|
||||||
id = 1
|
id = 1
|
||||||
sampleMimeType = video/dolby-vision
|
sampleMimeType = video/dolby-vision
|
||||||
|
|
@ -358,8 +358,16 @@ track 0:
|
||||||
data = length 23136, hash 8AF1C1AD
|
data = length 23136, hash 8AF1C1AD
|
||||||
sample 82:
|
sample 82:
|
||||||
time = 2446666
|
time = 2446666
|
||||||
flags = 536870912
|
flags = 0
|
||||||
data = length 26792, hash 3157758F
|
data = length 26792, hash 3157758F
|
||||||
|
sample 83:
|
||||||
|
time = 2613333
|
||||||
|
flags = 0
|
||||||
|
data = length 62711, hash EF9AC8F5
|
||||||
|
sample 84:
|
||||||
|
time = 2546666
|
||||||
|
flags = 536870912
|
||||||
|
data = length 33333, hash 567D33D6
|
||||||
track 1:
|
track 1:
|
||||||
total output bytes = 45765
|
total output bytes = 45765
|
||||||
sample count = 112
|
sample count = 112
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ seekMap:
|
||||||
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
||||||
numberOfTracks = 2
|
numberOfTracks = 2
|
||||||
track 0:
|
track 0:
|
||||||
total output bytes = 2168517
|
total output bytes = 2264561
|
||||||
sample count = 60
|
sample count = 62
|
||||||
format 0:
|
format 0:
|
||||||
id = 1
|
id = 1
|
||||||
sampleMimeType = video/dolby-vision
|
sampleMimeType = video/dolby-vision
|
||||||
|
|
@ -266,8 +266,16 @@ track 0:
|
||||||
data = length 23136, hash 8AF1C1AD
|
data = length 23136, hash 8AF1C1AD
|
||||||
sample 59:
|
sample 59:
|
||||||
time = 2446666
|
time = 2446666
|
||||||
flags = 536870912
|
flags = 0
|
||||||
data = length 26792, hash 3157758F
|
data = length 26792, hash 3157758F
|
||||||
|
sample 60:
|
||||||
|
time = 2613333
|
||||||
|
flags = 0
|
||||||
|
data = length 62711, hash EF9AC8F5
|
||||||
|
sample 61:
|
||||||
|
time = 2546666
|
||||||
|
flags = 536870912
|
||||||
|
data = length 33333, hash 567D33D6
|
||||||
track 1:
|
track 1:
|
||||||
total output bytes = 30664
|
total output bytes = 30664
|
||||||
sample count = 76
|
sample count = 76
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ seekMap:
|
||||||
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
||||||
numberOfTracks = 2
|
numberOfTracks = 2
|
||||||
track 0:
|
track 0:
|
||||||
total output bytes = 1019852
|
total output bytes = 1115896
|
||||||
sample count = 28
|
sample count = 30
|
||||||
format 0:
|
format 0:
|
||||||
id = 1
|
id = 1
|
||||||
sampleMimeType = video/dolby-vision
|
sampleMimeType = video/dolby-vision
|
||||||
|
|
@ -138,8 +138,16 @@ track 0:
|
||||||
data = length 23136, hash 8AF1C1AD
|
data = length 23136, hash 8AF1C1AD
|
||||||
sample 27:
|
sample 27:
|
||||||
time = 2446666
|
time = 2446666
|
||||||
flags = 536870912
|
flags = 0
|
||||||
data = length 26792, hash 3157758F
|
data = length 26792, hash 3157758F
|
||||||
|
sample 28:
|
||||||
|
time = 2613333
|
||||||
|
flags = 0
|
||||||
|
data = length 62711, hash EF9AC8F5
|
||||||
|
sample 29:
|
||||||
|
time = 2546666
|
||||||
|
flags = 536870912
|
||||||
|
data = length 33333, hash 567D33D6
|
||||||
track 1:
|
track 1:
|
||||||
total output bytes = 15570
|
total output bytes = 15570
|
||||||
sample count = 39
|
sample count = 39
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ seekMap:
|
||||||
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
||||||
numberOfTracks = 2
|
numberOfTracks = 2
|
||||||
track 0:
|
track 0:
|
||||||
total output bytes = 1019852
|
total output bytes = 1115896
|
||||||
sample count = 28
|
sample count = 30
|
||||||
format 0:
|
format 0:
|
||||||
id = 1
|
id = 1
|
||||||
sampleMimeType = video/dolby-vision
|
sampleMimeType = video/dolby-vision
|
||||||
|
|
@ -138,8 +138,16 @@ track 0:
|
||||||
data = length 23136, hash 8AF1C1AD
|
data = length 23136, hash 8AF1C1AD
|
||||||
sample 27:
|
sample 27:
|
||||||
time = 2446666
|
time = 2446666
|
||||||
flags = 536870912
|
flags = 0
|
||||||
data = length 26792, hash 3157758F
|
data = length 26792, hash 3157758F
|
||||||
|
sample 28:
|
||||||
|
time = 2613333
|
||||||
|
flags = 0
|
||||||
|
data = length 62711, hash EF9AC8F5
|
||||||
|
sample 29:
|
||||||
|
time = 2546666
|
||||||
|
flags = 536870912
|
||||||
|
data = length 33333, hash 567D33D6
|
||||||
track 1:
|
track 1:
|
||||||
total output bytes = 1239
|
total output bytes = 1239
|
||||||
sample count = 3
|
sample count = 3
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ seekMap:
|
||||||
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
||||||
numberOfTracks = 2
|
numberOfTracks = 2
|
||||||
track 0:
|
track 0:
|
||||||
total output bytes = 3112471
|
total output bytes = 3208515
|
||||||
sample count = 83
|
sample count = 85
|
||||||
format 0:
|
format 0:
|
||||||
id = 1
|
id = 1
|
||||||
sampleMimeType = video/dolby-vision
|
sampleMimeType = video/dolby-vision
|
||||||
|
|
@ -358,8 +358,16 @@ track 0:
|
||||||
data = length 23136, hash 8AF1C1AD
|
data = length 23136, hash 8AF1C1AD
|
||||||
sample 82:
|
sample 82:
|
||||||
time = 2446666
|
time = 2446666
|
||||||
flags = 536870912
|
flags = 0
|
||||||
data = length 26792, hash 3157758F
|
data = length 26792, hash 3157758F
|
||||||
|
sample 83:
|
||||||
|
time = 2613333
|
||||||
|
flags = 0
|
||||||
|
data = length 62711, hash EF9AC8F5
|
||||||
|
sample 84:
|
||||||
|
time = 2546666
|
||||||
|
flags = 536870912
|
||||||
|
data = length 33333, hash 567D33D6
|
||||||
track 1:
|
track 1:
|
||||||
total output bytes = 45765
|
total output bytes = 45765
|
||||||
sample count = 112
|
sample count = 112
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ seekMap:
|
||||||
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
getPosition(2548333) = [[timeUs=1680000, position=34939]]
|
||||||
numberOfTracks = 2
|
numberOfTracks = 2
|
||||||
track 0:
|
track 0:
|
||||||
total output bytes = 3112471
|
total output bytes = 3208515
|
||||||
sample count = 83
|
sample count = 85
|
||||||
format 0:
|
format 0:
|
||||||
id = 1
|
id = 1
|
||||||
sampleMimeType = video/dolby-vision
|
sampleMimeType = video/dolby-vision
|
||||||
|
|
@ -358,8 +358,16 @@ track 0:
|
||||||
data = length 23136, hash 8AF1C1AD
|
data = length 23136, hash 8AF1C1AD
|
||||||
sample 82:
|
sample 82:
|
||||||
time = 2446666
|
time = 2446666
|
||||||
flags = 536870912
|
flags = 0
|
||||||
data = length 26792, hash 3157758F
|
data = length 26792, hash 3157758F
|
||||||
|
sample 83:
|
||||||
|
time = 2613333
|
||||||
|
flags = 0
|
||||||
|
data = length 62711, hash EF9AC8F5
|
||||||
|
sample 84:
|
||||||
|
time = 2546666
|
||||||
|
flags = 536870912
|
||||||
|
data = length 33333, hash 567D33D6
|
||||||
track 1:
|
track 1:
|
||||||
total output bytes = 45765
|
total output bytes = 45765
|
||||||
sample count = 112
|
sample count = 112
|
||||||
|
|
|
||||||
|
|
@ -793,7 +793,7 @@ MediaCodecAdapter (exotest.audio.aac):
|
||||||
rendered = false
|
rendered = false
|
||||||
MediaCodecAdapter (exotest.video.hevc):
|
MediaCodecAdapter (exotest.video.hevc):
|
||||||
inputBuffers:
|
inputBuffers:
|
||||||
count = 84
|
count = 86
|
||||||
input buffer #0:
|
input buffer #0:
|
||||||
timeUs = 999999545000
|
timeUs = 999999545000
|
||||||
contents = length 78829, hash 9265686F
|
contents = length 78829, hash 9265686F
|
||||||
|
|
@ -1044,11 +1044,17 @@ MediaCodecAdapter (exotest.video.hevc):
|
||||||
timeUs = 1000002446666
|
timeUs = 1000002446666
|
||||||
contents = length 26792, hash 3157758F
|
contents = length 26792, hash 3157758F
|
||||||
input buffer #83:
|
input buffer #83:
|
||||||
|
timeUs = 1000002613333
|
||||||
|
contents = length 62711, hash EF9AC8F5
|
||||||
|
input buffer #84:
|
||||||
|
timeUs = 1000002546666
|
||||||
|
contents = length 33333, hash 567D33D6
|
||||||
|
input buffer #85:
|
||||||
timeUs = 0
|
timeUs = 0
|
||||||
flags = 4
|
flags = 4
|
||||||
contents = length 0, hash 1
|
contents = length 0, hash 1
|
||||||
outputBuffers:
|
outputBuffers:
|
||||||
count = 83
|
count = 85
|
||||||
output buffer #0:
|
output buffer #0:
|
||||||
timeUs = 999999545000
|
timeUs = 999999545000
|
||||||
size = 78829
|
size = 78829
|
||||||
|
|
@ -1381,6 +1387,14 @@ MediaCodecAdapter (exotest.video.hevc):
|
||||||
timeUs = 1000002446666
|
timeUs = 1000002446666
|
||||||
size = 26792
|
size = 26792
|
||||||
rendered = true
|
rendered = true
|
||||||
|
output buffer #83:
|
||||||
|
timeUs = 1000002613333
|
||||||
|
size = 62711
|
||||||
|
rendered = true
|
||||||
|
output buffer #84:
|
||||||
|
timeUs = 1000002546666
|
||||||
|
size = 33333
|
||||||
|
rendered = true
|
||||||
AudioSink:
|
AudioSink:
|
||||||
buffer count = 112
|
buffer count = 112
|
||||||
config:
|
config:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue