Home Reference Source

src/remux/mp4-remuxer.js

  1. /**
  2. * fMP4 remuxer
  3. */
  4.  
  5. import AAC from './aac-helper';
  6. import MP4 from './mp4-generator';
  7.  
  8. import Event from '../events';
  9. import { ErrorTypes, ErrorDetails } from '../errors';
  10.  
  11. import { toMsFromMpegTsClock, toMpegTsClockFromTimescale, toTimescaleFromScale } from '../utils/timescale-conversion';
  12.  
  13. import { logger } from '../utils/logger';
  14.  
  15. const MAX_SILENT_FRAME_DURATION_90KHZ = toMpegTsClockFromTimescale(10);
  16. const PTS_DTS_SHIFT_TOLERANCE_90KHZ = toMpegTsClockFromTimescale(0.2);
  17.  
  18. class MP4Remuxer {
  19. constructor (observer, config, typeSupported, vendor) {
  20. this.observer = observer;
  21. this.config = config;
  22. this.typeSupported = typeSupported;
  23. const userAgent = navigator.userAgent;
  24. this.isSafari = vendor && vendor.indexOf('Apple') > -1 && userAgent && !userAgent.match('CriOS');
  25. this.ISGenerated = false;
  26. }
  27.  
  28. destroy () {
  29. }
  30.  
  31. resetTimeStamp (defaultTimeStamp) {
  32. this._initPTS = this._initDTS = defaultTimeStamp;
  33. }
  34.  
  35. resetInitSegment () {
  36. this.ISGenerated = false;
  37. }
  38.  
  39. remux (audioTrack, videoTrack, id3Track, textTrack, timeOffset, contiguous, accurateTimeOffset) {
  40. // generate Init Segment if needed
  41. if (!this.ISGenerated) {
  42. this.generateIS(audioTrack, videoTrack, timeOffset);
  43. }
  44.  
  45. if (this.ISGenerated) {
  46. const nbAudioSamples = audioTrack.samples.length;
  47. const nbVideoSamples = videoTrack.samples.length;
  48. let audioTimeOffset = timeOffset;
  49. let videoTimeOffset = timeOffset;
  50. if (nbAudioSamples && nbVideoSamples) {
  51. // timeOffset is expected to be the offset of the first timestamp of this fragment (first DTS)
  52. // if first audio DTS is not aligned with first video DTS then we need to take that into account
  53. // when providing timeOffset to remuxAudio / remuxVideo. if we don't do that, there might be a permanent / small
  54. // drift between audio and video streams
  55. let audiovideoDeltaDts = (audioTrack.samples[0].pts - videoTrack.samples[0].pts) / videoTrack.inputTimeScale;
  56. audioTimeOffset += Math.max(0, audiovideoDeltaDts);
  57. videoTimeOffset += Math.max(0, -audiovideoDeltaDts);
  58. }
  59. // Purposefully remuxing audio before video, so that remuxVideo can use nextAudioPts, which is
  60. // calculated in remuxAudio.
  61. // logger.log('nb AAC samples:' + audioTrack.samples.length);
  62. if (nbAudioSamples) {
  63. // if initSegment was generated without video samples, regenerate it again
  64. if (!audioTrack.timescale) {
  65. logger.warn('regenerate InitSegment as audio detected');
  66. this.generateIS(audioTrack, videoTrack, timeOffset);
  67. }
  68. let audioData = this.remuxAudio(audioTrack, audioTimeOffset, contiguous, accurateTimeOffset);
  69. // logger.log('nb AVC samples:' + videoTrack.samples.length);
  70. if (nbVideoSamples) {
  71. let audioTrackLength;
  72. if (audioData) {
  73. audioTrackLength = audioData.endPTS - audioData.startPTS;
  74. }
  75.  
  76. // if initSegment was generated without video samples, regenerate it again
  77. if (!videoTrack.timescale) {
  78. logger.warn('regenerate InitSegment as video detected');
  79. this.generateIS(audioTrack, videoTrack, timeOffset);
  80. }
  81. this.remuxVideo(videoTrack, videoTimeOffset, contiguous, audioTrackLength, accurateTimeOffset);
  82. }
  83. } else {
  84. // logger.log('nb AVC samples:' + videoTrack.samples.length);
  85. if (nbVideoSamples) {
  86. let videoData = this.remuxVideo(videoTrack, videoTimeOffset, contiguous, 0, accurateTimeOffset);
  87. if (videoData && audioTrack.codec) {
  88. this.remuxEmptyAudio(audioTrack, audioTimeOffset, contiguous, videoData);
  89. }
  90. }
  91. }
  92. }
  93. // logger.log('nb ID3 samples:' + audioTrack.samples.length);
  94. if (id3Track.samples.length) {
  95. this.remuxID3(id3Track, timeOffset);
  96. }
  97.  
  98. // logger.log('nb ID3 samples:' + audioTrack.samples.length);
  99. if (textTrack.samples.length) {
  100. this.remuxText(textTrack, timeOffset);
  101. }
  102.  
  103. // notify end of parsing
  104. this.observer.trigger(Event.FRAG_PARSED);
  105. }
  106.  
  107. generateIS (audioTrack, videoTrack, timeOffset) {
  108. let observer = this.observer,
  109. audioSamples = audioTrack.samples,
  110. videoSamples = videoTrack.samples,
  111. typeSupported = this.typeSupported,
  112. container = 'audio/mp4',
  113. tracks = {},
  114. data = { tracks: tracks },
  115. computePTSDTS = (this._initPTS === undefined),
  116. initPTS, initDTS;
  117.  
  118. if (computePTSDTS) {
  119. initPTS = initDTS = Infinity;
  120. }
  121.  
  122. if (audioTrack.config && audioSamples.length) {
  123. // let's use audio sampling rate as MP4 time scale.
  124. // rationale is that there is a integer nb of audio frames per audio sample (1024 for AAC)
  125. // using audio sampling rate here helps having an integer MP4 frame duration
  126. // this avoids potential rounding issue and AV sync issue
  127. audioTrack.timescale = audioTrack.samplerate;
  128. logger.log(`audio sampling rate : ${audioTrack.samplerate}`);
  129. if (!audioTrack.isAAC) {
  130. if (typeSupported.mpeg) { // Chrome and Safari
  131. container = 'audio/mpeg';
  132. audioTrack.codec = '';
  133. } else if (typeSupported.mp3) { // Firefox
  134. audioTrack.codec = 'mp3';
  135. }
  136. }
  137. tracks.audio = {
  138. container: container,
  139. codec: audioTrack.codec,
  140. initSegment: !audioTrack.isAAC && typeSupported.mpeg ? new Uint8Array() : MP4.initSegment([audioTrack]),
  141. metadata: {
  142. channelCount: audioTrack.channelCount
  143. }
  144. };
  145. if (computePTSDTS) {
  146. // remember first PTS of this demuxing context. for audio, PTS = DTS
  147. initPTS = initDTS = audioSamples[0].pts - audioTrack.inputTimeScale * timeOffset;
  148. }
  149. }
  150.  
  151. if (videoTrack.sps && videoTrack.pps && videoSamples.length) {
  152. // let's use input time scale as MP4 video timescale
  153. // we use input time scale straight away to avoid rounding issues on frame duration / cts computation
  154. const inputTimeScale = videoTrack.inputTimeScale;
  155. videoTrack.timescale = inputTimeScale;
  156. tracks.video = {
  157. container: 'video/mp4',
  158. codec: videoTrack.codec,
  159. initSegment: MP4.initSegment([videoTrack]),
  160. metadata: {
  161. width: videoTrack.width,
  162. height: videoTrack.height
  163. }
  164. };
  165. if (computePTSDTS) {
  166. initPTS = Math.min(initPTS, videoSamples[0].pts - inputTimeScale * timeOffset);
  167. initDTS = Math.min(initDTS, videoSamples[0].dts - inputTimeScale * timeOffset);
  168. this.observer.trigger(Event.INIT_PTS_FOUND, { initPTS: initPTS });
  169. }
  170. }
  171.  
  172. if (Object.keys(tracks).length) {
  173. observer.trigger(Event.FRAG_PARSING_INIT_SEGMENT, data);
  174. this.ISGenerated = true;
  175. if (computePTSDTS) {
  176. this._initPTS = initPTS;
  177. this._initDTS = initDTS;
  178. }
  179. } else {
  180. observer.trigger(Event.ERROR, { type: ErrorTypes.MEDIA_ERROR, details: ErrorDetails.FRAG_PARSING_ERROR, fatal: false, reason: 'no audio/video samples found' });
  181. }
  182. }
  183.  
  184. remuxVideo (track, timeOffset, contiguous, audioTrackLength, accurateTimeOffset) {
  185. let offset = 8;
  186. let mp4SampleDuration;
  187. let mdat;
  188. let moof;
  189. let firstPTS;
  190. let firstDTS;
  191. let lastPTS;
  192. let lastDTS;
  193. const timeScale = track.timescale;
  194. const inputSamples = track.samples;
  195. const outputSamples = [];
  196. const nbSamples = inputSamples.length;
  197. const ptsNormalize = this._PTSNormalize;
  198. const initPTS = this._initPTS;
  199.  
  200. // if parsed fragment is contiguous with last one, let's use last DTS value as reference
  201. let nextAvcDts = this.nextAvcDts;
  202.  
  203. const isSafari = this.isSafari;
  204.  
  205. if (nbSamples === 0) {
  206. return;
  207. }
  208.  
  209. // Safari does not like overlapping DTS on consecutive fragments. let's use nextAvcDts to overcome this if fragments are consecutive
  210. if (isSafari) {
  211. // also consider consecutive fragments as being contiguous (even if a level switch occurs),
  212. // for sake of clarity:
  213. // consecutive fragments are frags with
  214. // - less than 100ms gaps between new time offset (if accurate) and next expected PTS OR
  215. // - less than 200 ms PTS gaps (timeScale/5)
  216. contiguous |= (inputSamples.length && nextAvcDts &&
  217. ((accurateTimeOffset && Math.abs(timeOffset - nextAvcDts / timeScale) < 0.1) ||
  218. Math.abs((inputSamples[0].pts - nextAvcDts - initPTS)) < timeScale / 5)
  219. );
  220. }
  221.  
  222. if (!contiguous) {
  223. // if not contiguous, let's use target timeOffset
  224. nextAvcDts = timeOffset * timeScale;
  225. }
  226.  
  227. // PTS is coded on 33bits, and can loop from -2^32 to 2^32
  228. // ptsNormalize will make PTS/DTS value monotonic, we use last known DTS value as reference value
  229. inputSamples.forEach(function (sample) {
  230. sample.pts = ptsNormalize(sample.pts - initPTS, nextAvcDts);
  231. sample.dts = ptsNormalize(sample.dts - initPTS, nextAvcDts);
  232. });
  233.  
  234. // sort video samples by DTS then PTS then demux id order
  235. inputSamples.sort(function (a, b) {
  236. const deltadts = a.dts - b.dts;
  237. const deltapts = a.pts - b.pts;
  238. return deltadts || (deltapts || (a.id - b.id));
  239. });
  240.  
  241. // handle broken streams with PTS < DTS, tolerance up 0.2 seconds
  242. let PTSDTSshift = inputSamples.reduce((prev, curr) => Math.max(Math.min(prev, curr.pts - curr.dts), -1 * PTS_DTS_SHIFT_TOLERANCE_90KHZ), 0);
  243. if (PTSDTSshift < 0) {
  244. logger.warn(`PTS < DTS detected in video samples, shifting DTS by ${toMsFromMpegTsClock(PTSDTSshift, true)} ms to overcome this issue`);
  245. for (let i = 0; i < inputSamples.length; i++) {
  246. inputSamples[i].dts += PTSDTSshift;
  247. }
  248. }
  249.  
  250. // compute first DTS and last DTS, normalize them against reference value
  251. let sample = inputSamples[0];
  252. firstDTS = Math.max(sample.dts, 0);
  253. firstPTS = Math.max(sample.pts, 0);
  254.  
  255. // check timestamp continuity accross consecutive fragments (this is to remove inter-fragment gap/hole)
  256. let delta = firstDTS - nextAvcDts;
  257. // if fragment are contiguous, detect hole/overlapping between fragments
  258. if (contiguous) {
  259. if (delta) {
  260. if (delta > 1) {
  261. logger.log(`AVC: ${toMsFromMpegTsClock(delta, true)} ms hole between fragments detected,filling it`);
  262. } else if (delta < -1) {
  263. logger.log(`AVC: ${toMsFromMpegTsClock(-delta, true)} ms overlapping between fragments detected`);
  264. }
  265.  
  266. // remove hole/gap : set DTS to next expected DTS
  267. firstDTS = nextAvcDts;
  268. inputSamples[0].dts = firstDTS;
  269. // offset PTS as well, ensure that PTS is smaller or equal than new DTS
  270. firstPTS = Math.max(firstPTS - delta, nextAvcDts);
  271. inputSamples[0].pts = firstPTS;
  272. logger.log(`Video: PTS/DTS adjusted: ${toMsFromMpegTsClock(firstPTS, true)}/${toMsFromMpegTsClock(firstDTS, true)}, delta: ${toMsFromMpegTsClock(delta, true)} ms`);
  273. }
  274. }
  275.  
  276. // compute lastPTS/lastDTS
  277. sample = inputSamples[inputSamples.length - 1];
  278. lastDTS = Math.max(sample.dts, 0);
  279. lastPTS = Math.max(sample.pts, 0, lastDTS);
  280.  
  281. // on Safari let's signal the same sample duration for all samples
  282. // sample duration (as expected by trun MP4 boxes), should be the delta between sample DTS
  283. // set this constant duration as being the avg delta between consecutive DTS.
  284. if (isSafari) {
  285. mp4SampleDuration = Math.round((lastDTS - firstDTS) / (inputSamples.length - 1));
  286. }
  287.  
  288. let nbNalu = 0, naluLen = 0;
  289. for (let i = 0; i < nbSamples; i++) {
  290. // compute total/avc sample length and nb of NAL units
  291. let sample = inputSamples[i], units = sample.units, nbUnits = units.length, sampleLen = 0;
  292. for (let j = 0; j < nbUnits; j++) {
  293. sampleLen += units[j].data.length;
  294. }
  295.  
  296. naluLen += sampleLen;
  297. nbNalu += nbUnits;
  298. sample.length = sampleLen;
  299.  
  300. // normalize PTS/DTS
  301. if (isSafari) {
  302. // sample DTS is computed using a constant decoding offset (mp4SampleDuration) between samples
  303. sample.dts = firstDTS + i * mp4SampleDuration;
  304. } else {
  305. // ensure sample monotonic DTS
  306. sample.dts = Math.max(sample.dts, firstDTS);
  307. }
  308. // ensure that computed value is greater or equal than sample DTS
  309. sample.pts = Math.max(sample.pts, sample.dts);
  310. }
  311.  
  312. /* concatenate the video data and construct the mdat in place
  313. (need 8 more bytes to fill length and mpdat type) */
  314. let mdatSize = naluLen + (4 * nbNalu) + 8;
  315. try {
  316. mdat = new Uint8Array(mdatSize);
  317. } catch (err) {
  318. this.observer.trigger(Event.ERROR, { type: ErrorTypes.MUX_ERROR, details: ErrorDetails.REMUX_ALLOC_ERROR, fatal: false, bytes: mdatSize, reason: `fail allocating video mdat ${mdatSize}` });
  319. return;
  320. }
  321. let view = new DataView(mdat.buffer);
  322. view.setUint32(0, mdatSize);
  323. mdat.set(MP4.types.mdat, 4);
  324.  
  325. for (let i = 0; i < nbSamples; i++) {
  326. let avcSample = inputSamples[i],
  327. avcSampleUnits = avcSample.units,
  328. mp4SampleLength = 0,
  329. compositionTimeOffset;
  330. // convert NALU bitstream to MP4 format (prepend NALU with size field)
  331. for (let j = 0, nbUnits = avcSampleUnits.length; j < nbUnits; j++) {
  332. let unit = avcSampleUnits[j],
  333. unitData = unit.data,
  334. unitDataLen = unit.data.byteLength;
  335. view.setUint32(offset, unitDataLen);
  336. offset += 4;
  337. mdat.set(unitData, offset);
  338. offset += unitDataLen;
  339. mp4SampleLength += 4 + unitDataLen;
  340. }
  341.  
  342. if (!isSafari) {
  343. // expected sample duration is the Decoding Timestamp diff of consecutive samples
  344. if (i < nbSamples - 1) {
  345. mp4SampleDuration = inputSamples[i + 1].dts - avcSample.dts;
  346. } else {
  347. let config = this.config,
  348. lastFrameDuration = avcSample.dts - inputSamples[i > 0 ? i - 1 : i].dts;
  349. if (config.stretchShortVideoTrack) {
  350. // In some cases, a segment's audio track duration may exceed the video track duration.
  351. // Since we've already remuxed audio, and we know how long the audio track is, we look to
  352. // see if the delta to the next segment is longer than maxBufferHole.
  353. // If so, playback would potentially get stuck, so we artificially inflate
  354. // the duration of the last frame to minimize any potential gap between segments.
  355. let maxBufferHole = config.maxBufferHole,
  356. gapTolerance = Math.floor(maxBufferHole * timeScale),
  357. deltaToFrameEnd = (audioTrackLength ? firstPTS + audioTrackLength * timeScale : this.nextAudioPts) - avcSample.pts;
  358. if (deltaToFrameEnd > gapTolerance) {
  359. // We subtract lastFrameDuration from deltaToFrameEnd to try to prevent any video
  360. // frame overlap. maxBufferHole should be >> lastFrameDuration anyway.
  361. mp4SampleDuration = deltaToFrameEnd - lastFrameDuration;
  362. if (mp4SampleDuration < 0) {
  363. mp4SampleDuration = lastFrameDuration;
  364. }
  365.  
  366. logger.log(`It is approximately ${toMsFromMpegTsClock(deltaToFrameEnd, false)} ms to the next segment; using duration ${toMsFromMpegTsClock(mp4SampleDuration, false)} ms for the last video frame.`);
  367. } else {
  368. mp4SampleDuration = lastFrameDuration;
  369. }
  370. } else {
  371. mp4SampleDuration = lastFrameDuration;
  372. }
  373. }
  374. compositionTimeOffset = Math.round(avcSample.pts - avcSample.dts);
  375. } else {
  376. compositionTimeOffset = Math.max(0, mp4SampleDuration * Math.round((avcSample.pts - avcSample.dts) / mp4SampleDuration));
  377. }
  378.  
  379. // console.log('PTS/DTS/initDTS/normPTS/normDTS/relative PTS : ${avcSample.pts}/${avcSample.dts}/${initDTS}/${ptsnorm}/${dtsnorm}/${(avcSample.pts/4294967296).toFixed(3)}');
  380. outputSamples.push({
  381. size: mp4SampleLength,
  382. // constant duration
  383. duration: mp4SampleDuration,
  384. cts: compositionTimeOffset,
  385. flags: {
  386. isLeading: 0,
  387. isDependedOn: 0,
  388. hasRedundancy: 0,
  389. degradPrio: 0,
  390. dependsOn: avcSample.key ? 2 : 1,
  391. isNonSync: avcSample.key ? 0 : 1
  392. }
  393. });
  394. }
  395. // next AVC sample DTS should be equal to last sample DTS + last sample duration (in PES timescale)
  396. this.nextAvcDts = lastDTS + mp4SampleDuration;
  397. let dropped = track.dropped;
  398. track.nbNalu = 0;
  399. track.dropped = 0;
  400. if (outputSamples.length && navigator.userAgent.toLowerCase().indexOf('chrome') > -1) {
  401. let flags = outputSamples[0].flags;
  402. // chrome workaround, mark first sample as being a Random Access Point to avoid sourcebuffer append issue
  403. // https://code.google.com/p/chromium/issues/detail?id=229412
  404. flags.dependsOn = 2;
  405. flags.isNonSync = 0;
  406. }
  407. track.samples = outputSamples;
  408. moof = MP4.moof(track.sequenceNumber++, firstDTS, track);
  409. track.samples = [];
  410.  
  411. let data = {
  412. data1: moof,
  413. data2: mdat,
  414. startPTS: firstPTS / timeScale,
  415. endPTS: (lastPTS + mp4SampleDuration) / timeScale,
  416. startDTS: firstDTS / timeScale,
  417. endDTS: this.nextAvcDts / timeScale,
  418. type: 'video',
  419. hasAudio: false,
  420. hasVideo: true,
  421. nb: outputSamples.length,
  422. dropped: dropped
  423. };
  424. this.observer.trigger(Event.FRAG_PARSING_DATA, data);
  425. return data;
  426. }
  427.  
  428. remuxAudio (track, timeOffset, contiguous, accurateTimeOffset) {
  429. const inputTimeScale = track.inputTimeScale;
  430. const mp4timeScale = track.timescale;
  431. const scaleFactor = inputTimeScale / mp4timeScale;
  432. const mp4SampleDuration = track.isAAC ? 1024 : 1152;
  433. const inputSampleDuration = mp4SampleDuration * scaleFactor;
  434. const ptsNormalize = this._PTSNormalize;
  435. const initPTS = this._initPTS;
  436. const rawMPEG = !track.isAAC && this.typeSupported.mpeg;
  437.  
  438. let mp4Sample;
  439. let fillFrame;
  440. let mdat;
  441. let moof;
  442. let firstPTS;
  443. let lastPTS;
  444. let offset = (rawMPEG ? 0 : 8);
  445. let inputSamples = track.samples;
  446. let outputSamples = [];
  447. let nextAudioPts = this.nextAudioPts;
  448.  
  449. // for audio samples, also consider consecutive fragments as being contiguous (even if a level switch occurs),
  450. // for sake of clarity:
  451. // consecutive fragments are frags with
  452. // - less than 100ms gaps between new time offset (if accurate) and next expected PTS OR
  453. // - less than 20 audio frames distance
  454. // contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1)
  455. // this helps ensuring audio continuity
  456. // and this also avoids audio glitches/cut when switching quality, or reporting wrong duration on first audio frame
  457. contiguous |= (inputSamples.length && nextAudioPts &&
  458. ((accurateTimeOffset && Math.abs(timeOffset - nextAudioPts / inputTimeScale) < 0.1) ||
  459. Math.abs((inputSamples[0].pts - nextAudioPts - initPTS)) < 20 * inputSampleDuration)
  460. );
  461.  
  462. // compute normalized PTS
  463. inputSamples.forEach(function (sample) {
  464. sample.pts = sample.dts = ptsNormalize(sample.pts - initPTS, timeOffset * inputTimeScale);
  465. });
  466.  
  467. // filter out sample with negative PTS that are not playable anyway
  468. // if we don't remove these negative samples, they will shift all audio samples forward.
  469. // leading to audio overlap between current / next fragment
  470. inputSamples = inputSamples.filter(function (sample) {
  471. return sample.pts >= 0;
  472. });
  473.  
  474. // in case all samples have negative PTS, and have been filtered out, return now
  475. if (inputSamples.length === 0) {
  476. return;
  477. }
  478.  
  479. if (!contiguous) {
  480. if (!accurateTimeOffset) {
  481. // if frag are mot contiguous and if we cant trust time offset, let's use first sample PTS as next audio PTS
  482. nextAudioPts = inputSamples[0].pts;
  483. } else {
  484. // if timeOffset is accurate, let's use it as predicted next audio PTS
  485. nextAudioPts = timeOffset * inputTimeScale;
  486. }
  487. }
  488.  
  489. // If the audio track is missing samples, the frames seem to get "left-shifted" within the
  490. // resulting mp4 segment, causing sync issues and leaving gaps at the end of the audio segment.
  491. // In an effort to prevent this from happening, we inject frames here where there are gaps.
  492. // When possible, we inject a silent frame; when that's not possible, we duplicate the last
  493. // frame.
  494.  
  495. if (track.isAAC) {
  496. const maxAudioFramesDrift = this.config.maxAudioFramesDrift;
  497. for (let i = 0, nextPts = nextAudioPts; i < inputSamples.length;) {
  498. // First, let's see how far off this frame is from where we expect it to be
  499. var sample = inputSamples[i], delta;
  500. let pts = sample.pts;
  501. delta = pts - nextPts;
  502.  
  503. // If we're overlapping by more than a duration, drop this sample
  504. if (delta <= -maxAudioFramesDrift * inputSampleDuration) {
  505. logger.warn(`Dropping 1 audio frame @ ${toMsFromMpegTsClock(nextPts, true)} ms due to ${toMsFromMpegTsClock(delta, true)} ms overlap.`);
  506. inputSamples.splice(i, 1);
  507. // Don't touch nextPtsNorm or i
  508. } // eslint-disable-line brace-style
  509.  
  510. // Insert missing frames if:
  511. // 1: We're more than maxAudioFramesDrift frame away
  512. // 2: Not more than MAX_SILENT_FRAME_DURATION away
  513. // 3: currentTime (aka nextPtsNorm) is not 0
  514. else if (delta >= maxAudioFramesDrift * inputSampleDuration && delta < MAX_SILENT_FRAME_DURATION_90KHZ && nextPts) {
  515. let missing = Math.round(delta / inputSampleDuration);
  516. logger.warn(`Injecting ${missing} audio frames @ ${toMsFromMpegTsClock(nextPts, true)} ms due to ${toMsFromMpegTsClock(nextPts, true)} ms gap.`);
  517. for (let j = 0; j < missing; j++) {
  518. let newStamp = Math.max(nextPts, 0);
  519. fillFrame = AAC.getSilentFrame(track.manifestCodec || track.codec, track.channelCount);
  520. if (!fillFrame) {
  521. logger.log('Unable to get silent frame for given audio codec; duplicating last frame instead.');
  522. fillFrame = sample.unit.subarray();
  523. }
  524. inputSamples.splice(i, 0, { unit: fillFrame, pts: newStamp, dts: newStamp });
  525. nextPts += inputSampleDuration;
  526. i++;
  527. }
  528.  
  529. // Adjust sample to next expected pts
  530. sample.pts = sample.dts = nextPts;
  531. nextPts += inputSampleDuration;
  532. i++;
  533. } else {
  534. // Otherwise, just adjust pts
  535. if (Math.abs(delta) > (0.1 * inputSampleDuration)) {
  536. // logger.log(`Invalid frame delta ${Math.round(delta + inputSampleDuration)} at PTS ${Math.round(pts / 90)} (should be ${Math.round(inputSampleDuration)}).`);
  537. }
  538. sample.pts = sample.dts = nextPts;
  539. nextPts += inputSampleDuration;
  540. i++;
  541. }
  542. }
  543. }
  544.  
  545. // compute mdat size, as we eventually filtered/added some samples
  546. let nbSamples = inputSamples.length;
  547. let mdatSize = 0;
  548. while (nbSamples--) {
  549. mdatSize += inputSamples[nbSamples].unit.byteLength;
  550. }
  551.  
  552. for (let j = 0, nbSamples = inputSamples.length; j < nbSamples; j++) {
  553. let audioSample = inputSamples[j];
  554. let unit = audioSample.unit;
  555. let pts = audioSample.pts;
  556.  
  557. // logger.log(`Audio/PTS:${toMsFromMpegTsClock(pts, true)}`);
  558. // if not first sample
  559.  
  560. if (lastPTS !== undefined) {
  561. mp4Sample.duration = Math.round((pts - lastPTS) / scaleFactor);
  562. } else {
  563. let delta = pts - nextAudioPts;
  564. let numMissingFrames = 0;
  565.  
  566. // if fragment are contiguous, detect hole/overlapping between fragments
  567. // contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1)
  568. if (contiguous && track.isAAC) {
  569. // log delta
  570. if (delta) {
  571. if (delta > 0 && delta < MAX_SILENT_FRAME_DURATION_90KHZ) {
  572. // Q: why do we have to round here, shouldn't this always result in an integer if timestamps are correct,
  573. // and if not, shouldn't we actually Math.ceil() instead?
  574. numMissingFrames = Math.round((pts - nextAudioPts) / inputSampleDuration);
  575.  
  576. logger.log(`${toMsFromMpegTsClock(delta, true)} ms hole between AAC samples detected,filling it`);
  577. if (numMissingFrames > 0) {
  578. fillFrame = AAC.getSilentFrame(track.manifestCodec || track.codec, track.channelCount);
  579. if (!fillFrame) {
  580. fillFrame = unit.subarray();
  581. }
  582.  
  583. mdatSize += numMissingFrames * fillFrame.length;
  584. }
  585. // if we have frame overlap, overlapping for more than half a frame duraion
  586. } else if (delta < -12) {
  587. // drop overlapping audio frames... browser will deal with it
  588. logger.log(`drop overlapping AAC sample, expected/parsed/delta: ${toMsFromMpegTsClock(nextAudioPts, true)} ms / ${toMsFromMpegTsClock(pts, true)} ms / ${toMsFromMpegTsClock(-delta, true)} ms`);
  589. mdatSize -= unit.byteLength;
  590. continue;
  591. }
  592. // set PTS/DTS to expected PTS/DTS
  593. pts = nextAudioPts;
  594. }
  595. }
  596. // remember first PTS of our audioSamples
  597. firstPTS = pts;
  598. if (mdatSize > 0) {
  599. mdatSize += offset;
  600. try {
  601. mdat = new Uint8Array(mdatSize);
  602. } catch (err) {
  603. this.observer.trigger(Event.ERROR, { type: ErrorTypes.MUX_ERROR, details: ErrorDetails.REMUX_ALLOC_ERROR, fatal: false, bytes: mdatSize, reason: `fail allocating audio mdat ${mdatSize}` });
  604. return;
  605. }
  606. if (!rawMPEG) {
  607. const view = new DataView(mdat.buffer);
  608. view.setUint32(0, mdatSize);
  609. mdat.set(MP4.types.mdat, 4);
  610. }
  611. } else {
  612. // no audio samples
  613. return;
  614. }
  615. for (let i = 0; i < numMissingFrames; i++) {
  616. fillFrame = AAC.getSilentFrame(track.manifestCodec || track.codec, track.channelCount);
  617. if (!fillFrame) {
  618. logger.log('Unable to get silent frame for given audio codec; duplicating this frame instead.');
  619. fillFrame = unit.subarray();
  620. }
  621. mdat.set(fillFrame, offset);
  622. offset += fillFrame.byteLength;
  623. mp4Sample = {
  624. size: fillFrame.byteLength,
  625. cts: 0,
  626. duration: 1024,
  627. flags: {
  628. isLeading: 0,
  629. isDependedOn: 0,
  630. hasRedundancy: 0,
  631. degradPrio: 0,
  632. dependsOn: 1
  633. }
  634. };
  635. outputSamples.push(mp4Sample);
  636. }
  637. }
  638. mdat.set(unit, offset);
  639. let unitLen = unit.byteLength;
  640. offset += unitLen;
  641. // console.log('PTS/DTS/initDTS/normPTS/normDTS/relative PTS : ${audioSample.pts}/${audioSample.dts}/${initDTS}/${ptsnorm}/${dtsnorm}/${(audioSample.pts/4294967296).toFixed(3)}');
  642. mp4Sample = {
  643. size: unitLen,
  644. cts: 0,
  645. duration: 0,
  646. flags: {
  647. isLeading: 0,
  648. isDependedOn: 0,
  649. hasRedundancy: 0,
  650. degradPrio: 0,
  651. dependsOn: 1
  652. }
  653. };
  654. outputSamples.push(mp4Sample);
  655. lastPTS = pts;
  656. }
  657. let lastSampleDuration = 0;
  658. nbSamples = outputSamples.length;
  659. // set last sample duration as being identical to previous sample
  660. if (nbSamples >= 2) {
  661. lastSampleDuration = outputSamples[nbSamples - 2].duration;
  662. mp4Sample.duration = lastSampleDuration;
  663. }
  664. if (nbSamples) {
  665. // next audio sample PTS should be equal to last sample PTS + duration
  666. this.nextAudioPts = nextAudioPts = lastPTS + scaleFactor * lastSampleDuration;
  667. // logger.log('Audio/PTS/PTSend:' + audioSample.pts.toFixed(0) + '/' + this.nextAacDts.toFixed(0));
  668. track.samples = outputSamples;
  669. if (rawMPEG) {
  670. moof = new Uint8Array();
  671. } else {
  672. moof = MP4.moof(track.sequenceNumber++, firstPTS / scaleFactor, track);
  673. }
  674.  
  675. track.samples = [];
  676. const start = firstPTS / inputTimeScale;
  677. const end = nextAudioPts / inputTimeScale;
  678. const audioData = {
  679. data1: moof,
  680. data2: mdat,
  681. startPTS: start,
  682. endPTS: end,
  683. startDTS: start,
  684. endDTS: end,
  685. type: 'audio',
  686. hasAudio: true,
  687. hasVideo: false,
  688. nb: nbSamples
  689. };
  690. this.observer.trigger(Event.FRAG_PARSING_DATA, audioData);
  691. return audioData;
  692. }
  693. return null;
  694. }
  695.  
  696. remuxEmptyAudio (track, timeOffset, contiguous, videoData) {
  697. let inputTimeScale = track.inputTimeScale;
  698. let mp4timeScale = track.samplerate ? track.samplerate : inputTimeScale;
  699. let scaleFactor = inputTimeScale / mp4timeScale;
  700. let nextAudioPts = this.nextAudioPts;
  701.  
  702. // sync with video's timestamp
  703. let startDTS = (nextAudioPts !== undefined ? nextAudioPts : videoData.startDTS * inputTimeScale) + this._initDTS;
  704. let endDTS = videoData.endDTS * inputTimeScale + this._initDTS;
  705. // one sample's duration value
  706. let sampleDuration = 1024;
  707. let frameDuration = scaleFactor * sampleDuration;
  708.  
  709. // samples count of this segment's duration
  710. let nbSamples = Math.ceil((endDTS - startDTS) / frameDuration);
  711.  
  712. // silent frame
  713. let silentFrame = AAC.getSilentFrame(track.manifestCodec || track.codec, track.channelCount);
  714.  
  715. logger.warn('remux empty Audio');
  716. // Can't remux if we can't generate a silent frame...
  717. if (!silentFrame) {
  718. logger.trace('Unable to remuxEmptyAudio since we were unable to get a silent frame for given audio codec!');
  719. return;
  720. }
  721.  
  722. let samples = [];
  723. for (let i = 0; i < nbSamples; i++) {
  724. let stamp = startDTS + i * frameDuration;
  725. samples.push({ unit: silentFrame, pts: stamp, dts: stamp });
  726. }
  727. track.samples = samples;
  728.  
  729. this.remuxAudio(track, timeOffset, contiguous);
  730. }
  731.  
  732. remuxID3 (track) {
  733. const length = track.samples.length;
  734. if (!length) {
  735. return;
  736. }
  737. const inputTimeScale = track.inputTimeScale;
  738. const initPTS = this._initPTS;
  739. const initDTS = this._initDTS;
  740. // consume samples
  741. for (let index = 0; index < length; index++) {
  742. const sample = track.samples[index];
  743. // setting id3 pts, dts to relative time
  744. // using this._initPTS and this._initDTS to calculate relative time
  745. sample.pts = ((sample.pts - initPTS) / inputTimeScale);
  746. sample.dts = ((sample.dts - initDTS) / inputTimeScale);
  747. }
  748. this.observer.trigger(Event.FRAG_PARSING_METADATA, {
  749. samples: track.samples
  750. });
  751.  
  752. track.samples = [];
  753. }
  754.  
  755. remuxText (track) {
  756. track.samples.sort(function (a, b) {
  757. return (a.pts - b.pts);
  758. });
  759.  
  760. let length = track.samples.length, sample;
  761. const inputTimeScale = track.inputTimeScale;
  762. const initPTS = this._initPTS;
  763. // consume samples
  764. if (length) {
  765. for (let index = 0; index < length; index++) {
  766. sample = track.samples[index];
  767. // setting text pts, dts to relative time
  768. // using this._initPTS and this._initDTS to calculate relative time
  769. sample.pts = ((sample.pts - initPTS) / inputTimeScale);
  770. }
  771. this.observer.trigger(Event.FRAG_PARSING_USERDATA, {
  772. samples: track.samples
  773. });
  774. }
  775.  
  776. track.samples = [];
  777. }
  778.  
  779. _PTSNormalize (value, reference) {
  780. let offset;
  781. if (reference === undefined) {
  782. return value;
  783. }
  784.  
  785. if (reference < value) {
  786. // - 2^33
  787. offset = -8589934592;
  788. } else {
  789. // + 2^33
  790. offset = 8589934592;
  791. }
  792. /* PTS is 33bit (from 0 to 2^33 -1)
  793. if diff between value and reference is bigger than half of the amplitude (2^32) then it means that
  794. PTS looping occured. fill the gap */
  795. while (Math.abs(value - reference) > 4294967296) {
  796. value += offset;
  797. }
  798.  
  799. return value;
  800. }
  801. }
  802.  
  803. export default MP4Remuxer;