ffmpeg – variable timestamps for audio samples in MP4

Consider the following ffprobe output for a MP4 file. This is an extraction of first 10 frames in the audio stream and its metadata. The audio stream has a sampling rate of 48000, and is encoded with AAC with 1024 samples per AAC frame. This means each AAC frame should have a 1024/48k =0.021333333 sec. duration.

ffprobe version 5.1.3 Copyright (c) 2007-2022 the FFmpeg developers
built with gcc 13 (GCC)
  configuration: --prefix=/home/thanuja/ffmpeg_build --pkg-config-flags=--static --extra-cflags=-I/home/thanuja/ffmpeg_build/include --extra-ldflags=-L/home/thanuja/ffmpeg_build/lib --extra-libs=-lpthread --extra-libs=-lm --bindir=/home/thanuja/bin --enable-gpl --enable-libfdk_aac --enable-libfreetype --enable-libmp3lame --enable-libopus --enable-libvpx --enable-libx264 --enable-libx265 --enable-nonfree --enable-openssl --enable-demuxer=spdif --enable-decoder=dolby_e --enable-decoder=ac3 --enable-decoder=eac3 --enable-indev=alsa --enable-outdev=alsa --enable-shared
  libavutil      57. 28.100 / 57. 28.100
  libavcodec     59. 37.100 / 59. 37.100
  libavformat    59. 27.100 / 59. 27.100
  libavdevice    59.  7.100 / 59.  7.100
  libavfilter     8. 44.100 /  8. 44.100
  libswscale      6.  7.100 /  6.  7.100
  libswresample   4.  7.100 /  4.  7.100
  libpostproc    56.  6.100 / 56.  6.100
Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'file.mp4':
  Metadata:
    major_brand     : mp42
    minor_version   : 1
    compatible_brands: isommp41mp42
    creation_time   : 2023-04-02T23:52:12.000000Z
  Duration: 00:04:47.84, start: 0.000000, bitrate: 7374 kb/s
  Stream #0:0[0x1](und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(tv, bt709/unknown/bt709, progressive), 1920x1080 [SAR 1:1 DAR 16:9], 7198 kb/s, 59.94 fps, 59.94 tbr, 90k tbn (default)
    Metadata:
      creation_time   : 2023-04-02T23:52:12.000000Z
      handler_name    : Core Media Video
      vendor_id       : [0][0][0][0]
  Stream #0:1[0x2](eng): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 125 kb/s (default)
    Metadata:
      creation_time   : 2023-04-02T23:52:12.000000Z
      handler_name    : Core Media Audio
      vendor_id       : [0][0][0][0]

frames.frame.0.media_type="audio"
frames.frame.0.stream_index=1
frames.frame.0.key_frame=1
frames.frame.0.pts=-1
frames.frame.0.pts_time="-0.000021"
frames.frame.0.pkt_dts=-1
frames.frame.0.pkt_dts_time="-0.000021"
frames.frame.0.best_effort_timestamp=-1
frames.frame.0.best_effort_timestamp_time="-0.000021"
frames.frame.0.pkt_duration=325
frames.frame.0.pkt_duration_time="0.006771"
frames.frame.0.pkt_pos="379"
frames.frame.0.pkt_size="334"
frames.frame.0.sample_fmt="fltp"
frames.frame.0.nb_samples=326
frames.frame.0.channels=2
frames.frame.0.channel_layout="stereo"
frames.frame.1.media_type="audio"
frames.frame.1.stream_index=1
frames.frame.1.key_frame=1
frames.frame.1.pts=324
frames.frame.1.pts_time="0.006750"
frames.frame.1.pkt_dts=324
frames.frame.1.pkt_dts_time="0.006750"
frames.frame.1.best_effort_timestamp=324
frames.frame.1.best_effort_timestamp_time="0.006750"
frames.frame.1.pkt_duration=1023
frames.frame.1.pkt_duration_time="0.021312"
frames.frame.1.pkt_pos="713"
frames.frame.1.pkt_size="334"
frames.frame.1.sample_fmt="fltp"
frames.frame.1.nb_samples=1024
frames.frame.1.channels=2
frames.frame.1.channel_layout="stereo"
frames.frame.2.media_type="audio"
frames.frame.2.stream_index=1
frames.frame.2.key_frame=1
frames.frame.2.pts=1347
frames.frame.2.pts_time="0.028063"
frames.frame.2.pkt_dts=1347
frames.frame.2.pkt_dts_time="0.028063"
frames.frame.2.best_effort_timestamp=1347
frames.frame.2.best_effort_timestamp_time="0.028063"
frames.frame.2.pkt_duration=1026
frames.frame.2.pkt_duration_time="0.021375"
frames.frame.2.pkt_pos="1047"
frames.frame.2.pkt_size="335"
frames.frame.2.sample_fmt="fltp"
frames.frame.2.nb_samples=1024
frames.frame.2.channels=2
frames.frame.2.channel_layout="stereo"
frames.frame.3.media_type="audio"
frames.frame.3.stream_index=1
frames.frame.3.key_frame=1
frames.frame.3.pts=2373
frames.frame.3.pts_time="0.049437"
frames.frame.3.pkt_dts=2373
frames.frame.3.pkt_dts_time="0.049437"
frames.frame.3.best_effort_timestamp=2373
frames.frame.3.best_effort_timestamp_time="0.049437"
frames.frame.3.pkt_duration=1023
frames.frame.3.pkt_duration_time="0.021312"
frames.frame.3.pkt_pos="1382"
frames.frame.3.pkt_size="334"
frames.frame.3.sample_fmt="fltp"
frames.frame.3.nb_samples=1024
frames.frame.3.channels=2
frames.frame.3.channel_layout="stereo"
frames.frame.4.media_type="audio"
frames.frame.4.stream_index=1
frames.frame.4.key_frame=1
frames.frame.4.pts=3396
frames.frame.4.pts_time="0.070750"
frames.frame.4.pkt_dts=3396
frames.frame.4.pkt_dts_time="0.070750"
frames.frame.4.best_effort_timestamp=3396
frames.frame.4.best_effort_timestamp_time="0.070750"
frames.frame.4.pkt_duration=1023
frames.frame.4.pkt_duration_time="0.021312"
frames.frame.4.pkt_pos="1716"
frames.frame.4.pkt_size="334"
frames.frame.4.sample_fmt="fltp"
frames.frame.4.nb_samples=1024
frames.frame.4.channels=2
frames.frame.4.channel_layout="stereo"
frames.frame.5.media_type="audio"
frames.frame.5.stream_index=1
frames.frame.5.key_frame=1
frames.frame.5.pts=4419
frames.frame.5.pts_time="0.092062"
frames.frame.5.pkt_dts=4419
frames.frame.5.pkt_dts_time="0.092062"
frames.frame.5.best_effort_timestamp=4419
frames.frame.5.best_effort_timestamp_time="0.092062"
frames.frame.5.pkt_duration=1026
frames.frame.5.pkt_duration_time="0.021375"
frames.frame.5.pkt_pos="2050"
frames.frame.5.pkt_size="335"
frames.frame.5.sample_fmt="fltp"
frames.frame.5.nb_samples=1024
frames.frame.5.channels=2
frames.frame.5.channel_layout="stereo"
frames.frame.6.media_type="audio"
frames.frame.6.stream_index=1
frames.frame.6.key_frame=1
frames.frame.6.pts=5445
frames.frame.6.pts_time="0.113437"
frames.frame.6.pkt_dts=5445
frames.frame.6.pkt_dts_time="0.113437"
frames.frame.6.best_effort_timestamp=5445
frames.frame.6.best_effort_timestamp_time="0.113437"
frames.frame.6.pkt_duration=1023
frames.frame.6.pkt_duration_time="0.021312"
frames.frame.6.pkt_pos="2385"
frames.frame.6.pkt_size="334"
frames.frame.6.sample_fmt="fltp"
frames.frame.6.nb_samples=1024
frames.frame.6.channels=2
frames.frame.6.channel_layout="stereo"
frames.frame.7.media_type="audio"
frames.frame.7.stream_index=1
frames.frame.7.key_frame=1
frames.frame.7.pts=6468
frames.frame.7.pts_time="0.134750"
frames.frame.7.pkt_dts=6468
frames.frame.7.pkt_dts_time="0.134750"
frames.frame.7.best_effort_timestamp=6468
frames.frame.7.best_effort_timestamp_time="0.134750"
frames.frame.7.pkt_duration=1023
frames.frame.7.pkt_duration_time="0.021312"
frames.frame.7.pkt_pos="2719"
frames.frame.7.pkt_size="334"
frames.frame.7.sample_fmt="fltp"
frames.frame.7.nb_samples=1024
frames.frame.7.channels=2
frames.frame.7.channel_layout="stereo"
frames.frame.8.media_type="audio"
frames.frame.8.stream_index=1
frames.frame.8.key_frame=1
frames.frame.8.pts=7491
frames.frame.8.pts_time="0.156062"
frames.frame.8.pkt_dts=7491
frames.frame.8.pkt_dts_time="0.156062"
frames.frame.8.best_effort_timestamp=7491
frames.frame.8.best_effort_timestamp_time="0.156062"
frames.frame.8.pkt_duration=1026
frames.frame.8.pkt_duration_time="0.021375"
frames.frame.8.pkt_pos="3053"
frames.frame.8.pkt_size="335"
frames.frame.8.sample_fmt="fltp"
frames.frame.8.nb_samples=1024
frames.frame.8.channels=2
frames.frame.8.channel_layout="stereo"
frames.frame.9.media_type="audio"
frames.frame.9.stream_index=1
frames.frame.9.key_frame=1
frames.frame.9.pts=8517
frames.frame.9.pts_time="0.177437"
frames.frame.9.pkt_dts=8517
frames.frame.9.pkt_dts_time="0.177437"
frames.frame.9.best_effort_timestamp=8517
frames.frame.9.best_effort_timestamp_time="0.177437"
frames.frame.9.pkt_duration=1023
frames.frame.9.pkt_duration_time="0.021312"
frames.frame.9.pkt_pos="3388"
frames.frame.9.pkt_size="334"
frames.frame.9.sample_fmt="fltp"
frames.frame.9.nb_samples=1024
frames.frame.9.channels=2
frames.frame.9.channel_layout="stereo"
frames.frame.10.media_type="audio"
frames.frame.10.stream_index=1
frames.frame.10.key_frame=1
frames.frame.10.pts=9540
frames.frame.10.pts_time="0.198750"
frames.frame.10.pkt_dts=9540
frames.frame.10.pkt_dts_time="0.198750"
frames.frame.10.best_effort_timestamp=9540
frames.frame.10.best_effort_timestamp_time="0.198750"
frames.frame.10.pkt_duration=1023
frames.frame.10.pkt_duration_time="0.021312"
frames.frame.10.pkt_pos="3722"
frames.frame.10.pkt_size="334"
frames.frame.10.sample_fmt="fltp"
frames.frame.10.nb_samples=1024
frames.frame.10.channels=2
frames.frame.10.channel_layout="stereo"

However, if we check the PTS values, and pkt_duration for the first few frames we see the following:

frames.frame.1.pkt_duration=1023 
frames.frame.1.pkt_duration_time="0.021312" 
frames.frame.1.nb_samples=1024 

frames.frame.2.pkt_duration=1026 
frames.frame.2.pkt_duration_time="0.021375" 
frames.frame.2.nb_samples=1024 

frames.frame.3.pkt_duration=1023 
frames.frame.3.pkt_duration_time="0.021312" 
frames.frame.3.nb_samples=1024 

frames.frame.4.pkt_duration=1023 
frames.frame.4.pkt_duration_time="0.021312" 
frames.frame.4.nb_samples=1024 

frames.frame.5.pkt_duration=1026 
frames.frame.5.pkt_duration_time="0.021375" 
frames.frame.5.nb_samples=1024

As I understand, this means that frame 1 has 1024 samples, but it has a duration of 0.021312s (less than 0.02133s which it actually should be). This implies that sample time doesn’t adhere to 48k sampling rate.

Can you please help me understand the following:

What are the possible reasons for MP4 timestamps to have variable duration for audio samples in different frames even though they all have equal number of samples?
How does the media player deal with this? For instance, frame 2 has 1024 samples and if it honours sampling rate of 48k, it should have a duration of 0.02133s. However, frame 2 has a duration of 0.021375s. Therefore, with respect to timing, frame 2 is two sample units longer. So does the media player respect these time durations or does it just ignore these and play the samples at 48k sampling rate?