| Index: webrtc/modules/audio_coding/codecs/opus/opus/src/doc/draft-ietf-codec-oggopus.xml
|
| diff --git a/webrtc/modules/audio_coding/codecs/opus/opus/src/doc/draft-ietf-codec-oggopus.xml b/webrtc/modules/audio_coding/codecs/opus/opus/src/doc/draft-ietf-codec-oggopus.xml
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..7489c20146e32c2ee71b56b73a190ae2925a7bfc
|
| --- /dev/null
|
| +++ b/webrtc/modules/audio_coding/codecs/opus/opus/src/doc/draft-ietf-codec-oggopus.xml
|
| @@ -0,0 +1,1751 @@
|
| +<?xml version="1.0" encoding="utf-8"?>
|
| +<!DOCTYPE rfc SYSTEM 'rfc2629.dtd' [
|
| +<!ENTITY rfc2119 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml'>
|
| +<!ENTITY rfc3533 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3533.xml'>
|
| +<!ENTITY rfc3629 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3629.xml'>
|
| +<!ENTITY rfc4732 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.4732.xml'>
|
| +<!ENTITY rfc5226 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5226.xml'>
|
| +<!ENTITY rfc5334 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5334.xml'>
|
| +<!ENTITY rfc6381 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6381.xml'>
|
| +<!ENTITY rfc6716 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6716.xml'>
|
| +<!ENTITY rfc6982 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6982.xml'>
|
| +<!ENTITY rfc7587 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.7587.xml'>
|
| +]>
|
| +<?rfc toc="yes" symrefs="yes" ?>
|
| +
|
| +<rfc ipr="trust200902" category="std" docName="draft-ietf-codec-oggopus-09"
|
| + updates="5334">
|
| +
|
| +<front>
|
| +<title abbrev="Ogg Opus">Ogg Encapsulation for the Opus Audio Codec</title>
|
| +<author initials="T.B." surname="Terriberry" fullname="Timothy B. Terriberry">
|
| +<organization>Mozilla Corporation</organization>
|
| +<address>
|
| +<postal>
|
| +<street>650 Castro Street</street>
|
| +<city>Mountain View</city>
|
| +<region>CA</region>
|
| +<code>94041</code>
|
| +<country>USA</country>
|
| +</postal>
|
| +<phone>+1 650 903-0800</phone>
|
| +<email>tterribe@xiph.org</email>
|
| +</address>
|
| +</author>
|
| +
|
| +<author initials="R." surname="Lee" fullname="Ron Lee">
|
| +<organization>Voicetronix</organization>
|
| +<address>
|
| +<postal>
|
| +<street>246 Pulteney Street, Level 1</street>
|
| +<city>Adelaide</city>
|
| +<region>SA</region>
|
| +<code>5000</code>
|
| +<country>Australia</country>
|
| +</postal>
|
| +<phone>+61 8 8232 9112</phone>
|
| +<email>ron@debian.org</email>
|
| +</address>
|
| +</author>
|
| +
|
| +<author initials="R." surname="Giles" fullname="Ralph Giles">
|
| +<organization>Mozilla Corporation</organization>
|
| +<address>
|
| +<postal>
|
| +<street>163 West Hastings Street</street>
|
| +<city>Vancouver</city>
|
| +<region>BC</region>
|
| +<code>V6B 1H5</code>
|
| +<country>Canada</country>
|
| +</postal>
|
| +<phone>+1 778 785 1540</phone>
|
| +<email>giles@xiph.org</email>
|
| +</address>
|
| +</author>
|
| +
|
| +<date day="23" month="November" year="2015"/>
|
| +<area>RAI</area>
|
| +<workgroup>codec</workgroup>
|
| +
|
| +<abstract>
|
| +<t>
|
| +This document defines the Ogg encapsulation for the Opus interactive speech and
|
| + audio codec.
|
| +This allows data encoded in the Opus format to be stored in an Ogg logical
|
| + bitstream.
|
| +</t>
|
| +</abstract>
|
| +</front>
|
| +
|
| +<middle>
|
| +<section anchor="intro" title="Introduction">
|
| +<t>
|
| +The IETF Opus codec is a low-latency audio codec optimized for both voice and
|
| + general-purpose audio.
|
| +See <xref target="RFC6716"/> for technical details.
|
| +This document defines the encapsulation of Opus in a continuous, logical Ogg
|
| + bitstream <xref target="RFC3533"/>.
|
| +Ogg encapsulation provides Opus with a long-term storage format supporting
|
| + all of the essential features, including metadata, fast and accurate seeking,
|
| + corruption detection, recapture after errors, low overhead, and the ability to
|
| + multiplex Opus with other codecs (including video) with minimal buffering.
|
| +It also provides a live streamable format, capable of delivery over a reliable
|
| + stream-oriented transport, without requiring all the data, or even the total
|
| + length of the data, up-front, in a form that is identical to the on-disk
|
| + storage format.
|
| +</t>
|
| +<t>
|
| +Ogg bitstreams are made up of a series of 'pages', each of which contains data
|
| + from one or more 'packets'.
|
| +Pages are the fundamental unit of multiplexing in an Ogg stream.
|
| +Each page is associated with a particular logical stream and contains a capture
|
| + pattern and checksum, flags to mark the beginning and end of the logical
|
| + stream, and a 'granule position' that represents an absolute position in the
|
| + stream, to aid seeking.
|
| +A single page can contain up to 65,025 octets of packet data from up to 255
|
| + different packets.
|
| +Packets can be split arbitrarily across pages, and continued from one page to
|
| + the next (allowing packets much larger than would fit on a single page).
|
| +Each page contains 'lacing values' that indicate how the data is partitioned
|
| + into packets, allowing a demultiplexer (demuxer) to recover the packet
|
| + boundaries without examining the encoded data.
|
| +A packet is said to 'complete' on a page when the page contains the final
|
| + lacing value corresponding to that packet.
|
| +</t>
|
| +<t>
|
| +This encapsulation defines the contents of the packet data, including
|
| + the necessary headers, the organization of those packets into a logical
|
| + stream, and the interpretation of the codec-specific granule position field.
|
| +It does not attempt to describe or specify the existing Ogg container format.
|
| +Readers unfamiliar with the basic concepts mentioned above are encouraged to
|
| + review the details in <xref target="RFC3533"/>.
|
| +</t>
|
| +
|
| +</section>
|
| +
|
| +<section anchor="terminology" title="Terminology">
|
| +<t>
|
| +The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
|
| + "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this
|
| + document are to be interpreted as described in <xref target="RFC2119"/>.
|
| +</t>
|
| +
|
| +</section>
|
| +
|
| +<section anchor="packet_organization" title="Packet Organization">
|
| +<t>
|
| +An Ogg Opus stream is organized as follows.
|
| +</t>
|
| +<t>
|
| +There are two mandatory header packets.
|
| +The first packet in the logical Ogg bitstream MUST contain the identification
|
| + (ID) header, which uniquely identifies a stream as Opus audio.
|
| +The format of this header is defined in <xref target="id_header"/>.
|
| +It is placed alone (without any other packet data) on the first page of
|
| + the logical Ogg bitstream, and completes on that page.
|
| +This page has its 'beginning of stream' flag set.
|
| +</t>
|
| +<t>
|
| +The second packet in the logical Ogg bitstream MUST contain the comment header,
|
| + which contains user-supplied metadata.
|
| +The format of this header is defined in <xref target="comment_header"/>.
|
| +It MAY span multiple pages, beginning on the second page of the logical
|
| + stream.
|
| +However many pages it spans, the comment header packet MUST finish the page on
|
| + which it completes.
|
| +</t>
|
| +<t>
|
| +All subsequent pages are audio data pages, and the Ogg packets they contain are
|
| + audio data packets.
|
| +Each audio data packet contains one Opus packet for each of N different
|
| + streams, where N is typically one for mono or stereo, but MAY be greater than
|
| + one for multichannel audio.
|
| +The value N is specified in the ID header (see
|
| + <xref target="channel_mapping"/>), and is fixed over the entire length of the
|
| + logical Ogg bitstream.
|
| +</t>
|
| +<t>
|
| +The first (N - 1) Opus packets, if any, are packed one after another
|
| + into the Ogg packet, using the self-delimiting framing from Appendix B of
|
| + <xref target="RFC6716"/>.
|
| +The remaining Opus packet is packed at the end of the Ogg packet using the
|
| + regular, undelimited framing from Section 3 of <xref target="RFC6716"/>.
|
| +All of the Opus packets in a single Ogg packet MUST be constrained to have the
|
| + same duration.
|
| +An implementation of this specification SHOULD treat any Opus packet whose
|
| + duration is different from that of the first Opus packet in an Ogg packet as
|
| + if it were a malformed Opus packet with an invalid Table Of Contents (TOC)
|
| + sequence.
|
| +</t>
|
| +<t>
|
| +The TOC sequence at the beginning of each Opus packet indicates the coding
|
| + mode, audio bandwidth, channel count, duration (frame size), and number of
|
| + frames per packet, as described in Section 3.1
|
| + of <xref target="RFC6716"/>.
|
| +The coding mode is one of SILK, Hybrid, or Constrained Energy Lapped Transform
|
| + (CELT).
|
| +The combination of coding mode, audio bandwidth, and frame size is referred to
|
| + as the configuration of an Opus packet.
|
| +</t>
|
| +<t>
|
| +Packets are placed into Ogg pages in order until the end of stream.
|
| +Audio data packets might span page boundaries.
|
| +The first audio data page could have the 'continued packet' flag set
|
| + (indicating the first audio data packet is continued from a previous page) if,
|
| + for example, it was a live stream joined mid-broadcast, with the headers
|
| + pasted on the front.
|
| +A demuxer SHOULD NOT attempt to decode the data for the first packet on a page
|
| + with the 'continued packet' flag set if the previous page with packet data
|
| + does not end in a continued packet (i.e., did not end with a lacing value of
|
| + 255) or if the page sequence numbers are not consecutive, unless the demuxer
|
| + has some special knowledge that would allow it to interpret this data
|
| + despite the missing pieces.
|
| +An implementation MUST treat a zero-octet audio data packet as if it were a
|
| + malformed Opus packet as described in
|
| + Section 3.4 of <xref target="RFC6716"/>.
|
| +</t>
|
| +<t>
|
| +A logical stream ends with a page with the 'end of stream' flag set, but
|
| + implementations need to be prepared to deal with truncated streams that do not
|
| + have a page marked 'end of stream'.
|
| +There is no reason for the final packet on the last page to be a continued
|
| + packet, i.e., for the final lacing value to be less than 255.
|
| +However, demuxers might encounter such streams, possibly as the result of a
|
| + transfer that did not complete or of corruption.
|
| +A demuxer SHOULD NOT attempt to decode the data from a packet that continues
|
| + onto a subsequent page (i.e., when the page ends with a lacing value of 255)
|
| + if the next page with packet data does not have the 'continued packet' flag
|
| + set or does not exist, or if the page sequence numbers are not consecutive,
|
| + unless the demuxer has some special knowledge that would allow it to interpret
|
| + this data despite the missing pieces.
|
| +There MUST NOT be any more pages in an Opus logical bitstream after a page
|
| + marked 'end of stream'.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="granpos" title="Granule Position">
|
| +<t>
|
| +The granule position MUST be zero for the ID header page and the
|
| + page where the comment header completes.
|
| +That is, the first page in the logical stream, and the last header
|
| + page before the first audio data page both have a granule position of zero.
|
| +</t>
|
| +<t>
|
| +The granule position of an audio data page encodes the total number of PCM
|
| + samples in the stream up to and including the last fully-decodable sample from
|
| + the last packet completed on that page.
|
| +The granule position of the first audio data page will usually be larger than
|
| + zero, as described in <xref target="start_granpos_restrictions"/>.
|
| +</t>
|
| +
|
| +<t>
|
| +A page that is entirely spanned by a single packet (that completes on a
|
| + subsequent page) has no granule position, and the granule position field is
|
| + set to the special value '-1' in two's complement.
|
| +</t>
|
| +
|
| +<t>
|
| +The granule position of an audio data page is in units of PCM audio samples at
|
| + a fixed rate of 48 kHz (per channel; a stereo stream's granule position
|
| + does not increment at twice the speed of a mono stream).
|
| +It is possible to run an Opus decoder at other sampling rates, but the value
|
| + in the granule position field always counts samples assuming a 48 kHz
|
| + decoding rate, and the rest of this specification makes the same assumption.
|
| +</t>
|
| +
|
| +<t>
|
| +The duration of an Opus packet can be any multiple of 2.5 ms, up to a
|
| + maximum of 120 ms.
|
| +This duration is encoded in the TOC sequence at the beginning of each packet.
|
| +The number of samples returned by a decoder corresponds to this duration
|
| + exactly, even for the first few packets.
|
| +For example, a 20 ms packet fed to a decoder running at 48 kHz will
|
| + always return 960 samples.
|
| +A demuxer can parse the TOC sequence at the beginning of each Ogg packet to
|
| + work backwards or forwards from a packet with a known granule position (i.e.,
|
| + the last packet completed on some page) in order to assign granule positions
|
| + to every packet, or even every individual sample.
|
| +The one exception is the last page in the stream, as described below.
|
| +</t>
|
| +
|
| +<t>
|
| +All other pages with completed packets after the first MUST have a granule
|
| + position equal to the number of samples contained in packets that complete on
|
| + that page plus the granule position of the most recent page with completed
|
| + packets.
|
| +This guarantees that a demuxer can assign individual packets the same granule
|
| + position when working forwards as when working backwards.
|
| +For this to work, there cannot be any gaps.
|
| +</t>
|
| +
|
| +<section anchor="gap-repair" title="Repairing Gaps in Real-time Streams">
|
| +<t>
|
| +In order to support capturing a real-time stream that has lost or not
|
| + transmitted packets, a multiplexer (muxer) SHOULD emit packets that explicitly
|
| + request the use of Packet Loss Concealment (PLC) in place of the missing
|
| + packets.
|
| +Implementations that fail to do so still MUST NOT increment the granule
|
| + position for a page by anything other than the number of samples contained in
|
| + packets that actually complete on that page.
|
| +</t>
|
| +<t>
|
| +Only gaps that are a multiple of 2.5 ms are repairable, as these are the
|
| + only durations that can be created by packet loss or discontinuous
|
| + transmission.
|
| +Muxers need not handle other gap sizes.
|
| +Creating the necessary packets involves synthesizing a TOC byte (defined in
|
| +Section 3.1 of <xref target="RFC6716"/>)—and whatever
|
| + additional internal framing is needed—to indicate the packet duration
|
| + for each stream.
|
| +The actual length of each missing Opus frame inside the packet is zero bytes,
|
| + as defined in Section 3.2.1 of <xref target="RFC6716"/>.
|
| +</t>
|
| +
|
| +<t>
|
| +Zero-byte frames MAY be packed into packets using any of codes 0, 1,
|
| + 2, or 3.
|
| +When successive frames have the same configuration, the higher code packings
|
| + reduce overhead.
|
| +Likewise, if the TOC configuration matches, the muxer MAY further combine the
|
| + empty frames with previous or subsequent non-zero-length frames (using
|
| + code 2 or VBR code 3).
|
| +</t>
|
| +
|
| +<t>
|
| +<xref target="RFC6716"/> does not impose any requirements on the PLC, but this
|
| + section outlines choices that are expected to have a positive influence on
|
| + most PLC implementations, including the reference implementation.
|
| +Synthesized TOC sequences SHOULD maintain the same mode, audio bandwidth,
|
| + channel count, and frame size as the previous packet (if any).
|
| +This is the simplest and usually the most well-tested case for the PLC to
|
| + handle and it covers all losses that do not include a configuration switch,
|
| + as defined in Section 4.5 of <xref target="RFC6716"/>.
|
| +</t>
|
| +
|
| +<t>
|
| +When a previous packet is available, keeping the audio bandwidth and channel
|
| + count the same allows the PLC to provide maximum continuity in the concealment
|
| + data it generates.
|
| +However, if the size of the gap is not a multiple of the most recent frame
|
| + size, then the frame size will have to change for at least some frames.
|
| +Such changes SHOULD be delayed as long as possible to simplify
|
| + things for PLC implementations.
|
| +</t>
|
| +
|
| +<t>
|
| +As an example, a 95 ms gap could be encoded as nineteen 5 ms frames
|
| + in two bytes with a single CBR code 3 packet.
|
| +If the previous frame size was 20 ms, using four 20 ms frames
|
| + followed by three 5 ms frames requires 4 bytes (plus an extra byte
|
| + of Ogg lacing overhead), but allows the PLC to use its well-tested steady
|
| + state behavior for as long as possible.
|
| +The total bitrate of the latter approach, including Ogg overhead, is about
|
| + 0.4 kbps, so the impact on file size is minimal.
|
| +</t>
|
| +
|
| +<t>
|
| +Changing modes is discouraged, since this causes some decoder implementations
|
| + to reset their PLC state.
|
| +However, SILK and Hybrid mode frames cannot fill gaps that are not a multiple
|
| + of 10 ms.
|
| +If switching to CELT mode is needed to match the gap size, a muxer SHOULD do
|
| + so at the end of the gap to allow the PLC to function for as long as possible.
|
| +</t>
|
| +
|
| +<t>
|
| +In the example above, if the previous frame was a 20 ms SILK mode frame,
|
| + the better solution is to synthesize a packet describing four 20 ms SILK
|
| + frames, followed by a packet with a single 10 ms SILK
|
| + frame, and finally a packet with a 5 ms CELT frame, to fill the 95 ms
|
| + gap.
|
| +This also requires four bytes to describe the synthesized packet data (two
|
| + bytes for a CBR code 3 and one byte each for two code 0 packets) but three
|
| + bytes of Ogg lacing overhead are needed to mark the packet boundaries.
|
| +At 0.6 kbps, this is still a minimal bitrate impact over a naive, low quality
|
| + solution.
|
| +</t>
|
| +
|
| +<t>
|
| +Since medium-band audio is an option only in the SILK mode, wideband frames
|
| + SHOULD be generated if switching from that configuration to CELT mode, to
|
| + ensure that any PLC implementation which does try to migrate state between
|
| + the modes will be able to preserve all of the available audio bandwidth.
|
| +</t>
|
| +
|
| +</section>
|
| +
|
| +<section anchor="preskip" title="Pre-skip">
|
| +<t>
|
| +There is some amount of latency introduced during the decoding process, to
|
| + allow for overlap in the CELT mode, stereo mixing in the SILK mode, and
|
| + resampling.
|
| +The encoder might have introduced additional latency through its own resampling
|
| + and analysis (though the exact amount is not specified).
|
| +Therefore, the first few samples produced by the decoder do not correspond to
|
| + real input audio, but are instead composed of padding inserted by the encoder
|
| + to compensate for this latency.
|
| +These samples need to be stored and decoded, as Opus is an asymptotically
|
| + convergent predictive codec, meaning the decoded contents of each frame depend
|
| + on the recent history of decoder inputs.
|
| +However, a player will want to skip these samples after decoding them.
|
| +</t>
|
| +
|
| +<t>
|
| +A 'pre-skip' field in the ID header (see <xref target="id_header"/>) signals
|
| + the number of samples that SHOULD be skipped (decoded but discarded) at the
|
| + beginning of the stream, though some specific applications might have a reason
|
| + for looking at that data.
|
| +This amount need not be a multiple of 2.5 ms, MAY be smaller than a single
|
| + packet, or MAY span the contents of several packets.
|
| +These samples are not valid audio.
|
| +</t>
|
| +
|
| +<t>
|
| +For example, if the first Opus frame uses the CELT mode, it will always
|
| + produce 120 samples of windowed overlap-add data.
|
| +However, the overlap data is initially all zeros (since there is no prior
|
| + frame), meaning this cannot, in general, accurately represent the original
|
| + audio.
|
| +The SILK mode requires additional delay to account for its analysis and
|
| + resampling latency.
|
| +The encoder delays the original audio to avoid this problem.
|
| +</t>
|
| +
|
| +<t>
|
| +The pre-skip field MAY also be used to perform sample-accurate cropping of
|
| + already encoded streams.
|
| +In this case, a value of at least 3840 samples (80 ms) provides
|
| + sufficient history to the decoder that it will have converged
|
| + before the stream's output begins.
|
| +</t>
|
| +
|
| +</section>
|
| +
|
| +<section anchor="pcm_sample_position" title="PCM Sample Position">
|
| +<t>
|
| +The PCM sample position is determined from the granule position using the
|
| + formula
|
| +</t>
|
| +<figure align="center">
|
| +<artwork align="center"><![CDATA[
|
| +'PCM sample position' = 'granule position' - 'pre-skip' .
|
| +]]></artwork>
|
| +</figure>
|
| +
|
| +<t>
|
| +For example, if the granule position of the first audio data page is 59,971,
|
| + and the pre-skip is 11,971, then the PCM sample position of the last decoded
|
| + sample from that page is 48,000.
|
| +</t>
|
| +<t>
|
| +This can be converted into a playback time using the formula
|
| +</t>
|
| +<figure align="center">
|
| +<artwork align="center"><![CDATA[
|
| + 'PCM sample position'
|
| +'playback time' = --------------------- .
|
| + 48000.0
|
| +]]></artwork>
|
| +</figure>
|
| +
|
| +<t>
|
| +The initial PCM sample position before any samples are played is normally '0'.
|
| +In this case, the PCM sample position of the first audio sample to be played
|
| + starts at '1', because it marks the time on the clock
|
| + <spanx style="emph">after</spanx> that sample has been played, and a stream
|
| + that is exactly one second long has a final PCM sample position of '48000',
|
| + as in the example here.
|
| +</t>
|
| +
|
| +<t>
|
| +Vorbis streams use a granule position smaller than the number of audio samples
|
| + contained in the first audio data page to indicate that some of those samples
|
| + are trimmed from the output (see <xref target="vorbis-trim"/>).
|
| +However, to do so, Vorbis requires that the first audio data page contains
|
| + exactly two packets, in order to allow the decoder to perform PCM position
|
| + adjustments before needing to return any PCM data.
|
| +Opus uses the pre-skip mechanism for this purpose instead, since the encoder
|
| + might introduce more than a single packet's worth of latency, and since very
|
| + large packets in streams with a very large number of channels might not fit
|
| + on a single page.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="end_trimming" title="End Trimming">
|
| +<t>
|
| +The page with the 'end of stream' flag set MAY have a granule position that
|
| + indicates the page contains less audio data than would normally be returned by
|
| + decoding up through the final packet.
|
| +This is used to end the stream somewhere other than an even frame boundary.
|
| +The granule position of the most recent audio data page with completed packets
|
| + is used to make this determination, or '0' is used if there were no previous
|
| + audio data pages with a completed packet.
|
| +The difference between these granule positions indicates how many samples to
|
| + keep after decoding the packets that completed on the final page.
|
| +The remaining samples are discarded.
|
| +The number of discarded samples SHOULD be no larger than the number decoded
|
| + from the last packet.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="start_granpos_restrictions"
|
| + title="Restrictions on the Initial Granule Position">
|
| +<t>
|
| +The granule position of the first audio data page with a completed packet MAY
|
| + be larger than the number of samples contained in packets that complete on
|
| + that page, however it MUST NOT be smaller, unless that page has the 'end of
|
| + stream' flag set.
|
| +Allowing a granule position larger than the number of samples allows the
|
| + beginning of a stream to be cropped or a live stream to be joined without
|
| + rewriting the granule position of all the remaining pages.
|
| +This means that the PCM sample position just before the first sample to be
|
| + played MAY be larger than '0'.
|
| +Synchronization when multiplexing with other logical streams still uses the PCM
|
| + sample position relative to '0' to compute sample times.
|
| +This does not affect the behavior of pre-skip: exactly 'pre-skip' samples
|
| + SHOULD be skipped from the beginning of the decoded output, even if the
|
| + initial PCM sample position is greater than zero.
|
| +</t>
|
| +
|
| +<t>
|
| +On the other hand, a granule position that is smaller than the number of
|
| + decoded samples prevents a demuxer from working backwards to assign each
|
| + packet or each individual sample a valid granule position, since granule
|
| + positions are non-negative.
|
| +An implementation MUST reject as invalid any stream where the granule position
|
| + is smaller than the number of samples contained in packets that complete on
|
| + the first audio data page with a completed packet, unless that page has the
|
| + 'end of stream' flag set.
|
| +It MAY defer this action until it decodes the last packet completed on that
|
| + page.
|
| +</t>
|
| +
|
| +<t>
|
| +If that page has the 'end of stream' flag set, a demuxer MUST reject as invalid
|
| + any stream where its granule position is smaller than the 'pre-skip' amount.
|
| +This would indicate that there are more samples to be skipped from the initial
|
| + decoded output than exist in the stream.
|
| +If the granule position is smaller than the number of decoded samples produced
|
| + by the packets that complete on that page, then a demuxer MUST use an initial
|
| + granule position of '0', and can work forwards from '0' to timestamp
|
| + individual packets.
|
| +If the granule position is larger than the number of decoded samples available,
|
| + then the demuxer MUST still work backwards as described above, even if the
|
| + 'end of stream' flag is set, to determine the initial granule position, and
|
| + thus the initial PCM sample position.
|
| +Both of these will be greater than '0' in this case.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="seeking_and_preroll" title="Seeking and Pre-roll">
|
| +<t>
|
| +Seeking in Ogg files is best performed using a bisection search for a page
|
| + whose granule position corresponds to a PCM position at or before the seek
|
| + target.
|
| +With appropriately weighted bisection, accurate seeking can be performed in
|
| + just one or two bisections on average, even in multi-gigabyte files.
|
| +See <xref target="seeking"/> for an example of general implementation guidance.
|
| +</t>
|
| +
|
| +<t>
|
| +When seeking within an Ogg Opus stream, an implementation SHOULD start decoding
|
| + (and discarding the output) at least 3840 samples (80 ms) prior to
|
| + the seek target in order to ensure that the output audio is correct by the
|
| + time it reaches the seek target.
|
| +This 'pre-roll' is separate from, and unrelated to, the 'pre-skip' used at the
|
| + beginning of the stream.
|
| +If the point 80 ms prior to the seek target comes before the initial PCM
|
| + sample position, an implementation SHOULD start decoding from the beginning of
|
| + the stream, applying pre-skip as normal, regardless of whether the pre-skip is
|
| + larger or smaller than 80 ms, and then continue to discard samples
|
| + to reach the seek target (if any).
|
| +</t>
|
| +</section>
|
| +
|
| +</section>
|
| +
|
| +<section anchor="headers" title="Header Packets">
|
| +<t>
|
| +An Ogg Opus logical stream contains exactly two mandatory header packets:
|
| + an identification header and a comment header.
|
| +</t>
|
| +
|
| +<section anchor="id_header" title="Identification Header">
|
| +
|
| +<figure anchor="id_header_packet" title="ID Header Packet" align="center">
|
| +<artwork align="center"><![CDATA[
|
| + 0 1 2 3
|
| + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| 'O' | 'p' | 'u' | 's' |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| 'H' | 'e' | 'a' | 'd' |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| Version = 1 | Channel Count | Pre-skip |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| Input Sample Rate (Hz) |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| Output Gain (Q7.8 in dB) | Mapping Family| |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ :
|
| +| |
|
| +: Optional Channel Mapping Table... :
|
| +| |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +]]></artwork>
|
| +</figure>
|
| +
|
| +<t>
|
| +The fields in the identification (ID) header have the following meaning:
|
| +<list style="numbers">
|
| +<t>Magic Signature:
|
| +<vspace blankLines="1"/>
|
| +This is an 8-octet (64-bit) field that allows codec identification and is
|
| + human-readable.
|
| +It contains, in order, the magic numbers:
|
| +<list style="empty">
|
| +<t>0x4F 'O'</t>
|
| +<t>0x70 'p'</t>
|
| +<t>0x75 'u'</t>
|
| +<t>0x73 's'</t>
|
| +<t>0x48 'H'</t>
|
| +<t>0x65 'e'</t>
|
| +<t>0x61 'a'</t>
|
| +<t>0x64 'd'</t>
|
| +</list>
|
| +Starting with "Op" helps distinguish it from audio data packets, as this is an
|
| + invalid TOC sequence.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>Version (8 bits, unsigned):
|
| +<vspace blankLines="1"/>
|
| +The version number MUST always be '1' for this version of the encapsulation
|
| + specification.
|
| +Implementations SHOULD treat streams where the upper four bits of the version
|
| + number match that of a recognized specification as backwards-compatible with
|
| + that specification.
|
| +That is, the version number can be split into "major" and "minor" version
|
| + sub-fields, with changes to the "minor" sub-field (in the lower four bits)
|
| + signaling compatible changes.
|
| +For example, an implementation of this specification SHOULD accept any stream
|
| + with a version number of '15' or less, and SHOULD assume any stream with a
|
| + version number '16' or greater is incompatible.
|
| +The initial version '1' was chosen to keep implementations from relying on this
|
| + octet as a null terminator for the "OpusHead" string.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>Output Channel Count 'C' (8 bits, unsigned):
|
| +<vspace blankLines="1"/>
|
| +This is the number of output channels.
|
| +This might be different than the number of encoded channels, which can change
|
| + on a packet-by-packet basis.
|
| +This value MUST NOT be zero.
|
| +The maximum allowable value depends on the channel mapping family, and might be
|
| + as large as 255.
|
| +See <xref target="channel_mapping"/> for details.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>Pre-skip (16 bits, unsigned, little
|
| + endian):
|
| +<vspace blankLines="1"/>
|
| +This is the number of samples (at 48 kHz) to discard from the decoder
|
| + output when starting playback, and also the number to subtract from a page's
|
| + granule position to calculate its PCM sample position.
|
| +When cropping the beginning of existing Ogg Opus streams, a pre-skip of at
|
| + least 3,840 samples (80 ms) is RECOMMENDED to ensure complete
|
| + convergence in the decoder.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>Input Sample Rate (32 bits, unsigned, little
|
| + endian):
|
| +<vspace blankLines="1"/>
|
| +This is the sample rate of the original input (before encoding), in Hz.
|
| +This field is <spanx style="emph">not</spanx> the sample rate to use for
|
| + playback of the encoded data.
|
| +<vspace blankLines="1"/>
|
| +Opus can switch between internal audio bandwidths of 4, 6, 8, 12, and
|
| + 20 kHz.
|
| +Each packet in the stream can have a different audio bandwidth.
|
| +Regardless of the audio bandwidth, the reference decoder supports decoding any
|
| + stream at a sample rate of 8, 12, 16, 24, or 48 kHz.
|
| +The original sample rate of the audio passed to the encoder is not preserved
|
| + by the lossy compression.
|
| +<vspace blankLines="1"/>
|
| +An Ogg Opus player SHOULD select the playback sample rate according to the
|
| + following procedure:
|
| +<list style="numbers">
|
| +<t>If the hardware supports 48 kHz playback, decode at 48 kHz.</t>
|
| +<t>Otherwise, if the hardware's highest available sample rate is a supported
|
| + rate, decode at this sample rate.</t>
|
| +<t>Otherwise, if the hardware's highest available sample rate is less than
|
| + 48 kHz, decode at the next higher Opus supported rate above the highest
|
| + available hardware rate and resample.</t>
|
| +<t>Otherwise, decode at 48 kHz and resample.</t>
|
| +</list>
|
| +However, the 'Input Sample Rate' field allows the muxer to pass the sample
|
| + rate of the original input stream as metadata.
|
| +This is useful when the user requires the output sample rate to match the
|
| + input sample rate.
|
| +For example, when not playing the output, an implementation writing PCM format
|
| + samples to disk might choose to resample the audio back to the original input
|
| + sample rate to reduce surprise to the user, who might reasonably expect to get
|
| + back a file with the same sample rate.
|
| +<vspace blankLines="1"/>
|
| +A value of zero indicates 'unspecified'.
|
| +Muxers SHOULD write the actual input sample rate or zero, but implementations
|
| + which do something with this field SHOULD take care to behave sanely if given
|
| + crazy values (e.g., do not actually upsample the output to 10 MHz if
|
| + requested).
|
| +Implementations SHOULD support input sample rates between 8 kHz and
|
| + 192 kHz (inclusive).
|
| +Rates outside this range MAY be ignored by falling back to the default rate of
|
| + 48 kHz instead.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>Output Gain (16 bits, signed, little endian):
|
| +<vspace blankLines="1"/>
|
| +This is a gain to be applied when decoding.
|
| +It is 20*log10 of the factor by which to scale the decoder output to achieve
|
| + the desired playback volume, stored in a 16-bit, signed, two's complement
|
| + fixed-point value with 8 fractional bits (i.e., Q7.8).
|
| +<vspace blankLines="1"/>
|
| +To apply the gain, an implementation could use
|
| +<figure align="center">
|
| +<artwork align="center"><![CDATA[
|
| +sample *= pow(10, output_gain/(20.0*256)) ,
|
| +]]></artwork>
|
| +</figure>
|
| + where output_gain is the raw 16-bit value from the header.
|
| +<vspace blankLines="1"/>
|
| +Players and media frameworks SHOULD apply it by default.
|
| +If a player chooses to apply any volume adjustment or gain modification, such
|
| + as the R128_TRACK_GAIN (see <xref target="comment_header"/>), the adjustment
|
| + MUST be applied in addition to this output gain in order to achieve playback
|
| + at the normalized volume.
|
| +<vspace blankLines="1"/>
|
| +A muxer SHOULD set this field to zero, and instead apply any gain prior to
|
| + encoding, when this is possible and does not conflict with the user's wishes.
|
| +A nonzero output gain indicates the gain was adjusted after encoding, or that
|
| + a user wished to adjust the gain for playback while preserving the ability
|
| + to recover the original signal amplitude.
|
| +<vspace blankLines="1"/>
|
| +Although the output gain has enormous range (+/- 128 dB, enough to amplify
|
| + inaudible sounds to the threshold of physical pain), most applications can
|
| + only reasonably use a small portion of this range around zero.
|
| +The large range serves in part to ensure that gain can always be losslessly
|
| + transferred between OpusHead and R128 gain tags (see below) without
|
| + saturating.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>Channel Mapping Family (8 bits, unsigned):
|
| +<vspace blankLines="1"/>
|
| +This octet indicates the order and semantic meaning of the output channels.
|
| +<vspace blankLines="1"/>
|
| +Each currently specified value of this octet indicates a mapping family, which
|
| + defines a set of allowed channel counts, and the ordered set of channel names
|
| + for each allowed channel count.
|
| +The details are described in <xref target="channel_mapping"/>.
|
| +</t>
|
| +<t>Channel Mapping Table:
|
| +This table defines the mapping from encoded streams to output channels.
|
| +Its contents are specified in <xref target="channel_mapping"/>.
|
| +</t>
|
| +</list>
|
| +</t>
|
| +
|
| +<t>
|
| +All fields in the ID headers are REQUIRED, except for the channel mapping
|
| + table, which MUST be omitted when the channel mapping family is 0, but
|
| + is REQUIRED otherwise.
|
| +Implementations SHOULD reject streams with ID headers that do not contain
|
| + enough data for these fields, even if they contain a valid Magic Signature.
|
| +Future versions of this specification, even backwards-compatible versions,
|
| + might include additional fields in the ID header.
|
| +If an ID header has a compatible major version, but a larger minor version,
|
| + an implementation MUST NOT reject it for containing additional data not
|
| + specified here, provided it still completes on the first page.
|
| +</t>
|
| +
|
| +<section anchor="channel_mapping" title="Channel Mapping">
|
| +<t>
|
| +An Ogg Opus stream allows mapping one number of Opus streams (N) to a possibly
|
| + larger number of decoded channels (M + N) to yet another number of
|
| + output channels (C), which might be larger or smaller than the number of
|
| + decoded channels.
|
| +The order and meaning of these channels are defined by a channel mapping,
|
| + which consists of the 'channel mapping family' octet and, for channel mapping
|
| + families other than family 0, a channel mapping table, as illustrated in
|
| + <xref target="channel_mapping_table"/>.
|
| +</t>
|
| +
|
| +<figure anchor="channel_mapping_table" title="Channel Mapping Table"
|
| + align="center">
|
| +<artwork align="center"><![CDATA[
|
| + 0 1 2 3
|
| + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
| + +-+-+-+-+-+-+-+-+
|
| + | Stream Count |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| Coupled Count | Channel Mapping... :
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +]]></artwork>
|
| +</figure>
|
| +
|
| +<t>
|
| +The fields in the channel mapping table have the following meaning:
|
| +<list style="numbers" counter="8">
|
| +<t>Stream Count 'N' (8 bits, unsigned):
|
| +<vspace blankLines="1"/>
|
| +This is the total number of streams encoded in each Ogg packet.
|
| +This value is necessary to correctly parse the packed Opus packets inside an
|
| + Ogg packet, as described in <xref target="packet_organization"/>.
|
| +This value MUST NOT be zero, as without at least one Opus packet with a valid
|
| + TOC sequence, a demuxer cannot recover the duration of an Ogg packet.
|
| +<vspace blankLines="1"/>
|
| +For channel mapping family 0, this value defaults to 1, and is not coded.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>Coupled Stream Count 'M' (8 bits, unsigned):
|
| +This is the number of streams whose decoders are to be configured to produce
|
| + two channels (stereo).
|
| +This MUST be no larger than the total number of streams, N.
|
| +<vspace blankLines="1"/>
|
| +Each packet in an Opus stream has an internal channel count of 1 or 2, which
|
| + can change from packet to packet.
|
| +This is selected by the encoder depending on the bitrate and the audio being
|
| + encoded.
|
| +The original channel count of the audio passed to the encoder is not
|
| + necessarily preserved by the lossy compression.
|
| +<vspace blankLines="1"/>
|
| +Regardless of the internal channel count, any Opus stream can be decoded as
|
| + mono (a single channel) or stereo (two channels) by appropriate initialization
|
| + of the decoder.
|
| +The 'coupled stream count' field indicates that the decoders for the first M
|
| + Opus streams are to be initialized for stereo (two-channel) output, and the
|
| + remaining (N - M) decoders are to be initialized for mono (a single
|
| + channel) only.
|
| +The total number of decoded channels, (M + N), MUST be no larger than
|
| + 255, as there is no way to index more channels than that in the channel
|
| + mapping.
|
| +<vspace blankLines="1"/>
|
| +For channel mapping family 0, this value defaults to (C - 1)
|
| + (i.e., 0 for mono and 1 for stereo), and is not coded.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>Channel Mapping (8*C bits):
|
| +This contains one octet per output channel, indicating which decoded channel
|
| + is to be used for each one.
|
| +Let 'index' be the value of this octet for a particular output channel.
|
| +This value MUST either be smaller than (M + N), or be the special
|
| + value 255.
|
| +If 'index' is less than 2*M, the output MUST be taken from decoding stream
|
| + ('index'/2) as stereo and selecting the left channel if 'index' is even, and
|
| + the right channel if 'index' is odd.
|
| +If 'index' is 2*M or larger, but less than 255, the output MUST be taken from
|
| + decoding stream ('index' - M) as mono.
|
| +If 'index' is 255, the corresponding output channel MUST contain pure silence.
|
| +<vspace blankLines="1"/>
|
| +The number of output channels, C, is not constrained to match the number of
|
| + decoded channels (M + N).
|
| +A single index value MAY appear multiple times, i.e., the same decoded channel
|
| + might be mapped to multiple output channels.
|
| +Some decoded channels might not be assigned to any output channel, as well.
|
| +<vspace blankLines="1"/>
|
| +For channel mapping family 0, the first index defaults to 0, and if
|
| + C == 2, the second index defaults to 1.
|
| +Neither index is coded.
|
| +</t>
|
| +</list>
|
| +</t>
|
| +
|
| +<t>
|
| +After producing the output channels, the channel mapping family determines the
|
| + semantic meaning of each one.
|
| +There are three defined mapping families in this specification.
|
| +</t>
|
| +
|
| +<section anchor="channel_mapping_0" title="Channel Mapping Family 0">
|
| +<t>
|
| +Allowed numbers of channels: 1 or 2.
|
| +RTP mapping.
|
| +This is the same channel interpretation as <xref target="RFC7587"/>.
|
| +</t>
|
| +<t>
|
| +<list style="symbols">
|
| +<t>1 channel: monophonic (mono).</t>
|
| +<t>2 channels: stereo (left, right).</t>
|
| +</list>
|
| +Special mapping: This channel mapping value also
|
| + indicates that the contents consists of a single Opus stream that is stereo if
|
| + and only if C == 2, with stream index 0 mapped to output
|
| + channel 0 (mono, or left channel) and stream index 1 mapped to
|
| + output channel 1 (right channel) if stereo.
|
| +When the 'channel mapping family' octet has this value, the channel mapping
|
| + table MUST be omitted from the ID header packet.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="channel_mapping_1" title="Channel Mapping Family 1">
|
| +<t>
|
| +Allowed numbers of channels: 1...8.
|
| +Vorbis channel order (see below).
|
| +</t>
|
| +<t>
|
| +Each channel is assigned to a speaker location in a conventional surround
|
| + arrangement.
|
| +Specific locations depend on the number of channels, and are given below
|
| + in order of the corresponding channel indices.
|
| +<list style="symbols">
|
| + <t>1 channel: monophonic (mono).</t>
|
| + <t>2 channels: stereo (left, right).</t>
|
| + <t>3 channels: linear surround (left, center, right)</t>
|
| + <t>4 channels: quadraphonic (front left, front right, rear left, rear right).</t>
|
| + <t>5 channels: 5.0 surround (front left, front center, front right, rear left, rear right).</t>
|
| + <t>6 channels: 5.1 surround (front left, front center, front right, rear left, rear right, LFE).</t>
|
| + <t>7 channels: 6.1 surround (front left, front center, front right, side left, side right, rear center, LFE).</t>
|
| + <t>8 channels: 7.1 surround (front left, front center, front right, side left, side right, rear left, rear right, LFE)</t>
|
| +</list>
|
| +</t>
|
| +<t>
|
| +This set of surround options and speaker location orderings is the same
|
| + as those used by the Vorbis codec <xref target="vorbis-mapping"/>.
|
| +The ordering is different from the one used by the
|
| + WAVE <xref target="wave-multichannel"/> and
|
| + Free Lossless Audio Codec (FLAC) <xref target="flac"/> formats,
|
| + so correct ordering requires permutation of the output channels when decoding
|
| + to or encoding from those formats.
|
| +'LFE' here refers to a Low Frequency Effects channel, often mapped to a
|
| + subwoofer with no particular spatial position.
|
| +Implementations SHOULD identify 'side' or 'rear' speaker locations with
|
| + 'surround' and 'back' as appropriate when interfacing with audio formats
|
| + or systems which prefer that terminology.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="channel_mapping_255"
|
| + title="Channel Mapping Family 255">
|
| +<t>
|
| +Allowed numbers of channels: 1...255.
|
| +No defined channel meaning.
|
| +</t>
|
| +<t>
|
| +Channels are unidentified.
|
| +General-purpose players SHOULD NOT attempt to play these streams.
|
| +Offline implementations MAY deinterleave the output into separate PCM files,
|
| + one per channel.
|
| +Implementations SHOULD NOT produce output for channels mapped to stream index
|
| + 255 (pure silence) unless they have no other way to indicate the index of
|
| + non-silent channels.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="channel_mapping_undefined"
|
| + title="Undefined Channel Mappings">
|
| +<t>
|
| +The remaining channel mapping families (2...254) are reserved.
|
| +A demuxer implementation encountering a reserved channel mapping family value
|
| + SHOULD act as though the value is 255.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="downmix" title="Downmixing">
|
| +<t>
|
| +An Ogg Opus player MUST support any valid channel mapping with a channel
|
| + mapping family of 0 or 1, even if the number of channels does not match the
|
| + physically connected audio hardware.
|
| +Players SHOULD perform channel mixing to increase or reduce the number of
|
| + channels as needed.
|
| +</t>
|
| +
|
| +<t>
|
| +Implementations MAY use the following matrices to implement downmixing from
|
| + multichannel files using <xref target="channel_mapping_1">Channel Mapping
|
| + Family 1</xref>, which are known to give acceptable results for stereo.
|
| +Matrices for 3 and 4 channels are normalized so each coefficient row sums
|
| + to 1 to avoid clipping.
|
| +For 5 or more channels they are normalized to 2 as a compromise between
|
| + clipping and dynamic range reduction.
|
| +</t>
|
| +<t>
|
| +In these matrices the front left and front right channels are generally
|
| +passed through directly.
|
| +When a surround channel is split between both the left and right stereo
|
| + channels, coefficients are chosen so their squares sum to 1, which
|
| + helps preserve the perceived intensity.
|
| +Rear channels are mixed more diffusely or attenuated to maintain focus
|
| + on the front channels.
|
| +</t>
|
| +
|
| +<figure anchor="downmix-matrix-3"
|
| + title="Stereo downmix matrix for the linear surround channel mapping"
|
| + align="center">
|
| +<artwork align="center"><![CDATA[
|
| +L output = ( 0.585786 * left + 0.414214 * center )
|
| +R output = ( 0.414214 * center + 0.585786 * right )
|
| +]]></artwork>
|
| +<postamble>
|
| +Exact coefficient values are 1 and 1/sqrt(2), multiplied by
|
| + 1/(1 + 1/sqrt(2)) for normalization.
|
| +</postamble>
|
| +</figure>
|
| +
|
| +<figure anchor="downmix-matrix-4"
|
| + title="Stereo downmix matrix for the quadraphonic channel mapping"
|
| + align="center">
|
| +<artwork align="center"><![CDATA[
|
| +/ \ / \ / FL \
|
| +| L output | | 0.422650 0.000000 0.366025 0.211325 | | FR |
|
| +| R output | = | 0.000000 0.422650 0.211325 0.366025 | | RL |
|
| +\ / \ / \ RR /
|
| +]]></artwork>
|
| +<postamble>
|
| +Exact coefficient values are 1, sqrt(3)/2 and 1/2, multiplied by
|
| + 1/(1 + sqrt(3)/2 + 1/2) for normalization.
|
| +</postamble>
|
| +</figure>
|
| +
|
| +<figure anchor="downmix-matrix-5"
|
| + title="Stereo downmix matrix for the 5.0 surround mapping"
|
| + align="center">
|
| +<artwork align="center"><![CDATA[
|
| + / FL \
|
| +/ \ / \ | FC |
|
| +| L | | 0.650802 0.460186 0.000000 0.563611 0.325401 | | FR |
|
| +| R | = | 0.000000 0.460186 0.650802 0.325401 0.563611 | | RL |
|
| +\ / \ / | RR |
|
| + \ /
|
| +]]></artwork>
|
| +<postamble>
|
| +Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by
|
| + 2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2)
|
| + for normalization.
|
| +</postamble>
|
| +</figure>
|
| +
|
| +<figure anchor="downmix-matrix-6"
|
| + title="Stereo downmix matrix for the 5.1 surround mapping"
|
| + align="center">
|
| +<artwork align="center"><![CDATA[
|
| + /FL \
|
| +/ \ / \ |FC |
|
| +|L| | 0.529067 0.374107 0.000000 0.458186 0.264534 0.374107 | |FR |
|
| +|R| = | 0.000000 0.374107 0.529067 0.264534 0.458186 0.374107 | |RL |
|
| +\ / \ / |RR |
|
| + \LFE/
|
| +]]></artwork>
|
| +<postamble>
|
| +Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by
|
| +2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2 + 1/sqrt(2))
|
| + for normalization.
|
| +</postamble>
|
| +</figure>
|
| +
|
| +<figure anchor="downmix-matrix-7"
|
| + title="Stereo downmix matrix for the 6.1 surround mapping"
|
| + align="center">
|
| +<artwork align="center"><![CDATA[
|
| + / \
|
| + | 0.455310 0.321953 0.000000 0.394310 0.227655 0.278819 0.321953 |
|
| + | 0.000000 0.321953 0.455310 0.227655 0.394310 0.278819 0.321953 |
|
| + \ /
|
| +]]></artwork>
|
| +<postamble>
|
| +Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2, 1/2 and
|
| + sqrt(3)/2/sqrt(2), multiplied by
|
| + 2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2 +
|
| + sqrt(3)/2/sqrt(2) + 1/sqrt(2)) for normalization.
|
| +The coefficients are in the same order as in <xref target="channel_mapping_1" />,
|
| + and the matrices above.
|
| +</postamble>
|
| +</figure>
|
| +
|
| +<figure anchor="downmix-matrix-8"
|
| + title="Stereo downmix matrix for the 7.1 surround mapping"
|
| + align="center">
|
| +<artwork align="center"><![CDATA[
|
| +/ \
|
| +| .388631 .274804 .000000 .336565 .194316 .336565 .194316 .274804 |
|
| +| .000000 .274804 .388631 .194316 .336565 .194316 .336565 .274804 |
|
| +\ /
|
| +]]></artwork>
|
| +<postamble>
|
| +Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by
|
| + 2/(2 + 2/sqrt(2) + sqrt(3)) for normalization.
|
| +The coefficients are in the same order as in <xref target="channel_mapping_1" />,
|
| + and the matrices above.
|
| +</postamble>
|
| +</figure>
|
| +
|
| +</section>
|
| +
|
| +</section> <!-- end channel_mapping_table -->
|
| +
|
| +</section> <!-- end id_header -->
|
| +
|
| +<section anchor="comment_header" title="Comment Header">
|
| +
|
| +<figure anchor="comment_header_packet" title="Comment Header Packet"
|
| + align="center">
|
| +<artwork align="center"><![CDATA[
|
| + 0 1 2 3
|
| + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| 'O' | 'p' | 'u' | 's' |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| 'T' | 'a' | 'g' | 's' |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| Vendor String Length |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| |
|
| +: Vendor String... :
|
| +| |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| User Comment List Length |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| User Comment #0 String Length |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| |
|
| +: User Comment #0 String... :
|
| +| |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +| User Comment #1 String Length |
|
| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
| +: :
|
| +]]></artwork>
|
| +</figure>
|
| +
|
| +<t>
|
| +The comment header consists of a 64-bit magic signature, followed by data in
|
| + the same format as the <xref target="vorbis-comment"/> header used in Ogg
|
| + Vorbis, except (like Ogg Theora and Speex) the final "framing bit" specified
|
| + in the Vorbis spec is not present.
|
| +<list style="numbers">
|
| +<t>Magic Signature:
|
| +<vspace blankLines="1"/>
|
| +This is an 8-octet (64-bit) field that allows codec identification and is
|
| + human-readable.
|
| +It contains, in order, the magic numbers:
|
| +<list style="empty">
|
| +<t>0x4F 'O'</t>
|
| +<t>0x70 'p'</t>
|
| +<t>0x75 'u'</t>
|
| +<t>0x73 's'</t>
|
| +<t>0x54 'T'</t>
|
| +<t>0x61 'a'</t>
|
| +<t>0x67 'g'</t>
|
| +<t>0x73 's'</t>
|
| +</list>
|
| +Starting with "Op" helps distinguish it from audio data packets, as this is an
|
| + invalid TOC sequence.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>Vendor String Length (32 bits, unsigned, little endian):
|
| +<vspace blankLines="1"/>
|
| +This field gives the length of the following vendor string, in octets.
|
| +It MUST NOT indicate that the vendor string is longer than the rest of the
|
| + packet.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>Vendor String (variable length, UTF-8 vector):
|
| +<vspace blankLines="1"/>
|
| +This is a simple human-readable tag for vendor information, encoded as a UTF-8
|
| + string <xref target="RFC3629"/>.
|
| +No terminating null octet is necessary.
|
| +<vspace blankLines="1"/>
|
| +This tag is intended to identify the codec encoder and encapsulation
|
| + implementations, for tracing differences in technical behavior.
|
| +User-facing applications can use the 'ENCODER' user comment tag to identify
|
| + themselves.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>User Comment List Length (32 bits, unsigned, little endian):
|
| +<vspace blankLines="1"/>
|
| +This field indicates the number of user-supplied comments.
|
| +It MAY indicate there are zero user-supplied comments, in which case there are
|
| + no additional fields in the packet.
|
| +It MUST NOT indicate that there are so many comments that the comment string
|
| + lengths would require more data than is available in the rest of the packet.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>User Comment #i String Length (32 bits, unsigned, little endian):
|
| +<vspace blankLines="1"/>
|
| +This field gives the length of the following user comment string, in octets.
|
| +There is one for each user comment indicated by the 'user comment list length'
|
| + field.
|
| +It MUST NOT indicate that the string is longer than the rest of the packet.
|
| +<vspace blankLines="1"/>
|
| +</t>
|
| +<t>User Comment #i String (variable length, UTF-8 vector):
|
| +<vspace blankLines="1"/>
|
| +This field contains a single user comment string.
|
| +There is one for each user comment indicated by the 'user comment list length'
|
| + field.
|
| +</t>
|
| +</list>
|
| +</t>
|
| +
|
| +<t>
|
| +The vendor string length and user comment list length are REQUIRED, and
|
| + implementations SHOULD reject comment headers that do not contain enough data
|
| + for these fields, or that do not contain enough data for the corresponding
|
| + vendor string or user comments they describe.
|
| +Making this check before allocating the associated memory to contain the data
|
| + helps prevent a possible Denial-of-Service (DoS) attack from small comment
|
| + headers that claim to contain strings longer than the entire packet or more
|
| + user comments than than could possibly fit in the packet.
|
| +</t>
|
| +
|
| +<t>
|
| +Immediately following the user comment list, the comment header MAY
|
| + contain zero-padding or other binary data which is not specified here.
|
| +If the least-significant bit of the first byte of this data is 1, then editors
|
| + SHOULD preserve the contents of this data when updating the tags, but if this
|
| + bit is 0, all such data MAY be treated as padding, and truncated or discarded
|
| + as desired.
|
| +This allows informal experimentation with the format of this binary data until
|
| + it can be specified later.
|
| +</t>
|
| +
|
| +<t>
|
| +The comment header can be arbitrarily large and might be spread over a large
|
| + number of Ogg pages.
|
| +Implementations MUST avoid attempting to allocate excessive amounts of memory
|
| + when presented with a very large comment header.
|
| +To accomplish this, implementations MAY reject a comment header larger than
|
| + 125,829,120 octets, and MAY ignore individual comments that are not fully
|
| + contained within the first 61,440 octets of the comment header.
|
| +</t>
|
| +
|
| +<section anchor="comment_format" title="Tag Definitions">
|
| +<t>
|
| +The user comment strings follow the NAME=value format described by
|
| + <xref target="vorbis-comment"/> with the same recommended tag names:
|
| + ARTIST, TITLE, DATE, ALBUM, and so on.
|
| +</t>
|
| +<t>
|
| +Two new comment tags are introduced here:
|
| +</t>
|
| +
|
| +<t>First, an optional gain for track normalization:</t>
|
| +<figure align="center">
|
| +<artwork align="left"><![CDATA[
|
| +R128_TRACK_GAIN=-573
|
| +]]></artwork>
|
| +</figure>
|
| +<t>
|
| + representing the volume shift needed to normalize the track's volume
|
| + during isolated playback, in random shuffle, and so on.
|
| +The gain is a Q7.8 fixed point number in dB, as in the ID header's 'output
|
| + gain' field.
|
| +This tag is similar to the REPLAYGAIN_TRACK_GAIN tag in
|
| + Vorbis <xref target="replay-gain"/>, except that the normal volume
|
| + reference is the <xref target="EBU-R128"/> standard.
|
| +</t>
|
| +<t>Second, an optional gain for album normalization:</t>
|
| +<figure align="center">
|
| +<artwork align="left"><![CDATA[
|
| +R128_ALBUM_GAIN=111
|
| +]]></artwork>
|
| +</figure>
|
| +<t>
|
| + representing the volume shift needed to normalize the overall volume when
|
| + played as part of a particular collection of tracks.
|
| +The gain is also a Q7.8 fixed point number in dB, as in the ID header's
|
| + 'output gain' field.
|
| +</t>
|
| +<t>
|
| +An Ogg Opus stream MUST NOT have more than one of each of these tags, and if
|
| + present their values MUST be an integer from -32768 to 32767, inclusive,
|
| + represented in ASCII as a base 10 number with no whitespace.
|
| +A leading '+' or '-' character is valid.
|
| +Leading zeros are also permitted, but the value MUST be represented by
|
| + no more than 6 characters.
|
| +Other non-digit characters MUST NOT be present.
|
| +</t>
|
| +<t>
|
| +If present, R128_TRACK_GAIN and R128_ALBUM_GAIN MUST correctly represent
|
| + the R128 normalization gain relative to the 'output gain' field specified
|
| + in the ID header.
|
| +If a player chooses to make use of the R128_TRACK_GAIN tag or the
|
| + R128_ALBUM_GAIN tag, it MUST apply those gains
|
| + <spanx style="emph">in addition</spanx> to the 'output gain' value.
|
| +If a tool modifies the ID header's 'output gain' field, it MUST also update or
|
| + remove the R128_TRACK_GAIN and R128_ALBUM_GAIN comment tags if present.
|
| +A muxer SHOULD place the gain it wants other tools to use by default into the
|
| + 'output gain' field, and not the comment tag.
|
| +</t>
|
| +<t>
|
| +To avoid confusion with multiple normalization schemes, an Opus comment header
|
| + SHOULD NOT contain any of the REPLAYGAIN_TRACK_GAIN, REPLAYGAIN_TRACK_PEAK,
|
| + REPLAYGAIN_ALBUM_GAIN, or REPLAYGAIN_ALBUM_PEAK tags, unless they are only
|
| + to be used in some context where there is guaranteed to be no such confusion.
|
| +<xref target="EBU-R128"/> normalization is preferred to the earlier
|
| + REPLAYGAIN schemes because of its clear definition and adoption by industry.
|
| +Peak normalizations are difficult to calculate reliably for lossy codecs
|
| + because of variation in excursion heights due to decoder differences.
|
| +In the authors' investigations they were not applied consistently or broadly
|
| + enough to merit inclusion here.
|
| +</t>
|
| +</section> <!-- end comment_format -->
|
| +</section> <!-- end comment_header -->
|
| +
|
| +</section> <!-- end headers -->
|
| +
|
| +<section anchor="packet_size_limits" title="Packet Size Limits">
|
| +<t>
|
| +Technically, valid Opus packets can be arbitrarily large due to the padding
|
| + format, although the amount of non-padding data they can contain is bounded.
|
| +These packets might be spread over a similarly enormous number of Ogg pages.
|
| +When encoding, implementations SHOULD limit the use of padding in audio data
|
| + packets to no more than is necessary to make a variable bitrate (VBR) stream
|
| + constant bitrate (CBR), unless they have no reasonable way to determine what
|
| + is necessary.
|
| +Demuxers SHOULD reject audio data packets (treat them as if they were malformed
|
| + Opus packets with an invalid TOC sequence) larger than 61,440 octets per
|
| + Opus stream, unless they have a specific reason for allowing extra padding.
|
| +Such packets necessarily contain more padding than needed to make a stream CBR.
|
| +Demuxers MUST avoid attempting to allocate excessive amounts of memory when
|
| + presented with a very large packet.
|
| +Demuxers MAY reject or partially process audio data packets larger than
|
| + 61,440 octets in an Ogg Opus stream with channel mapping families 0
|
| + or 1.
|
| +Demuxers MAY reject or partially process audio data packets in any Ogg Opus
|
| + stream if the packet is larger than 61,440 octets and also larger than
|
| + 7,680 octets per Opus stream.
|
| +The presence of an extremely large packet in the stream could indicate a
|
| + memory exhaustion attack or stream corruption.
|
| +</t>
|
| +<t>
|
| +In an Ogg Opus stream, the largest possible valid packet that does not use
|
| + padding has a size of (61,298*N - 2) octets.
|
| +With 255 streams, this is 15,630,988 octets and can
|
| + span up to 61,298 Ogg pages, all but one of which will have a granule
|
| + position of -1.
|
| +This is of course a very extreme packet, consisting of 255 streams, each
|
| + containing 120 ms of audio encoded as 2.5 ms frames, each frame
|
| + using the maximum possible number of octets (1275) and stored in the least
|
| + efficient manner allowed (a VBR code 3 Opus packet).
|
| +Even in such a packet, most of the data will be zeros as 2.5 ms frames
|
| + cannot actually use all 1275 octets.
|
| +</t>
|
| +<t>
|
| +The largest packet consisting of entirely useful data is
|
| + (15,326*N - 2) octets.
|
| +This corresponds to 120 ms of audio encoded as 10 ms frames in either
|
| + SILK or Hybrid mode, but at a data rate of over 1 Mbps, which makes little
|
| + sense for the quality achieved.
|
| +</t>
|
| +<t>
|
| +A more reasonable limit is (7,664*N - 2) octets.
|
| +This corresponds to 120 ms of audio encoded as 20 ms stereo CELT mode
|
| + frames, with a total bitrate just under 511 kbps (not counting the Ogg
|
| + encapsulation overhead).
|
| +For channel mapping family 1, N=8 provides a reasonable upper bound, as it
|
| + allows for each of the 8 possible output channels to be decoded from a
|
| + separate stereo Opus stream.
|
| +This gives a size of 61,310 octets, which is rounded up to a multiple of
|
| + 1,024 octets to yield the audio data packet size of 61,440 octets
|
| + that any implementation is expected to be able to process successfully.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="encoder" title="Encoder Guidelines">
|
| +<t>
|
| +When encoding Opus streams, Ogg muxers SHOULD take into account the
|
| + algorithmic delay of the Opus encoder.
|
| +</t>
|
| +<t>
|
| +In encoders derived from the reference
|
| + implementation <xref target="RFC6716"/>, the number of samples can be
|
| + queried with:
|
| +</t>
|
| +<figure align="center">
|
| +<artwork align="center"><![CDATA[
|
| + opus_encoder_ctl(encoder_state, OPUS_GET_LOOKAHEAD(&delay_samples));
|
| +]]></artwork>
|
| +</figure>
|
| +<t>
|
| +To achieve good quality in the very first samples of a stream, implementations
|
| + MAY use linear predictive coding (LPC) extrapolation to generate at least 120
|
| + extra samples at the beginning to avoid the Opus encoder having to encode a
|
| + discontinuous signal.
|
| +For more information on linear prediction, see
|
| + <xref target="linear-prediction"/>.
|
| +For an input file containing 'length' samples, the implementation SHOULD set
|
| + the pre-skip header value to (delay_samples + extra_samples), encode
|
| + at least (length + delay_samples + extra_samples)
|
| + samples, and set the granule position of the last page to
|
| + (length + delay_samples + extra_samples).
|
| +This ensures that the encoded file has the same duration as the original, with
|
| + no time offset. The best way to pad the end of the stream is to also use LPC
|
| + extrapolation, but zero-padding is also acceptable.
|
| +</t>
|
| +
|
| +<section anchor="lpc" title="LPC Extrapolation">
|
| +<t>
|
| +The first step in LPC extrapolation is to compute linear prediction
|
| + coefficients. <xref target="lpc-sample"/>
|
| +When extending the end of the signal, order-N (typically with N ranging from 8
|
| + to 40) LPC analysis is performed on a window near the end of the signal.
|
| +The last N samples are used as memory to an infinite impulse response (IIR)
|
| + filter.
|
| +</t>
|
| +<t>
|
| +The filter is then applied on a zero input to extrapolate the end of the signal.
|
| +Let a(k) be the kth LPC coefficient and x(n) be the nth sample of the signal,
|
| + each new sample past the end of the signal is computed as:
|
| +</t>
|
| +<figure align="center">
|
| +<artwork align="center"><![CDATA[
|
| + N
|
| + ---
|
| +x(n) = \ a(k)*x(n-k)
|
| + /
|
| + ---
|
| + k=1
|
| +]]></artwork>
|
| +</figure>
|
| +<t>
|
| +The process is repeated independently for each channel.
|
| +It is possible to extend the beginning of the signal by applying the same
|
| + process backward in time.
|
| +When extending the beginning of the signal, it is best to apply a "fade in" to
|
| + the extrapolated signal, e.g. by multiplying it by a half-Hanning window
|
| + <xref target="hanning"/>.
|
| +</t>
|
| +
|
| +</section>
|
| +
|
| +<section anchor="continuous_chaining" title="Continuous Chaining">
|
| +<t>
|
| +In some applications, such as Internet radio, it is desirable to cut a long
|
| + stream into smaller chains, e.g. so the comment header can be updated.
|
| +This can be done simply by separating the input streams into segments and
|
| + encoding each segment independently.
|
| +The drawback of this approach is that it creates a small discontinuity
|
| + at the boundary due to the lossy nature of Opus.
|
| +A muxer MAY avoid this discontinuity by using the following procedure:
|
| +<list style="numbers">
|
| +<t>Encode the last frame of the first segment as an independent frame by
|
| + turning off all forms of inter-frame prediction.
|
| +De-emphasis is allowed.</t>
|
| +<t>Set the granule position of the last page to a point near the end of the
|
| + last frame.</t>
|
| +<t>Begin the second segment with a copy of the last frame of the first
|
| + segment.</t>
|
| +<t>Set the pre-skip value of the second stream in such a way as to properly
|
| + join the two streams.</t>
|
| +<t>Continue the encoding process normally from there, without any reset to
|
| + the encoder.</t>
|
| +</list>
|
| +</t>
|
| +<t>
|
| +In encoders derived from the reference implementation, inter-frame prediction
|
| + can be turned off by calling:
|
| +</t>
|
| +<figure align="center">
|
| +<artwork align="center"><![CDATA[
|
| + opus_encoder_ctl(encoder_state, OPUS_SET_PREDICTION_DISABLED(1));
|
| +]]></artwork>
|
| +</figure>
|
| +<t>
|
| +For best results, this implementation requires that prediction be explicitly
|
| + enabled again before resuming normal encoding, even after a reset.
|
| +</t>
|
| +
|
| +</section>
|
| +
|
| +</section>
|
| +
|
| +<section anchor="implementation" title="Implementation Status">
|
| +<t>
|
| +A brief summary of major implementations of this draft is available
|
| + at <eref target="https://wiki.xiph.org/OggOpusImplementation"/>,
|
| + along with their status.
|
| +</t>
|
| +<t>
|
| +[Note to RFC Editor: please remove this entire section before
|
| + final publication per <xref target="RFC6982"/>, along with
|
| + its references.]
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="security" title="Security Considerations">
|
| +<t>
|
| +Implementations of the Opus codec need to take appropriate security
|
| + considerations into account, as outlined in <xref target="RFC4732"/>.
|
| +This is just as much a problem for the container as it is for the codec itself.
|
| +Robustness against malicious payloads is extremely important.
|
| +Malicious payloads MUST NOT cause an implementation to overrun its allocated
|
| + memory or to take an excessive amount of resources to decode.
|
| +Although problems in encoding applications are typically rarer, the same
|
| + applies to the muxer.
|
| +Malicious audio input streams MUST NOT cause an implementation to overrun its
|
| + allocated memory or consume excessive resources because this would allow an
|
| + attacker to attack transcoding gateways.
|
| +</t>
|
| +
|
| +<t>
|
| +Like most other container formats, Ogg Opus streams SHOULD NOT be used with
|
| + insecure ciphers or cipher modes that are vulnerable to known-plaintext
|
| + attacks.
|
| +Elements such as the Ogg page capture pattern and the magic signatures in the
|
| + ID header and the comment header all have easily predictable values, in
|
| + addition to various elements of the codec data itself.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="content_type" title="Content Type">
|
| +<t>
|
| +An "Ogg Opus file" consists of one or more sequentially multiplexed segments,
|
| + each containing exactly one Ogg Opus stream.
|
| +The RECOMMENDED mime-type for Ogg Opus files is "audio/ogg".
|
| +</t>
|
| +
|
| +<t>
|
| +If more specificity is desired, one MAY indicate the presence of Opus streams
|
| + using the codecs parameter defined in <xref target="RFC6381"/> and
|
| + <xref target="RFC5334"/>, e.g.,
|
| +</t>
|
| +<figure>
|
| +<artwork align="center"><![CDATA[
|
| + audio/ogg; codecs=opus
|
| +]]></artwork>
|
| +</figure>
|
| +<t>
|
| + for an Ogg Opus file.
|
| +</t>
|
| +
|
| +<t>
|
| +The RECOMMENDED filename extension for Ogg Opus files is '.opus'.
|
| +</t>
|
| +
|
| +<t>
|
| +When Opus is concurrently multiplexed with other streams in an Ogg container,
|
| + one SHOULD use one of the "audio/ogg", "video/ogg", or "application/ogg"
|
| + mime-types, as defined in <xref target="RFC5334"/>.
|
| +Such streams are not strictly "Ogg Opus files" as described above,
|
| + since they contain more than a single Opus stream per sequentially
|
| + multiplexed segment, e.g. video or multiple audio tracks.
|
| +In such cases the the '.opus' filename extension is NOT RECOMMENDED.
|
| +</t>
|
| +
|
| +<t>
|
| +In either case, this document updates <xref target="RFC5334"/>
|
| + to add 'opus' as a codecs parameter value with char[8]: 'OpusHead'
|
| + as Codec Identifier.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="iana" title="IANA Considerations">
|
| +<t>
|
| +This document updates the IANA Media Types registry to add .opus
|
| + as a file extension for "audio/ogg", and to add itself as a reference
|
| + alongside <xref target="RFC5334"/> for "audio/ogg", "video/ogg", and
|
| + "application/ogg" Media Types.
|
| +</t>
|
| +<t>
|
| +This document defines a new registry "Opus Channel Mapping Families" to
|
| + indicate how the semantic meanings of the channels in a multi-channel Opus
|
| + stream are described.
|
| +IANA SHALL create a new name space of "Opus Channel Mapping Families".
|
| +All maintenance within and additions to the contents of this name space MUST be
|
| + according to the "Specification Requried with Expert Review" registration
|
| + policy as defined in <xref target="RFC5226"/>.
|
| +Each registry entry consists of a Channel Mapping Family Number, which is
|
| + specified in decimal in the range 0 to 255, inclusive, and a Reference (or
|
| + list of references)
|
| +Each Reference must point to sufficient documentation to describe what
|
| + information is coded in the Opus identification header for this channel
|
| + mapping family, how a demuxer determines the Stream Count ('N') and Coupled
|
| + Stream Count ('M') from this information, and how it determines the proper
|
| + interpretation of each of the decoded channels.
|
| +</t>
|
| +<t>
|
| +This document defines three initial assignments for this registry.
|
| +</t>
|
| +<texttable>
|
| +<ttcol>Value</ttcol><ttcol>Reference</ttcol>
|
| +<c>0</c><c>[RFCXXXX] <xref target="channel_mapping_0"/></c>
|
| +<c>1</c><c>[RFCXXXX] <xref target="channel_mapping_1"/></c>
|
| +<c>255</c><c>[RFCXXXX] <xref target="channel_mapping_255"/></c>
|
| +</texttable>
|
| +<t>
|
| +The designated expert will determine if the Reference points to a specification
|
| + that meets the requirements for permanence and ready availability laid out
|
| + in <xref target="RFC5226"/> and that it specifies the information
|
| + described above with sufficient clarity to allow interoperable
|
| + implementations.
|
| +</t>
|
| +</section>
|
| +
|
| +<section anchor="Acknowledgments" title="Acknowledgments">
|
| +<t>
|
| +Thanks to Ben Campbell, Mark Harris, Greg Maxwell, Christopher "Monty"
|
| + Montgomery, Jean-Marc Valin, and Mo Zanaty for their valuable contributions to
|
| + this document.
|
| +Additional thanks to Andrew D'Addesio, Greg Maxwell, and Vincent Penquerc'h for
|
| + their feedback based on early implementations.
|
| +</t>
|
| +</section>
|
| +
|
| +<section title="RFC Editor Notes">
|
| +<t>
|
| +In <xref target="iana"/>, "RFCXXXX" is to be replaced with the RFC number
|
| + assigned to this draft.
|
| +</t>
|
| +<t>
|
| +In the Copyright Notice at the start of the document, the following paragraph
|
| + is to be appended after the regular copyright notice text:
|
| +</t>
|
| +<t>
|
| +"The licenses granted by the IETF Trust to this RFC under Section 3.c of
|
| + the Trust Legal Provisions shall also include the right to extract text from
|
| + Sections 1 through 14 of this RFC and create derivative works from
|
| + these extracts, and to copy, publish, display, and distribute such derivative
|
| + works in any medium and for any purpose, provided that no such derivative work
|
| + shall be presented, displayed, or published in a manner that states or implies
|
| + that it is part of this RFC or any other IETF Document."
|
| +</t>
|
| +</section>
|
| +
|
| +</middle>
|
| +<back>
|
| +<references title="Normative References">
|
| + &rfc2119;
|
| + &rfc3533;
|
| + &rfc3629;
|
| + &rfc4732;
|
| + &rfc5226;
|
| + &rfc5334;
|
| + &rfc6381;
|
| + &rfc6716;
|
| +
|
| +<reference anchor="EBU-R128" target="https://tech.ebu.ch/loudness">
|
| +<front>
|
| + <title>Loudness Recommendation EBU R128</title>
|
| + <author>
|
| + <organization>EBU Technical Committee</organization>
|
| + </author>
|
| + <date month="August" year="2011"/>
|
| +</front>
|
| +</reference>
|
| +
|
| +<reference anchor="vorbis-comment"
|
| + target="https://www.xiph.org/vorbis/doc/v-comment.html">
|
| +<front>
|
| +<title>Ogg Vorbis I Format Specification: Comment Field and Header
|
| + Specification</title>
|
| +<author initials="C." surname="Montgomery"
|
| + fullname="Christopher "Monty" Montgomery"/>
|
| +<date month="July" year="2002"/>
|
| +</front>
|
| +</reference>
|
| +
|
| +</references>
|
| +
|
| +<references title="Informative References">
|
| +
|
| +<!--?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.3550.xml"?-->
|
| + &rfc6982;
|
| + &rfc7587;
|
| +
|
| +<reference anchor="flac"
|
| + target="https://xiph.org/flac/format.html">
|
| + <front>
|
| + <title>FLAC - Free Lossless Audio Codec Format Description</title>
|
| + <author initials="J." surname="Coalson" fullname="Josh Coalson"/>
|
| + <date month="January" year="2008"/>
|
| + </front>
|
| +</reference>
|
| +
|
| +<reference anchor="hanning"
|
| + target="https://en.wikipedia.org/wiki/Hamming_function#Hann_.28Hanning.29_window">
|
| + <front>
|
| + <title>Hann window</title>
|
| + <author>
|
| + <organization>Wikipedia</organization>
|
| + </author>
|
| + <date month="May" year="2013"/>
|
| + </front>
|
| +</reference>
|
| +
|
| +<reference anchor="linear-prediction"
|
| + target="https://en.wikipedia.org/wiki/Linear_predictive_coding">
|
| + <front>
|
| + <title>Linear Predictive Coding</title>
|
| + <author>
|
| + <organization>Wikipedia</organization>
|
| + </author>
|
| + <date month="January" year="2014"/>
|
| + </front>
|
| +</reference>
|
| +
|
| +<reference anchor="lpc-sample"
|
| + target="https://svn.xiph.org/trunk/vorbis/lib/lpc.c">
|
| +<front>
|
| + <title>Autocorrelation LPC coeff generation algorithm
|
| + (Vorbis source code)</title>
|
| +<author initials="J." surname="Degener" fullname="Jutta Degener"/>
|
| +<author initials="C." surname="Bormann" fullname="Carsten Bormann"/>
|
| +<date month="November" year="1994"/>
|
| +</front>
|
| +</reference>
|
| +
|
| +
|
| +<reference anchor="replay-gain"
|
| + target="https://wiki.xiph.org/VorbisComment#Replay_Gain">
|
| +<front>
|
| +<title>VorbisComment: Replay Gain</title>
|
| +<author initials="C." surname="Parker" fullname="Conrad Parker"/>
|
| +<author initials="M." surname="Leese" fullname="Martin Leese"/>
|
| +<date month="June" year="2009"/>
|
| +</front>
|
| +</reference>
|
| +
|
| +<reference anchor="seeking"
|
| + target="https://wiki.xiph.org/Seeking">
|
| +<front>
|
| +<title>Granulepos Encoding and How Seeking Really Works</title>
|
| +<author initials="S." surname="Pfeiffer" fullname="Silvia Pfeiffer"/>
|
| +<author initials="C." surname="Parker" fullname="Conrad Parker"/>
|
| +<author initials="G." surname="Maxwell" fullname="Greg Maxwell"/>
|
| +<date month="May" year="2012"/>
|
| +</front>
|
| +</reference>
|
| +
|
| +<reference anchor="vorbis-mapping"
|
| + target="https://www.xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-810004.3.9">
|
| +<front>
|
| +<title>The Vorbis I Specification, Section 4.3.9 Output Channel Order</title>
|
| +<author initials="C." surname="Montgomery"
|
| + fullname="Christopher "Monty" Montgomery"/>
|
| +<date month="January" year="2010"/>
|
| +</front>
|
| +</reference>
|
| +
|
| +<reference anchor="vorbis-trim"
|
| + target="https://xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-132000A.2">
|
| + <front>
|
| + <title>The Vorbis I Specification, Appendix A: Embedding Vorbis
|
| + into an Ogg stream</title>
|
| + <author initials="C." surname="Montgomery"
|
| + fullname="Christopher "Monty" Montgomery"/>
|
| + <date month="November" year="2008"/>
|
| + </front>
|
| +</reference>
|
| +
|
| +<reference anchor="wave-multichannel"
|
| + target="http://msdn.microsoft.com/en-us/windows/hardware/gg463006.aspx">
|
| + <front>
|
| + <title>Multiple Channel Audio Data and WAVE Files</title>
|
| + <author>
|
| + <organization>Microsoft Corporation</organization>
|
| + </author>
|
| + <date month="March" year="2007"/>
|
| + </front>
|
| +</reference>
|
| +
|
| +</references>
|
| +
|
| +</back>
|
| +</rfc>
|
|
|