Index: webrtc/modules/audio_coding/codecs/opus/opus/src/doc/draft-ietf-codec-oggopus.xml |
diff --git a/webrtc/modules/audio_coding/codecs/opus/opus/src/doc/draft-ietf-codec-oggopus.xml b/webrtc/modules/audio_coding/codecs/opus/opus/src/doc/draft-ietf-codec-oggopus.xml |
new file mode 100644 |
index 0000000000000000000000000000000000000000..7489c20146e32c2ee71b56b73a190ae2925a7bfc |
--- /dev/null |
+++ b/webrtc/modules/audio_coding/codecs/opus/opus/src/doc/draft-ietf-codec-oggopus.xml |
@@ -0,0 +1,1751 @@ |
+<?xml version="1.0" encoding="utf-8"?> |
+<!DOCTYPE rfc SYSTEM 'rfc2629.dtd' [ |
+<!ENTITY rfc2119 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml'> |
+<!ENTITY rfc3533 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3533.xml'> |
+<!ENTITY rfc3629 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3629.xml'> |
+<!ENTITY rfc4732 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.4732.xml'> |
+<!ENTITY rfc5226 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5226.xml'> |
+<!ENTITY rfc5334 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5334.xml'> |
+<!ENTITY rfc6381 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6381.xml'> |
+<!ENTITY rfc6716 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6716.xml'> |
+<!ENTITY rfc6982 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6982.xml'> |
+<!ENTITY rfc7587 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.7587.xml'> |
+]> |
+<?rfc toc="yes" symrefs="yes" ?> |
+ |
+<rfc ipr="trust200902" category="std" docName="draft-ietf-codec-oggopus-09" |
+ updates="5334"> |
+ |
+<front> |
+<title abbrev="Ogg Opus">Ogg Encapsulation for the Opus Audio Codec</title> |
+<author initials="T.B." surname="Terriberry" fullname="Timothy B. Terriberry"> |
+<organization>Mozilla Corporation</organization> |
+<address> |
+<postal> |
+<street>650 Castro Street</street> |
+<city>Mountain View</city> |
+<region>CA</region> |
+<code>94041</code> |
+<country>USA</country> |
+</postal> |
+<phone>+1 650 903-0800</phone> |
+<email>tterribe@xiph.org</email> |
+</address> |
+</author> |
+ |
+<author initials="R." surname="Lee" fullname="Ron Lee"> |
+<organization>Voicetronix</organization> |
+<address> |
+<postal> |
+<street>246 Pulteney Street, Level 1</street> |
+<city>Adelaide</city> |
+<region>SA</region> |
+<code>5000</code> |
+<country>Australia</country> |
+</postal> |
+<phone>+61 8 8232 9112</phone> |
+<email>ron@debian.org</email> |
+</address> |
+</author> |
+ |
+<author initials="R." surname="Giles" fullname="Ralph Giles"> |
+<organization>Mozilla Corporation</organization> |
+<address> |
+<postal> |
+<street>163 West Hastings Street</street> |
+<city>Vancouver</city> |
+<region>BC</region> |
+<code>V6B 1H5</code> |
+<country>Canada</country> |
+</postal> |
+<phone>+1 778 785 1540</phone> |
+<email>giles@xiph.org</email> |
+</address> |
+</author> |
+ |
+<date day="23" month="November" year="2015"/> |
+<area>RAI</area> |
+<workgroup>codec</workgroup> |
+ |
+<abstract> |
+<t> |
+This document defines the Ogg encapsulation for the Opus interactive speech and |
+ audio codec. |
+This allows data encoded in the Opus format to be stored in an Ogg logical |
+ bitstream. |
+</t> |
+</abstract> |
+</front> |
+ |
+<middle> |
+<section anchor="intro" title="Introduction"> |
+<t> |
+The IETF Opus codec is a low-latency audio codec optimized for both voice and |
+ general-purpose audio. |
+See <xref target="RFC6716"/> for technical details. |
+This document defines the encapsulation of Opus in a continuous, logical Ogg |
+ bitstream <xref target="RFC3533"/>. |
+Ogg encapsulation provides Opus with a long-term storage format supporting |
+ all of the essential features, including metadata, fast and accurate seeking, |
+ corruption detection, recapture after errors, low overhead, and the ability to |
+ multiplex Opus with other codecs (including video) with minimal buffering. |
+It also provides a live streamable format, capable of delivery over a reliable |
+ stream-oriented transport, without requiring all the data, or even the total |
+ length of the data, up-front, in a form that is identical to the on-disk |
+ storage format. |
+</t> |
+<t> |
+Ogg bitstreams are made up of a series of 'pages', each of which contains data |
+ from one or more 'packets'. |
+Pages are the fundamental unit of multiplexing in an Ogg stream. |
+Each page is associated with a particular logical stream and contains a capture |
+ pattern and checksum, flags to mark the beginning and end of the logical |
+ stream, and a 'granule position' that represents an absolute position in the |
+ stream, to aid seeking. |
+A single page can contain up to 65,025 octets of packet data from up to 255 |
+ different packets. |
+Packets can be split arbitrarily across pages, and continued from one page to |
+ the next (allowing packets much larger than would fit on a single page). |
+Each page contains 'lacing values' that indicate how the data is partitioned |
+ into packets, allowing a demultiplexer (demuxer) to recover the packet |
+ boundaries without examining the encoded data. |
+A packet is said to 'complete' on a page when the page contains the final |
+ lacing value corresponding to that packet. |
+</t> |
+<t> |
+This encapsulation defines the contents of the packet data, including |
+ the necessary headers, the organization of those packets into a logical |
+ stream, and the interpretation of the codec-specific granule position field. |
+It does not attempt to describe or specify the existing Ogg container format. |
+Readers unfamiliar with the basic concepts mentioned above are encouraged to |
+ review the details in <xref target="RFC3533"/>. |
+</t> |
+ |
+</section> |
+ |
+<section anchor="terminology" title="Terminology"> |
+<t> |
+The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", |
+ "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this |
+ document are to be interpreted as described in <xref target="RFC2119"/>. |
+</t> |
+ |
+</section> |
+ |
+<section anchor="packet_organization" title="Packet Organization"> |
+<t> |
+An Ogg Opus stream is organized as follows. |
+</t> |
+<t> |
+There are two mandatory header packets. |
+The first packet in the logical Ogg bitstream MUST contain the identification |
+ (ID) header, which uniquely identifies a stream as Opus audio. |
+The format of this header is defined in <xref target="id_header"/>. |
+It is placed alone (without any other packet data) on the first page of |
+ the logical Ogg bitstream, and completes on that page. |
+This page has its 'beginning of stream' flag set. |
+</t> |
+<t> |
+The second packet in the logical Ogg bitstream MUST contain the comment header, |
+ which contains user-supplied metadata. |
+The format of this header is defined in <xref target="comment_header"/>. |
+It MAY span multiple pages, beginning on the second page of the logical |
+ stream. |
+However many pages it spans, the comment header packet MUST finish the page on |
+ which it completes. |
+</t> |
+<t> |
+All subsequent pages are audio data pages, and the Ogg packets they contain are |
+ audio data packets. |
+Each audio data packet contains one Opus packet for each of N different |
+ streams, where N is typically one for mono or stereo, but MAY be greater than |
+ one for multichannel audio. |
+The value N is specified in the ID header (see |
+ <xref target="channel_mapping"/>), and is fixed over the entire length of the |
+ logical Ogg bitstream. |
+</t> |
+<t> |
+The first (N - 1) Opus packets, if any, are packed one after another |
+ into the Ogg packet, using the self-delimiting framing from Appendix B of |
+ <xref target="RFC6716"/>. |
+The remaining Opus packet is packed at the end of the Ogg packet using the |
+ regular, undelimited framing from Section 3 of <xref target="RFC6716"/>. |
+All of the Opus packets in a single Ogg packet MUST be constrained to have the |
+ same duration. |
+An implementation of this specification SHOULD treat any Opus packet whose |
+ duration is different from that of the first Opus packet in an Ogg packet as |
+ if it were a malformed Opus packet with an invalid Table Of Contents (TOC) |
+ sequence. |
+</t> |
+<t> |
+The TOC sequence at the beginning of each Opus packet indicates the coding |
+ mode, audio bandwidth, channel count, duration (frame size), and number of |
+ frames per packet, as described in Section 3.1 |
+ of <xref target="RFC6716"/>. |
+The coding mode is one of SILK, Hybrid, or Constrained Energy Lapped Transform |
+ (CELT). |
+The combination of coding mode, audio bandwidth, and frame size is referred to |
+ as the configuration of an Opus packet. |
+</t> |
+<t> |
+Packets are placed into Ogg pages in order until the end of stream. |
+Audio data packets might span page boundaries. |
+The first audio data page could have the 'continued packet' flag set |
+ (indicating the first audio data packet is continued from a previous page) if, |
+ for example, it was a live stream joined mid-broadcast, with the headers |
+ pasted on the front. |
+A demuxer SHOULD NOT attempt to decode the data for the first packet on a page |
+ with the 'continued packet' flag set if the previous page with packet data |
+ does not end in a continued packet (i.e., did not end with a lacing value of |
+ 255) or if the page sequence numbers are not consecutive, unless the demuxer |
+ has some special knowledge that would allow it to interpret this data |
+ despite the missing pieces. |
+An implementation MUST treat a zero-octet audio data packet as if it were a |
+ malformed Opus packet as described in |
+ Section 3.4 of <xref target="RFC6716"/>. |
+</t> |
+<t> |
+A logical stream ends with a page with the 'end of stream' flag set, but |
+ implementations need to be prepared to deal with truncated streams that do not |
+ have a page marked 'end of stream'. |
+There is no reason for the final packet on the last page to be a continued |
+ packet, i.e., for the final lacing value to be less than 255. |
+However, demuxers might encounter such streams, possibly as the result of a |
+ transfer that did not complete or of corruption. |
+A demuxer SHOULD NOT attempt to decode the data from a packet that continues |
+ onto a subsequent page (i.e., when the page ends with a lacing value of 255) |
+ if the next page with packet data does not have the 'continued packet' flag |
+ set or does not exist, or if the page sequence numbers are not consecutive, |
+ unless the demuxer has some special knowledge that would allow it to interpret |
+ this data despite the missing pieces. |
+There MUST NOT be any more pages in an Opus logical bitstream after a page |
+ marked 'end of stream'. |
+</t> |
+</section> |
+ |
+<section anchor="granpos" title="Granule Position"> |
+<t> |
+The granule position MUST be zero for the ID header page and the |
+ page where the comment header completes. |
+That is, the first page in the logical stream, and the last header |
+ page before the first audio data page both have a granule position of zero. |
+</t> |
+<t> |
+The granule position of an audio data page encodes the total number of PCM |
+ samples in the stream up to and including the last fully-decodable sample from |
+ the last packet completed on that page. |
+The granule position of the first audio data page will usually be larger than |
+ zero, as described in <xref target="start_granpos_restrictions"/>. |
+</t> |
+ |
+<t> |
+A page that is entirely spanned by a single packet (that completes on a |
+ subsequent page) has no granule position, and the granule position field is |
+ set to the special value '-1' in two's complement. |
+</t> |
+ |
+<t> |
+The granule position of an audio data page is in units of PCM audio samples at |
+ a fixed rate of 48 kHz (per channel; a stereo stream's granule position |
+ does not increment at twice the speed of a mono stream). |
+It is possible to run an Opus decoder at other sampling rates, but the value |
+ in the granule position field always counts samples assuming a 48 kHz |
+ decoding rate, and the rest of this specification makes the same assumption. |
+</t> |
+ |
+<t> |
+The duration of an Opus packet can be any multiple of 2.5 ms, up to a |
+ maximum of 120 ms. |
+This duration is encoded in the TOC sequence at the beginning of each packet. |
+The number of samples returned by a decoder corresponds to this duration |
+ exactly, even for the first few packets. |
+For example, a 20 ms packet fed to a decoder running at 48 kHz will |
+ always return 960 samples. |
+A demuxer can parse the TOC sequence at the beginning of each Ogg packet to |
+ work backwards or forwards from a packet with a known granule position (i.e., |
+ the last packet completed on some page) in order to assign granule positions |
+ to every packet, or even every individual sample. |
+The one exception is the last page in the stream, as described below. |
+</t> |
+ |
+<t> |
+All other pages with completed packets after the first MUST have a granule |
+ position equal to the number of samples contained in packets that complete on |
+ that page plus the granule position of the most recent page with completed |
+ packets. |
+This guarantees that a demuxer can assign individual packets the same granule |
+ position when working forwards as when working backwards. |
+For this to work, there cannot be any gaps. |
+</t> |
+ |
+<section anchor="gap-repair" title="Repairing Gaps in Real-time Streams"> |
+<t> |
+In order to support capturing a real-time stream that has lost or not |
+ transmitted packets, a multiplexer (muxer) SHOULD emit packets that explicitly |
+ request the use of Packet Loss Concealment (PLC) in place of the missing |
+ packets. |
+Implementations that fail to do so still MUST NOT increment the granule |
+ position for a page by anything other than the number of samples contained in |
+ packets that actually complete on that page. |
+</t> |
+<t> |
+Only gaps that are a multiple of 2.5 ms are repairable, as these are the |
+ only durations that can be created by packet loss or discontinuous |
+ transmission. |
+Muxers need not handle other gap sizes. |
+Creating the necessary packets involves synthesizing a TOC byte (defined in |
+Section 3.1 of <xref target="RFC6716"/>)—and whatever |
+ additional internal framing is needed—to indicate the packet duration |
+ for each stream. |
+The actual length of each missing Opus frame inside the packet is zero bytes, |
+ as defined in Section 3.2.1 of <xref target="RFC6716"/>. |
+</t> |
+ |
+<t> |
+Zero-byte frames MAY be packed into packets using any of codes 0, 1, |
+ 2, or 3. |
+When successive frames have the same configuration, the higher code packings |
+ reduce overhead. |
+Likewise, if the TOC configuration matches, the muxer MAY further combine the |
+ empty frames with previous or subsequent non-zero-length frames (using |
+ code 2 or VBR code 3). |
+</t> |
+ |
+<t> |
+<xref target="RFC6716"/> does not impose any requirements on the PLC, but this |
+ section outlines choices that are expected to have a positive influence on |
+ most PLC implementations, including the reference implementation. |
+Synthesized TOC sequences SHOULD maintain the same mode, audio bandwidth, |
+ channel count, and frame size as the previous packet (if any). |
+This is the simplest and usually the most well-tested case for the PLC to |
+ handle and it covers all losses that do not include a configuration switch, |
+ as defined in Section 4.5 of <xref target="RFC6716"/>. |
+</t> |
+ |
+<t> |
+When a previous packet is available, keeping the audio bandwidth and channel |
+ count the same allows the PLC to provide maximum continuity in the concealment |
+ data it generates. |
+However, if the size of the gap is not a multiple of the most recent frame |
+ size, then the frame size will have to change for at least some frames. |
+Such changes SHOULD be delayed as long as possible to simplify |
+ things for PLC implementations. |
+</t> |
+ |
+<t> |
+As an example, a 95 ms gap could be encoded as nineteen 5 ms frames |
+ in two bytes with a single CBR code 3 packet. |
+If the previous frame size was 20 ms, using four 20 ms frames |
+ followed by three 5 ms frames requires 4 bytes (plus an extra byte |
+ of Ogg lacing overhead), but allows the PLC to use its well-tested steady |
+ state behavior for as long as possible. |
+The total bitrate of the latter approach, including Ogg overhead, is about |
+ 0.4 kbps, so the impact on file size is minimal. |
+</t> |
+ |
+<t> |
+Changing modes is discouraged, since this causes some decoder implementations |
+ to reset their PLC state. |
+However, SILK and Hybrid mode frames cannot fill gaps that are not a multiple |
+ of 10 ms. |
+If switching to CELT mode is needed to match the gap size, a muxer SHOULD do |
+ so at the end of the gap to allow the PLC to function for as long as possible. |
+</t> |
+ |
+<t> |
+In the example above, if the previous frame was a 20 ms SILK mode frame, |
+ the better solution is to synthesize a packet describing four 20 ms SILK |
+ frames, followed by a packet with a single 10 ms SILK |
+ frame, and finally a packet with a 5 ms CELT frame, to fill the 95 ms |
+ gap. |
+This also requires four bytes to describe the synthesized packet data (two |
+ bytes for a CBR code 3 and one byte each for two code 0 packets) but three |
+ bytes of Ogg lacing overhead are needed to mark the packet boundaries. |
+At 0.6 kbps, this is still a minimal bitrate impact over a naive, low quality |
+ solution. |
+</t> |
+ |
+<t> |
+Since medium-band audio is an option only in the SILK mode, wideband frames |
+ SHOULD be generated if switching from that configuration to CELT mode, to |
+ ensure that any PLC implementation which does try to migrate state between |
+ the modes will be able to preserve all of the available audio bandwidth. |
+</t> |
+ |
+</section> |
+ |
+<section anchor="preskip" title="Pre-skip"> |
+<t> |
+There is some amount of latency introduced during the decoding process, to |
+ allow for overlap in the CELT mode, stereo mixing in the SILK mode, and |
+ resampling. |
+The encoder might have introduced additional latency through its own resampling |
+ and analysis (though the exact amount is not specified). |
+Therefore, the first few samples produced by the decoder do not correspond to |
+ real input audio, but are instead composed of padding inserted by the encoder |
+ to compensate for this latency. |
+These samples need to be stored and decoded, as Opus is an asymptotically |
+ convergent predictive codec, meaning the decoded contents of each frame depend |
+ on the recent history of decoder inputs. |
+However, a player will want to skip these samples after decoding them. |
+</t> |
+ |
+<t> |
+A 'pre-skip' field in the ID header (see <xref target="id_header"/>) signals |
+ the number of samples that SHOULD be skipped (decoded but discarded) at the |
+ beginning of the stream, though some specific applications might have a reason |
+ for looking at that data. |
+This amount need not be a multiple of 2.5 ms, MAY be smaller than a single |
+ packet, or MAY span the contents of several packets. |
+These samples are not valid audio. |
+</t> |
+ |
+<t> |
+For example, if the first Opus frame uses the CELT mode, it will always |
+ produce 120 samples of windowed overlap-add data. |
+However, the overlap data is initially all zeros (since there is no prior |
+ frame), meaning this cannot, in general, accurately represent the original |
+ audio. |
+The SILK mode requires additional delay to account for its analysis and |
+ resampling latency. |
+The encoder delays the original audio to avoid this problem. |
+</t> |
+ |
+<t> |
+The pre-skip field MAY also be used to perform sample-accurate cropping of |
+ already encoded streams. |
+In this case, a value of at least 3840 samples (80 ms) provides |
+ sufficient history to the decoder that it will have converged |
+ before the stream's output begins. |
+</t> |
+ |
+</section> |
+ |
+<section anchor="pcm_sample_position" title="PCM Sample Position"> |
+<t> |
+The PCM sample position is determined from the granule position using the |
+ formula |
+</t> |
+<figure align="center"> |
+<artwork align="center"><![CDATA[ |
+'PCM sample position' = 'granule position' - 'pre-skip' . |
+]]></artwork> |
+</figure> |
+ |
+<t> |
+For example, if the granule position of the first audio data page is 59,971, |
+ and the pre-skip is 11,971, then the PCM sample position of the last decoded |
+ sample from that page is 48,000. |
+</t> |
+<t> |
+This can be converted into a playback time using the formula |
+</t> |
+<figure align="center"> |
+<artwork align="center"><![CDATA[ |
+ 'PCM sample position' |
+'playback time' = --------------------- . |
+ 48000.0 |
+]]></artwork> |
+</figure> |
+ |
+<t> |
+The initial PCM sample position before any samples are played is normally '0'. |
+In this case, the PCM sample position of the first audio sample to be played |
+ starts at '1', because it marks the time on the clock |
+ <spanx style="emph">after</spanx> that sample has been played, and a stream |
+ that is exactly one second long has a final PCM sample position of '48000', |
+ as in the example here. |
+</t> |
+ |
+<t> |
+Vorbis streams use a granule position smaller than the number of audio samples |
+ contained in the first audio data page to indicate that some of those samples |
+ are trimmed from the output (see <xref target="vorbis-trim"/>). |
+However, to do so, Vorbis requires that the first audio data page contains |
+ exactly two packets, in order to allow the decoder to perform PCM position |
+ adjustments before needing to return any PCM data. |
+Opus uses the pre-skip mechanism for this purpose instead, since the encoder |
+ might introduce more than a single packet's worth of latency, and since very |
+ large packets in streams with a very large number of channels might not fit |
+ on a single page. |
+</t> |
+</section> |
+ |
+<section anchor="end_trimming" title="End Trimming"> |
+<t> |
+The page with the 'end of stream' flag set MAY have a granule position that |
+ indicates the page contains less audio data than would normally be returned by |
+ decoding up through the final packet. |
+This is used to end the stream somewhere other than an even frame boundary. |
+The granule position of the most recent audio data page with completed packets |
+ is used to make this determination, or '0' is used if there were no previous |
+ audio data pages with a completed packet. |
+The difference between these granule positions indicates how many samples to |
+ keep after decoding the packets that completed on the final page. |
+The remaining samples are discarded. |
+The number of discarded samples SHOULD be no larger than the number decoded |
+ from the last packet. |
+</t> |
+</section> |
+ |
+<section anchor="start_granpos_restrictions" |
+ title="Restrictions on the Initial Granule Position"> |
+<t> |
+The granule position of the first audio data page with a completed packet MAY |
+ be larger than the number of samples contained in packets that complete on |
+ that page, however it MUST NOT be smaller, unless that page has the 'end of |
+ stream' flag set. |
+Allowing a granule position larger than the number of samples allows the |
+ beginning of a stream to be cropped or a live stream to be joined without |
+ rewriting the granule position of all the remaining pages. |
+This means that the PCM sample position just before the first sample to be |
+ played MAY be larger than '0'. |
+Synchronization when multiplexing with other logical streams still uses the PCM |
+ sample position relative to '0' to compute sample times. |
+This does not affect the behavior of pre-skip: exactly 'pre-skip' samples |
+ SHOULD be skipped from the beginning of the decoded output, even if the |
+ initial PCM sample position is greater than zero. |
+</t> |
+ |
+<t> |
+On the other hand, a granule position that is smaller than the number of |
+ decoded samples prevents a demuxer from working backwards to assign each |
+ packet or each individual sample a valid granule position, since granule |
+ positions are non-negative. |
+An implementation MUST reject as invalid any stream where the granule position |
+ is smaller than the number of samples contained in packets that complete on |
+ the first audio data page with a completed packet, unless that page has the |
+ 'end of stream' flag set. |
+It MAY defer this action until it decodes the last packet completed on that |
+ page. |
+</t> |
+ |
+<t> |
+If that page has the 'end of stream' flag set, a demuxer MUST reject as invalid |
+ any stream where its granule position is smaller than the 'pre-skip' amount. |
+This would indicate that there are more samples to be skipped from the initial |
+ decoded output than exist in the stream. |
+If the granule position is smaller than the number of decoded samples produced |
+ by the packets that complete on that page, then a demuxer MUST use an initial |
+ granule position of '0', and can work forwards from '0' to timestamp |
+ individual packets. |
+If the granule position is larger than the number of decoded samples available, |
+ then the demuxer MUST still work backwards as described above, even if the |
+ 'end of stream' flag is set, to determine the initial granule position, and |
+ thus the initial PCM sample position. |
+Both of these will be greater than '0' in this case. |
+</t> |
+</section> |
+ |
+<section anchor="seeking_and_preroll" title="Seeking and Pre-roll"> |
+<t> |
+Seeking in Ogg files is best performed using a bisection search for a page |
+ whose granule position corresponds to a PCM position at or before the seek |
+ target. |
+With appropriately weighted bisection, accurate seeking can be performed in |
+ just one or two bisections on average, even in multi-gigabyte files. |
+See <xref target="seeking"/> for an example of general implementation guidance. |
+</t> |
+ |
+<t> |
+When seeking within an Ogg Opus stream, an implementation SHOULD start decoding |
+ (and discarding the output) at least 3840 samples (80 ms) prior to |
+ the seek target in order to ensure that the output audio is correct by the |
+ time it reaches the seek target. |
+This 'pre-roll' is separate from, and unrelated to, the 'pre-skip' used at the |
+ beginning of the stream. |
+If the point 80 ms prior to the seek target comes before the initial PCM |
+ sample position, an implementation SHOULD start decoding from the beginning of |
+ the stream, applying pre-skip as normal, regardless of whether the pre-skip is |
+ larger or smaller than 80 ms, and then continue to discard samples |
+ to reach the seek target (if any). |
+</t> |
+</section> |
+ |
+</section> |
+ |
+<section anchor="headers" title="Header Packets"> |
+<t> |
+An Ogg Opus logical stream contains exactly two mandatory header packets: |
+ an identification header and a comment header. |
+</t> |
+ |
+<section anchor="id_header" title="Identification Header"> |
+ |
+<figure anchor="id_header_packet" title="ID Header Packet" align="center"> |
+<artwork align="center"><![CDATA[ |
+ 0 1 2 3 |
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| 'O' | 'p' | 'u' | 's' | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| 'H' | 'e' | 'a' | 'd' | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| Version = 1 | Channel Count | Pre-skip | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| Input Sample Rate (Hz) | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| Output Gain (Q7.8 in dB) | Mapping Family| | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ : |
+| | |
+: Optional Channel Mapping Table... : |
+| | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+]]></artwork> |
+</figure> |
+ |
+<t> |
+The fields in the identification (ID) header have the following meaning: |
+<list style="numbers"> |
+<t>Magic Signature: |
+<vspace blankLines="1"/> |
+This is an 8-octet (64-bit) field that allows codec identification and is |
+ human-readable. |
+It contains, in order, the magic numbers: |
+<list style="empty"> |
+<t>0x4F 'O'</t> |
+<t>0x70 'p'</t> |
+<t>0x75 'u'</t> |
+<t>0x73 's'</t> |
+<t>0x48 'H'</t> |
+<t>0x65 'e'</t> |
+<t>0x61 'a'</t> |
+<t>0x64 'd'</t> |
+</list> |
+Starting with "Op" helps distinguish it from audio data packets, as this is an |
+ invalid TOC sequence. |
+<vspace blankLines="1"/> |
+</t> |
+<t>Version (8 bits, unsigned): |
+<vspace blankLines="1"/> |
+The version number MUST always be '1' for this version of the encapsulation |
+ specification. |
+Implementations SHOULD treat streams where the upper four bits of the version |
+ number match that of a recognized specification as backwards-compatible with |
+ that specification. |
+That is, the version number can be split into "major" and "minor" version |
+ sub-fields, with changes to the "minor" sub-field (in the lower four bits) |
+ signaling compatible changes. |
+For example, an implementation of this specification SHOULD accept any stream |
+ with a version number of '15' or less, and SHOULD assume any stream with a |
+ version number '16' or greater is incompatible. |
+The initial version '1' was chosen to keep implementations from relying on this |
+ octet as a null terminator for the "OpusHead" string. |
+<vspace blankLines="1"/> |
+</t> |
+<t>Output Channel Count 'C' (8 bits, unsigned): |
+<vspace blankLines="1"/> |
+This is the number of output channels. |
+This might be different than the number of encoded channels, which can change |
+ on a packet-by-packet basis. |
+This value MUST NOT be zero. |
+The maximum allowable value depends on the channel mapping family, and might be |
+ as large as 255. |
+See <xref target="channel_mapping"/> for details. |
+<vspace blankLines="1"/> |
+</t> |
+<t>Pre-skip (16 bits, unsigned, little |
+ endian): |
+<vspace blankLines="1"/> |
+This is the number of samples (at 48 kHz) to discard from the decoder |
+ output when starting playback, and also the number to subtract from a page's |
+ granule position to calculate its PCM sample position. |
+When cropping the beginning of existing Ogg Opus streams, a pre-skip of at |
+ least 3,840 samples (80 ms) is RECOMMENDED to ensure complete |
+ convergence in the decoder. |
+<vspace blankLines="1"/> |
+</t> |
+<t>Input Sample Rate (32 bits, unsigned, little |
+ endian): |
+<vspace blankLines="1"/> |
+This is the sample rate of the original input (before encoding), in Hz. |
+This field is <spanx style="emph">not</spanx> the sample rate to use for |
+ playback of the encoded data. |
+<vspace blankLines="1"/> |
+Opus can switch between internal audio bandwidths of 4, 6, 8, 12, and |
+ 20 kHz. |
+Each packet in the stream can have a different audio bandwidth. |
+Regardless of the audio bandwidth, the reference decoder supports decoding any |
+ stream at a sample rate of 8, 12, 16, 24, or 48 kHz. |
+The original sample rate of the audio passed to the encoder is not preserved |
+ by the lossy compression. |
+<vspace blankLines="1"/> |
+An Ogg Opus player SHOULD select the playback sample rate according to the |
+ following procedure: |
+<list style="numbers"> |
+<t>If the hardware supports 48 kHz playback, decode at 48 kHz.</t> |
+<t>Otherwise, if the hardware's highest available sample rate is a supported |
+ rate, decode at this sample rate.</t> |
+<t>Otherwise, if the hardware's highest available sample rate is less than |
+ 48 kHz, decode at the next higher Opus supported rate above the highest |
+ available hardware rate and resample.</t> |
+<t>Otherwise, decode at 48 kHz and resample.</t> |
+</list> |
+However, the 'Input Sample Rate' field allows the muxer to pass the sample |
+ rate of the original input stream as metadata. |
+This is useful when the user requires the output sample rate to match the |
+ input sample rate. |
+For example, when not playing the output, an implementation writing PCM format |
+ samples to disk might choose to resample the audio back to the original input |
+ sample rate to reduce surprise to the user, who might reasonably expect to get |
+ back a file with the same sample rate. |
+<vspace blankLines="1"/> |
+A value of zero indicates 'unspecified'. |
+Muxers SHOULD write the actual input sample rate or zero, but implementations |
+ which do something with this field SHOULD take care to behave sanely if given |
+ crazy values (e.g., do not actually upsample the output to 10 MHz if |
+ requested). |
+Implementations SHOULD support input sample rates between 8 kHz and |
+ 192 kHz (inclusive). |
+Rates outside this range MAY be ignored by falling back to the default rate of |
+ 48 kHz instead. |
+<vspace blankLines="1"/> |
+</t> |
+<t>Output Gain (16 bits, signed, little endian): |
+<vspace blankLines="1"/> |
+This is a gain to be applied when decoding. |
+It is 20*log10 of the factor by which to scale the decoder output to achieve |
+ the desired playback volume, stored in a 16-bit, signed, two's complement |
+ fixed-point value with 8 fractional bits (i.e., Q7.8). |
+<vspace blankLines="1"/> |
+To apply the gain, an implementation could use |
+<figure align="center"> |
+<artwork align="center"><![CDATA[ |
+sample *= pow(10, output_gain/(20.0*256)) , |
+]]></artwork> |
+</figure> |
+ where output_gain is the raw 16-bit value from the header. |
+<vspace blankLines="1"/> |
+Players and media frameworks SHOULD apply it by default. |
+If a player chooses to apply any volume adjustment or gain modification, such |
+ as the R128_TRACK_GAIN (see <xref target="comment_header"/>), the adjustment |
+ MUST be applied in addition to this output gain in order to achieve playback |
+ at the normalized volume. |
+<vspace blankLines="1"/> |
+A muxer SHOULD set this field to zero, and instead apply any gain prior to |
+ encoding, when this is possible and does not conflict with the user's wishes. |
+A nonzero output gain indicates the gain was adjusted after encoding, or that |
+ a user wished to adjust the gain for playback while preserving the ability |
+ to recover the original signal amplitude. |
+<vspace blankLines="1"/> |
+Although the output gain has enormous range (+/- 128 dB, enough to amplify |
+ inaudible sounds to the threshold of physical pain), most applications can |
+ only reasonably use a small portion of this range around zero. |
+The large range serves in part to ensure that gain can always be losslessly |
+ transferred between OpusHead and R128 gain tags (see below) without |
+ saturating. |
+<vspace blankLines="1"/> |
+</t> |
+<t>Channel Mapping Family (8 bits, unsigned): |
+<vspace blankLines="1"/> |
+This octet indicates the order and semantic meaning of the output channels. |
+<vspace blankLines="1"/> |
+Each currently specified value of this octet indicates a mapping family, which |
+ defines a set of allowed channel counts, and the ordered set of channel names |
+ for each allowed channel count. |
+The details are described in <xref target="channel_mapping"/>. |
+</t> |
+<t>Channel Mapping Table: |
+This table defines the mapping from encoded streams to output channels. |
+Its contents are specified in <xref target="channel_mapping"/>. |
+</t> |
+</list> |
+</t> |
+ |
+<t> |
+All fields in the ID headers are REQUIRED, except for the channel mapping |
+ table, which MUST be omitted when the channel mapping family is 0, but |
+ is REQUIRED otherwise. |
+Implementations SHOULD reject streams with ID headers that do not contain |
+ enough data for these fields, even if they contain a valid Magic Signature. |
+Future versions of this specification, even backwards-compatible versions, |
+ might include additional fields in the ID header. |
+If an ID header has a compatible major version, but a larger minor version, |
+ an implementation MUST NOT reject it for containing additional data not |
+ specified here, provided it still completes on the first page. |
+</t> |
+ |
+<section anchor="channel_mapping" title="Channel Mapping"> |
+<t> |
+An Ogg Opus stream allows mapping one number of Opus streams (N) to a possibly |
+ larger number of decoded channels (M + N) to yet another number of |
+ output channels (C), which might be larger or smaller than the number of |
+ decoded channels. |
+The order and meaning of these channels are defined by a channel mapping, |
+ which consists of the 'channel mapping family' octet and, for channel mapping |
+ families other than family 0, a channel mapping table, as illustrated in |
+ <xref target="channel_mapping_table"/>. |
+</t> |
+ |
+<figure anchor="channel_mapping_table" title="Channel Mapping Table" |
+ align="center"> |
+<artwork align="center"><![CDATA[ |
+ 0 1 2 3 |
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 |
+ +-+-+-+-+-+-+-+-+ |
+ | Stream Count | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| Coupled Count | Channel Mapping... : |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+]]></artwork> |
+</figure> |
+ |
+<t> |
+The fields in the channel mapping table have the following meaning: |
+<list style="numbers" counter="8"> |
+<t>Stream Count 'N' (8 bits, unsigned): |
+<vspace blankLines="1"/> |
+This is the total number of streams encoded in each Ogg packet. |
+This value is necessary to correctly parse the packed Opus packets inside an |
+ Ogg packet, as described in <xref target="packet_organization"/>. |
+This value MUST NOT be zero, as without at least one Opus packet with a valid |
+ TOC sequence, a demuxer cannot recover the duration of an Ogg packet. |
+<vspace blankLines="1"/> |
+For channel mapping family 0, this value defaults to 1, and is not coded. |
+<vspace blankLines="1"/> |
+</t> |
+<t>Coupled Stream Count 'M' (8 bits, unsigned): |
+This is the number of streams whose decoders are to be configured to produce |
+ two channels (stereo). |
+This MUST be no larger than the total number of streams, N. |
+<vspace blankLines="1"/> |
+Each packet in an Opus stream has an internal channel count of 1 or 2, which |
+ can change from packet to packet. |
+This is selected by the encoder depending on the bitrate and the audio being |
+ encoded. |
+The original channel count of the audio passed to the encoder is not |
+ necessarily preserved by the lossy compression. |
+<vspace blankLines="1"/> |
+Regardless of the internal channel count, any Opus stream can be decoded as |
+ mono (a single channel) or stereo (two channels) by appropriate initialization |
+ of the decoder. |
+The 'coupled stream count' field indicates that the decoders for the first M |
+ Opus streams are to be initialized for stereo (two-channel) output, and the |
+ remaining (N - M) decoders are to be initialized for mono (a single |
+ channel) only. |
+The total number of decoded channels, (M + N), MUST be no larger than |
+ 255, as there is no way to index more channels than that in the channel |
+ mapping. |
+<vspace blankLines="1"/> |
+For channel mapping family 0, this value defaults to (C - 1) |
+ (i.e., 0 for mono and 1 for stereo), and is not coded. |
+<vspace blankLines="1"/> |
+</t> |
+<t>Channel Mapping (8*C bits): |
+This contains one octet per output channel, indicating which decoded channel |
+ is to be used for each one. |
+Let 'index' be the value of this octet for a particular output channel. |
+This value MUST either be smaller than (M + N), or be the special |
+ value 255. |
+If 'index' is less than 2*M, the output MUST be taken from decoding stream |
+ ('index'/2) as stereo and selecting the left channel if 'index' is even, and |
+ the right channel if 'index' is odd. |
+If 'index' is 2*M or larger, but less than 255, the output MUST be taken from |
+ decoding stream ('index' - M) as mono. |
+If 'index' is 255, the corresponding output channel MUST contain pure silence. |
+<vspace blankLines="1"/> |
+The number of output channels, C, is not constrained to match the number of |
+ decoded channels (M + N). |
+A single index value MAY appear multiple times, i.e., the same decoded channel |
+ might be mapped to multiple output channels. |
+Some decoded channels might not be assigned to any output channel, as well. |
+<vspace blankLines="1"/> |
+For channel mapping family 0, the first index defaults to 0, and if |
+ C == 2, the second index defaults to 1. |
+Neither index is coded. |
+</t> |
+</list> |
+</t> |
+ |
+<t> |
+After producing the output channels, the channel mapping family determines the |
+ semantic meaning of each one. |
+There are three defined mapping families in this specification. |
+</t> |
+ |
+<section anchor="channel_mapping_0" title="Channel Mapping Family 0"> |
+<t> |
+Allowed numbers of channels: 1 or 2. |
+RTP mapping. |
+This is the same channel interpretation as <xref target="RFC7587"/>. |
+</t> |
+<t> |
+<list style="symbols"> |
+<t>1 channel: monophonic (mono).</t> |
+<t>2 channels: stereo (left, right).</t> |
+</list> |
+Special mapping: This channel mapping value also |
+ indicates that the contents consists of a single Opus stream that is stereo if |
+ and only if C == 2, with stream index 0 mapped to output |
+ channel 0 (mono, or left channel) and stream index 1 mapped to |
+ output channel 1 (right channel) if stereo. |
+When the 'channel mapping family' octet has this value, the channel mapping |
+ table MUST be omitted from the ID header packet. |
+</t> |
+</section> |
+ |
+<section anchor="channel_mapping_1" title="Channel Mapping Family 1"> |
+<t> |
+Allowed numbers of channels: 1...8. |
+Vorbis channel order (see below). |
+</t> |
+<t> |
+Each channel is assigned to a speaker location in a conventional surround |
+ arrangement. |
+Specific locations depend on the number of channels, and are given below |
+ in order of the corresponding channel indices. |
+<list style="symbols"> |
+ <t>1 channel: monophonic (mono).</t> |
+ <t>2 channels: stereo (left, right).</t> |
+ <t>3 channels: linear surround (left, center, right)</t> |
+ <t>4 channels: quadraphonic (front left, front right, rear left, rear right).</t> |
+ <t>5 channels: 5.0 surround (front left, front center, front right, rear left, rear right).</t> |
+ <t>6 channels: 5.1 surround (front left, front center, front right, rear left, rear right, LFE).</t> |
+ <t>7 channels: 6.1 surround (front left, front center, front right, side left, side right, rear center, LFE).</t> |
+ <t>8 channels: 7.1 surround (front left, front center, front right, side left, side right, rear left, rear right, LFE)</t> |
+</list> |
+</t> |
+<t> |
+This set of surround options and speaker location orderings is the same |
+ as those used by the Vorbis codec <xref target="vorbis-mapping"/>. |
+The ordering is different from the one used by the |
+ WAVE <xref target="wave-multichannel"/> and |
+ Free Lossless Audio Codec (FLAC) <xref target="flac"/> formats, |
+ so correct ordering requires permutation of the output channels when decoding |
+ to or encoding from those formats. |
+'LFE' here refers to a Low Frequency Effects channel, often mapped to a |
+ subwoofer with no particular spatial position. |
+Implementations SHOULD identify 'side' or 'rear' speaker locations with |
+ 'surround' and 'back' as appropriate when interfacing with audio formats |
+ or systems which prefer that terminology. |
+</t> |
+</section> |
+ |
+<section anchor="channel_mapping_255" |
+ title="Channel Mapping Family 255"> |
+<t> |
+Allowed numbers of channels: 1...255. |
+No defined channel meaning. |
+</t> |
+<t> |
+Channels are unidentified. |
+General-purpose players SHOULD NOT attempt to play these streams. |
+Offline implementations MAY deinterleave the output into separate PCM files, |
+ one per channel. |
+Implementations SHOULD NOT produce output for channels mapped to stream index |
+ 255 (pure silence) unless they have no other way to indicate the index of |
+ non-silent channels. |
+</t> |
+</section> |
+ |
+<section anchor="channel_mapping_undefined" |
+ title="Undefined Channel Mappings"> |
+<t> |
+The remaining channel mapping families (2...254) are reserved. |
+A demuxer implementation encountering a reserved channel mapping family value |
+ SHOULD act as though the value is 255. |
+</t> |
+</section> |
+ |
+<section anchor="downmix" title="Downmixing"> |
+<t> |
+An Ogg Opus player MUST support any valid channel mapping with a channel |
+ mapping family of 0 or 1, even if the number of channels does not match the |
+ physically connected audio hardware. |
+Players SHOULD perform channel mixing to increase or reduce the number of |
+ channels as needed. |
+</t> |
+ |
+<t> |
+Implementations MAY use the following matrices to implement downmixing from |
+ multichannel files using <xref target="channel_mapping_1">Channel Mapping |
+ Family 1</xref>, which are known to give acceptable results for stereo. |
+Matrices for 3 and 4 channels are normalized so each coefficient row sums |
+ to 1 to avoid clipping. |
+For 5 or more channels they are normalized to 2 as a compromise between |
+ clipping and dynamic range reduction. |
+</t> |
+<t> |
+In these matrices the front left and front right channels are generally |
+passed through directly. |
+When a surround channel is split between both the left and right stereo |
+ channels, coefficients are chosen so their squares sum to 1, which |
+ helps preserve the perceived intensity. |
+Rear channels are mixed more diffusely or attenuated to maintain focus |
+ on the front channels. |
+</t> |
+ |
+<figure anchor="downmix-matrix-3" |
+ title="Stereo downmix matrix for the linear surround channel mapping" |
+ align="center"> |
+<artwork align="center"><![CDATA[ |
+L output = ( 0.585786 * left + 0.414214 * center ) |
+R output = ( 0.414214 * center + 0.585786 * right ) |
+]]></artwork> |
+<postamble> |
+Exact coefficient values are 1 and 1/sqrt(2), multiplied by |
+ 1/(1 + 1/sqrt(2)) for normalization. |
+</postamble> |
+</figure> |
+ |
+<figure anchor="downmix-matrix-4" |
+ title="Stereo downmix matrix for the quadraphonic channel mapping" |
+ align="center"> |
+<artwork align="center"><![CDATA[ |
+/ \ / \ / FL \ |
+| L output | | 0.422650 0.000000 0.366025 0.211325 | | FR | |
+| R output | = | 0.000000 0.422650 0.211325 0.366025 | | RL | |
+\ / \ / \ RR / |
+]]></artwork> |
+<postamble> |
+Exact coefficient values are 1, sqrt(3)/2 and 1/2, multiplied by |
+ 1/(1 + sqrt(3)/2 + 1/2) for normalization. |
+</postamble> |
+</figure> |
+ |
+<figure anchor="downmix-matrix-5" |
+ title="Stereo downmix matrix for the 5.0 surround mapping" |
+ align="center"> |
+<artwork align="center"><![CDATA[ |
+ / FL \ |
+/ \ / \ | FC | |
+| L | | 0.650802 0.460186 0.000000 0.563611 0.325401 | | FR | |
+| R | = | 0.000000 0.460186 0.650802 0.325401 0.563611 | | RL | |
+\ / \ / | RR | |
+ \ / |
+]]></artwork> |
+<postamble> |
+Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by |
+ 2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2) |
+ for normalization. |
+</postamble> |
+</figure> |
+ |
+<figure anchor="downmix-matrix-6" |
+ title="Stereo downmix matrix for the 5.1 surround mapping" |
+ align="center"> |
+<artwork align="center"><![CDATA[ |
+ /FL \ |
+/ \ / \ |FC | |
+|L| | 0.529067 0.374107 0.000000 0.458186 0.264534 0.374107 | |FR | |
+|R| = | 0.000000 0.374107 0.529067 0.264534 0.458186 0.374107 | |RL | |
+\ / \ / |RR | |
+ \LFE/ |
+]]></artwork> |
+<postamble> |
+Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by |
+2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2 + 1/sqrt(2)) |
+ for normalization. |
+</postamble> |
+</figure> |
+ |
+<figure anchor="downmix-matrix-7" |
+ title="Stereo downmix matrix for the 6.1 surround mapping" |
+ align="center"> |
+<artwork align="center"><![CDATA[ |
+ / \ |
+ | 0.455310 0.321953 0.000000 0.394310 0.227655 0.278819 0.321953 | |
+ | 0.000000 0.321953 0.455310 0.227655 0.394310 0.278819 0.321953 | |
+ \ / |
+]]></artwork> |
+<postamble> |
+Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2, 1/2 and |
+ sqrt(3)/2/sqrt(2), multiplied by |
+ 2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2 + |
+ sqrt(3)/2/sqrt(2) + 1/sqrt(2)) for normalization. |
+The coefficients are in the same order as in <xref target="channel_mapping_1" />, |
+ and the matrices above. |
+</postamble> |
+</figure> |
+ |
+<figure anchor="downmix-matrix-8" |
+ title="Stereo downmix matrix for the 7.1 surround mapping" |
+ align="center"> |
+<artwork align="center"><![CDATA[ |
+/ \ |
+| .388631 .274804 .000000 .336565 .194316 .336565 .194316 .274804 | |
+| .000000 .274804 .388631 .194316 .336565 .194316 .336565 .274804 | |
+\ / |
+]]></artwork> |
+<postamble> |
+Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by |
+ 2/(2 + 2/sqrt(2) + sqrt(3)) for normalization. |
+The coefficients are in the same order as in <xref target="channel_mapping_1" />, |
+ and the matrices above. |
+</postamble> |
+</figure> |
+ |
+</section> |
+ |
+</section> <!-- end channel_mapping_table --> |
+ |
+</section> <!-- end id_header --> |
+ |
+<section anchor="comment_header" title="Comment Header"> |
+ |
+<figure anchor="comment_header_packet" title="Comment Header Packet" |
+ align="center"> |
+<artwork align="center"><![CDATA[ |
+ 0 1 2 3 |
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| 'O' | 'p' | 'u' | 's' | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| 'T' | 'a' | 'g' | 's' | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| Vendor String Length | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| | |
+: Vendor String... : |
+| | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| User Comment List Length | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| User Comment #0 String Length | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| | |
+: User Comment #0 String... : |
+| | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+| User Comment #1 String Length | |
++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+: : |
+]]></artwork> |
+</figure> |
+ |
+<t> |
+The comment header consists of a 64-bit magic signature, followed by data in |
+ the same format as the <xref target="vorbis-comment"/> header used in Ogg |
+ Vorbis, except (like Ogg Theora and Speex) the final "framing bit" specified |
+ in the Vorbis spec is not present. |
+<list style="numbers"> |
+<t>Magic Signature: |
+<vspace blankLines="1"/> |
+This is an 8-octet (64-bit) field that allows codec identification and is |
+ human-readable. |
+It contains, in order, the magic numbers: |
+<list style="empty"> |
+<t>0x4F 'O'</t> |
+<t>0x70 'p'</t> |
+<t>0x75 'u'</t> |
+<t>0x73 's'</t> |
+<t>0x54 'T'</t> |
+<t>0x61 'a'</t> |
+<t>0x67 'g'</t> |
+<t>0x73 's'</t> |
+</list> |
+Starting with "Op" helps distinguish it from audio data packets, as this is an |
+ invalid TOC sequence. |
+<vspace blankLines="1"/> |
+</t> |
+<t>Vendor String Length (32 bits, unsigned, little endian): |
+<vspace blankLines="1"/> |
+This field gives the length of the following vendor string, in octets. |
+It MUST NOT indicate that the vendor string is longer than the rest of the |
+ packet. |
+<vspace blankLines="1"/> |
+</t> |
+<t>Vendor String (variable length, UTF-8 vector): |
+<vspace blankLines="1"/> |
+This is a simple human-readable tag for vendor information, encoded as a UTF-8 |
+ string <xref target="RFC3629"/>. |
+No terminating null octet is necessary. |
+<vspace blankLines="1"/> |
+This tag is intended to identify the codec encoder and encapsulation |
+ implementations, for tracing differences in technical behavior. |
+User-facing applications can use the 'ENCODER' user comment tag to identify |
+ themselves. |
+<vspace blankLines="1"/> |
+</t> |
+<t>User Comment List Length (32 bits, unsigned, little endian): |
+<vspace blankLines="1"/> |
+This field indicates the number of user-supplied comments. |
+It MAY indicate there are zero user-supplied comments, in which case there are |
+ no additional fields in the packet. |
+It MUST NOT indicate that there are so many comments that the comment string |
+ lengths would require more data than is available in the rest of the packet. |
+<vspace blankLines="1"/> |
+</t> |
+<t>User Comment #i String Length (32 bits, unsigned, little endian): |
+<vspace blankLines="1"/> |
+This field gives the length of the following user comment string, in octets. |
+There is one for each user comment indicated by the 'user comment list length' |
+ field. |
+It MUST NOT indicate that the string is longer than the rest of the packet. |
+<vspace blankLines="1"/> |
+</t> |
+<t>User Comment #i String (variable length, UTF-8 vector): |
+<vspace blankLines="1"/> |
+This field contains a single user comment string. |
+There is one for each user comment indicated by the 'user comment list length' |
+ field. |
+</t> |
+</list> |
+</t> |
+ |
+<t> |
+The vendor string length and user comment list length are REQUIRED, and |
+ implementations SHOULD reject comment headers that do not contain enough data |
+ for these fields, or that do not contain enough data for the corresponding |
+ vendor string or user comments they describe. |
+Making this check before allocating the associated memory to contain the data |
+ helps prevent a possible Denial-of-Service (DoS) attack from small comment |
+ headers that claim to contain strings longer than the entire packet or more |
+ user comments than than could possibly fit in the packet. |
+</t> |
+ |
+<t> |
+Immediately following the user comment list, the comment header MAY |
+ contain zero-padding or other binary data which is not specified here. |
+If the least-significant bit of the first byte of this data is 1, then editors |
+ SHOULD preserve the contents of this data when updating the tags, but if this |
+ bit is 0, all such data MAY be treated as padding, and truncated or discarded |
+ as desired. |
+This allows informal experimentation with the format of this binary data until |
+ it can be specified later. |
+</t> |
+ |
+<t> |
+The comment header can be arbitrarily large and might be spread over a large |
+ number of Ogg pages. |
+Implementations MUST avoid attempting to allocate excessive amounts of memory |
+ when presented with a very large comment header. |
+To accomplish this, implementations MAY reject a comment header larger than |
+ 125,829,120 octets, and MAY ignore individual comments that are not fully |
+ contained within the first 61,440 octets of the comment header. |
+</t> |
+ |
+<section anchor="comment_format" title="Tag Definitions"> |
+<t> |
+The user comment strings follow the NAME=value format described by |
+ <xref target="vorbis-comment"/> with the same recommended tag names: |
+ ARTIST, TITLE, DATE, ALBUM, and so on. |
+</t> |
+<t> |
+Two new comment tags are introduced here: |
+</t> |
+ |
+<t>First, an optional gain for track normalization:</t> |
+<figure align="center"> |
+<artwork align="left"><![CDATA[ |
+R128_TRACK_GAIN=-573 |
+]]></artwork> |
+</figure> |
+<t> |
+ representing the volume shift needed to normalize the track's volume |
+ during isolated playback, in random shuffle, and so on. |
+The gain is a Q7.8 fixed point number in dB, as in the ID header's 'output |
+ gain' field. |
+This tag is similar to the REPLAYGAIN_TRACK_GAIN tag in |
+ Vorbis <xref target="replay-gain"/>, except that the normal volume |
+ reference is the <xref target="EBU-R128"/> standard. |
+</t> |
+<t>Second, an optional gain for album normalization:</t> |
+<figure align="center"> |
+<artwork align="left"><![CDATA[ |
+R128_ALBUM_GAIN=111 |
+]]></artwork> |
+</figure> |
+<t> |
+ representing the volume shift needed to normalize the overall volume when |
+ played as part of a particular collection of tracks. |
+The gain is also a Q7.8 fixed point number in dB, as in the ID header's |
+ 'output gain' field. |
+</t> |
+<t> |
+An Ogg Opus stream MUST NOT have more than one of each of these tags, and if |
+ present their values MUST be an integer from -32768 to 32767, inclusive, |
+ represented in ASCII as a base 10 number with no whitespace. |
+A leading '+' or '-' character is valid. |
+Leading zeros are also permitted, but the value MUST be represented by |
+ no more than 6 characters. |
+Other non-digit characters MUST NOT be present. |
+</t> |
+<t> |
+If present, R128_TRACK_GAIN and R128_ALBUM_GAIN MUST correctly represent |
+ the R128 normalization gain relative to the 'output gain' field specified |
+ in the ID header. |
+If a player chooses to make use of the R128_TRACK_GAIN tag or the |
+ R128_ALBUM_GAIN tag, it MUST apply those gains |
+ <spanx style="emph">in addition</spanx> to the 'output gain' value. |
+If a tool modifies the ID header's 'output gain' field, it MUST also update or |
+ remove the R128_TRACK_GAIN and R128_ALBUM_GAIN comment tags if present. |
+A muxer SHOULD place the gain it wants other tools to use by default into the |
+ 'output gain' field, and not the comment tag. |
+</t> |
+<t> |
+To avoid confusion with multiple normalization schemes, an Opus comment header |
+ SHOULD NOT contain any of the REPLAYGAIN_TRACK_GAIN, REPLAYGAIN_TRACK_PEAK, |
+ REPLAYGAIN_ALBUM_GAIN, or REPLAYGAIN_ALBUM_PEAK tags, unless they are only |
+ to be used in some context where there is guaranteed to be no such confusion. |
+<xref target="EBU-R128"/> normalization is preferred to the earlier |
+ REPLAYGAIN schemes because of its clear definition and adoption by industry. |
+Peak normalizations are difficult to calculate reliably for lossy codecs |
+ because of variation in excursion heights due to decoder differences. |
+In the authors' investigations they were not applied consistently or broadly |
+ enough to merit inclusion here. |
+</t> |
+</section> <!-- end comment_format --> |
+</section> <!-- end comment_header --> |
+ |
+</section> <!-- end headers --> |
+ |
+<section anchor="packet_size_limits" title="Packet Size Limits"> |
+<t> |
+Technically, valid Opus packets can be arbitrarily large due to the padding |
+ format, although the amount of non-padding data they can contain is bounded. |
+These packets might be spread over a similarly enormous number of Ogg pages. |
+When encoding, implementations SHOULD limit the use of padding in audio data |
+ packets to no more than is necessary to make a variable bitrate (VBR) stream |
+ constant bitrate (CBR), unless they have no reasonable way to determine what |
+ is necessary. |
+Demuxers SHOULD reject audio data packets (treat them as if they were malformed |
+ Opus packets with an invalid TOC sequence) larger than 61,440 octets per |
+ Opus stream, unless they have a specific reason for allowing extra padding. |
+Such packets necessarily contain more padding than needed to make a stream CBR. |
+Demuxers MUST avoid attempting to allocate excessive amounts of memory when |
+ presented with a very large packet. |
+Demuxers MAY reject or partially process audio data packets larger than |
+ 61,440 octets in an Ogg Opus stream with channel mapping families 0 |
+ or 1. |
+Demuxers MAY reject or partially process audio data packets in any Ogg Opus |
+ stream if the packet is larger than 61,440 octets and also larger than |
+ 7,680 octets per Opus stream. |
+The presence of an extremely large packet in the stream could indicate a |
+ memory exhaustion attack or stream corruption. |
+</t> |
+<t> |
+In an Ogg Opus stream, the largest possible valid packet that does not use |
+ padding has a size of (61,298*N - 2) octets. |
+With 255 streams, this is 15,630,988 octets and can |
+ span up to 61,298 Ogg pages, all but one of which will have a granule |
+ position of -1. |
+This is of course a very extreme packet, consisting of 255 streams, each |
+ containing 120 ms of audio encoded as 2.5 ms frames, each frame |
+ using the maximum possible number of octets (1275) and stored in the least |
+ efficient manner allowed (a VBR code 3 Opus packet). |
+Even in such a packet, most of the data will be zeros as 2.5 ms frames |
+ cannot actually use all 1275 octets. |
+</t> |
+<t> |
+The largest packet consisting of entirely useful data is |
+ (15,326*N - 2) octets. |
+This corresponds to 120 ms of audio encoded as 10 ms frames in either |
+ SILK or Hybrid mode, but at a data rate of over 1 Mbps, which makes little |
+ sense for the quality achieved. |
+</t> |
+<t> |
+A more reasonable limit is (7,664*N - 2) octets. |
+This corresponds to 120 ms of audio encoded as 20 ms stereo CELT mode |
+ frames, with a total bitrate just under 511 kbps (not counting the Ogg |
+ encapsulation overhead). |
+For channel mapping family 1, N=8 provides a reasonable upper bound, as it |
+ allows for each of the 8 possible output channels to be decoded from a |
+ separate stereo Opus stream. |
+This gives a size of 61,310 octets, which is rounded up to a multiple of |
+ 1,024 octets to yield the audio data packet size of 61,440 octets |
+ that any implementation is expected to be able to process successfully. |
+</t> |
+</section> |
+ |
+<section anchor="encoder" title="Encoder Guidelines"> |
+<t> |
+When encoding Opus streams, Ogg muxers SHOULD take into account the |
+ algorithmic delay of the Opus encoder. |
+</t> |
+<t> |
+In encoders derived from the reference |
+ implementation <xref target="RFC6716"/>, the number of samples can be |
+ queried with: |
+</t> |
+<figure align="center"> |
+<artwork align="center"><![CDATA[ |
+ opus_encoder_ctl(encoder_state, OPUS_GET_LOOKAHEAD(&delay_samples)); |
+]]></artwork> |
+</figure> |
+<t> |
+To achieve good quality in the very first samples of a stream, implementations |
+ MAY use linear predictive coding (LPC) extrapolation to generate at least 120 |
+ extra samples at the beginning to avoid the Opus encoder having to encode a |
+ discontinuous signal. |
+For more information on linear prediction, see |
+ <xref target="linear-prediction"/>. |
+For an input file containing 'length' samples, the implementation SHOULD set |
+ the pre-skip header value to (delay_samples + extra_samples), encode |
+ at least (length + delay_samples + extra_samples) |
+ samples, and set the granule position of the last page to |
+ (length + delay_samples + extra_samples). |
+This ensures that the encoded file has the same duration as the original, with |
+ no time offset. The best way to pad the end of the stream is to also use LPC |
+ extrapolation, but zero-padding is also acceptable. |
+</t> |
+ |
+<section anchor="lpc" title="LPC Extrapolation"> |
+<t> |
+The first step in LPC extrapolation is to compute linear prediction |
+ coefficients. <xref target="lpc-sample"/> |
+When extending the end of the signal, order-N (typically with N ranging from 8 |
+ to 40) LPC analysis is performed on a window near the end of the signal. |
+The last N samples are used as memory to an infinite impulse response (IIR) |
+ filter. |
+</t> |
+<t> |
+The filter is then applied on a zero input to extrapolate the end of the signal. |
+Let a(k) be the kth LPC coefficient and x(n) be the nth sample of the signal, |
+ each new sample past the end of the signal is computed as: |
+</t> |
+<figure align="center"> |
+<artwork align="center"><![CDATA[ |
+ N |
+ --- |
+x(n) = \ a(k)*x(n-k) |
+ / |
+ --- |
+ k=1 |
+]]></artwork> |
+</figure> |
+<t> |
+The process is repeated independently for each channel. |
+It is possible to extend the beginning of the signal by applying the same |
+ process backward in time. |
+When extending the beginning of the signal, it is best to apply a "fade in" to |
+ the extrapolated signal, e.g. by multiplying it by a half-Hanning window |
+ <xref target="hanning"/>. |
+</t> |
+ |
+</section> |
+ |
+<section anchor="continuous_chaining" title="Continuous Chaining"> |
+<t> |
+In some applications, such as Internet radio, it is desirable to cut a long |
+ stream into smaller chains, e.g. so the comment header can be updated. |
+This can be done simply by separating the input streams into segments and |
+ encoding each segment independently. |
+The drawback of this approach is that it creates a small discontinuity |
+ at the boundary due to the lossy nature of Opus. |
+A muxer MAY avoid this discontinuity by using the following procedure: |
+<list style="numbers"> |
+<t>Encode the last frame of the first segment as an independent frame by |
+ turning off all forms of inter-frame prediction. |
+De-emphasis is allowed.</t> |
+<t>Set the granule position of the last page to a point near the end of the |
+ last frame.</t> |
+<t>Begin the second segment with a copy of the last frame of the first |
+ segment.</t> |
+<t>Set the pre-skip value of the second stream in such a way as to properly |
+ join the two streams.</t> |
+<t>Continue the encoding process normally from there, without any reset to |
+ the encoder.</t> |
+</list> |
+</t> |
+<t> |
+In encoders derived from the reference implementation, inter-frame prediction |
+ can be turned off by calling: |
+</t> |
+<figure align="center"> |
+<artwork align="center"><![CDATA[ |
+ opus_encoder_ctl(encoder_state, OPUS_SET_PREDICTION_DISABLED(1)); |
+]]></artwork> |
+</figure> |
+<t> |
+For best results, this implementation requires that prediction be explicitly |
+ enabled again before resuming normal encoding, even after a reset. |
+</t> |
+ |
+</section> |
+ |
+</section> |
+ |
+<section anchor="implementation" title="Implementation Status"> |
+<t> |
+A brief summary of major implementations of this draft is available |
+ at <eref target="https://wiki.xiph.org/OggOpusImplementation"/>, |
+ along with their status. |
+</t> |
+<t> |
+[Note to RFC Editor: please remove this entire section before |
+ final publication per <xref target="RFC6982"/>, along with |
+ its references.] |
+</t> |
+</section> |
+ |
+<section anchor="security" title="Security Considerations"> |
+<t> |
+Implementations of the Opus codec need to take appropriate security |
+ considerations into account, as outlined in <xref target="RFC4732"/>. |
+This is just as much a problem for the container as it is for the codec itself. |
+Robustness against malicious payloads is extremely important. |
+Malicious payloads MUST NOT cause an implementation to overrun its allocated |
+ memory or to take an excessive amount of resources to decode. |
+Although problems in encoding applications are typically rarer, the same |
+ applies to the muxer. |
+Malicious audio input streams MUST NOT cause an implementation to overrun its |
+ allocated memory or consume excessive resources because this would allow an |
+ attacker to attack transcoding gateways. |
+</t> |
+ |
+<t> |
+Like most other container formats, Ogg Opus streams SHOULD NOT be used with |
+ insecure ciphers or cipher modes that are vulnerable to known-plaintext |
+ attacks. |
+Elements such as the Ogg page capture pattern and the magic signatures in the |
+ ID header and the comment header all have easily predictable values, in |
+ addition to various elements of the codec data itself. |
+</t> |
+</section> |
+ |
+<section anchor="content_type" title="Content Type"> |
+<t> |
+An "Ogg Opus file" consists of one or more sequentially multiplexed segments, |
+ each containing exactly one Ogg Opus stream. |
+The RECOMMENDED mime-type for Ogg Opus files is "audio/ogg". |
+</t> |
+ |
+<t> |
+If more specificity is desired, one MAY indicate the presence of Opus streams |
+ using the codecs parameter defined in <xref target="RFC6381"/> and |
+ <xref target="RFC5334"/>, e.g., |
+</t> |
+<figure> |
+<artwork align="center"><![CDATA[ |
+ audio/ogg; codecs=opus |
+]]></artwork> |
+</figure> |
+<t> |
+ for an Ogg Opus file. |
+</t> |
+ |
+<t> |
+The RECOMMENDED filename extension for Ogg Opus files is '.opus'. |
+</t> |
+ |
+<t> |
+When Opus is concurrently multiplexed with other streams in an Ogg container, |
+ one SHOULD use one of the "audio/ogg", "video/ogg", or "application/ogg" |
+ mime-types, as defined in <xref target="RFC5334"/>. |
+Such streams are not strictly "Ogg Opus files" as described above, |
+ since they contain more than a single Opus stream per sequentially |
+ multiplexed segment, e.g. video or multiple audio tracks. |
+In such cases the the '.opus' filename extension is NOT RECOMMENDED. |
+</t> |
+ |
+<t> |
+In either case, this document updates <xref target="RFC5334"/> |
+ to add 'opus' as a codecs parameter value with char[8]: 'OpusHead' |
+ as Codec Identifier. |
+</t> |
+</section> |
+ |
+<section anchor="iana" title="IANA Considerations"> |
+<t> |
+This document updates the IANA Media Types registry to add .opus |
+ as a file extension for "audio/ogg", and to add itself as a reference |
+ alongside <xref target="RFC5334"/> for "audio/ogg", "video/ogg", and |
+ "application/ogg" Media Types. |
+</t> |
+<t> |
+This document defines a new registry "Opus Channel Mapping Families" to |
+ indicate how the semantic meanings of the channels in a multi-channel Opus |
+ stream are described. |
+IANA SHALL create a new name space of "Opus Channel Mapping Families". |
+All maintenance within and additions to the contents of this name space MUST be |
+ according to the "Specification Requried with Expert Review" registration |
+ policy as defined in <xref target="RFC5226"/>. |
+Each registry entry consists of a Channel Mapping Family Number, which is |
+ specified in decimal in the range 0 to 255, inclusive, and a Reference (or |
+ list of references) |
+Each Reference must point to sufficient documentation to describe what |
+ information is coded in the Opus identification header for this channel |
+ mapping family, how a demuxer determines the Stream Count ('N') and Coupled |
+ Stream Count ('M') from this information, and how it determines the proper |
+ interpretation of each of the decoded channels. |
+</t> |
+<t> |
+This document defines three initial assignments for this registry. |
+</t> |
+<texttable> |
+<ttcol>Value</ttcol><ttcol>Reference</ttcol> |
+<c>0</c><c>[RFCXXXX] <xref target="channel_mapping_0"/></c> |
+<c>1</c><c>[RFCXXXX] <xref target="channel_mapping_1"/></c> |
+<c>255</c><c>[RFCXXXX] <xref target="channel_mapping_255"/></c> |
+</texttable> |
+<t> |
+The designated expert will determine if the Reference points to a specification |
+ that meets the requirements for permanence and ready availability laid out |
+ in <xref target="RFC5226"/> and that it specifies the information |
+ described above with sufficient clarity to allow interoperable |
+ implementations. |
+</t> |
+</section> |
+ |
+<section anchor="Acknowledgments" title="Acknowledgments"> |
+<t> |
+Thanks to Ben Campbell, Mark Harris, Greg Maxwell, Christopher "Monty" |
+ Montgomery, Jean-Marc Valin, and Mo Zanaty for their valuable contributions to |
+ this document. |
+Additional thanks to Andrew D'Addesio, Greg Maxwell, and Vincent Penquerc'h for |
+ their feedback based on early implementations. |
+</t> |
+</section> |
+ |
+<section title="RFC Editor Notes"> |
+<t> |
+In <xref target="iana"/>, "RFCXXXX" is to be replaced with the RFC number |
+ assigned to this draft. |
+</t> |
+<t> |
+In the Copyright Notice at the start of the document, the following paragraph |
+ is to be appended after the regular copyright notice text: |
+</t> |
+<t> |
+"The licenses granted by the IETF Trust to this RFC under Section 3.c of |
+ the Trust Legal Provisions shall also include the right to extract text from |
+ Sections 1 through 14 of this RFC and create derivative works from |
+ these extracts, and to copy, publish, display, and distribute such derivative |
+ works in any medium and for any purpose, provided that no such derivative work |
+ shall be presented, displayed, or published in a manner that states or implies |
+ that it is part of this RFC or any other IETF Document." |
+</t> |
+</section> |
+ |
+</middle> |
+<back> |
+<references title="Normative References"> |
+ &rfc2119; |
+ &rfc3533; |
+ &rfc3629; |
+ &rfc4732; |
+ &rfc5226; |
+ &rfc5334; |
+ &rfc6381; |
+ &rfc6716; |
+ |
+<reference anchor="EBU-R128" target="https://tech.ebu.ch/loudness"> |
+<front> |
+ <title>Loudness Recommendation EBU R128</title> |
+ <author> |
+ <organization>EBU Technical Committee</organization> |
+ </author> |
+ <date month="August" year="2011"/> |
+</front> |
+</reference> |
+ |
+<reference anchor="vorbis-comment" |
+ target="https://www.xiph.org/vorbis/doc/v-comment.html"> |
+<front> |
+<title>Ogg Vorbis I Format Specification: Comment Field and Header |
+ Specification</title> |
+<author initials="C." surname="Montgomery" |
+ fullname="Christopher "Monty" Montgomery"/> |
+<date month="July" year="2002"/> |
+</front> |
+</reference> |
+ |
+</references> |
+ |
+<references title="Informative References"> |
+ |
+<!--?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.3550.xml"?--> |
+ &rfc6982; |
+ &rfc7587; |
+ |
+<reference anchor="flac" |
+ target="https://xiph.org/flac/format.html"> |
+ <front> |
+ <title>FLAC - Free Lossless Audio Codec Format Description</title> |
+ <author initials="J." surname="Coalson" fullname="Josh Coalson"/> |
+ <date month="January" year="2008"/> |
+ </front> |
+</reference> |
+ |
+<reference anchor="hanning" |
+ target="https://en.wikipedia.org/wiki/Hamming_function#Hann_.28Hanning.29_window"> |
+ <front> |
+ <title>Hann window</title> |
+ <author> |
+ <organization>Wikipedia</organization> |
+ </author> |
+ <date month="May" year="2013"/> |
+ </front> |
+</reference> |
+ |
+<reference anchor="linear-prediction" |
+ target="https://en.wikipedia.org/wiki/Linear_predictive_coding"> |
+ <front> |
+ <title>Linear Predictive Coding</title> |
+ <author> |
+ <organization>Wikipedia</organization> |
+ </author> |
+ <date month="January" year="2014"/> |
+ </front> |
+</reference> |
+ |
+<reference anchor="lpc-sample" |
+ target="https://svn.xiph.org/trunk/vorbis/lib/lpc.c"> |
+<front> |
+ <title>Autocorrelation LPC coeff generation algorithm |
+ (Vorbis source code)</title> |
+<author initials="J." surname="Degener" fullname="Jutta Degener"/> |
+<author initials="C." surname="Bormann" fullname="Carsten Bormann"/> |
+<date month="November" year="1994"/> |
+</front> |
+</reference> |
+ |
+ |
+<reference anchor="replay-gain" |
+ target="https://wiki.xiph.org/VorbisComment#Replay_Gain"> |
+<front> |
+<title>VorbisComment: Replay Gain</title> |
+<author initials="C." surname="Parker" fullname="Conrad Parker"/> |
+<author initials="M." surname="Leese" fullname="Martin Leese"/> |
+<date month="June" year="2009"/> |
+</front> |
+</reference> |
+ |
+<reference anchor="seeking" |
+ target="https://wiki.xiph.org/Seeking"> |
+<front> |
+<title>Granulepos Encoding and How Seeking Really Works</title> |
+<author initials="S." surname="Pfeiffer" fullname="Silvia Pfeiffer"/> |
+<author initials="C." surname="Parker" fullname="Conrad Parker"/> |
+<author initials="G." surname="Maxwell" fullname="Greg Maxwell"/> |
+<date month="May" year="2012"/> |
+</front> |
+</reference> |
+ |
+<reference anchor="vorbis-mapping" |
+ target="https://www.xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-810004.3.9"> |
+<front> |
+<title>The Vorbis I Specification, Section 4.3.9 Output Channel Order</title> |
+<author initials="C." surname="Montgomery" |
+ fullname="Christopher "Monty" Montgomery"/> |
+<date month="January" year="2010"/> |
+</front> |
+</reference> |
+ |
+<reference anchor="vorbis-trim" |
+ target="https://xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-132000A.2"> |
+ <front> |
+ <title>The Vorbis I Specification, Appendix A: Embedding Vorbis |
+ into an Ogg stream</title> |
+ <author initials="C." surname="Montgomery" |
+ fullname="Christopher "Monty" Montgomery"/> |
+ <date month="November" year="2008"/> |
+ </front> |
+</reference> |
+ |
+<reference anchor="wave-multichannel" |
+ target="http://msdn.microsoft.com/en-us/windows/hardware/gg463006.aspx"> |
+ <front> |
+ <title>Multiple Channel Audio Data and WAVE Files</title> |
+ <author> |
+ <organization>Microsoft Corporation</organization> |
+ </author> |
+ <date month="March" year="2007"/> |
+ </front> |
+</reference> |
+ |
+</references> |
+ |
+</back> |
+</rfc> |