Skip to content

Latest commit

 

History

History
13243 lines (12931 loc) · 566 KB

Vorbis_I_spec.html

File metadata and controls

13243 lines (12931 loc) · 566 KB
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html >
<head><title>Vorbis I specification</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="generator" content="TeX4ht (http://www.tug.org/tex4ht/)">
<meta name="originator" content="TeX4ht (http://www.tug.org/tex4ht/)">
<!-- html -->
<meta name="src" content="Vorbis_I_spec.tex">
<meta name="date" content="2015-02-27 13:18:00">
<link rel="stylesheet" type="text/css" href="Vorbis_I_spec.css">
</head><body
>
<div class="maketitle">
<h2 class="titleHead">Vorbis I specification</h2>
<div class="author" ><span
class="cmr-17">Xiph.Org Foundation</span></div><br />
<div class="date" ><span
class="cmr-17">February 27, 2015</span></div>
</div>
<h3 class="likesectionHead"><a
id="x1-1000"></a>Contents</h3>
<div class="tableofcontents">
&#x00A0;<span class="sectionToc" >1 <a
href="#x1-20001" id="QQ2-1-2">Introduction and Description</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >1.1 <a
href="#x1-30001.1" id="QQ2-1-3">Overview</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.1.1 <a
href="#x1-40001.1.1" id="QQ2-1-4">Application</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.1.2 <a
href="#x1-50001.1.2" id="QQ2-1-5">Classification</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.1.3 <a
href="#x1-60001.1.3" id="QQ2-1-6">Assumptions</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.1.4 <a
href="#x1-70001.1.4" id="QQ2-1-7">Codec Setup and Probability Model</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.1.5 <a
href="#x1-90001.1.5" id="QQ2-1-9">Format Specification</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.1.6 <a
href="#x1-100001.1.6" id="QQ2-1-10">Hardware Profile</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >1.2 <a
href="#x1-110001.2" id="QQ2-1-11">Decoder Configuration</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.2.1 <a
href="#x1-120001.2.1" id="QQ2-1-13">Global Config</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.2.2 <a
href="#x1-130001.2.2" id="QQ2-1-14">Mode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.2.3 <a
href="#x1-140001.2.3" id="QQ2-1-15">Mapping</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.2.4 <a
href="#x1-150001.2.4" id="QQ2-1-16">Floor</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.2.5 <a
href="#x1-160001.2.5" id="QQ2-1-17">Residue</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.2.6 <a
href="#x1-170001.2.6" id="QQ2-1-18">Codebooks</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >1.3 <a
href="#x1-180001.3" id="QQ2-1-19">High-level Decode Process</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.3.1 <a
href="#x1-190001.3.1" id="QQ2-1-20">Decode Setup</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >1.3.2 <a
href="#x1-230001.3.2" id="QQ2-1-24">Decode Procedure</a></span>
<br />&#x00A0;<span class="sectionToc" >2 <a
href="#x1-360002" id="QQ2-1-39">Bitpacking Convention</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >2.1 <a
href="#x1-370002.1" id="QQ2-1-40">Overview</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >2.1.1 <a
href="#x1-380002.1.1" id="QQ2-1-41">octets, bytes and words</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >2.1.2 <a
href="#x1-390002.1.2" id="QQ2-1-42">bit order</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >2.1.3 <a
href="#x1-400002.1.3" id="QQ2-1-43">byte order</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >2.1.4 <a
href="#x1-410002.1.4" id="QQ2-1-44">coding bits into byte sequences</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >2.1.5 <a
href="#x1-420002.1.5" id="QQ2-1-45">signedness</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >2.1.6 <a
href="#x1-430002.1.6" id="QQ2-1-46">coding example</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >2.1.7 <a
href="#x1-440002.1.7" id="QQ2-1-47">decoding example</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >2.1.8 <a
href="#x1-450002.1.8" id="QQ2-1-48">end-of-packet alignment</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >2.1.9 <a
href="#x1-460002.1.9" id="QQ2-1-49">reading zero bits</a></span>
<br />&#x00A0;<span class="sectionToc" >3 <a
href="#x1-470003" id="QQ2-1-50">Probability Model and Codebooks</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >3.1 <a
href="#x1-480003.1" id="QQ2-1-51">Overview</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >3.1.1 <a
href="#x1-490003.1.1" id="QQ2-1-52">Bitwise operation</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >3.2 <a
href="#x1-500003.2" id="QQ2-1-53">Packed codebook format</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >3.2.1 <a
href="#x1-510003.2.1" id="QQ2-1-54">codebook decode</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >3.3 <a
href="#x1-580003.3" id="QQ2-1-63">Use of the codebook abstraction</a></span>
<br />&#x00A0;<span class="sectionToc" >4 <a
href="#x1-590004" id="QQ2-1-64">Codec Setup and Packet Decode</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >4.1 <a
href="#x1-600004.1" id="QQ2-1-65">Overview</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >4.2 <a
href="#x1-610004.2" id="QQ2-1-66">Header decode and decode setup</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.2.1 <a
href="#x1-620004.2.1" id="QQ2-1-67">Common header decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.2.2 <a
href="#x1-630004.2.2" id="QQ2-1-68">Identification header</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.2.3 <a
href="#x1-640004.2.3" id="QQ2-1-69">Comment header</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.2.4 <a
href="#x1-650004.2.4" id="QQ2-1-70">Setup header</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >4.3 <a
href="#x1-720004.3" id="QQ2-1-78">Audio packet decode and synthesis</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.3.1 <a
href="#x1-730004.3.1" id="QQ2-1-79">packet type, mode and window decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.3.2 <a
href="#x1-740004.3.2" id="QQ2-1-80">floor curve decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.3.3 <a
href="#x1-750004.3.3" id="QQ2-1-81">nonzero vector propagate</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.3.4 <a
href="#x1-760004.3.4" id="QQ2-1-82">residue decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.3.5 <a
href="#x1-770004.3.5" id="QQ2-1-83">inverse coupling</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.3.6 <a
href="#x1-780004.3.6" id="QQ2-1-84">dot product</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.3.7 <a
href="#x1-790004.3.7" id="QQ2-1-85">inverse MDCT</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.3.8 <a
href="#x1-800004.3.8" id="QQ2-1-86">overlap_add</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >4.3.9 <a
href="#x1-810004.3.9" id="QQ2-1-87">output channel order</a></span>
<br />&#x00A0;<span class="sectionToc" >5 <a
href="#x1-820005" id="QQ2-1-88">comment field and header specification</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >5.1 <a
href="#x1-830005.1" id="QQ2-1-89">Overview</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >5.2 <a
href="#x1-840005.2" id="QQ2-1-90">Comment encoding</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >5.2.1 <a
href="#x1-850005.2.1" id="QQ2-1-91">Structure</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >5.2.2 <a
href="#x1-860005.2.2" id="QQ2-1-92">Content vector format</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >5.2.3 <a
href="#x1-890005.2.3" id="QQ2-1-95">Encoding</a></span>
<br />&#x00A0;<span class="sectionToc" >6 <a
href="#x1-900006" id="QQ2-1-96">Floor type 0 setup and decode</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >6.1 <a
href="#x1-910006.1" id="QQ2-1-97">Overview</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >6.2 <a
href="#x1-920006.2" id="QQ2-1-98">Floor 0 format</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >6.2.1 <a
href="#x1-930006.2.1" id="QQ2-1-99">header decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >6.2.2 <a
href="#x1-940006.2.2" id="QQ2-1-100">packet decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >6.2.3 <a
href="#x1-950006.2.3" id="QQ2-1-101">curve computation</a></span>
<br />&#x00A0;<span class="sectionToc" >7 <a
href="#x1-970007" id="QQ2-1-103">Floor type 1 setup and decode</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >7.1 <a
href="#x1-980007.1" id="QQ2-1-104">Overview</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >7.2 <a
href="#x1-990007.2" id="QQ2-1-105">Floor 1 format</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >7.2.1 <a
href="#x1-1000007.2.1" id="QQ2-1-106">model</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >7.2.2 <a
href="#x1-1010007.2.2" id="QQ2-1-111">header decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >7.2.3 <a
href="#x1-1020007.2.3" id="QQ2-1-112">packet decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >7.2.4 <a
href="#x1-1030007.2.4" id="QQ2-1-113">curve computation</a></span>
<br />&#x00A0;<span class="sectionToc" >8 <a
href="#x1-1040008" id="QQ2-1-114">Residue setup and decode</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >8.1 <a
href="#x1-1050008.1" id="QQ2-1-115">Overview</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >8.2 <a
href="#x1-1060008.2" id="QQ2-1-116">Residue format</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >8.3 <a
href="#x1-1070008.3" id="QQ2-1-118">residue 0</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >8.4 <a
href="#x1-1080008.4" id="QQ2-1-119">residue 1</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >8.5 <a
href="#x1-1090008.5" id="QQ2-1-120">residue 2</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >8.6 <a
href="#x1-1100008.6" id="QQ2-1-122">Residue decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >8.6.1 <a
href="#x1-1110008.6.1" id="QQ2-1-123">header decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >8.6.2 <a
href="#x1-1120008.6.2" id="QQ2-1-124">packet decode</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >8.6.3 <a
href="#x1-1130008.6.3" id="QQ2-1-125">format 0 specifics</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >8.6.4 <a
href="#x1-1140008.6.4" id="QQ2-1-126">format 1 specifics</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >8.6.5 <a
href="#x1-1150008.6.5" id="QQ2-1-127">format 2 specifics</a></span>
<br />&#x00A0;<span class="sectionToc" >9 <a
href="#x1-1160009" id="QQ2-1-128">Helper equations</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >9.1 <a
href="#x1-1170009.1" id="QQ2-1-129">Overview</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >9.2 <a
href="#x1-1180009.2" id="QQ2-1-130">Functions</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >9.2.1 <a
href="#x1-1190009.2.1" id="QQ2-1-131">ilog</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >9.2.2 <a
href="#x1-1200009.2.2" id="QQ2-1-132">float32_unpack</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >9.2.3 <a
href="#x1-1210009.2.3" id="QQ2-1-133">lookup1_values</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >9.2.4 <a
href="#x1-1220009.2.4" id="QQ2-1-134">low_neighbor</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >9.2.5 <a
href="#x1-1230009.2.5" id="QQ2-1-135">high_neighbor</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >9.2.6 <a
href="#x1-1240009.2.6" id="QQ2-1-136">render_point</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >9.2.7 <a
href="#x1-1250009.2.7" id="QQ2-1-137">render_line</a></span>
<br />&#x00A0;<span class="sectionToc" >10 <a
href="#x1-12600010" id="QQ2-1-138">Tables</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >10.1 <a
href="#x1-12700010.1" id="QQ2-1-139">floor1_inverse_dB_table</a></span>
<br />&#x00A0;<span class="sectionToc" >A <a
href="#x1-128000A" id="QQ2-1-140">Embedding Vorbis into an Ogg stream</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >A.1 <a
href="#x1-129000A.1" id="QQ2-1-141">Overview</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >A.1.1 <a
href="#x1-130000A.1.1" id="QQ2-1-142">Restrictions</a></span>
<br />&#x00A0;&#x00A0;&#x00A0;<span class="subsubsectionToc" >A.1.2 <a
href="#x1-131000A.1.2" id="QQ2-1-143">MIME type</a></span>
<br />&#x00A0;&#x00A0;<span class="subsectionToc" >A.2 <a
href="#x1-132000A.2" id="QQ2-1-144">Encapsulation</a></span>
<br />&#x00A0;<span class="sectionToc" >B <a
href="#x1-134000B" id="QQ2-1-146">Vorbis encapsulation in RTP</a></span>
</div>
<h3 class="sectionHead"><span class="titlemark">1. </span> <a
id="x1-20001"></a>Introduction and Description</h3>
<!--l. 6--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">1.1. </span> <a
id="x1-30001.1"></a>Overview</h4>
<!--l. 8--><p class="noindent" >This document provides a high level description of the Vorbis codec&#8217;s construction. A bit-by-bit
specification appears beginning in <a
href="#x1-590004">section&#x00A0;4</a>, &#8220;<a
href="#x1-590004">Codec Setup and Packet Decode<!--tex4ht:ref: vorbis:spec:codec --></a>&#8221;. The later
sections assume a high-level understanding of the Vorbis decode process, which is provided
here.
<!--l. 15--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.1.1. </span> <a
id="x1-40001.1.1"></a>Application</h5>
<!--l. 16--><p class="noindent" >Vorbis is a general purpose perceptual audio CODEC intended to allow maximum encoder
flexibility, thus allowing it to scale competitively over an exceptionally wide range of bitrates. At
the high quality/bitrate end of the scale (CD or DAT rate stereo, 16/24 bits) it is in the same
league as MPEG-2 and MPC. Similarly, the 1.0 encoder can encode high-quality CD and DAT
rate stereo at below 48kbps without resampling to a lower rate. Vorbis is also intended for lower
and higher sample rates (from 8kHz telephony to 192kHz digital masters) and a range of channel
representations (monaural, polyphonic, stereo, quadraphonic, 5.1, ambisonic, or up to 255
discrete channels).
<!--l. 29--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.1.2. </span> <a
id="x1-50001.1.2"></a>Classification</h5>
<!--l. 30--><p class="noindent" >Vorbis I is a forward-adaptive monolithic transform CODEC based on the Modified Discrete
Cosine Transform. The codec is structured to allow addition of a hybrid wavelet filterbank in
Vorbis II to offer better transient response and reproduction using a transform better suited to
localized time events.
<!--l. 37--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.1.3. </span> <a
id="x1-60001.1.3"></a>Assumptions</h5>
<!--l. 39--><p class="noindent" >The Vorbis CODEC design assumes a complex, psychoacoustically-aware encoder and simple,
low-complexity decoder. Vorbis decode is computationally simpler than mp3, although it does
require more working memory as Vorbis has no static probability model; the vector codebooks
used in the first stage of decoding from the bitstream are packed in their entirety into the Vorbis
bitstream headers. In packed form, these codebooks occupy only a few kilobytes; the extent to
which they are pre-decoded into a cache is the dominant factor in decoder memory
usage.
<!--l. 50--><p class="noindent" >Vorbis provides none of its own framing, synchronization or protection against errors; it
is solely a method of accepting input audio, dividing it into individual frames and
compressing these frames into raw, unformatted &#8217;packets&#8217;. The decoder then accepts
these raw packets in sequence, decodes them, synthesizes audio frames from them, and
reassembles the frames into a facsimile of the original audio stream. Vorbis is a free-form
variable bit rate (VBR) codec and packets have no minimum size, maximum size, or
fixed/expected size. Packets are designed that they may be truncated (or padded)
and remain decodable; this is not to be considered an error condition and is used
extensively in bitrate management in peeling. Both the transport mechanism and
decoder must allow that a packet may be any size, or end before or after packet decode
expects.
<!--l. 64--><p class="noindent" >Vorbis packets are thus intended to be used with a transport mechanism that provides free-form
framing, sync, positioning and error correction in accordance with these design assumptions, such
as Ogg (for file transport) or RTP (for network multicast). For purposes of a few examples in this
document, we will assume that Vorbis is to be embedded in an Ogg stream specifically,
although this is by no means a requirement or fundamental assumption in the Vorbis
design.
<!--l. 72--><p class="noindent" >The specification for embedding Vorbis into an Ogg transport stream is in <a
href="#x1-128000A">section&#x00A0;A</a>,
&#8220;<a
href="#x1-128000A">Embedding Vorbis into an Ogg stream<!--tex4ht:ref: vorbis:over:ogg --></a>&#8221;.
<!--l. 77--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.1.4. </span> <a
id="x1-70001.1.4"></a>Codec Setup and Probability Model</h5>
<!--l. 79--><p class="noindent" >Vorbis&#8217; heritage is as a research CODEC and its current design reflects a desire to allow multiple
decades of continuous encoder improvement before running out of room within the codec
specification. For these reasons, configurable aspects of codec setup intentionally lean toward the
extreme of forward adaptive.
<!--l. 85--><p class="noindent" >The single most controversial design decision in Vorbis (and the most unusual for a Vorbis
developer to keep in mind) is that the entire probability model of the codec, the Huffman and
VQ codebooks, is packed into the bitstream header along with extensive CODEC setup
parameters (often several hundred fields). This makes it impossible, as it would be with
MPEG audio layers, to embed a simple frame type flag in each audio packet, or begin
decode at any frame in the stream without having previously fetched the codec setup
header.
<!--l. 95--><p class="noindent" ><span class="likesubparagraphHead"><a
id="x1-80001.1.4"></a><span
class="cmbx-12">Note:</span></span> Vorbis <span
class="cmti-12">can </span>initiate decode at any arbitrary packet within a bitstream so long as the codec
has been initialized/setup with the setup headers.
<!--l. 101--><p class="noindent" >Thus, Vorbis headers are both required for decode to begin and relatively large as bitstream
headers go. The header size is unbounded, although for streaming a rule-of-thumb of 4kB or less
is recommended (and Xiph.Org&#8217;s Vorbis encoder follows this suggestion).
<!--l. 106--><p class="noindent" >Our own design work indicates the primary liability of the required header is in mindshare; it is
an unusual design and thus causes some amount of complaint among engineers as this runs
against current design trends (and also points out limitations in some existing software/interface
designs, such as Windows&#8217; ACM codec framework). However, we find that it does not
fundamentally limit Vorbis&#8217; suitable application space.
<!--l. 115--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.1.5. </span> <a
id="x1-90001.1.5"></a>Format Specification</h5>
<!--l. 116--><p class="noindent" >The Vorbis format is well-defined by its decode specification; any encoder that produces packets
that are correctly decoded by the reference Vorbis decoder described below may be considered
a proper Vorbis encoder. A decoder must faithfully and completely implement the
specification defined below (except where noted) to be considered a proper Vorbis
decoder.
<!--l. 123--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.1.6. </span> <a
id="x1-100001.1.6"></a>Hardware Profile</h5>
<!--l. 124--><p class="noindent" >Although Vorbis decode is computationally simple, it may still run into specific limitations of an
embedded design. For this reason, embedded designs are allowed to deviate in limited ways from
the &#8216;full&#8217; decode specification yet still be certified compliant. These optional omissions are
labelled in the spec where relevant.
<!--l. 131--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">1.2. </span> <a
id="x1-110001.2"></a>Decoder Configuration</h4>
<!--l. 133--><p class="noindent" >Decoder setup consists of configuration of multiple, self-contained component abstractions that
perform specific functions in the decode pipeline. Each different component instance of a specific
type is semantically interchangeable; decoder configuration consists both of internal component
configuration, as well as arrangement of specific instances into a decode pipeline. Componentry
arrangement is roughly as follows:
<div class="center"
>
<!--l. 141--><p class="noindent" >
<!--l. 142--><p class="noindent" ><img
src="components.png" alt="PIC"
>
<br /> <div class="caption"
><span class="id">Figure&#x00A0;1: </span><span
class="content">decoder pipeline configuration</span></div><!--tex4ht:label?: x1-110011 -->
</div>
<!--l. 146--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.2.1. </span> <a
id="x1-120001.2.1"></a>Global Config</h5>
<!--l. 147--><p class="noindent" >Global codec configuration consists of a few audio related fields (sample rate, channels), Vorbis
version (always &#8217;0&#8217; in Vorbis I), bitrate hints, and the lists of component instances. All other
configuration is in the context of specific components.
<!--l. 152--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.2.2. </span> <a
id="x1-130001.2.2"></a>Mode</h5>
<!--l. 154--><p class="noindent" >Each Vorbis frame is coded according to a master &#8217;mode&#8217;. A bitstream may use one or many
modes.
<!--l. 157--><p class="noindent" >The mode mechanism is used to encode a frame according to one of multiple possible
methods with the intention of choosing a method best suited to that frame. Different
modes are, e.g. how frame size is changed from frame to frame. The mode number of a
frame serves as a top level configuration switch for all other specific aspects of frame
decode.
<!--l. 164--><p class="noindent" >A &#8217;mode&#8217; configuration consists of a frame size setting, window type (always 0, the Vorbis
window, in Vorbis I), transform type (always type 0, the MDCT, in Vorbis I) and a mapping
number. The mapping number specifies which mapping configuration instance to use for low-level
packet decode and synthesis.
<!--l. 171--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.2.3. </span> <a
id="x1-140001.2.3"></a>Mapping</h5>
<!--l. 173--><p class="noindent" >A mapping contains a channel coupling description and a list of &#8217;submaps&#8217; that bundle sets
of channel vectors together for grouped encoding and decoding. These submaps are
not references to external components; the submap list is internal and specific to a
mapping.
<!--l. 178--><p class="noindent" >A &#8217;submap&#8217; is a configuration/grouping that applies to a subset of floor and residue vectors
within a mapping. The submap functions as a last layer of indirection such that specific special
floor or residue settings can be applied not only to all the vectors in a given mode, but also
specific vectors in a specific mode. Each submap specifies the proper floor and residue
instance number to use for decoding that submap&#8217;s spectral floor and spectral residue
vectors.
<!--l. 186--><p class="noindent" >As an example:
<!--l. 188--><p class="noindent" >Assume a Vorbis stream that contains six channels in the standard 5.1 format. The sixth
channel, as is normal in 5.1, is bass only. Therefore it would be wasteful to encode a
full-spectrum version of it as with the other channels. The submapping mechanism can be used
to apply a full range floor and residue encoding to channels 0 through 4, and a bass-only
representation to the bass channel, thus saving space. In this example, channels 0-4 belong to
submap 0 (which indicates use of a full-range floor) and channel 5 belongs to submap 1, which
uses a bass-only representation.
<!--l. 199--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.2.4. </span> <a
id="x1-150001.2.4"></a>Floor</h5>
<!--l. 201--><p class="noindent" >Vorbis encodes a spectral &#8217;floor&#8217; vector for each PCM channel. This vector is a low-resolution
representation of the audio spectrum for the given channel in the current frame, generally used
akin to a whitening filter. It is named a &#8217;floor&#8217; because the Xiph.Org reference encoder has
historically used it as a unit-baseline for spectral resolution.
<!--l. 208--><p class="noindent" >A floor encoding may be of two types. Floor 0 uses a packed LSP representation on a dB
amplitude scale and Bark frequency scale. Floor 1 represents the curve as a piecewise linear
interpolated representation on a dB amplitude scale and linear frequency scale. The two floors
are semantically interchangeable in encoding/decoding. However, floor type 1 provides more
stable inter-frame behavior, and so is the preferred choice in all coupled-stereo and
high bitrate modes. Floor 1 is also considerably less expensive to decode than floor
0.
<!--l. 218--><p class="noindent" >Floor 0 is not to be considered deprecated, but it is of limited modern use. No known Vorbis
encoder past Xiph.Org&#8217;s own beta 4 makes use of floor 0.
<!--l. 222--><p class="noindent" >The values coded/decoded by a floor are both compactly formatted and make use of entropy
coding to save space. For this reason, a floor configuration generally refers to multiple
codebooks in the codebook component list. Entropy coding is thus provided as an
abstraction, and each floor instance may choose from any and all available codebooks when
coding/decoding.
<!--l. 230--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.2.5. </span> <a
id="x1-160001.2.5"></a>Residue</h5>
<!--l. 231--><p class="noindent" >The spectral residue is the fine structure of the audio spectrum once the floor curve has been
subtracted out. In simplest terms, it is coded in the bitstream using cascaded (multi-pass) vector
quantization according to one of three specific packing/coding algorithms numbered
0 through 2. The packing algorithm details are configured by residue instance. As
with the floor components, the final VQ/entropy encoding is provided by external
codebook instances and each residue instance may choose from any and all available
codebooks.
<!--l. 241--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.2.6. </span> <a
id="x1-170001.2.6"></a>Codebooks</h5>
<!--l. 243--><p class="noindent" >Codebooks are a self-contained abstraction that perform entropy decoding and, optionally, use
the entropy-decoded integer value as an offset into an index of output value vectors, returning
the indicated vector of values.
<!--l. 248--><p class="noindent" >The entropy coding in a Vorbis I codebook is provided by a standard Huffman binary tree
representation. This tree is tightly packed using one of several methods, depending on whether
codeword lengths are ordered or unordered, or the tree is sparse.
<!--l. 253--><p class="noindent" >The codebook vector index is similarly packed according to index characteristic. Most commonly,
the vector index is encoded as a single list of values of possible values that are then permuted
into a list of n-dimensional rows (lattice VQ).
<!--l. 260--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">1.3. </span> <a
id="x1-180001.3"></a>High-level Decode Process</h4>
<!--l. 262--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.3.1. </span> <a
id="x1-190001.3.1"></a>Decode Setup</h5>
<!--l. 264--><p class="noindent" >Before decoding can begin, a decoder must initialize using the bitstream headers matching the
stream to be decoded. Vorbis uses three header packets; all are required, in-order, by
this specification. Once set up, decode may begin at any audio packet belonging to
the Vorbis stream. In Vorbis I, all packets after the three initial headers are audio
packets.
<!--l. 271--><p class="noindent" >The header packets are, in order, the identification header, the comments header, and the setup
header.
<!--l. 274--><p class="noindent" ><span class="paragraphHead"><a
id="x1-200001.3.1"></a><span
class="cmbx-12">Identification Header</span></span>
The identification header identifies the bitstream as Vorbis, Vorbis version, and the simple audio
characteristics of the stream such as sample rate and number of channels.
<!--l. 279--><p class="noindent" ><span class="paragraphHead"><a
id="x1-210001.3.1"></a><span
class="cmbx-12">Comment Header</span></span>
The comment header includes user text comments (&#8220;tags&#8221;) and a vendor string for the
application/library that produced the bitstream. The encoding and proper use of the comment
header is described in <a
href="#x1-820005">section&#x00A0;5</a>, &#8220;<a
href="#x1-820005">comment field and header specification<!--tex4ht:ref: vorbis:spec:comment --></a>&#8221;.
<!--l. 284--><p class="noindent" ><span class="paragraphHead"><a
id="x1-220001.3.1"></a><span
class="cmbx-12">Setup Header</span></span>
The setup header includes extensive CODEC setup information as well as the complete VQ and
Huffman codebooks needed for decode.
<!--l. 289--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">1.3.2. </span> <a
id="x1-230001.3.2"></a>Decode Procedure</h5>
<!--l. 291--><p class="noindent" >The decoding and synthesis procedure for all audio packets is fundamentally the same.
<dl class="enumerate-enumitem"><dt class="enumerate-enumitem">
1. </dt><dd
class="enumerate-enumitem">decode packet type flag
</dd><dt class="enumerate-enumitem">
2. </dt><dd
class="enumerate-enumitem">decode mode number
</dd><dt class="enumerate-enumitem">
3. </dt><dd
class="enumerate-enumitem">decode window shape (long windows only)
</dd><dt class="enumerate-enumitem">
4. </dt><dd
class="enumerate-enumitem">decode floor
</dd><dt class="enumerate-enumitem">
5. </dt><dd
class="enumerate-enumitem">decode residue into residue vectors
</dd><dt class="enumerate-enumitem">
6. </dt><dd
class="enumerate-enumitem">inverse channel coupling of residue vectors
</dd><dt class="enumerate-enumitem">
7. </dt><dd
class="enumerate-enumitem">generate floor curve from decoded floor data
</dd><dt class="enumerate-enumitem">
8. </dt><dd
class="enumerate-enumitem">compute dot product of floor and residue, producing audio spectrum vector
</dd><dt class="enumerate-enumitem">
9. </dt><dd
class="enumerate-enumitem">inverse monolithic transform of audio spectrum vector, always an MDCT in Vorbis
I
</dd><dt class="enumerate-enumitem">
10. </dt><dd
class="enumerate-enumitem">overlap/add left-hand output of transform with right-hand output of previous frame
</dd><dt class="enumerate-enumitem">
11. </dt><dd
class="enumerate-enumitem">store right hand-data from transform of current frame for future lapping
</dd><dt class="enumerate-enumitem">
12. </dt><dd
class="enumerate-enumitem">if not first frame, return results of overlap/add as audio result of current frame</dd></dl>
<!--l. 308--><p class="noindent" >Note that clever rearrangement of the synthesis arithmetic is possible; as an example, one can
take advantage of symmetries in the MDCT to store the right-hand transform data of a partial
MDCT for a 50% inter-frame buffer space savings, and then complete the transform later before
overlap/add with the next frame. This optimization produces entirely equivalent output and is
naturally perfectly legal. The decoder must be <span
class="cmti-12">entirely mathematically equivalent </span>to the
specification, it need not be a literal semantic implementation.
<!--l. 317--><p class="noindent" ><span class="paragraphHead"><a
id="x1-240001.3.2"></a><span
class="cmbx-12">Packet type decode</span></span>
Vorbis I uses four packet types. The first three packet types mark each of the three Vorbis
headers described above. The fourth packet type marks an audio packet. All other packet types
are reserved; packets marked with a reserved type should be ignored.
<!--l. 324--><p class="noindent" >Following the three header packets, all packets in a Vorbis I stream are audio. The first step of
audio packet decode is to read and verify the packet type; <span
class="cmti-12">a non-audio packet when audio is</span>
<span
class="cmti-12">expected indicates stream corruption or a non-compliant stream. The decoder must ignore the</span>
<span
class="cmti-12">packet and not attempt decoding it to audio</span>.
<!--l. 334--><p class="noindent" ><span class="paragraphHead"><a
id="x1-250001.3.2"></a><span
class="cmbx-12">Mode decode</span></span>
Vorbis allows an encoder to set up multiple, numbered packet &#8217;modes&#8217;, as described earlier, all of
which may be used in a given Vorbis stream. The mode is encoded as an integer used as a direct
offset into the mode instance index.
<!--l. 341--><p class="noindent" ><span class="paragraphHead"><a
id="x1-260001.3.2"></a><span
class="cmbx-12">Window shape decode (long windows only)</span></span>
Vorbis frames may be one of two PCM sample sizes specified during codec setup. In Vorbis I,
legal frame sizes are powers of two from 64 to 8192 samples. Aside from coupling, Vorbis
handles channels as independent vectors and these frame sizes are in samples per
channel.
<!--l. 348--><p class="noindent" >Vorbis uses an overlapping transform, namely the MDCT, to blend one frame into the next,
avoiding most inter-frame block boundary artifacts. The MDCT output of one frame is windowed
according to MDCT requirements, overlapped 50% with the output of the previous frame and
added. The window shape assures seamless reconstruction.
<!--l. 354--><p class="noindent" >This is easy to visualize in the case of equal sized-windows:
<div class="center"
>
<!--l. 356--><p class="noindent" >
<!--l. 357--><p class="noindent" ><img
src="window1.png" alt="PIC"
>
<br /> <div class="caption"
><span class="id">Figure&#x00A0;2: </span><span
class="content">overlap of two equal-sized windows</span></div><!--tex4ht:label?: x1-260012 -->
</div>
<!--l. 361--><p class="noindent" >And slightly more complex in the case of overlapping unequal sized windows:
<div class="center"
>
<!--l. 364--><p class="noindent" >
<!--l. 365--><p class="noindent" ><img
src="window2.png" alt="PIC"
>
<br /> <div class="caption"
><span class="id">Figure&#x00A0;3: </span><span
class="content">overlap of a long and a short window</span></div><!--tex4ht:label?: x1-260023 -->
</div>
<!--l. 369--><p class="noindent" >In the unequal-sized window case, the window shape of the long window must be modified for
seamless lapping as above. It is possible to correctly infer window shape to be applied to the
current window from knowing the sizes of the current, previous and next window. It is legal for a
decoder to use this method. However, in the case of a long window (short windows require no
modification), Vorbis also codes two flag bits to specify pre- and post- window shape. Although
not strictly necessary for function, this minor redundancy allows a packet to be fully decoded to
the point of lapping entirely independently of any other packet, allowing easier abstraction of
decode layers as well as allowing a greater level of easy parallelism in encode and
decode.
<!--l. 382--><p class="noindent" >A description of valid window functions for use with an inverse MDCT can be found in <span class="cite">[<a
href="#XSporer/Brandenburg/Edler">1</a>]</span>.
Vorbis windows all use the slope function
<center class="math-display" >
<img
src="Vorbis_I_spec0x.png" alt="y = sin (.5 * &#x03C0; sin2((x + .5)&#x2215;n * &#x03C0;)).
" class="math-display" ></center>
<!--l. 385--><p class="nopar" >
<!--l. 389--><p class="noindent" ><span class="paragraphHead"><a
id="x1-270001.3.2"></a><span
class="cmbx-12">floor decode</span></span>
Each floor is encoded/decoded in channel order, however each floor belongs to a &#8217;submap&#8217; that
specifies which floor configuration to use. All floors are decoded before residue decode
begins.
<!--l. 395--><p class="noindent" ><span class="paragraphHead"><a
id="x1-280001.3.2"></a><span
class="cmbx-12">residue decode</span></span>
Although the number of residue vectors equals the number of channels, channel coupling may
mean that the raw residue vectors extracted during decode do not map directly to specific
channels. When channel coupling is in use, some vectors will correspond to coupled magnitude or
angle. The coupling relationships are described in the codec setup and may differ from frame to
frame, due to different mode numbers.
<!--l. 404--><p class="noindent" >Vorbis codes residue vectors in groups by submap; the coding is done in submap order from
submap 0 through n-1. This differs from floors which are coded using a configuration provided by
submap number, but are coded individually in channel order.
<!--l. 411--><p class="noindent" ><span class="paragraphHead"><a
id="x1-290001.3.2"></a><span
class="cmbx-12">inverse channel coupling</span></span>
A detailed discussion of stereo in the Vorbis codec can be found in the document
<a
href="stereo.html" >Stereo Channel Coupling in the Vorbis CODEC</a>. Vorbis is not limited to only stereo
coupling, but the stereo document also gives a good overview of the generic coupling
mechanism.
<!--l. 419--><p class="noindent" >Vorbis coupling applies to pairs of residue vectors at a time; decoupling is done in-place a
pair at a time in the order and using the vectors specified in the current mapping
configuration. The decoupling operation is the same for all pairs, converting square polar
representation (where one vector is magnitude and the second angle) back to Cartesian
representation.
<!--l. 426--><p class="noindent" >After decoupling, in order, each pair of vectors on the coupling list, the resulting residue vectors
represent the fine spectral detail of each output channel.
<!--l. 432--><p class="noindent" ><span class="paragraphHead"><a
id="x1-300001.3.2"></a><span
class="cmbx-12">generate floor curve</span></span>
The decoder may choose to generate the floor curve at any appropriate time. It is reasonable to
generate the output curve when the floor data is decoded from the raw packet, or it
can be generated after inverse coupling and applied to the spectral residue directly,
combining generation and the dot product into one step and eliminating some working
space.
<!--l. 441--><p class="noindent" >Both floor 0 and floor 1 generate a linear-range, linear-domain output vector to be multiplied
(dot product) by the linear-range, linear-domain spectral residue.
<!--l. 447--><p class="noindent" ><span class="paragraphHead"><a
id="x1-310001.3.2"></a><span
class="cmbx-12">compute floor/residue dot product</span></span>
This step is straightforward; for each output channel, the decoder multiplies the floor curve and
residue vectors element by element, producing the finished audio spectrum of each
channel.
<!--l. 455--><p class="noindent" >One point is worth mentioning about this dot product; a common mistake in a fixed point
implementation might be to assume that a 32 bit fixed-point representation for floor and
residue and direct multiplication of the vectors is sufficient for acceptable spectral depth
in all cases because it happens to mostly work with the current Xiph.Org reference
encoder.
<!--l. 462--><p class="noindent" >However, floor vector values can span <span
class="cmsy-10x-x-120">~</span>140dB (<span
class="cmsy-10x-x-120">~</span>24 bits unsigned), and the audio spectrum
vector should represent a minimum of 120dB (<span
class="cmsy-10x-x-120">~</span>21 bits with sign), even when output is to a 16
bit PCM device. For the residue vector to represent full scale if the floor is nailed
to <span
class="cmsy-10x-x-120">-</span>140dB, it must be able to span 0 to +140dB. For the residue vector to reach
full scale if the floor is nailed at 0dB, it must be able to represent <span
class="cmsy-10x-x-120">-</span>140dB to +0dB.
Thus, in order to handle full range dynamics, a residue vector may span <span
class="cmsy-10x-x-120">-</span>140dB to
+140dB entirely within spec. A 280dB range is approximately 48 bits with sign; thus the
residue vector must be able to represent a 48 bit range and the dot product must
be able to handle an effective 48 bit times 24 bit multiplication. This range may be
achieved using large (64 bit or larger) integers, or implementing a movable binary point
representation.
<!--l. 479--><p class="noindent" ><span class="paragraphHead"><a
id="x1-320001.3.2"></a><span
class="cmbx-12">inverse monolithic transform (MDCT)</span></span>
The audio spectrum is converted back into time domain PCM audio via an inverse Modified
Discrete Cosine Transform (MDCT). A detailed description of the MDCT is available in
<span class="cite">[<a
href="#XSporer/Brandenburg/Edler">1</a>]</span>.
<!--l. 485--><p class="noindent" >Note that the PCM produced directly from the MDCT is not yet finished audio; it must be
lapped with surrounding frames using an appropriate window (such as the Vorbis window) before
the MDCT can be considered orthogonal.
<!--l. 492--><p class="noindent" ><span class="paragraphHead"><a
id="x1-330001.3.2"></a><span
class="cmbx-12">overlap/add data</span></span>
Windowed MDCT output is overlapped and added with the right hand data of the previous
window such that the 3/4 point of the previous window is aligned with the 1/4 point of the
current window (as illustrated in the window overlap diagram). At this point, the audio data
between the center of the previous frame and the center of the current frame is now finished and
ready to be returned.
<!--l. 501--><p class="noindent" ><span class="paragraphHead"><a
id="x1-340001.3.2"></a><span
class="cmbx-12">cache right hand data</span></span>
The decoder must cache the right hand portion of the current frame to be lapped with the left
hand portion of the next frame.
<!--l. 507--><p class="noindent" ><span class="paragraphHead"><a
id="x1-350001.3.2"></a><span
class="cmbx-12">return finished audio data</span></span>
The overlapped portion produced from overlapping the previous and current frame data
is finished data to be returned by the decoder. This data spans from the center of
the previous window to the center of the current window. In the case of same-sized
windows, the amount of data to return is one-half block consisting of and only of the
overlapped portions. When overlapping a short and long window, much of the returned
range is not actually overlap. This does not damage transform orthogonality. Pay
attention however to returning the correct data range; the amount of data to be returned
is:
<!--l. 519--><p class="noindent" >
<div class="fancyvrb" id="fancyvrb1"><a
id="x1-35002r1"></a><span
class="cmr-6">1</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;window_blocksize(previous_window)/4+window_blocksize(current_window)/4</span></div>
<!--l. 523--><p class="noindent" >from the center of the previous window to the center of the current window.
<!--l. 526--><p class="noindent" >Data is not returned from the first frame; it must be used to &#8217;prime&#8217; the decode engine. The
encoder accounts for this priming when calculating PCM offsets; after the first frame, the proper
PCM output offset is &#8217;0&#8217; (as no data has been returned yet).
<h3 class="sectionHead"><span class="titlemark">2. </span> <a
id="x1-360002"></a>Bitpacking Convention</h3>
<!--l. 6--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">2.1. </span> <a
id="x1-370002.1"></a>Overview</h4>
<!--l. 8--><p class="noindent" >The Vorbis codec uses relatively unstructured raw packets containing arbitrary-width binary
integer fields. Logically, these packets are a bitstream in which bits are coded one-by-one by the
encoder and then read one-by-one in the same monotonically increasing order by the decoder.
Most current binary storage arrangements group bits into a native word size of eight bits
(octets), sixteen bits, thirty-two bits or, less commonly other fixed word sizes. The Vorbis
bitpacking convention specifies the correct mapping of the logical packet bitstream into an actual
representation in fixed-width words.
<!--l. 19--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">2.1.1. </span> <a
id="x1-380002.1.1"></a>octets, bytes and words</h5>
<!--l. 21--><p class="noindent" >In most contemporary architectures, a &#8217;byte&#8217; is synonymous with an &#8217;octet&#8217;, that is, eight bits.
This has not always been the case; seven, ten, eleven and sixteen bit &#8217;bytes&#8217; have been used.
For purposes of the bitpacking convention, a byte implies the native, smallest integer
storage representation offered by a platform. On modern platforms, this is generally
assumed to be eight bits (not necessarily because of the processor but because of the
filesystem/memory architecture. Modern filesystems invariably offer bytes as the fundamental
atom of storage). A &#8217;word&#8217; is an integer size that is a grouped multiple of this smallest
size.
<!--l. 32--><p class="noindent" >The most ubiquitous architectures today consider a &#8217;byte&#8217; to be an octet (eight bits) and a word
to be a group of two, four or eight bytes (16, 32 or 64 bits). Note however that the Vorbis
bitpacking convention is still well defined for any native byte size; Vorbis uses the native
bit-width of a given storage system. This document assumes that a byte is one octet for purposes
of example.
<!--l. 39--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">2.1.2. </span> <a
id="x1-390002.1.2"></a>bit order</h5>
<!--l. 41--><p class="noindent" >A byte has a well-defined &#8217;least significant&#8217; bit (LSb), which is the only bit set when the byte is
storing the two&#8217;s complement integer value +1. A byte&#8217;s &#8217;most significant&#8217; bit (MSb) is at the
opposite end of the byte. Bits in a byte are numbered from zero at the LSb to <span
class="cmmi-12">n </span>(<span
class="cmmi-12">n </span>= 7 in an
octet) for the MSb.
<!--l. 50--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">2.1.3. </span> <a
id="x1-400002.1.3"></a>byte order</h5>
<!--l. 52--><p class="noindent" >Words are native groupings of multiple bytes. Several byte orderings are possible in a word; the
common ones are 3-2-1-0 (&#8217;big endian&#8217; or &#8217;most significant byte first&#8217; in which the
highest-valued byte comes first), 0-1-2-3 (&#8217;little endian&#8217; or &#8217;least significant byte first&#8217; in
which the lowest value byte comes first) and less commonly 3-1-2-0 and 0-2-1-3 (&#8217;mixed
endian&#8217;).
<!--l. 59--><p class="noindent" >The Vorbis bitpacking convention specifies storage and bitstream manipulation at the byte, not
word, level, thus host word ordering is of a concern only during optimization when writing high
performance code that operates on a word of storage at a time rather than by byte.
Logically, bytes are always coded and decoded in order from byte zero through byte
<span
class="cmmi-12">n</span>.
<!--l. 68--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">2.1.4. </span> <a
id="x1-410002.1.4"></a>coding bits into byte sequences</h5>
<!--l. 70--><p class="noindent" >The Vorbis codec has need to code arbitrary bit-width integers, from zero to 32 bits
wide, into packets. These integer fields are not aligned to the boundaries of the byte
representation; the next field is written at the bit position at which the previous field
ends.
<!--l. 75--><p class="noindent" >The encoder logically packs integers by writing the LSb of a binary integer to the logical
bitstream first, followed by next least significant bit, etc, until the requested number of bits
have been coded. When packing the bits into bytes, the encoder begins by placing
the LSb of the integer to be written into the least significant unused bit position of
the destination byte, followed by the next-least significant bit of the source integer
and so on up to the requested number of bits. When all bits of the destination byte
have been filled, encoding continues by zeroing all bits of the next byte and writing
the next bit into the bit position 0 of that byte. Decoding follows the same process
as encoding, but by reading bits from the byte stream and reassembling them into
integers.
<!--l. 90--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">2.1.5. </span> <a
id="x1-420002.1.5"></a>signedness</h5>
<!--l. 92--><p class="noindent" >The signedness of a specific number resulting from decode is to be interpreted by the decoder
given decode context. That is, the three bit binary pattern &#8217;b111&#8217; can be taken to represent
either &#8217;seven&#8217; as an unsigned integer, or &#8217;-1&#8217; as a signed, two&#8217;s complement integer. The
encoder and decoder are responsible for knowing if fields are to be treated as signed or
unsigned.
<!--l. 101--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">2.1.6. </span> <a
id="x1-430002.1.6"></a>coding example</h5>
<!--l. 103--><p class="noindent" >Code the 4 bit integer value &#8217;12&#8217; [b1100] into an empty bytestream. Bytestream result:
<!--l. 106--><p class="noindent" >
<div class="fancyvrb" id="fancyvrb2"><a
id="x1-43002r1"></a><span
class="cmr-6">1</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;|</span><br class="fancyvrb" /><a
id="x1-43004r2"></a><span
class="cmr-6">2</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;V</span><br class="fancyvrb" /><a
id="x1-43006r3"></a><span
class="cmr-6">3</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><br class="fancyvrb" /><a
id="x1-43008r4"></a><span
class="cmr-6">4</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;7</span><span
class="cmtt-8">&#x00A0;6</span><span
class="cmtt-8">&#x00A0;5</span><span
class="cmtt-8">&#x00A0;4</span><span
class="cmtt-8">&#x00A0;3</span><span
class="cmtt-8">&#x00A0;2</span><span
class="cmtt-8">&#x00A0;1</span><span
class="cmtt-8">&#x00A0;0</span><br class="fancyvrb" /><a
id="x1-43010r5"></a><span
class="cmr-6">5</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;byte</span><span
class="cmtt-8">&#x00A0;0</span><span
class="cmtt-8">&#x00A0;[0</span><span
class="cmtt-8">&#x00A0;0</span><span
class="cmtt-8">&#x00A0;0</span><span
class="cmtt-8">&#x00A0;0</span><span
class="cmtt-8">&#x00A0;1</span><span
class="cmtt-8">&#x00A0;1</span><span
class="cmtt-8">&#x00A0;0</span><span
class="cmtt-8">&#x00A0;0]</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;&#x003C;-</span><br class="fancyvrb" /><a
id="x1-43012r6"></a><span
class="cmr-6">6</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;byte</span><span
class="cmtt-8">&#x00A0;1</span><span
class="cmtt-8">&#x00A0;[</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;]</span>
<br class="fancyvrb" /><a
id="x1-43014r7"></a><span
class="cmr-6">7</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;byte</span><span
class="cmtt-8">&#x00A0;2</span><span
class="cmtt-8">&#x00A0;[</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;]</span><br class="fancyvrb" /><a
id="x1-43016r8"></a><span
class="cmr-6">8</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;byte</span><span
class="cmtt-8">&#x00A0;3</span><span
class="cmtt-8">&#x00A0;[</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;]</span><br class="fancyvrb" /><a
id="x1-43018r9"></a><span
class="cmr-6">9</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span
class="cmtt-8">&#x00A0;</span><span