snow.txt 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. =============================================
  2. Snow Video Codec Specification Draft 20080110
  3. =============================================
  4. Introduction:
  5. =============
  6. This specification describes the Snow bitstream syntax and semantics as
  7. well as the formal Snow decoding process.
  8. The decoding process is described precisely and any compliant decoder
  9. MUST produce the exact same output for a spec-conformant Snow stream.
  10. For encoding, though, any process which generates a stream compliant to
  11. the syntactical and semantic requirements and which is decodable by
  12. the process described in this spec shall be considered a conformant
  13. Snow encoder.
  14. Definitions:
  15. ============
  16. MUST the specific part must be done to conform to this standard
  17. SHOULD it is recommended to be done that way, but not strictly required
  18. ilog2(x) is the rounded down logarithm of x with basis 2
  19. ilog2(0) = 0
  20. Type definitions:
  21. =================
  22. b 1-bit range coded
  23. u unsigned scalar value range coded
  24. s signed scalar value range coded
  25. Bitstream syntax:
  26. =================
  27. frame:
  28. header
  29. prediction
  30. residual
  31. header:
  32. keyframe b MID_STATE
  33. if(keyframe || always_reset)
  34. reset_contexts
  35. if(keyframe){
  36. version u header_state
  37. always_reset b header_state
  38. temporal_decomposition_type u header_state
  39. temporal_decomposition_count u header_state
  40. spatial_decomposition_count u header_state
  41. colorspace_type u header_state
  42. chroma_h_shift u header_state
  43. chroma_v_shift u header_state
  44. spatial_scalability b header_state
  45. max_ref_frames-1 u header_state
  46. qlogs
  47. }
  48. if(!keyframe){
  49. update_mc b header_state
  50. if(update_mc){
  51. for(plane=0; plane<2; plane++){
  52. diag_mc b header_state
  53. htaps/2-1 u header_state
  54. for(i= p->htaps/2; i; i--)
  55. |hcoeff[i]| u header_state
  56. }
  57. }
  58. update_qlogs b header_state
  59. if(update_qlogs){
  60. spatial_decomposition_count u header_state
  61. qlogs
  62. }
  63. }
  64. spatial_decomposition_type s header_state
  65. qlog s header_state
  66. mv_scale s header_state
  67. qbias s header_state
  68. block_max_depth s header_state
  69. qlogs:
  70. for(plane=0; plane<2; plane++){
  71. quant_table[plane][0][0] s header_state
  72. for(level=0; level < spatial_decomposition_count; level++){
  73. quant_table[plane][level][1]s header_state
  74. quant_table[plane][level][3]s header_state
  75. }
  76. }
  77. reset_contexts
  78. *_state[*]= MID_STATE
  79. prediction:
  80. for(y=0; y<block_count_vertical; y++)
  81. for(x=0; x<block_count_horizontal; x++)
  82. block(0)
  83. block(level):
  84. mvx_diff=mvy_diff=y_diff=cb_diff=cr_diff=0
  85. if(keyframe){
  86. intra=1
  87. }else{
  88. if(level!=max_block_depth){
  89. s_context= 2*left->level + 2*top->level + topleft->level + topright->level
  90. leaf b block_state[4 + s_context]
  91. }
  92. if(level==max_block_depth || leaf){
  93. intra b block_state[1 + left->intra + top->intra]
  94. if(intra){
  95. y_diff s block_state[32]
  96. cb_diff s block_state[64]
  97. cr_diff s block_state[96]
  98. }else{
  99. ref_context= ilog2(2*left->ref) + ilog2(2*top->ref)
  100. if(ref_frames > 1)
  101. ref u block_state[128 + 1024 + 32*ref_context]
  102. mx_context= ilog2(2*abs(left->mx - top->mx))
  103. my_context= ilog2(2*abs(left->my - top->my))
  104. mvx_diff s block_state[128 + 32*(mx_context + 16*!!ref)]
  105. mvy_diff s block_state[128 + 32*(my_context + 16*!!ref)]
  106. }
  107. }else{
  108. block(level+1)
  109. block(level+1)
  110. block(level+1)
  111. block(level+1)
  112. }
  113. }
  114. residual:
  115. residual2(luma)
  116. residual2(chroma_cr)
  117. residual2(chroma_cb)
  118. residual2:
  119. for(level=0; level<spatial_decomposition_count; level++){
  120. if(level==0)
  121. subband(LL, 0)
  122. subband(HL, level)
  123. subband(LH, level)
  124. subband(HH, level)
  125. }
  126. subband:
  127. FIXME
  128. Tag description:
  129. ----------------
  130. version
  131. 0
  132. this MUST NOT change within a bitstream
  133. always_reset
  134. if 1 then the range coder contexts will be reset after each frame
  135. temporal_decomposition_type
  136. 0
  137. temporal_decomposition_count
  138. 0
  139. spatial_decomposition_count
  140. FIXME
  141. colorspace_type
  142. 0
  143. this MUST NOT change within a bitstream
  144. chroma_h_shift
  145. log2(luma.width / chroma.width)
  146. this MUST NOT change within a bitstream
  147. chroma_v_shift
  148. log2(luma.height / chroma.height)
  149. this MUST NOT change within a bitstream
  150. spatial_scalability
  151. 0
  152. max_ref_frames
  153. maximum number of reference frames
  154. this MUST NOT change within a bitstream
  155. update_mc
  156. indicates that motion compensation filter parameters are stored in the
  157. header
  158. diag_mc
  159. flag to enable faster diagonal interpolation
  160. this SHOULD be 1 unless it turns out to be covered by a valid patent
  161. htaps
  162. number of half pel interpolation filter taps, MUST be even, >0 and <10
  163. hcoeff
  164. half pel interpolation filter coefficients, hcoeff[0] are the 2 middle
  165. coefficients [1] are the next outer ones and so on, resulting in a filter
  166. like: ...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ...
  167. the sign of the coefficients is not explicitly stored but alternates
  168. after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,...
  169. hcoeff[0] is not explicitly stored but found by subtracting the sum
  170. of all stored coefficients with signs from 32
  171. hcoeff[0]= 32 - hcoeff[1] - hcoeff[2] - ...
  172. a good choice for hcoeff and htaps is
  173. htaps= 6
  174. hcoeff={40,-10,2}
  175. an alternative which requires more computations at both encoder and
  176. decoder side and may or may not be better is
  177. htaps= 8
  178. hcoeff={42,-14,6,-2}
  179. ref_frames
  180. minimum of the number of available reference frames and max_ref_frames
  181. for example the first frame after a key frame always has ref_frames=1
  182. spatial_decomposition_type
  183. wavelet type
  184. 0 is a 9/7 symmetric compact integer wavelet
  185. 1 is a 5/3 symmetric compact integer wavelet
  186. others are reserved
  187. stored as delta from last, last is reset to 0 if always_reset || keyframe
  188. qlog
  189. quality (logarthmic quantizer scale)
  190. stored as delta from last, last is reset to 0 if always_reset || keyframe
  191. mv_scale
  192. stored as delta from last, last is reset to 0 if always_reset || keyframe
  193. FIXME check that everything works fine if this changes between frames
  194. qbias
  195. dequantization bias
  196. stored as delta from last, last is reset to 0 if always_reset || keyframe
  197. block_max_depth
  198. maximum depth of the block tree
  199. stored as delta from last, last is reset to 0 if always_reset || keyframe
  200. quant_table
  201. quantiztation table
  202. Highlevel bitstream structure:
  203. =============================
  204. --------------------------------------------
  205. | Header |
  206. --------------------------------------------
  207. | ------------------------------------ |
  208. | | Block0 | |
  209. | | split? | |
  210. | | yes no | |
  211. | | ......... intra? | |
  212. | | : Block01 : yes no | |
  213. | | : Block02 : ....... .......... | |
  214. | | : Block03 : : y DC : : ref index: | |
  215. | | : Block04 : : cb DC : : motion x : | |
  216. | | ......... : cr DC : : motion y : | |
  217. | | ....... .......... | |
  218. | ------------------------------------ |
  219. | ------------------------------------ |
  220. | | Block1 | |
  221. | ... |
  222. --------------------------------------------
  223. | ------------ ------------ ------------ |
  224. || Y subbands | | Cb subbands| | Cr subbands||
  225. || --- --- | | --- --- | | --- --- ||
  226. || |LL0||HL0| | | |LL0||HL0| | | |LL0||HL0| ||
  227. || --- --- | | --- --- | | --- --- ||
  228. || --- --- | | --- --- | | --- --- ||
  229. || |LH0||HH0| | | |LH0||HH0| | | |LH0||HH0| ||
  230. || --- --- | | --- --- | | --- --- ||
  231. || --- --- | | --- --- | | --- --- ||
  232. || |HL1||LH1| | | |HL1||LH1| | | |HL1||LH1| ||
  233. || --- --- | | --- --- | | --- --- ||
  234. || --- --- | | --- --- | | --- --- ||
  235. || |HH1||HL2| | | |HH1||HL2| | | |HH1||HL2| ||
  236. || ... | | ... | | ... ||
  237. | ------------ ------------ ------------ |
  238. --------------------------------------------
  239. Decoding process:
  240. =================
  241. ------------
  242. | |
  243. | Subbands |
  244. ------------ | |
  245. | | ------------
  246. | Intra DC | |
  247. | | LL0 subband prediction
  248. ------------ |
  249. \ Dequantizaton
  250. ------------------- \ |
  251. | Reference frames | \ IDWT
  252. | ------- ------- | Motion \ |
  253. ||Frame 0| |Frame 1|| Compensation . OBMC v -------
  254. | ------- ------- | --------------. \------> + --->|Frame n|-->output
  255. | ------- ------- | -------
  256. ||Frame 2| |Frame 3||<----------------------------------/
  257. | ... |
  258. -------------------
  259. Range Coder:
  260. ============
  261. Binary Range Coder:
  262. -------------------
  263. The implemented range coder is an adapted version based upon "Range encoding:
  264. an algorithm for removing redundancy from a digitised message." by G. N. N.
  265. Martin.
  266. The symbols encoded by the Snow range coder are bits (0|1). The
  267. associated probabilities are not fix but change depending on the symbol mix
  268. seen so far.
  269. bit seen | new state
  270. ---------+-----------------------------------------------
  271. 0 | 256 - state_transition_table[256 - old_state];
  272. 1 | state_transition_table[ old_state];
  273. state_transition_table = {
  274. 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27,
  275. 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42,
  276. 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57,
  277. 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
  278. 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
  279. 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
  280. 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118,
  281. 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133,
  282. 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
  283. 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
  284. 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179,
  285. 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194,
  286. 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209,
  287. 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225,
  288. 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240,
  289. 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};
  290. FIXME
  291. Range Coding of integers:
  292. -------------------------
  293. FIXME
  294. Neighboring Blocks:
  295. ===================
  296. left and top are set to the respective blocks unless they are outside of
  297. the image in which case they are set to the Null block
  298. top-left is set to the top left block unless it is outside of the image in
  299. which case it is set to the left block
  300. if this block has no larger parent block or it is at the left side of its
  301. parent block and the top right block is not outside of the image then the
  302. top right block is used for top-right else the top-left block is used
  303. Null block
  304. y,cb,cr are 128
  305. level, ref, mx and my are 0
  306. Motion Vector Prediction:
  307. =========================
  308. 1. the motion vectors of all the neighboring blocks are scaled to
  309. compensate for the difference of reference frames
  310. scaled_mv= (mv * (256 * (current_reference+1) / (mv.reference+1)) + 128)>>8
  311. 2. the median of the scaled left, top and top-right vectors is used as
  312. motion vector prediction
  313. 3. the used motion vector is the sum of the predictor and
  314. (mvx_diff, mvy_diff)*mv_scale
  315. Intra DC Predicton:
  316. ======================
  317. the luma and chroma values of the left block are used as predictors
  318. the used luma and chroma is the sum of the predictor and y_diff, cb_diff, cr_diff
  319. to reverse this in the decoder apply the following:
  320. block[y][x].dc[0] = block[y][x-1].dc[0] + y_diff;
  321. block[y][x].dc[1] = block[y][x-1].dc[1] + cb_diff;
  322. block[y][x].dc[2] = block[y][x-1].dc[2] + cr_diff;
  323. block[*][-1].dc[*]= 128;
  324. Motion Compensation:
  325. ====================
  326. Halfpel interpolation:
  327. ----------------------
  328. halfpel interpolation is done by convolution with the halfpel filter stored
  329. in the header:
  330. horizontal halfpel samples are found by
  331. H1[y][x] = hcoeff[0]*(F[y][x ] + F[y][x+1])
  332. + hcoeff[1]*(F[y][x-1] + F[y][x+2])
  333. + hcoeff[2]*(F[y][x-2] + F[y][x+3])
  334. + ...
  335. h1[y][x] = (H1[y][x] + 32)>>6;
  336. vertical halfpel samples are found by
  337. H2[y][x] = hcoeff[0]*(F[y ][x] + F[y+1][x])
  338. + hcoeff[1]*(F[y-1][x] + F[y+2][x])
  339. + ...
  340. h2[y][x] = (H2[y][x] + 32)>>6;
  341. vertical+horizontal halfpel samples are found by
  342. H3[y][x] = hcoeff[0]*(H2[y][x ] + H2[y][x+1])
  343. + hcoeff[1]*(H2[y][x-1] + H2[y][x+2])
  344. + ...
  345. H3[y][x] = hcoeff[0]*(H1[y ][x] + H1[y+1][x])
  346. + hcoeff[1]*(H1[y+1][x] + H1[y+2][x])
  347. + ...
  348. h3[y][x] = (H3[y][x] + 2048)>>12;
  349. F H1 F
  350. | | |
  351. | | |
  352. | | |
  353. F H1 F
  354. | | |
  355. | | |
  356. | | |
  357. F-------F-------F-> H1<-F-------F-------F
  358. v v v
  359. H2 H3 H2
  360. ^ ^ ^
  361. F-------F-------F-> H1<-F-------F-------F
  362. | | |
  363. | | |
  364. | | |
  365. F H1 F
  366. | | |
  367. | | |
  368. | | |
  369. F H1 F
  370. unavailable fullpel samples (outside the picture for example) shall be equal
  371. to the closest available fullpel sample
  372. Smaller pel interpolation:
  373. --------------------------
  374. if diag_mc is set then points which lie on a line between 2 vertically,
  375. horiziontally or diagonally adjacent halfpel points shall be interpolated
  376. linearls with rounding to nearest and halfway values rounded up.
  377. points which lie on 2 diagonals at the same time should only use the one
  378. diagonal not containing the fullpel point
  379. F-->O---q---O<--h1->O---q---O<--F
  380. v \ / v \ / v
  381. O O O O O O O
  382. | / | \ |
  383. q q q q q
  384. | / | \ |
  385. O O O O O O O
  386. ^ / \ ^ / \ ^
  387. h2-->O---q---O<--h3->O---q---O<--h2
  388. v \ / v \ / v
  389. O O O O O O O
  390. | \ | / |
  391. q q q q q
  392. | \ | / |
  393. O O O O O O O
  394. ^ / \ ^ / \ ^
  395. F-->O---q---O<--h1->O---q---O<--F
  396. the remaining points shall be bilinearly interpolated from the
  397. up to 4 surrounding halfpel and fullpel points, again rounding should be to
  398. nearest and halfway values rounded up
  399. compliant Snow decoders MUST support 1-1/8 pel luma and 1/2-1/16 pel chroma
  400. interpolation at least
  401. Overlapped block motion compensation:
  402. -------------------------------------
  403. FIXME
  404. LL band prediction:
  405. ===================
  406. Each sample in the LL0 subband is predicted by the median of the left, top and
  407. left+top-topleft samples, samples outside the subband shall be considered to
  408. be 0. To reverse this prediction in the decoder apply the following.
  409. for(y=0; y<height; y++){
  410. for(x=0; x<width; x++){
  411. sample[y][x] += median(sample[y-1][x],
  412. sample[y][x-1],
  413. sample[y-1][x]+sample[y][x-1]-sample[y-1][x-1]);
  414. }
  415. }
  416. sample[-1][*]=sample[*][-1]= 0;
  417. width,height here are the width and height of the LL0 subband not of the final
  418. video
  419. Dequantizaton:
  420. ==============
  421. FIXME
  422. Wavelet Transform:
  423. ==================
  424. Snow supports 2 wavelet transforms, the symmetric biorthogonal 5/3 integer
  425. transform and a integer approximation of the symmetric biorthogonal 9/7
  426. daubechies wavelet.
  427. 2D IDWT (inverse discrete wavelet transform)
  428. --------------------------------------------
  429. The 2D IDWT applies a 2D filter recursively, each time combining the
  430. 4 lowest frequency subbands into a single subband until only 1 subband
  431. remains.
  432. The 2D filter is done by first applying a 1D filter in the vertical direction
  433. and then applying it in the horizontal one.
  434. --------------- --------------- --------------- ---------------
  435. |LL0|HL0| | | | | | | | | | | |
  436. |---+---| HL1 | | L0|H0 | HL1 | | LL1 | HL1 | | | |
  437. |LH0|HH0| | | | | | | | | | | |
  438. |-------+-------|->|-------+-------|->|-------+-------|->| L1 | H1 |->...
  439. | | | | | | | | | | | |
  440. | LH1 | HH1 | | LH1 | HH1 | | LH1 | HH1 | | | |
  441. | | | | | | | | | | | |
  442. --------------- --------------- --------------- ---------------
  443. 1D Filter:
  444. ----------
  445. 1. interleave the samples of the low and high frequency subbands like
  446. s={L0, H0, L1, H1, L2, H2, L3, H3, ... }
  447. note, this can end with a L or a H, the number of elements shall be w
  448. s[-1] shall be considered equivalent to s[1 ]
  449. s[w ] shall be considered equivalent to s[w-2]
  450. 2. perform the lifting steps in order as described below
  451. 5/3 Integer filter:
  452. 1. s[i] -= (s[i-1] + s[i+1] + 2)>>2; for all even i < w
  453. 2. s[i] += (s[i-1] + s[i+1] )>>1; for all odd i < w
  454. \ | /|\ | /|\ | /|\ | /|\
  455. \|/ | \|/ | \|/ | \|/ |
  456. + | + | + | + | -1/4
  457. /|\ | /|\ | /|\ | /|\ |
  458. / | \|/ | \|/ | \|/ | \|/
  459. | + | + | + | + +1/2
  460. Snow's 9/7 Integer filter:
  461. 1. s[i] -= (3*(s[i-1] + s[i+1]) + 4)>>3; for all even i < w
  462. 2. s[i] -= s[i-1] + s[i+1] ; for all odd i < w
  463. 3. s[i] += ( s[i-1] + s[i+1] + 4*s[i] + 8)>>4; for all even i < w
  464. 4. s[i] += (3*(s[i-1] + s[i+1]) )>>1; for all odd i < w
  465. \ | /|\ | /|\ | /|\ | /|\
  466. \|/ | \|/ | \|/ | \|/ |
  467. + | + | + | + | -3/8
  468. /|\ | /|\ | /|\ | /|\ |
  469. / | \|/ | \|/ | \|/ | \|/
  470. (| + (| + (| + (| + -1
  471. \ + /|\ + /|\ + /|\ + /|\ +1/4
  472. \|/ | \|/ | \|/ | \|/ |
  473. + | + | + | + | +1/16
  474. /|\ | /|\ | /|\ | /|\ |
  475. / | \|/ | \|/ | \|/ | \|/
  476. | + | + | + | + +3/2
  477. optimization tips:
  478. following are exactly identical
  479. (3a)>>1 == a + (a>>1)
  480. (a + 4b + 8)>>4 == ((a>>2) + b + 2)>>2
  481. 16bit implementation note:
  482. The IDWT can be implemented with 16bits, but this requires some care to
  483. prevent overflows, the following list, lists the minimum number of bits needed
  484. for some terms
  485. 1. lifting step
  486. A= s[i-1] + s[i+1] 16bit
  487. 3*A + 4 18bit
  488. A + (A>>1) + 2 17bit
  489. 3. lifting step
  490. s[i-1] + s[i+1] 17bit
  491. 4. lifiting step
  492. 3*(s[i-1] + s[i+1]) 17bit
  493. TODO:
  494. =====
  495. Important:
  496. finetune initial contexts
  497. flip wavelet?
  498. try to use the wavelet transformed predicted image (motion compensated image) as context for coding the residual coefficients
  499. try the MV length as context for coding the residual coefficients
  500. use extradata for stuff which is in the keyframes now?
  501. the MV median predictor is patented IIRC
  502. implement per picture halfpel interpolation
  503. try different range coder state transition tables for different contexts
  504. Not Important:
  505. compare the 6 tap and 8 tap hpel filters (psnr/bitrate and subjective quality)
  506. spatial_scalability b vs u (!= 0 breaks syntax anyway so we can add a u later)
  507. Credits:
  508. ========
  509. Michael Niedermayer
  510. Loren Merritt
  511. Copyright:
  512. ==========
  513. GPL + GFDL + whatever is needed to make this a RFC