snow.txt 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637
  1. =============================================
  2. Snow Video Codec Specification Draft 20080110
  3. =============================================
  4. Introduction:
  5. =============
  6. This specification describes the Snow bitstream syntax and semantics as
  7. well as the formal Snow decoding process.
  8. The decoding process is described precisely and any compliant decoder
  9. MUST produce the exact same output for a spec-conformant Snow stream.
  10. For encoding, though, any process which generates a stream compliant to
  11. the syntactical and semantic requirements and which is decodable by
  12. the process described in this spec shall be considered a conformant
  13. Snow encoder.
  14. Definitions:
  15. ============
  16. MUST the specific part must be done to conform to this standard
  17. SHOULD it is recommended to be done that way, but not strictly required
  18. ilog2(x) is the rounded down logarithm of x with basis 2
  19. ilog2(0) = 0
  20. Type definitions:
  21. =================
  22. b 1-bit range coded
  23. u unsigned scalar value range coded
  24. s signed scalar value range coded
  25. Bitstream syntax:
  26. =================
  27. frame:
  28. header
  29. prediction
  30. residual
  31. header:
  32. keyframe b MID_STATE
  33. if(keyframe || always_reset)
  34. reset_contexts
  35. if(keyframe){
  36. version u header_state
  37. always_reset b header_state
  38. temporal_decomposition_type u header_state
  39. temporal_decomposition_count u header_state
  40. spatial_decomposition_count u header_state
  41. colorspace_type u header_state
  42. if (nb_planes > 2) {
  43. chroma_h_shift u header_state
  44. chroma_v_shift u header_state
  45. }
  46. spatial_scalability b header_state
  47. max_ref_frames-1 u header_state
  48. qlogs
  49. }
  50. if(!keyframe){
  51. update_mc b header_state
  52. if(update_mc){
  53. for(plane=0; plane<nb_plane_types; plane++){
  54. diag_mc b header_state
  55. htaps/2-1 u header_state
  56. for(i= p->htaps/2; i; i--)
  57. |hcoeff[i]| u header_state
  58. }
  59. }
  60. update_qlogs b header_state
  61. if(update_qlogs){
  62. spatial_decomposition_count u header_state
  63. qlogs
  64. }
  65. }
  66. spatial_decomposition_type s header_state
  67. qlog s header_state
  68. mv_scale s header_state
  69. qbias s header_state
  70. block_max_depth s header_state
  71. qlogs:
  72. for(plane=0; plane<nb_plane_types; plane++){
  73. quant_table[plane][0][0] s header_state
  74. for(level=0; level < spatial_decomposition_count; level++){
  75. quant_table[plane][level][1]s header_state
  76. quant_table[plane][level][3]s header_state
  77. }
  78. }
  79. reset_contexts
  80. *_state[*]= MID_STATE
  81. prediction:
  82. for(y=0; y<block_count_vertical; y++)
  83. for(x=0; x<block_count_horizontal; x++)
  84. block(0)
  85. block(level):
  86. mvx_diff=mvy_diff=y_diff=cb_diff=cr_diff=0
  87. if(keyframe){
  88. intra=1
  89. }else{
  90. if(level!=max_block_depth){
  91. s_context= 2*left->level + 2*top->level + topleft->level + topright->level
  92. leaf b block_state[4 + s_context]
  93. }
  94. if(level==max_block_depth || leaf){
  95. intra b block_state[1 + left->intra + top->intra]
  96. if(intra){
  97. y_diff s block_state[32]
  98. cb_diff s block_state[64]
  99. cr_diff s block_state[96]
  100. }else{
  101. ref_context= ilog2(2*left->ref) + ilog2(2*top->ref)
  102. if(ref_frames > 1)
  103. ref u block_state[128 + 1024 + 32*ref_context]
  104. mx_context= ilog2(2*abs(left->mx - top->mx))
  105. my_context= ilog2(2*abs(left->my - top->my))
  106. mvx_diff s block_state[128 + 32*(mx_context + 16*!!ref)]
  107. mvy_diff s block_state[128 + 32*(my_context + 16*!!ref)]
  108. }
  109. }else{
  110. block(level+1)
  111. block(level+1)
  112. block(level+1)
  113. block(level+1)
  114. }
  115. }
  116. residual:
  117. residual2(luma)
  118. if (nb_planes > 2) {
  119. residual2(chroma_cr)
  120. residual2(chroma_cb)
  121. }
  122. residual2:
  123. for(level=0; level<spatial_decomposition_count; level++){
  124. if(level==0)
  125. subband(LL, 0)
  126. subband(HL, level)
  127. subband(LH, level)
  128. subband(HH, level)
  129. }
  130. subband:
  131. FIXME
  132. nb_plane_types = gray ? 1 : 2;
  133. Tag description:
  134. ----------------
  135. version
  136. 0
  137. this MUST NOT change within a bitstream
  138. always_reset
  139. if 1 then the range coder contexts will be reset after each frame
  140. temporal_decomposition_type
  141. 0
  142. temporal_decomposition_count
  143. 0
  144. spatial_decomposition_count
  145. FIXME
  146. colorspace_type
  147. 0 unspecified YcbCr
  148. 1 Gray
  149. 2 Gray + Alpha
  150. 3 GBR
  151. 4 GBRA
  152. this MUST NOT change within a bitstream
  153. chroma_h_shift
  154. log2(luma.width / chroma.width)
  155. this MUST NOT change within a bitstream
  156. chroma_v_shift
  157. log2(luma.height / chroma.height)
  158. this MUST NOT change within a bitstream
  159. spatial_scalability
  160. 0
  161. max_ref_frames
  162. maximum number of reference frames
  163. this MUST NOT change within a bitstream
  164. update_mc
  165. indicates that motion compensation filter parameters are stored in the
  166. header
  167. diag_mc
  168. flag to enable faster diagonal interpolation
  169. this SHOULD be 1 unless it turns out to be covered by a valid patent
  170. htaps
  171. number of half pel interpolation filter taps, MUST be even, >0 and <10
  172. hcoeff
  173. half pel interpolation filter coefficients, hcoeff[0] are the 2 middle
  174. coefficients [1] are the next outer ones and so on, resulting in a filter
  175. like: ...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ...
  176. the sign of the coefficients is not explicitly stored but alternates
  177. after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,...
  178. hcoeff[0] is not explicitly stored but found by subtracting the sum
  179. of all stored coefficients with signs from 32
  180. hcoeff[0]= 32 - hcoeff[1] - hcoeff[2] - ...
  181. a good choice for hcoeff and htaps is
  182. htaps= 6
  183. hcoeff={40,-10,2}
  184. an alternative which requires more computations at both encoder and
  185. decoder side and may or may not be better is
  186. htaps= 8
  187. hcoeff={42,-14,6,-2}
  188. ref_frames
  189. minimum of the number of available reference frames and max_ref_frames
  190. for example the first frame after a key frame always has ref_frames=1
  191. spatial_decomposition_type
  192. wavelet type
  193. 0 is a 9/7 symmetric compact integer wavelet
  194. 1 is a 5/3 symmetric compact integer wavelet
  195. others are reserved
  196. stored as delta from last, last is reset to 0 if always_reset || keyframe
  197. qlog
  198. quality (logarthmic quantizer scale)
  199. stored as delta from last, last is reset to 0 if always_reset || keyframe
  200. mv_scale
  201. stored as delta from last, last is reset to 0 if always_reset || keyframe
  202. FIXME check that everything works fine if this changes between frames
  203. qbias
  204. dequantization bias
  205. stored as delta from last, last is reset to 0 if always_reset || keyframe
  206. block_max_depth
  207. maximum depth of the block tree
  208. stored as delta from last, last is reset to 0 if always_reset || keyframe
  209. quant_table
  210. quantiztation table
  211. Highlevel bitstream structure:
  212. =============================
  213. --------------------------------------------
  214. | Header |
  215. --------------------------------------------
  216. | ------------------------------------ |
  217. | | Block0 | |
  218. | | split? | |
  219. | | yes no | |
  220. | | ......... intra? | |
  221. | | : Block01 : yes no | |
  222. | | : Block02 : ....... .......... | |
  223. | | : Block03 : : y DC : : ref index: | |
  224. | | : Block04 : : cb DC : : motion x : | |
  225. | | ......... : cr DC : : motion y : | |
  226. | | ....... .......... | |
  227. | ------------------------------------ |
  228. | ------------------------------------ |
  229. | | Block1 | |
  230. | ... |
  231. --------------------------------------------
  232. | ------------ ------------ ------------ |
  233. || Y subbands | | Cb subbands| | Cr subbands||
  234. || --- --- | | --- --- | | --- --- ||
  235. || |LL0||HL0| | | |LL0||HL0| | | |LL0||HL0| ||
  236. || --- --- | | --- --- | | --- --- ||
  237. || --- --- | | --- --- | | --- --- ||
  238. || |LH0||HH0| | | |LH0||HH0| | | |LH0||HH0| ||
  239. || --- --- | | --- --- | | --- --- ||
  240. || --- --- | | --- --- | | --- --- ||
  241. || |HL1||LH1| | | |HL1||LH1| | | |HL1||LH1| ||
  242. || --- --- | | --- --- | | --- --- ||
  243. || --- --- | | --- --- | | --- --- ||
  244. || |HH1||HL2| | | |HH1||HL2| | | |HH1||HL2| ||
  245. || ... | | ... | | ... ||
  246. | ------------ ------------ ------------ |
  247. --------------------------------------------
  248. Decoding process:
  249. =================
  250. ------------
  251. | |
  252. | Subbands |
  253. ------------ | |
  254. | | ------------
  255. | Intra DC | |
  256. | | LL0 subband prediction
  257. ------------ |
  258. \ Dequantizaton
  259. ------------------- \ |
  260. | Reference frames | \ IDWT
  261. | ------- ------- | Motion \ |
  262. ||Frame 0| |Frame 1|| Compensation . OBMC v -------
  263. | ------- ------- | --------------. \------> + --->|Frame n|-->output
  264. | ------- ------- | -------
  265. ||Frame 2| |Frame 3||<----------------------------------/
  266. | ... |
  267. -------------------
  268. Range Coder:
  269. ============
  270. Binary Range Coder:
  271. -------------------
  272. The implemented range coder is an adapted version based upon "Range encoding:
  273. an algorithm for removing redundancy from a digitised message." by G. N. N.
  274. Martin.
  275. The symbols encoded by the Snow range coder are bits (0|1). The
  276. associated probabilities are not fix but change depending on the symbol mix
  277. seen so far.
  278. bit seen | new state
  279. ---------+-----------------------------------------------
  280. 0 | 256 - state_transition_table[256 - old_state];
  281. 1 | state_transition_table[ old_state];
  282. state_transition_table = {
  283. 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27,
  284. 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42,
  285. 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57,
  286. 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
  287. 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
  288. 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
  289. 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118,
  290. 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133,
  291. 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
  292. 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
  293. 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179,
  294. 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194,
  295. 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209,
  296. 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225,
  297. 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240,
  298. 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};
  299. FIXME
  300. Range Coding of integers:
  301. -------------------------
  302. FIXME
  303. Neighboring Blocks:
  304. ===================
  305. left and top are set to the respective blocks unless they are outside of
  306. the image in which case they are set to the Null block
  307. top-left is set to the top left block unless it is outside of the image in
  308. which case it is set to the left block
  309. if this block has no larger parent block or it is at the left side of its
  310. parent block and the top right block is not outside of the image then the
  311. top right block is used for top-right else the top-left block is used
  312. Null block
  313. y,cb,cr are 128
  314. level, ref, mx and my are 0
  315. Motion Vector Prediction:
  316. =========================
  317. 1. the motion vectors of all the neighboring blocks are scaled to
  318. compensate for the difference of reference frames
  319. scaled_mv= (mv * (256 * (current_reference+1) / (mv.reference+1)) + 128)>>8
  320. 2. the median of the scaled left, top and top-right vectors is used as
  321. motion vector prediction
  322. 3. the used motion vector is the sum of the predictor and
  323. (mvx_diff, mvy_diff)*mv_scale
  324. Intra DC Predicton:
  325. ======================
  326. the luma and chroma values of the left block are used as predictors
  327. the used luma and chroma is the sum of the predictor and y_diff, cb_diff, cr_diff
  328. to reverse this in the decoder apply the following:
  329. block[y][x].dc[0] = block[y][x-1].dc[0] + y_diff;
  330. block[y][x].dc[1] = block[y][x-1].dc[1] + cb_diff;
  331. block[y][x].dc[2] = block[y][x-1].dc[2] + cr_diff;
  332. block[*][-1].dc[*]= 128;
  333. Motion Compensation:
  334. ====================
  335. Halfpel interpolation:
  336. ----------------------
  337. halfpel interpolation is done by convolution with the halfpel filter stored
  338. in the header:
  339. horizontal halfpel samples are found by
  340. H1[y][x] = hcoeff[0]*(F[y][x ] + F[y][x+1])
  341. + hcoeff[1]*(F[y][x-1] + F[y][x+2])
  342. + hcoeff[2]*(F[y][x-2] + F[y][x+3])
  343. + ...
  344. h1[y][x] = (H1[y][x] + 32)>>6;
  345. vertical halfpel samples are found by
  346. H2[y][x] = hcoeff[0]*(F[y ][x] + F[y+1][x])
  347. + hcoeff[1]*(F[y-1][x] + F[y+2][x])
  348. + ...
  349. h2[y][x] = (H2[y][x] + 32)>>6;
  350. vertical+horizontal halfpel samples are found by
  351. H3[y][x] = hcoeff[0]*(H2[y][x ] + H2[y][x+1])
  352. + hcoeff[1]*(H2[y][x-1] + H2[y][x+2])
  353. + ...
  354. H3[y][x] = hcoeff[0]*(H1[y ][x] + H1[y+1][x])
  355. + hcoeff[1]*(H1[y+1][x] + H1[y+2][x])
  356. + ...
  357. h3[y][x] = (H3[y][x] + 2048)>>12;
  358. F H1 F
  359. | | |
  360. | | |
  361. | | |
  362. F H1 F
  363. | | |
  364. | | |
  365. | | |
  366. F-------F-------F-> H1<-F-------F-------F
  367. v v v
  368. H2 H3 H2
  369. ^ ^ ^
  370. F-------F-------F-> H1<-F-------F-------F
  371. | | |
  372. | | |
  373. | | |
  374. F H1 F
  375. | | |
  376. | | |
  377. | | |
  378. F H1 F
  379. unavailable fullpel samples (outside the picture for example) shall be equal
  380. to the closest available fullpel sample
  381. Smaller pel interpolation:
  382. --------------------------
  383. if diag_mc is set then points which lie on a line between 2 vertically,
  384. horiziontally or diagonally adjacent halfpel points shall be interpolated
  385. linearls with rounding to nearest and halfway values rounded up.
  386. points which lie on 2 diagonals at the same time should only use the one
  387. diagonal not containing the fullpel point
  388. F-->O---q---O<--h1->O---q---O<--F
  389. v \ / v \ / v
  390. O O O O O O O
  391. | / | \ |
  392. q q q q q
  393. | / | \ |
  394. O O O O O O O
  395. ^ / \ ^ / \ ^
  396. h2-->O---q---O<--h3->O---q---O<--h2
  397. v \ / v \ / v
  398. O O O O O O O
  399. | \ | / |
  400. q q q q q
  401. | \ | / |
  402. O O O O O O O
  403. ^ / \ ^ / \ ^
  404. F-->O---q---O<--h1->O---q---O<--F
  405. the remaining points shall be bilinearly interpolated from the
  406. up to 4 surrounding halfpel and fullpel points, again rounding should be to
  407. nearest and halfway values rounded up
  408. compliant Snow decoders MUST support 1-1/8 pel luma and 1/2-1/16 pel chroma
  409. interpolation at least
  410. Overlapped block motion compensation:
  411. -------------------------------------
  412. FIXME
  413. LL band prediction:
  414. ===================
  415. Each sample in the LL0 subband is predicted by the median of the left, top and
  416. left+top-topleft samples, samples outside the subband shall be considered to
  417. be 0. To reverse this prediction in the decoder apply the following.
  418. for(y=0; y<height; y++){
  419. for(x=0; x<width; x++){
  420. sample[y][x] += median(sample[y-1][x],
  421. sample[y][x-1],
  422. sample[y-1][x]+sample[y][x-1]-sample[y-1][x-1]);
  423. }
  424. }
  425. sample[-1][*]=sample[*][-1]= 0;
  426. width,height here are the width and height of the LL0 subband not of the final
  427. video
  428. Dequantizaton:
  429. ==============
  430. FIXME
  431. Wavelet Transform:
  432. ==================
  433. Snow supports 2 wavelet transforms, the symmetric biorthogonal 5/3 integer
  434. transform and a integer approximation of the symmetric biorthogonal 9/7
  435. daubechies wavelet.
  436. 2D IDWT (inverse discrete wavelet transform)
  437. --------------------------------------------
  438. The 2D IDWT applies a 2D filter recursively, each time combining the
  439. 4 lowest frequency subbands into a single subband until only 1 subband
  440. remains.
  441. The 2D filter is done by first applying a 1D filter in the vertical direction
  442. and then applying it in the horizontal one.
  443. --------------- --------------- --------------- ---------------
  444. |LL0|HL0| | | | | | | | | | | |
  445. |---+---| HL1 | | L0|H0 | HL1 | | LL1 | HL1 | | | |
  446. |LH0|HH0| | | | | | | | | | | |
  447. |-------+-------|->|-------+-------|->|-------+-------|->| L1 | H1 |->...
  448. | | | | | | | | | | | |
  449. | LH1 | HH1 | | LH1 | HH1 | | LH1 | HH1 | | | |
  450. | | | | | | | | | | | |
  451. --------------- --------------- --------------- ---------------
  452. 1D Filter:
  453. ----------
  454. 1. interleave the samples of the low and high frequency subbands like
  455. s={L0, H0, L1, H1, L2, H2, L3, H3, ... }
  456. note, this can end with a L or a H, the number of elements shall be w
  457. s[-1] shall be considered equivalent to s[1 ]
  458. s[w ] shall be considered equivalent to s[w-2]
  459. 2. perform the lifting steps in order as described below
  460. 5/3 Integer filter:
  461. 1. s[i] -= (s[i-1] + s[i+1] + 2)>>2; for all even i < w
  462. 2. s[i] += (s[i-1] + s[i+1] )>>1; for all odd i < w
  463. \ | /|\ | /|\ | /|\ | /|\
  464. \|/ | \|/ | \|/ | \|/ |
  465. + | + | + | + | -1/4
  466. /|\ | /|\ | /|\ | /|\ |
  467. / | \|/ | \|/ | \|/ | \|/
  468. | + | + | + | + +1/2
  469. Snow's 9/7 Integer filter:
  470. 1. s[i] -= (3*(s[i-1] + s[i+1]) + 4)>>3; for all even i < w
  471. 2. s[i] -= s[i-1] + s[i+1] ; for all odd i < w
  472. 3. s[i] += ( s[i-1] + s[i+1] + 4*s[i] + 8)>>4; for all even i < w
  473. 4. s[i] += (3*(s[i-1] + s[i+1]) )>>1; for all odd i < w
  474. \ | /|\ | /|\ | /|\ | /|\
  475. \|/ | \|/ | \|/ | \|/ |
  476. + | + | + | + | -3/8
  477. /|\ | /|\ | /|\ | /|\ |
  478. / | \|/ | \|/ | \|/ | \|/
  479. (| + (| + (| + (| + -1
  480. \ + /|\ + /|\ + /|\ + /|\ +1/4
  481. \|/ | \|/ | \|/ | \|/ |
  482. + | + | + | + | +1/16
  483. /|\ | /|\ | /|\ | /|\ |
  484. / | \|/ | \|/ | \|/ | \|/
  485. | + | + | + | + +3/2
  486. optimization tips:
  487. following are exactly identical
  488. (3a)>>1 == a + (a>>1)
  489. (a + 4b + 8)>>4 == ((a>>2) + b + 2)>>2
  490. 16bit implementation note:
  491. The IDWT can be implemented with 16bits, but this requires some care to
  492. prevent overflows, the following list, lists the minimum number of bits needed
  493. for some terms
  494. 1. lifting step
  495. A= s[i-1] + s[i+1] 16bit
  496. 3*A + 4 18bit
  497. A + (A>>1) + 2 17bit
  498. 3. lifting step
  499. s[i-1] + s[i+1] 17bit
  500. 4. lifiting step
  501. 3*(s[i-1] + s[i+1]) 17bit
  502. TODO:
  503. =====
  504. Important:
  505. finetune initial contexts
  506. flip wavelet?
  507. try to use the wavelet transformed predicted image (motion compensated image) as context for coding the residual coefficients
  508. try the MV length as context for coding the residual coefficients
  509. use extradata for stuff which is in the keyframes now?
  510. implement per picture halfpel interpolation
  511. try different range coder state transition tables for different contexts
  512. Not Important:
  513. compare the 6 tap and 8 tap hpel filters (psnr/bitrate and subjective quality)
  514. spatial_scalability b vs u (!= 0 breaks syntax anyway so we can add a u later)
  515. Credits:
  516. ========
  517. Michael Niedermayer
  518. Loren Merritt
  519. Copyright:
  520. ==========
  521. GPL + GFDL + whatever is needed to make this a RFC