utf_8_sig.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. """ Python 'utf-8-sig' Codec
  2. This work similar to UTF-8 with the following changes:
  3. * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
  4. first three bytes.
  5. * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
  6. bytes will be skipped.
  7. """
  8. import codecs
  9. ### Codec APIs
  10. def encode(input, errors='strict'):
  11. return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
  12. len(input))
  13. def decode(input, errors='strict'):
  14. prefix = 0
  15. if input[:3] == codecs.BOM_UTF8:
  16. input = input[3:]
  17. prefix = 3
  18. (output, consumed) = codecs.utf_8_decode(input, errors, True)
  19. return (output, consumed+prefix)
  20. class IncrementalEncoder(codecs.IncrementalEncoder):
  21. def __init__(self, errors='strict'):
  22. codecs.IncrementalEncoder.__init__(self, errors)
  23. self.first = 1
  24. def encode(self, input, final=False):
  25. if self.first:
  26. self.first = 0
  27. return codecs.BOM_UTF8 + \
  28. codecs.utf_8_encode(input, self.errors)[0]
  29. else:
  30. return codecs.utf_8_encode(input, self.errors)[0]
  31. def reset(self):
  32. codecs.IncrementalEncoder.reset(self)
  33. self.first = 1
  34. def getstate(self):
  35. return self.first
  36. def setstate(self, state):
  37. self.first = state
  38. class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
  39. def __init__(self, errors='strict'):
  40. codecs.BufferedIncrementalDecoder.__init__(self, errors)
  41. self.first = 1
  42. def _buffer_decode(self, input, errors, final):
  43. if self.first:
  44. if len(input) < 3:
  45. if codecs.BOM_UTF8.startswith(input):
  46. # not enough data to decide if this really is a BOM
  47. # => try again on the next call
  48. return ("", 0)
  49. else:
  50. self.first = 0
  51. else:
  52. self.first = 0
  53. if input[:3] == codecs.BOM_UTF8:
  54. (output, consumed) = \
  55. codecs.utf_8_decode(input[3:], errors, final)
  56. return (output, consumed+3)
  57. return codecs.utf_8_decode(input, errors, final)
  58. def reset(self):
  59. codecs.BufferedIncrementalDecoder.reset(self)
  60. self.first = 1
  61. def getstate(self):
  62. state = codecs.BufferedIncrementalDecoder.getstate(self)
  63. # state[1] must be 0 here, as it isn't passed along to the caller
  64. return (state[0], self.first)
  65. def setstate(self, state):
  66. # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
  67. codecs.BufferedIncrementalDecoder.setstate(self, state)
  68. self.first = state[1]
  69. class StreamWriter(codecs.StreamWriter):
  70. def reset(self):
  71. codecs.StreamWriter.reset(self)
  72. try:
  73. del self.encode
  74. except AttributeError:
  75. pass
  76. def encode(self, input, errors='strict'):
  77. self.encode = codecs.utf_8_encode
  78. return encode(input, errors)
  79. class StreamReader(codecs.StreamReader):
  80. def reset(self):
  81. codecs.StreamReader.reset(self)
  82. try:
  83. del self.decode
  84. except AttributeError:
  85. pass
  86. def decode(self, input, errors='strict'):
  87. if len(input) < 3:
  88. if codecs.BOM_UTF8.startswith(input):
  89. # not enough data to decide if this is a BOM
  90. # => try again on the next call
  91. return ("", 0)
  92. elif input[:3] == codecs.BOM_UTF8:
  93. self.decode = codecs.utf_8_decode
  94. (output, consumed) = codecs.utf_8_decode(input[3:],errors)
  95. return (output, consumed+3)
  96. # (else) no BOM present
  97. self.decode = codecs.utf_8_decode
  98. return codecs.utf_8_decode(input, errors)
  99. ### encodings module API
  100. def getregentry():
  101. return codecs.CodecInfo(
  102. name='utf-8-sig',
  103. encode=encode,
  104. decode=decode,
  105. incrementalencoder=IncrementalEncoder,
  106. incrementaldecoder=IncrementalDecoder,
  107. streamreader=StreamReader,
  108. streamwriter=StreamWriter,
  109. )