tools.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import six
  2. import sys
  3. def to_utf8(value):
  4. """
  5. Converts value to string encoded into utf-8
  6. :param value:
  7. :return:
  8. """
  9. if sys.version_info[0] < 3:
  10. if not isinstance(value, basestring): # noqa
  11. value = unicode(value) # noqa
  12. if isinstance(value, str):
  13. value = value.decode("utf-8", errors="ignore")
  14. return value.encode('utf-8', 'ignore')
  15. else:
  16. return str(value)
  17. def trim_string(s, max_bytes):
  18. """
  19. Adjusts the length of the string s in order to fit it
  20. into max_bytes bytes of storage after encoding as UTF-8.
  21. Useful when cutting filesystem paths.
  22. :param s: unicode string
  23. :param max_bytes: number of bytes
  24. :return the prefix of s
  25. """
  26. if isinstance(s, six.text_type):
  27. return _trim_unicode_string(s, max_bytes)
  28. if isinstance(s, six.binary_type):
  29. if len(s) <= max_bytes:
  30. return s
  31. s = s.decode('utf-8', errors='ignore')
  32. s = _trim_unicode_string(s, max_bytes)
  33. s = s.encode('utf-8', errors='ignore')
  34. return s
  35. raise TypeError('a string is expected')
  36. def _trim_unicode_string(s, max_bytes):
  37. if len(s) * 4 <= max_bytes:
  38. # UTF-8 uses at most 4 bytes per character
  39. return s
  40. result = []
  41. cur_byte_length = 0
  42. for ch in s:
  43. cur_byte_length += len(ch.encode('utf-8'))
  44. if cur_byte_length > max_bytes:
  45. break
  46. result.append(ch)
  47. return ''.join(result)
  48. def to_str(s):
  49. if six.PY2 and isinstance(s, six.text_type):
  50. return s.encode('utf8')
  51. return s