gen_tld.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import sys
  4. def main():
  5. alphabet = 'abcdefghijklmnopqrstuvwxyz'
  6. tlds = dict()
  7. for s in alphabet:
  8. tlds[s] = list()
  9. tlds['xn--'] = list()
  10. tld_file = open(sys.argv[1], 'rb')
  11. for line in tld_file.readlines():
  12. domain = line.strip().lower()
  13. for label in tlds:
  14. if domain.startswith(b'xn--'):
  15. tlds['xn--'].append(domain)
  16. break
  17. elif domain.startswith(b'x'):
  18. tlds['x'].append(domain)
  19. break
  20. else:
  21. if domain.startswith(label.encode('utf-8')):
  22. tlds[label].append(domain)
  23. break
  24. stdout = open(sys.stdout.fileno(), "w", encoding="utf-8", closefd=False)
  25. stdout.write('// actual list can be found at http://data.iana.org/TLD/tlds-alpha-by-domain.txt\n')
  26. stdout.write('static const char* const TopLevelDomains[] = {\n')
  27. for label, value in sorted(tlds.items()):
  28. if label == 'xn--':
  29. stdout.write(' /* ')
  30. str = ''
  31. for n in value:
  32. unicode_domain = n.decode('idna')
  33. str += ('%s, ' % unicode_domain)
  34. stdout.write('%s*/\n' % str.rstrip())
  35. stdout.write(' ')
  36. str = ''
  37. for n in value:
  38. str += ('"%s", ' % n.decode('utf-8'))
  39. stdout.write('%s\n' % str.rstrip())
  40. else:
  41. stdout.write(' ')
  42. str = ''
  43. for n in value:
  44. str += ('"%s", ' % n.decode('utf-8'))
  45. stdout.write('%s\n' % str.rstrip())
  46. stdout.write(' 0\n')
  47. stdout.write('};\n')
  48. if __name__ == '__main__':
  49. main()