check-porn.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. #!/usr/bin/env python3
  2. """
  3. This script employs a VERY basic heuristic ('porn' in webpage.lower()) to check
  4. if we are not 'age_limit' tagging some porn site
  5. A second approach implemented relies on a list of porn domains, to activate it
  6. pass the list filename as the only argument
  7. """
  8. # Allow direct execution
  9. import os
  10. import sys
  11. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  12. import urllib.parse
  13. import urllib.request
  14. from test.helper import gettestcases
  15. if len(sys.argv) > 1:
  16. METHOD = 'LIST'
  17. LIST = open(sys.argv[1]).read().decode('utf8').strip()
  18. else:
  19. METHOD = 'EURISTIC'
  20. for test in gettestcases():
  21. if METHOD == 'EURISTIC':
  22. try:
  23. webpage = urllib.request.urlopen(test['url'], timeout=10).read()
  24. except Exception:
  25. print('\nFail: {}'.format(test['name']))
  26. continue
  27. webpage = webpage.decode('utf8', 'replace')
  28. RESULT = 'porn' in webpage.lower()
  29. elif METHOD == 'LIST':
  30. domain = urllib.parse.urlparse(test['url']).netloc
  31. if not domain:
  32. print('\nFail: {}'.format(test['name']))
  33. continue
  34. domain = '.'.join(domain.split('.')[-2:])
  35. RESULT = ('.' + domain + '\n' in LIST or '\n' + domain + '\n' in LIST)
  36. if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict']
  37. or test['info_dict']['age_limit'] != 18):
  38. print('\nPotential missing age_limit check: {}'.format(test['name']))
  39. elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict']
  40. and test['info_dict']['age_limit'] == 18):
  41. print('\nPotential false negative: {}'.format(test['name']))
  42. else:
  43. sys.stdout.write('.')
  44. sys.stdout.flush()
  45. print()