web_log.chart.py 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194
  1. # -*- coding: utf-8 -*-
  2. # Description: web log netdata python.d module
  3. # Author: ilyam8
  4. # SPDX-License-Identifier: GPL-3.0-or-later
  5. import bisect
  6. import os
  7. import re
  8. from collections import namedtuple, defaultdict
  9. from copy import deepcopy
  10. try:
  11. from itertools import filterfalse
  12. except ImportError:
  13. from itertools import ifilter as filter
  14. from itertools import ifilterfalse as filterfalse
  15. try:
  16. from sys import maxint
  17. except ImportError:
  18. from sys import maxsize as maxint
  19. from bases.collection import read_last_line
  20. from bases.FrameworkServices.LogService import LogService
  21. ORDER_APACHE_CACHE = [
  22. 'apache_cache',
  23. ]
  24. ORDER_WEB = [
  25. 'response_statuses',
  26. 'response_codes',
  27. 'bandwidth',
  28. 'response_time',
  29. 'response_time_hist',
  30. 'response_time_upstream',
  31. 'response_time_upstream_hist',
  32. 'requests_per_url',
  33. 'requests_per_user_defined',
  34. 'http_method',
  35. 'vhost',
  36. 'port',
  37. 'http_version',
  38. 'requests_per_ipproto',
  39. 'clients',
  40. 'clients_all'
  41. ]
  42. ORDER_SQUID = [
  43. 'squid_response_statuses',
  44. 'squid_response_codes',
  45. 'squid_detailed_response_codes',
  46. 'squid_method',
  47. 'squid_mime_type',
  48. 'squid_hier_code',
  49. 'squid_transport_methods',
  50. 'squid_transport_errors',
  51. 'squid_code',
  52. 'squid_handling_opts',
  53. 'squid_object_types',
  54. 'squid_cache_events',
  55. 'squid_bytes',
  56. 'squid_duration',
  57. 'squid_clients',
  58. 'squid_clients_all'
  59. ]
  60. CHARTS_WEB = {
  61. 'response_codes': {
  62. 'options': [None, 'Response Codes', 'requests/s', 'responses', 'web_log.response_codes', 'stacked'],
  63. 'lines': [
  64. ['2xx', None, 'incremental'],
  65. ['5xx', None, 'incremental'],
  66. ['3xx', None, 'incremental'],
  67. ['4xx', None, 'incremental'],
  68. ['1xx', None, 'incremental'],
  69. ['0xx', 'other', 'incremental'],
  70. ['unmatched', None, 'incremental']
  71. ]
  72. },
  73. 'bandwidth': {
  74. 'options': [None, 'Bandwidth', 'kilobits/s', 'bandwidth', 'web_log.bandwidth', 'area'],
  75. 'lines': [
  76. ['resp_length', 'received', 'incremental', 8, 1000],
  77. ['bytes_sent', 'sent', 'incremental', -8, 1000]
  78. ]
  79. },
  80. 'response_time': {
  81. 'options': [None, 'Processing Time', 'milliseconds', 'timings', 'web_log.response_time', 'area'],
  82. 'lines': [
  83. ['resp_time_min', 'min', 'incremental', 1, 1000],
  84. ['resp_time_max', 'max', 'incremental', 1, 1000],
  85. ['resp_time_avg', 'avg', 'incremental', 1, 1000]
  86. ]
  87. },
  88. 'response_time_hist': {
  89. 'options': [None, 'Processing Time Histogram', 'requests/s', 'timings', 'web_log.response_time_hist', 'line'],
  90. 'lines': []
  91. },
  92. 'response_time_upstream': {
  93. 'options': [None, 'Processing Time Upstream', 'milliseconds', 'timings',
  94. 'web_log.response_time_upstream', 'area'],
  95. 'lines': [
  96. ['resp_time_upstream_min', 'min', 'incremental', 1, 1000],
  97. ['resp_time_upstream_max', 'max', 'incremental', 1, 1000],
  98. ['resp_time_upstream_avg', 'avg', 'incremental', 1, 1000]
  99. ]
  100. },
  101. 'response_time_upstream_hist': {
  102. 'options': [None, 'Processing Time Histogram', 'requests/s', 'timings',
  103. 'web_log.response_time_upstream_hist', 'line'],
  104. 'lines': []
  105. },
  106. 'clients': {
  107. 'options': [None, 'Current Poll Unique Client IPs', 'unique ips', 'clients', 'web_log.clients', 'stacked'],
  108. 'lines': [
  109. ['unique_cur_ipv4', 'ipv4', 'incremental', 1, 1],
  110. ['unique_cur_ipv6', 'ipv6', 'incremental', 1, 1]
  111. ]
  112. },
  113. 'clients_all': {
  114. 'options': [None, 'All Time Unique Client IPs', 'unique ips', 'clients', 'web_log.clients_all', 'stacked'],
  115. 'lines': [
  116. ['unique_tot_ipv4', 'ipv4', 'absolute', 1, 1],
  117. ['unique_tot_ipv6', 'ipv6', 'absolute', 1, 1]
  118. ]
  119. },
  120. 'http_method': {
  121. 'options': [None, 'Requests Per HTTP Method', 'requests/s', 'http methods', 'web_log.http_method', 'stacked'],
  122. 'lines': [
  123. ['GET', 'GET', 'incremental', 1, 1]
  124. ]
  125. },
  126. 'http_version': {
  127. 'options': [None, 'Requests Per HTTP Version', 'requests/s', 'http versions',
  128. 'web_log.http_version', 'stacked'],
  129. 'lines': []
  130. },
  131. 'requests_per_ipproto': {
  132. 'options': [None, 'Requests Per IP Protocol', 'requests/s', 'ip protocols', 'web_log.requests_per_ipproto',
  133. 'stacked'],
  134. 'lines': [
  135. ['req_ipv4', 'ipv4', 'incremental', 1, 1],
  136. ['req_ipv6', 'ipv6', 'incremental', 1, 1]
  137. ]
  138. },
  139. 'response_statuses': {
  140. 'options': [None, 'Response Statuses', 'requests/s', 'responses', 'web_log.response_statuses', 'stacked'],
  141. 'lines': [
  142. ['successful_requests', 'success', 'incremental', 1, 1],
  143. ['server_errors', 'error', 'incremental', 1, 1],
  144. ['redirects', 'redirect', 'incremental', 1, 1],
  145. ['bad_requests', 'bad', 'incremental', 1, 1],
  146. ['other_requests', 'other', 'incremental', 1, 1]
  147. ]
  148. },
  149. 'requests_per_url': {
  150. 'options': [None, 'Requests Per Url', 'requests/s', 'urls', 'web_log.requests_per_url', 'stacked'],
  151. 'lines': [
  152. ['url_pattern_other', 'other', 'incremental', 1, 1]
  153. ]
  154. },
  155. 'requests_per_user_defined': {
  156. 'options': [None, 'Requests Per User Defined Pattern', 'requests/s', 'user defined',
  157. 'web_log.requests_per_user_defined', 'stacked'],
  158. 'lines': [
  159. ['user_pattern_other', 'other', 'incremental', 1, 1]
  160. ]
  161. },
  162. 'port': {
  163. 'options': [None, 'Requests Per Port', 'requests/s', 'port', 'web_log.port', 'stacked'],
  164. 'lines': [
  165. ['port_80', 'http', 'incremental', 1, 1],
  166. ['port_443', 'https', 'incremental', 1, 1]
  167. ]
  168. },
  169. 'vhost': {
  170. 'options': [None, 'Requests Per Vhost', 'requests/s', 'vhost', 'web_log.vhost', 'stacked'],
  171. 'lines': []
  172. }
  173. }
  174. CHARTS_APACHE_CACHE = {
  175. 'apache_cache': {
  176. 'options': [None, 'Apache Cached Responses', 'percentage', 'cached', 'web_log.apache_cache_cache',
  177. 'stacked'],
  178. 'lines': [
  179. ['hit', 'cache', 'percentage-of-absolute-row'],
  180. ['miss', None, 'percentage-of-absolute-row'],
  181. ['other', None, 'percentage-of-absolute-row']
  182. ]
  183. }
  184. }
  185. CHARTS_SQUID = {
  186. 'squid_duration': {
  187. 'options': [None, 'Elapsed Time The Transaction Busied The Cache',
  188. 'milliseconds', 'squid_timings', 'web_log.squid_duration', 'area'],
  189. 'lines': [
  190. ['duration_min', 'min', 'incremental', 1, 1000],
  191. ['duration_max', 'max', 'incremental', 1, 1000],
  192. ['duration_avg', 'avg', 'incremental', 1, 1000]
  193. ]
  194. },
  195. 'squid_bytes': {
  196. 'options': [None, 'Amount Of Data Delivered To The Clients',
  197. 'kilobits/s', 'squid_bandwidth', 'web_log.squid_bytes', 'area'],
  198. 'lines': [
  199. ['bytes', 'sent', 'incremental', 8, 1000]
  200. ]
  201. },
  202. 'squid_response_statuses': {
  203. 'options': [None, 'Response Statuses', 'responses/s', 'squid_responses', 'web_log.squid_response_statuses',
  204. 'stacked'],
  205. 'lines': [
  206. ['successful_requests', 'success', 'incremental', 1, 1],
  207. ['server_errors', 'error', 'incremental', 1, 1],
  208. ['redirects', 'redirect', 'incremental', 1, 1],
  209. ['bad_requests', 'bad', 'incremental', 1, 1],
  210. ['other_requests', 'other', 'incremental', 1, 1]
  211. ]
  212. },
  213. 'squid_response_codes': {
  214. 'options': [None, 'Response Codes', 'responses/s', 'squid_responses',
  215. 'web_log.squid_response_codes', 'stacked'],
  216. 'lines': [
  217. ['2xx', None, 'incremental'],
  218. ['5xx', None, 'incremental'],
  219. ['3xx', None, 'incremental'],
  220. ['4xx', None, 'incremental'],
  221. ['1xx', None, 'incremental'],
  222. ['0xx', None, 'incremental'],
  223. ['other', None, 'incremental'],
  224. ['unmatched', None, 'incremental']
  225. ]
  226. },
  227. 'squid_code': {
  228. 'options': [None, 'Responses Per Cache Result Of The Request',
  229. 'requests/s', 'squid_squid_cache', 'web_log.squid_code', 'stacked'],
  230. 'lines': []
  231. },
  232. 'squid_detailed_response_codes': {
  233. 'options': [None, 'Detailed Response Codes',
  234. 'responses/s', 'squid_responses', 'web_log.squid_detailed_response_codes', 'stacked'],
  235. 'lines': []
  236. },
  237. 'squid_hier_code': {
  238. 'options': [None, 'Responses Per Hierarchy Code',
  239. 'requests/s', 'squid_hierarchy', 'web_log.squid_hier_code', 'stacked'],
  240. 'lines': []
  241. },
  242. 'squid_method': {
  243. 'options': [None, 'Requests Per Method',
  244. 'requests/s', 'squid_requests', 'web_log.squid_method', 'stacked'],
  245. 'lines': []
  246. },
  247. 'squid_mime_type': {
  248. 'options': [None, 'Requests Per MIME Type',
  249. 'requests/s', 'squid_requests', 'web_log.squid_mime_type', 'stacked'],
  250. 'lines': []
  251. },
  252. 'squid_clients': {
  253. 'options': [None, 'Current Poll Unique Client IPs', 'unique ips', 'squid_clients',
  254. 'web_log.squid_clients', 'stacked'],
  255. 'lines': [
  256. ['unique_ipv4', 'ipv4', 'incremental'],
  257. ['unique_ipv6', 'ipv6', 'incremental']
  258. ]
  259. },
  260. 'squid_clients_all': {
  261. 'options': [None, 'All Time Unique Client IPs', 'unique ips', 'squid_clients',
  262. 'web_log.squid_clients_all', 'stacked'],
  263. 'lines': [
  264. ['unique_tot_ipv4', 'ipv4', 'absolute'],
  265. ['unique_tot_ipv6', 'ipv6', 'absolute']
  266. ]
  267. },
  268. 'squid_transport_methods': {
  269. 'options': [None, 'Transport Methods', 'requests/s', 'squid_squid_transport',
  270. 'web_log.squid_transport_methods', 'stacked'],
  271. 'lines': []
  272. },
  273. 'squid_transport_errors': {
  274. 'options': [None, 'Transport Errors', 'requests/s', 'squid_squid_transport',
  275. 'web_log.squid_transport_errors', 'stacked'],
  276. 'lines': []
  277. },
  278. 'squid_handling_opts': {
  279. 'options': [None, 'Handling Opts', 'requests/s', 'squid_squid_cache',
  280. 'web_log.squid_handling_opts', 'stacked'],
  281. 'lines': []
  282. },
  283. 'squid_object_types': {
  284. 'options': [None, 'Object Types', 'objects/s', 'squid_squid_cache',
  285. 'web_log.squid_object_types', 'stacked'],
  286. 'lines': []
  287. },
  288. 'squid_cache_events': {
  289. 'options': [None, 'Cache Events', 'events/s', 'squid_squid_cache',
  290. 'web_log.squid_cache_events', 'stacked'],
  291. 'lines': []
  292. }
  293. }
  294. NAMED_PATTERN = namedtuple('PATTERN', ['description', 'func'])
  295. DET_RESP_AGGR = ['', '_1xx', '_2xx', '_3xx', '_4xx', '_5xx', '_Other']
  296. SQUID_CODES = {
  297. 'TCP': 'squid_transport_methods',
  298. 'UDP': 'squid_transport_methods',
  299. 'NONE': 'squid_transport_methods',
  300. 'CLIENT': 'squid_handling_opts',
  301. 'IMS': 'squid_handling_opts',
  302. 'ASYNC': 'squid_handling_opts',
  303. 'SWAPFAIL': 'squid_handling_opts',
  304. 'REFRESH': 'squid_handling_opts',
  305. 'SHARED': 'squid_handling_opts',
  306. 'REPLY': 'squid_handling_opts',
  307. 'NEGATIVE': 'squid_object_types',
  308. 'STALE': 'squid_object_types',
  309. 'OFFLINE': 'squid_object_types',
  310. 'INVALID': 'squid_object_types',
  311. 'FAIL': 'squid_object_types',
  312. 'MODIFIED': 'squid_object_types',
  313. 'UNMODIFIED': 'squid_object_types',
  314. 'REDIRECT': 'squid_object_types',
  315. 'HIT': 'squid_cache_events',
  316. 'MEM': 'squid_cache_events',
  317. 'MISS': 'squid_cache_events',
  318. 'DENIED': 'squid_cache_events',
  319. 'NOFETCH': 'squid_cache_events',
  320. 'TUNNEL': 'squid_cache_events',
  321. 'ABORTED': 'squid_transport_errors',
  322. 'TIMEOUT': 'squid_transport_errors'
  323. }
  324. REQUEST_REGEX = re.compile(r'(?P<method>[A-Z]+) (?P<url>[^ ]+) [A-Z]+/(?P<http_version>\d(?:.\d)?)')
  325. MIME_TYPES = ['application', 'audio', 'example', 'font', 'image', 'message', 'model', 'multipart', 'text', 'video']
  326. class Service(LogService):
  327. def __init__(self, configuration=None, name=None):
  328. """
  329. :param configuration:
  330. :param name:
  331. """
  332. LogService.__init__(self, configuration=configuration, name=name)
  333. self.configuration = configuration
  334. self.log_path = self.configuration.get('path')
  335. self.job = None
  336. def check(self):
  337. """
  338. :return: bool
  339. 1. "log_path" is specified in the module configuration file
  340. 2. "log_path" must be readable by netdata user and must exist
  341. 3. "log_path' must not be empty. We need at least 1 line to find appropriate pattern to parse
  342. 4. other checks depends on log "type"
  343. """
  344. log_type = self.configuration.get('type', 'web')
  345. log_types = dict(web=Web, apache_cache=ApacheCache, squid=Squid)
  346. if log_type not in log_types:
  347. self.error('bad log type {log_type}. Supported types: {types}'.format(log_type=log_type,
  348. types=log_types.keys()))
  349. return False
  350. if not self.log_path:
  351. self.error('log path is not specified')
  352. return False
  353. if not (self._find_recent_log_file() and os.access(self.log_path, os.R_OK)):
  354. self.error('{log_file} not readable or not exist'.format(log_file=self.log_path))
  355. return False
  356. if not os.path.getsize(self.log_path):
  357. self.error('{log_file} is empty'.format(log_file=self.log_path))
  358. return False
  359. self.job = log_types[log_type](self)
  360. if self.job.check():
  361. self.order = self.job.order
  362. self.definitions = self.job.definitions
  363. return True
  364. return False
  365. def _get_data(self):
  366. return self.job.get_data(self._get_raw_data())
  367. class Web:
  368. def __init__(self, service):
  369. self.service = service
  370. self.order = ORDER_WEB[:]
  371. self.definitions = deepcopy(CHARTS_WEB)
  372. self.pre_filter = check_patterns('filter', self.configuration.get('filter'))
  373. self.storage = dict()
  374. self.data = {
  375. 'bytes_sent': 0,
  376. 'resp_length': 0,
  377. 'resp_time_min': 0,
  378. 'resp_time_max': 0,
  379. 'resp_time_avg': 0,
  380. 'resp_time_upstream_min': 0,
  381. 'resp_time_upstream_max': 0,
  382. 'resp_time_upstream_avg': 0,
  383. 'unique_cur_ipv4': 0,
  384. 'unique_cur_ipv6': 0,
  385. '2xx': 0,
  386. '5xx': 0,
  387. '3xx': 0,
  388. '4xx': 0,
  389. '1xx': 0,
  390. '0xx': 0,
  391. 'unmatched': 0,
  392. 'req_ipv4': 0,
  393. 'req_ipv6': 0,
  394. 'unique_tot_ipv4': 0,
  395. 'unique_tot_ipv6': 0,
  396. 'successful_requests': 0,
  397. 'redirects': 0,
  398. 'bad_requests': 0,
  399. 'server_errors': 0,
  400. 'other_requests': 0,
  401. 'GET': 0
  402. }
  403. def __getattr__(self, item):
  404. return getattr(self.service, item)
  405. def check(self):
  406. last_line = read_last_line(self.log_path)
  407. if not last_line:
  408. return False
  409. # Custom_log_format or predefined log format.
  410. if self.configuration.get('custom_log_format'):
  411. match_dict, error = self.find_regex_custom(last_line)
  412. else:
  413. match_dict, error = self.find_regex(last_line)
  414. # "match_dict" is None if there are any problems
  415. if match_dict is None:
  416. self.error(error)
  417. return False
  418. self.storage['unique_all_time'] = list()
  419. self.storage['url_pattern'] = check_patterns('url_pattern', self.configuration.get('categories'))
  420. self.storage['user_pattern'] = check_patterns('user_pattern', self.configuration.get('user_defined'))
  421. self.create_web_charts(match_dict) # Create charts
  422. self.info('Collected data: %s' % list(match_dict.keys()))
  423. return True
  424. def create_web_charts(self, match_dict):
  425. """
  426. :param match_dict: dict: regex.search.groupdict(). Ex. {'address': '127.0.0.1', 'code': '200', 'method': 'GET'}
  427. :return:
  428. Create/remove additional charts depending on the 'match_dict' keys and configuration file options
  429. """
  430. if 'resp_time' not in match_dict:
  431. self.order.remove('response_time')
  432. self.order.remove('response_time_hist')
  433. if 'resp_time_upstream' not in match_dict:
  434. self.order.remove('response_time_upstream')
  435. self.order.remove('response_time_upstream_hist')
  436. # Add 'response_time_hist' and 'response_time_upstream_hist' charts if is specified in the configuration
  437. histogram = self.configuration.get('histogram', None)
  438. if isinstance(histogram, list):
  439. self.storage['bucket_index'] = histogram[:]
  440. self.storage['bucket_index'].append(maxint)
  441. self.storage['buckets'] = [0] * (len(histogram) + 1)
  442. self.storage['upstream_buckets'] = [0] * (len(histogram) + 1)
  443. hist_lines = self.definitions['response_time_hist']['lines']
  444. upstream_hist_lines = self.definitions['response_time_upstream_hist']['lines']
  445. for i, le in enumerate(histogram):
  446. hist_key = 'response_time_hist_%d' % i
  447. upstream_hist_key = 'response_time_upstream_hist_%d' % i
  448. hist_lines.append([hist_key, str(le), 'incremental', 1, 1])
  449. upstream_hist_lines.append([upstream_hist_key, str(le), 'incremental', 1, 1])
  450. hist_lines.append(['response_time_hist_%d' % len(histogram), '+Inf', 'incremental', 1, 1])
  451. upstream_hist_lines.append(['response_time_upstream_hist_%d' % len(histogram), '+Inf', 'incremental', 1, 1])
  452. elif histogram is not None:
  453. self.error('expect histogram list, but was {0}'.format(type(histogram)))
  454. if not self.configuration.get('all_time', True):
  455. self.order.remove('clients_all')
  456. # Add 'detailed_response_codes' chart if specified in the configuration
  457. if self.configuration.get('detailed_response_codes', True):
  458. if self.configuration.get('detailed_response_aggregate', True):
  459. codes = DET_RESP_AGGR[:1]
  460. else:
  461. codes = DET_RESP_AGGR[1:]
  462. for code in codes:
  463. self.order.append('detailed_response_codes%s' % code)
  464. self.definitions['detailed_response_codes%s' % code] = {
  465. 'options': [None, 'Detailed Response Codes %s' % code[1:], 'requests/s', 'responses',
  466. 'web_log.detailed_response_codes%s' % code, 'stacked'],
  467. 'lines': []
  468. }
  469. # Add 'requests_per_url' chart if specified in the configuration
  470. if self.storage['url_pattern']:
  471. for elem in self.storage['url_pattern']:
  472. dim = [elem.description, elem.description[12:], 'incremental']
  473. self.definitions['requests_per_url']['lines'].append(dim)
  474. self.data[elem.description] = 0
  475. self.data['url_pattern_other'] = 0
  476. else:
  477. self.order.remove('requests_per_url')
  478. # Add 'requests_per_user_defined' chart if specified in the configuration
  479. if self.storage['user_pattern'] and 'user_defined' in match_dict:
  480. for elem in self.storage['user_pattern']:
  481. dim = [elem.description, elem.description[13:], 'incremental']
  482. self.definitions['requests_per_user_defined']['lines'].append(dim)
  483. self.data[elem.description] = 0
  484. self.data['user_pattern_other'] = 0
  485. else:
  486. self.order.remove('requests_per_user_defined')
  487. def get_data(self, raw_data=None):
  488. """
  489. Parses new log lines
  490. :return: dict OR None
  491. None if _get_raw_data method fails.
  492. In all other cases - dict.
  493. """
  494. if not raw_data:
  495. return None if raw_data is None else self.data
  496. filtered_data = filter_data(raw_data=raw_data, pre_filter=self.pre_filter)
  497. unique_current = set()
  498. timings = defaultdict(lambda: dict(minimum=None, maximum=0, summary=0, count=0))
  499. for line in filtered_data:
  500. match = self.storage['regex'].search(line)
  501. if match:
  502. match_dict = match.groupdict()
  503. try:
  504. code = match_dict['code'][0] + 'xx'
  505. self.data[code] += 1
  506. except KeyError:
  507. self.data['0xx'] += 1
  508. # detailed response code
  509. if self.configuration.get('detailed_response_codes', True):
  510. self.get_data_per_response_codes_detailed(code=match_dict['code'])
  511. # response statuses
  512. self.get_data_per_statuses(code=match_dict['code'])
  513. # requests per user defined pattern
  514. if self.storage['user_pattern'] and 'user_defined' in match_dict:
  515. self.get_data_per_pattern(row=match_dict['user_defined'],
  516. other='user_pattern_other',
  517. pattern=self.storage['user_pattern'])
  518. # method, url, http version
  519. self.get_data_from_request_field(match_dict=match_dict)
  520. # bandwidth sent
  521. bytes_sent = match_dict['bytes_sent'] if '-' not in match_dict['bytes_sent'] else 0
  522. self.data['bytes_sent'] += int(bytes_sent)
  523. # request processing time and bandwidth received
  524. if 'resp_length' in match_dict:
  525. resp_length = match_dict['resp_length'] if '-' not in match_dict['resp_length'] else 0
  526. self.data['resp_length'] += int(resp_length)
  527. if 'resp_time' in match_dict:
  528. resp_time = self.storage['func_resp_time'](float(match_dict['resp_time']))
  529. get_timings(timings=timings['resp_time'], time=resp_time)
  530. if 'bucket_index' in self.storage:
  531. get_hist(self.storage['bucket_index'], self.storage['buckets'], resp_time / 1000)
  532. if 'resp_time_upstream' in match_dict and match_dict['resp_time_upstream'] != '-':
  533. resp_time_upstream = self.storage['func_resp_time'](float(match_dict['resp_time_upstream']))
  534. get_timings(timings=timings['resp_time_upstream'], time=resp_time_upstream)
  535. if 'bucket_index' in self.storage:
  536. get_hist(self.storage['bucket_index'], self.storage['upstream_buckets'], resp_time / 1000)
  537. # requests per ip proto
  538. proto = 'ipv6' if ':' in match_dict['address'] else 'ipv4'
  539. self.data['req_' + proto] += 1
  540. # unique clients ips
  541. if self.configuration.get('all_time', True):
  542. if address_not_in_pool(pool=self.storage['unique_all_time'],
  543. address=match_dict['address'],
  544. pool_size=self.data['unique_tot_ipv4'] + self.data['unique_tot_ipv6']):
  545. self.data['unique_tot_' + proto] += 1
  546. if match_dict['address'] not in unique_current:
  547. self.data['unique_cur_' + proto] += 1
  548. unique_current.add(match_dict['address'])
  549. else:
  550. self.data['unmatched'] += 1
  551. # timings
  552. for elem in timings:
  553. self.data[elem + '_min'] += timings[elem]['minimum']
  554. self.data[elem + '_avg'] += timings[elem]['summary'] / timings[elem]['count']
  555. self.data[elem + '_max'] += timings[elem]['maximum']
  556. # histogram
  557. if 'bucket_index' in self.storage:
  558. buckets = self.storage['buckets']
  559. upstream_buckets = self.storage['upstream_buckets']
  560. for i in range(0, len(self.storage['bucket_index'])):
  561. hist_key = 'response_time_hist_%d' % i
  562. upstream_hist_key = 'response_time_upstream_hist_%d' % i
  563. self.data[hist_key] = buckets[i]
  564. self.data[upstream_hist_key] = upstream_buckets[i]
  565. return self.data
  566. def find_regex(self, last_line):
  567. """
  568. :param last_line: str: literally last line from log file
  569. :return: tuple where:
  570. [0]: dict or None: match_dict or None
  571. [1]: str: error description
  572. We need to find appropriate pattern for current log file
  573. All logic is do a regex search through the string for all predefined patterns
  574. until we find something or fail.
  575. """
  576. # REGEX: 1.IPv4 address 2.HTTP method 3. URL 4. Response code
  577. # 5. Bytes sent 6. Response length 7. Response process time
  578. default = re.compile(r'(?P<address>[\da-f.:]+|localhost)'
  579. r' -.*?"(?P<request>[^"]*)"'
  580. r' (?P<code>[1-9]\d{2})'
  581. r' (?P<bytes_sent>\d+|-)')
  582. apache_ext_insert = re.compile(r'(?P<address>[\da-f.:]+|localhost)'
  583. r' -.*?"(?P<request>[^"]*)"'
  584. r' (?P<code>[1-9]\d{2})'
  585. r' (?P<bytes_sent>\d+|-)'
  586. r' (?P<resp_length>\d+|-)'
  587. r' (?P<resp_time>\d+) ')
  588. apache_ext_append = re.compile(r'(?P<address>[\da-f.:]+|localhost)'
  589. r' -.*?"(?P<request>[^"]*)"'
  590. r' (?P<code>[1-9]\d{2})'
  591. r' (?P<bytes_sent>\d+|-)'
  592. r' .*?'
  593. r' (?P<resp_length>\d+|-)'
  594. r' (?P<resp_time>\d+)'
  595. r'(?: |$)')
  596. nginx_ext_insert = re.compile(r'(?P<address>[\da-f.:]+)'
  597. r' -.*?"(?P<request>[^"]*)"'
  598. r' (?P<code>[1-9]\d{2})'
  599. r' (?P<bytes_sent>\d+)'
  600. r' (?P<resp_length>\d+)'
  601. r' (?P<resp_time>\d+\.\d+) ')
  602. nginx_ext2_insert = re.compile(r'(?P<address>[\da-f.:]+)'
  603. r' -.*?"(?P<request>[^"]*)"'
  604. r' (?P<code>[1-9]\d{2})'
  605. r' (?P<bytes_sent>\d+)'
  606. r' (?P<resp_length>\d+)'
  607. r' (?P<resp_time>\d+\.\d+)'
  608. r' (?P<resp_time_upstream>[\d.-]+)')
  609. nginx_ext_append = re.compile(r'(?P<address>[\da-f.:]+)'
  610. r' -.*?"(?P<request>[^"]*)"'
  611. r' (?P<code>[1-9]\d{2})'
  612. r' (?P<bytes_sent>\d+)'
  613. r' .*?'
  614. r' (?P<resp_length>\d+)'
  615. r' (?P<resp_time>\d+\.\d+)')
  616. def func_usec(time):
  617. return time
  618. def func_sec(time):
  619. return time * 1000000
  620. r_regex = [apache_ext_insert, apache_ext_append,
  621. nginx_ext2_insert, nginx_ext_insert, nginx_ext_append,
  622. default]
  623. r_function = [func_usec, func_usec, func_sec, func_sec, func_sec, func_usec]
  624. regex_function = zip(r_regex, r_function)
  625. match_dict = dict()
  626. for regex, func in regex_function:
  627. match = regex.search(last_line)
  628. if match:
  629. self.storage['regex'] = regex
  630. self.storage['func_resp_time'] = func
  631. match_dict = match.groupdict()
  632. break
  633. return find_regex_return(match_dict=match_dict or None,
  634. msg='Unknown log format. You need to use "custom_log_format" feature.')
  635. def find_regex_custom(self, last_line):
  636. """
  637. :param last_line: str: literally last line from log file
  638. :return: tuple where:
  639. [0]: dict or None: match_dict or None
  640. [1]: str: error description
  641. We are here only if "custom_log_format" is in logs. We need to make sure:
  642. 1. "custom_log_format" is a dict
  643. 2. "pattern" in "custom_log_format" and pattern is <str> instance
  644. 3. if "time_multiplier" is in "custom_log_format" it must be <int> or <float> instance
  645. If all parameters is ok we need to make sure:
  646. 1. Pattern search is success
  647. 2. Pattern search contains named subgroups (?P<subgroup_name>) (= "match_dict")
  648. If pattern search is success we need to make sure:
  649. 1. All mandatory keys ['address', 'code', 'bytes_sent', 'method', 'url'] are in "match_dict"
  650. If this is True we need to make sure:
  651. 1. All mandatory key values from "match_dict" have the correct format
  652. ("code" is integer, "method" is uppercase word, etc)
  653. If non mandatory keys in "match_dict" we need to make sure:
  654. 1. All non mandatory key values from match_dict ['resp_length', 'resp_time'] have the correct format
  655. ("resp_length" is integer or "-", "resp_time" is integer or float)
  656. """
  657. if not hasattr(self.configuration.get('custom_log_format'), 'keys'):
  658. return find_regex_return(msg='Custom log: "custom_log_format" is not a <dict>')
  659. pattern = self.configuration.get('custom_log_format', dict()).get('pattern')
  660. if not (pattern and isinstance(pattern, str)):
  661. return find_regex_return(msg='Custom log: "pattern" option is not specified or type is not <str>')
  662. resp_time_func = self.configuration.get('custom_log_format', dict()).get('time_multiplier') or 0
  663. if not isinstance(resp_time_func, (int, float)):
  664. return find_regex_return(msg='Custom log: "time_multiplier" is not an integer or a float')
  665. try:
  666. regex = re.compile(pattern)
  667. except re.error as error:
  668. return find_regex_return(msg='Pattern compile error: %s' % str(error))
  669. match = regex.search(last_line)
  670. if not match:
  671. return find_regex_return(msg='Custom log: pattern search FAILED')
  672. match_dict = match.groupdict() or None
  673. if match_dict is None:
  674. return find_regex_return(msg='Custom log: search OK but contains no named subgroups'
  675. ' (you need to use ?P<subgroup_name>)')
  676. mandatory_dict = {'address': r'[\w.:-]+',
  677. 'code': r'[1-9]\d{2}',
  678. 'bytes_sent': r'\d+|-'}
  679. optional_dict = {'resp_length': r'\d+|-',
  680. 'resp_time': r'[\d.]+',
  681. 'resp_time_upstream': r'[\d.-]+',
  682. 'method': r'[A-Z]+',
  683. 'http_version': r'\d(?:.\d)?'}
  684. mandatory_values = set(mandatory_dict) - set(match_dict)
  685. if mandatory_values:
  686. return find_regex_return(msg='Custom log: search OK but some mandatory keys (%s) are missing'
  687. % list(mandatory_values))
  688. for key in mandatory_dict:
  689. if not re.search(mandatory_dict[key], match_dict[key]):
  690. return find_regex_return(msg='Custom log: can\'t parse "%s": %s'
  691. % (key, match_dict[key]))
  692. optional_values = set(optional_dict) & set(match_dict)
  693. for key in optional_values:
  694. if not re.search(optional_dict[key], match_dict[key]):
  695. return find_regex_return(msg='Custom log: can\'t parse "%s": %s'
  696. % (key, match_dict[key]))
  697. dot_in_time = '.' in match_dict.get('resp_time', '')
  698. if dot_in_time:
  699. self.storage['func_resp_time'] = lambda time: time * (resp_time_func or 1000000)
  700. else:
  701. self.storage['func_resp_time'] = lambda time: time * (resp_time_func or 1)
  702. self.storage['regex'] = regex
  703. return find_regex_return(match_dict=match_dict)
  704. def get_data_from_request_field(self, match_dict):
  705. if match_dict.get('request'):
  706. match_dict = REQUEST_REGEX.search(match_dict['request'])
  707. if match_dict:
  708. match_dict = match_dict.groupdict()
  709. else:
  710. return
  711. # requests per url
  712. if match_dict.get('url') and self.storage['url_pattern']:
  713. self.get_data_per_pattern(row=match_dict['url'],
  714. other='url_pattern_other',
  715. pattern=self.storage['url_pattern'])
  716. # requests per http method
  717. if match_dict.get('method'):
  718. if match_dict['method'] not in self.data:
  719. self.charts['http_method'].add_dimension([match_dict['method'],
  720. match_dict['method'],
  721. 'incremental'])
  722. self.data[match_dict['method']] = 0
  723. self.data[match_dict['method']] += 1
  724. # requests per http version
  725. if match_dict.get('http_version'):
  726. dim_id = match_dict['http_version'].replace('.', '_')
  727. if dim_id not in self.data:
  728. self.charts['http_version'].add_dimension([dim_id,
  729. match_dict['http_version'],
  730. 'incremental'])
  731. self.data[dim_id] = 0
  732. self.data[dim_id] += 1
  733. # requests per port number
  734. if match_dict.get('port'):
  735. if match_dict['port'] not in self.data:
  736. self.charts['port'].add_dimension([match_dict['port'],
  737. match_dict['port'],
  738. 'incremental'])
  739. self.data[match_dict['port']] = 0
  740. self.data[match_dict['port']] += 1
  741. # requests per vhost
  742. if match_dict.get('vhost'):
  743. dim_id = match_dict['vhost'].replace('.', '_')
  744. if dim_id not in self.data:
  745. self.charts['vhost'].add_dimension([dim_id,
  746. match_dict['vhost'],
  747. 'incremental'])
  748. self.data[dim_id] = 0
  749. self.data[dim_id] += 1
  750. def get_data_per_response_codes_detailed(self, code):
  751. """
  752. :param code: str: CODE from parsed line. Ex.: '202, '499'
  753. :return:
  754. Calls add_new_dimension method If the value is found for the first time
  755. """
  756. if code not in self.data:
  757. if self.configuration.get('detailed_response_aggregate', True):
  758. self.charts['detailed_response_codes'].add_dimension([code, code, 'incremental'])
  759. self.data[code] = 0
  760. else:
  761. code_index = int(code[0]) if int(code[0]) < 6 else 6
  762. chart_key = 'detailed_response_codes' + DET_RESP_AGGR[code_index]
  763. self.charts[chart_key].add_dimension([code, code, 'incremental'])
  764. self.data[code] = 0
  765. self.data[code] += 1
  766. def get_data_per_pattern(self, row, other, pattern):
  767. """
  768. :param row: str:
  769. :param other: str:
  770. :param pattern: named tuple: (['pattern_description', 'regular expression'])
  771. :return:
  772. Scan through string looking for the first location where patterns produce a match for all user
  773. defined patterns
  774. """
  775. match = None
  776. for elem in pattern:
  777. if elem.func(row):
  778. self.data[elem.description] += 1
  779. match = True
  780. break
  781. if not match:
  782. self.data[other] += 1
  783. def get_data_per_statuses(self, code):
  784. """
  785. :param code: str: response status code. Ex.: '202', '499'
  786. :return:
  787. """
  788. code_class = code[0]
  789. if code_class == '2' or code == '304' or code_class == '1' or code == '401':
  790. self.data['successful_requests'] += 1
  791. elif code_class == '3':
  792. self.data['redirects'] += 1
  793. elif code_class == '4':
  794. self.data['bad_requests'] += 1
  795. elif code_class == '5':
  796. self.data['server_errors'] += 1
  797. else:
  798. self.data['other_requests'] += 1
  799. class ApacheCache:
  800. def __init__(self, service):
  801. self.service = service
  802. self.order = ORDER_APACHE_CACHE
  803. self.definitions = CHARTS_APACHE_CACHE
  804. @staticmethod
  805. def check():
  806. return True
  807. @staticmethod
  808. def get_data(raw_data=None):
  809. data = dict(hit=0, miss=0, other=0)
  810. if not raw_data:
  811. return None if raw_data is None else data
  812. for line in raw_data:
  813. if 'cache hit' in line:
  814. data['hit'] += 1
  815. elif 'cache miss' in line:
  816. data['miss'] += 1
  817. else:
  818. data['other'] += 1
  819. return data
  820. class Squid:
  821. def __init__(self, service):
  822. self.service = service
  823. self.order = ORDER_SQUID
  824. self.definitions = CHARTS_SQUID
  825. self.pre_filter = check_patterns('filter', self.configuration.get('filter'))
  826. self.storage = dict()
  827. self.data = {
  828. 'duration_max': 0,
  829. 'duration_avg': 0,
  830. 'duration_min': 0,
  831. 'bytes': 0,
  832. '0xx': 0,
  833. '1xx': 0,
  834. '2xx': 0,
  835. '3xx': 0,
  836. '4xx': 0,
  837. '5xx': 0,
  838. 'other': 0,
  839. 'unmatched': 0,
  840. 'unique_ipv4': 0,
  841. 'unique_ipv6': 0,
  842. 'unique_tot_ipv4': 0,
  843. 'unique_tot_ipv6': 0,
  844. 'successful_requests': 0,
  845. 'redirects': 0,
  846. 'bad_requests': 0,
  847. 'server_errors': 0,
  848. 'other_requests': 0
  849. }
  850. def __getattr__(self, item):
  851. return getattr(self.service, item)
  852. def check(self):
  853. last_line = read_last_line(self.log_path)
  854. if not last_line:
  855. return False
  856. self.storage['unique_all_time'] = list()
  857. self.storage['regex'] = re.compile(r'[0-9.]+\s+(?P<duration>[0-9]+)'
  858. r' (?P<client_address>[\da-f.:]+)'
  859. r' (?P<squid_code>[A-Z_]+)/'
  860. r'(?P<http_code>[0-9]+)'
  861. r' (?P<bytes>[0-9]+)'
  862. r' (?P<method>[A-Z_]+)'
  863. r' (?P<url>[^ ]+)'
  864. r' (?P<user>[^ ]+)'
  865. r' (?P<hier_code>[A-Z_]+)/[\da-z.:-]+'
  866. r' (?P<mime_type>[A-Za-z-]*)')
  867. match = self.storage['regex'].search(last_line)
  868. if not match:
  869. self.error('Regex not matches (%s)' % self.storage['regex'].pattern)
  870. return False
  871. self.storage['dynamic'] = {
  872. 'http_code': {
  873. 'chart': 'squid_detailed_response_codes',
  874. 'func_dim_id': None,
  875. 'func_dim': None
  876. },
  877. 'hier_code': {
  878. 'chart': 'squid_hier_code',
  879. 'func_dim_id': None,
  880. 'func_dim': lambda v: v.replace('HIER_', '')
  881. },
  882. 'method': {
  883. 'chart': 'squid_method',
  884. 'func_dim_id': None,
  885. 'func_dim': None
  886. },
  887. 'mime_type': {
  888. 'chart': 'squid_mime_type',
  889. 'func_dim_id': lambda v: str.lower(v) if str.lower(v) in MIME_TYPES else 'unknown',
  890. 'func_dim': None
  891. }
  892. }
  893. if not self.configuration.get('all_time', True):
  894. self.order.remove('squid_clients_all')
  895. return True
  896. def get_data(self, raw_data=None):
  897. if not raw_data:
  898. return None if raw_data is None else self.data
  899. filtered_data = filter_data(raw_data=raw_data, pre_filter=self.pre_filter)
  900. unique_ip = set()
  901. timings = defaultdict(lambda: dict(minimum=None, maximum=0, summary=0, count=0))
  902. for row in filtered_data:
  903. match = self.storage['regex'].search(row)
  904. if match:
  905. match = match.groupdict()
  906. if match['duration'] != '0':
  907. get_timings(timings=timings['duration'], time=float(match['duration']) * 1000)
  908. try:
  909. self.data[match['http_code'][0] + 'xx'] += 1
  910. except KeyError:
  911. self.data['other'] += 1
  912. self.get_data_per_statuses(match['http_code'])
  913. self.get_data_per_squid_code(match['squid_code'])
  914. self.data['bytes'] += int(match['bytes'])
  915. proto = 'ipv4' if '.' in match['client_address'] else 'ipv6'
  916. # unique clients ips
  917. if self.configuration.get('all_time', True):
  918. if address_not_in_pool(pool=self.storage['unique_all_time'],
  919. address=match['client_address'],
  920. pool_size=self.data['unique_tot_ipv4'] + self.data['unique_tot_ipv6']):
  921. self.data['unique_tot_' + proto] += 1
  922. if match['client_address'] not in unique_ip:
  923. self.data['unique_' + proto] += 1
  924. unique_ip.add(match['client_address'])
  925. for key, values in self.storage['dynamic'].items():
  926. if match[key] == '-':
  927. continue
  928. dimension_id = values['func_dim_id'](match[key]) if values['func_dim_id'] else match[key]
  929. if dimension_id not in self.data:
  930. dimension = values['func_dim'](match[key]) if values['func_dim'] else dimension_id
  931. self.charts[values['chart']].add_dimension([dimension_id,
  932. dimension,
  933. 'incremental'])
  934. self.data[dimension_id] = 0
  935. self.data[dimension_id] += 1
  936. else:
  937. self.data['unmatched'] += 1
  938. for elem in timings:
  939. self.data[elem + '_min'] += timings[elem]['minimum']
  940. self.data[elem + '_avg'] += timings[elem]['summary'] / timings[elem]['count']
  941. self.data[elem + '_max'] += timings[elem]['maximum']
  942. return self.data
  943. def get_data_per_statuses(self, code):
  944. """
  945. :param code: str: response status code. Ex.: '202', '499'
  946. :return:
  947. """
  948. code_class = code[0]
  949. if code_class == '2' or code == '304' or code_class == '1' or code == '000':
  950. self.data['successful_requests'] += 1
  951. elif code_class == '3':
  952. self.data['redirects'] += 1
  953. elif code_class == '4':
  954. self.data['bad_requests'] += 1
  955. elif code_class == '5' or code_class == '6':
  956. self.data['server_errors'] += 1
  957. else:
  958. self.data['other_requests'] += 1
  959. def get_data_per_squid_code(self, code):
  960. """
  961. :param code: str: squid response code. Ex.: 'TCP_MISS', 'TCP_MISS_ABORTED'
  962. :return:
  963. """
  964. if code not in self.data:
  965. self.charts['squid_code'].add_dimension([code, code, 'incremental'])
  966. self.data[code] = 0
  967. self.data[code] += 1
  968. for tag in code.split('_'):
  969. try:
  970. chart_key = SQUID_CODES[tag]
  971. except KeyError:
  972. continue
  973. dimension_id = '_'.join(['code_detailed', tag])
  974. if dimension_id not in self.data:
  975. self.charts[chart_key].add_dimension([dimension_id, tag, 'incremental'])
  976. self.data[dimension_id] = 0
  977. self.data[dimension_id] += 1
  978. def get_timings(timings, time):
  979. """
  980. :param timings:
  981. :param time:
  982. :return:
  983. """
  984. if timings['minimum'] is None:
  985. timings['minimum'] = time
  986. if time > timings['maximum']:
  987. timings['maximum'] = time
  988. elif time < timings['minimum']:
  989. timings['minimum'] = time
  990. timings['summary'] += time
  991. timings['count'] += 1
  992. def get_hist(index, buckets, time):
  993. """
  994. :param index: histogram index (Ex. [10, 50, 100, 150, ...])
  995. :param buckets: histogram buckets
  996. :param time: time
  997. :return: None
  998. """
  999. for i in range(len(index) - 1, -1, -1):
  1000. if time <= index[i]:
  1001. buckets[i] += 1
  1002. else:
  1003. break
  1004. def address_not_in_pool(pool, address, pool_size):
  1005. """
  1006. :param pool: list of ip addresses
  1007. :param address: ip address
  1008. :param pool_size: current pool size
  1009. :return: True if address not in pool. False otherwise.
  1010. """
  1011. index = bisect.bisect_left(pool, address)
  1012. if index < pool_size:
  1013. if pool[index] == address:
  1014. return False
  1015. bisect.insort_left(pool, address)
  1016. return True
  1017. bisect.insort_left(pool, address)
  1018. return True
  1019. def find_regex_return(match_dict=None, msg='Generic error message'):
  1020. """
  1021. :param match_dict: dict: re.search.groupdict() or None
  1022. :param msg: str: error description
  1023. :return: tuple:
  1024. """
  1025. return match_dict, msg
  1026. def check_patterns(string, dimension_regex_dict):
  1027. """
  1028. :param string: str:
  1029. :param dimension_regex_dict: dict: ex. {'dim1': '<pattern1>', 'dim2': '<pattern2>'}
  1030. :return: list of named tuples or None:
  1031. We need to make sure all patterns are valid regular expressions
  1032. """
  1033. if not hasattr(dimension_regex_dict, 'keys'):
  1034. return None
  1035. result = list()
  1036. def valid_pattern(pattern):
  1037. """
  1038. :param pattern: str
  1039. :return: re.compile(pattern) or None
  1040. """
  1041. if not isinstance(pattern, str):
  1042. return False
  1043. try:
  1044. return re.compile(pattern)
  1045. except re.error:
  1046. return False
  1047. def func_search(pattern):
  1048. def closure(v):
  1049. return pattern.search(v)
  1050. return closure
  1051. for dimension, regex in dimension_regex_dict.items():
  1052. valid = valid_pattern(regex)
  1053. if isinstance(dimension, str) and valid_pattern:
  1054. func = func_search(valid)
  1055. result.append(NAMED_PATTERN(description='_'.join([string, dimension]),
  1056. func=func))
  1057. return result or None
  1058. def filter_data(raw_data, pre_filter):
  1059. """
  1060. :param raw_data:
  1061. :param pre_filter:
  1062. :return:
  1063. """
  1064. if not pre_filter:
  1065. return raw_data
  1066. filtered = raw_data
  1067. for elem in pre_filter:
  1068. if elem.description == 'filter_include':
  1069. filtered = filter(elem.func, filtered)
  1070. elif elem.description == 'filter_exclude':
  1071. filtered = filterfalse(elem.func, filtered)
  1072. return filtered