Scanners.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. # cython: auto_pickle=False
  2. #=======================================================================
  3. #
  4. # Python Lexical Analyser
  5. #
  6. #
  7. # Scanning an input stream
  8. #
  9. #=======================================================================
  10. from __future__ import absolute_import
  11. import cython
  12. cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object)
  13. from . import Errors
  14. from .Regexps import BOL, EOL, EOF
  15. NOT_FOUND = object()
  16. class Scanner(object):
  17. """
  18. A Scanner is used to read tokens from a stream of characters
  19. using the token set specified by a Plex.Lexicon.
  20. Constructor:
  21. Scanner(lexicon, stream, name = '')
  22. See the docstring of the __init__ method for details.
  23. Methods:
  24. See the docstrings of the individual methods for more
  25. information.
  26. read() --> (value, text)
  27. Reads the next lexical token from the stream.
  28. position() --> (name, line, col)
  29. Returns the position of the last token read using the
  30. read() method.
  31. begin(state_name)
  32. Causes scanner to change state.
  33. produce(value [, text])
  34. Causes return of a token value to the caller of the
  35. Scanner.
  36. """
  37. # lexicon = None # Lexicon
  38. # stream = None # file-like object
  39. # name = ''
  40. # buffer = ''
  41. # buf_start_pos = 0 # position in input of start of buffer
  42. # next_pos = 0 # position in input of next char to read
  43. # cur_pos = 0 # position in input of current char
  44. # cur_line = 1 # line number of current char
  45. # cur_line_start = 0 # position in input of start of current line
  46. # start_pos = 0 # position in input of start of token
  47. # start_line = 0 # line number of start of token
  48. # start_col = 0 # position in line of start of token
  49. # text = None # text of last token read
  50. # initial_state = None # Node
  51. # state_name = '' # Name of initial state
  52. # queue = None # list of tokens to be returned
  53. # trace = 0
  54. def __init__(self, lexicon, stream, name='', initial_pos=None):
  55. """
  56. Scanner(lexicon, stream, name = '')
  57. |lexicon| is a Plex.Lexicon instance specifying the lexical tokens
  58. to be recognised.
  59. |stream| can be a file object or anything which implements a
  60. compatible read() method.
  61. |name| is optional, and may be the name of the file being
  62. scanned or any other identifying string.
  63. """
  64. self.trace = 0
  65. self.buffer = u''
  66. self.buf_start_pos = 0
  67. self.next_pos = 0
  68. self.cur_pos = 0
  69. self.cur_line = 1
  70. self.start_pos = 0
  71. self.start_line = 0
  72. self.start_col = 0
  73. self.text = None
  74. self.state_name = None
  75. self.lexicon = lexicon
  76. self.stream = stream
  77. self.name = name
  78. self.queue = []
  79. self.initial_state = None
  80. self.begin('')
  81. self.next_pos = 0
  82. self.cur_pos = 0
  83. self.cur_line_start = 0
  84. self.cur_char = BOL
  85. self.input_state = 1
  86. if initial_pos is not None:
  87. self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2]
  88. def read(self):
  89. """
  90. Read the next lexical token from the stream and return a
  91. tuple (value, text), where |value| is the value associated with
  92. the token as specified by the Lexicon, and |text| is the actual
  93. string read from the stream. Returns (None, '') on end of file.
  94. """
  95. queue = self.queue
  96. while not queue:
  97. self.text, action = self.scan_a_token()
  98. if action is None:
  99. self.produce(None)
  100. self.eof()
  101. else:
  102. value = action.perform(self, self.text)
  103. if value is not None:
  104. self.produce(value)
  105. result = queue[0]
  106. del queue[0]
  107. return result
  108. def scan_a_token(self):
  109. """
  110. Read the next input sequence recognised by the machine
  111. and return (text, action). Returns ('', None) on end of
  112. file.
  113. """
  114. self.start_pos = self.cur_pos
  115. self.start_line = self.cur_line
  116. self.start_col = self.cur_pos - self.cur_line_start
  117. action = self.run_machine_inlined()
  118. if action is not None:
  119. if self.trace:
  120. print("Scanner: read: Performing %s %d:%d" % (
  121. action, self.start_pos, self.cur_pos))
  122. text = self.buffer[
  123. self.start_pos - self.buf_start_pos:
  124. self.cur_pos - self.buf_start_pos]
  125. return (text, action)
  126. else:
  127. if self.cur_pos == self.start_pos:
  128. if self.cur_char is EOL:
  129. self.next_char()
  130. if self.cur_char is None or self.cur_char is EOF:
  131. return (u'', None)
  132. raise Errors.UnrecognizedInput(self, self.state_name)
  133. def run_machine_inlined(self):
  134. """
  135. Inlined version of run_machine for speed.
  136. """
  137. state = self.initial_state
  138. cur_pos = self.cur_pos
  139. cur_line = self.cur_line
  140. cur_line_start = self.cur_line_start
  141. cur_char = self.cur_char
  142. input_state = self.input_state
  143. next_pos = self.next_pos
  144. buffer = self.buffer
  145. buf_start_pos = self.buf_start_pos
  146. buf_len = len(buffer)
  147. b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
  148. None, 0, 0, 0, u'', 0, 0
  149. trace = self.trace
  150. while 1:
  151. if trace: #TRACE#
  152. print("State %d, %d/%d:%s -->" % ( #TRACE#
  153. state['number'], input_state, cur_pos, repr(cur_char))) #TRACE#
  154. # Begin inlined self.save_for_backup()
  155. #action = state.action #@slow
  156. action = state['action'] #@fast
  157. if action is not None:
  158. b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
  159. action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos
  160. # End inlined self.save_for_backup()
  161. c = cur_char
  162. #new_state = state.new_state(c) #@slow
  163. new_state = state.get(c, NOT_FOUND) #@fast
  164. if new_state is NOT_FOUND: #@fast
  165. new_state = c and state.get('else') #@fast
  166. if new_state:
  167. if trace: #TRACE#
  168. print("State %d" % new_state['number']) #TRACE#
  169. state = new_state
  170. # Begin inlined: self.next_char()
  171. if input_state == 1:
  172. cur_pos = next_pos
  173. # Begin inlined: c = self.read_char()
  174. buf_index = next_pos - buf_start_pos
  175. if buf_index < buf_len:
  176. c = buffer[buf_index]
  177. next_pos += 1
  178. else:
  179. discard = self.start_pos - buf_start_pos
  180. data = self.stream.read(0x1000)
  181. buffer = self.buffer[discard:] + data
  182. self.buffer = buffer
  183. buf_start_pos += discard
  184. self.buf_start_pos = buf_start_pos
  185. buf_len = len(buffer)
  186. buf_index -= discard
  187. if data:
  188. c = buffer[buf_index]
  189. next_pos += 1
  190. else:
  191. c = u''
  192. # End inlined: c = self.read_char()
  193. if c == u'\n':
  194. cur_char = EOL
  195. input_state = 2
  196. elif not c:
  197. cur_char = EOL
  198. input_state = 4
  199. else:
  200. cur_char = c
  201. elif input_state == 2:
  202. cur_char = u'\n'
  203. input_state = 3
  204. elif input_state == 3:
  205. cur_line += 1
  206. cur_line_start = cur_pos = next_pos
  207. cur_char = BOL
  208. input_state = 1
  209. elif input_state == 4:
  210. cur_char = EOF
  211. input_state = 5
  212. else: # input_state = 5
  213. cur_char = u''
  214. # End inlined self.next_char()
  215. else: # not new_state
  216. if trace: #TRACE#
  217. print("blocked") #TRACE#
  218. # Begin inlined: action = self.back_up()
  219. if b_action is not None:
  220. (action, cur_pos, cur_line, cur_line_start,
  221. cur_char, input_state, next_pos) = \
  222. (b_action, b_cur_pos, b_cur_line, b_cur_line_start,
  223. b_cur_char, b_input_state, b_next_pos)
  224. else:
  225. action = None
  226. break # while 1
  227. # End inlined: action = self.back_up()
  228. self.cur_pos = cur_pos
  229. self.cur_line = cur_line
  230. self.cur_line_start = cur_line_start
  231. self.cur_char = cur_char
  232. self.input_state = input_state
  233. self.next_pos = next_pos
  234. if trace: #TRACE#
  235. if action is not None: #TRACE#
  236. print("Doing %s" % action) #TRACE#
  237. return action
  238. def next_char(self):
  239. input_state = self.input_state
  240. if self.trace:
  241. print("Scanner: next: %s [%d] %d" % (" " * 20, input_state, self.cur_pos))
  242. if input_state == 1:
  243. self.cur_pos = self.next_pos
  244. c = self.read_char()
  245. if c == u'\n':
  246. self.cur_char = EOL
  247. self.input_state = 2
  248. elif not c:
  249. self.cur_char = EOL
  250. self.input_state = 4
  251. else:
  252. self.cur_char = c
  253. elif input_state == 2:
  254. self.cur_char = u'\n'
  255. self.input_state = 3
  256. elif input_state == 3:
  257. self.cur_line += 1
  258. self.cur_line_start = self.cur_pos = self.next_pos
  259. self.cur_char = BOL
  260. self.input_state = 1
  261. elif input_state == 4:
  262. self.cur_char = EOF
  263. self.input_state = 5
  264. else: # input_state = 5
  265. self.cur_char = u''
  266. if self.trace:
  267. print("--> [%d] %d %r" % (input_state, self.cur_pos, self.cur_char))
  268. def position(self):
  269. """
  270. Return a tuple (name, line, col) representing the location of
  271. the last token read using the read() method. |name| is the
  272. name that was provided to the Scanner constructor; |line|
  273. is the line number in the stream (1-based); |col| is the
  274. position within the line of the first character of the token
  275. (0-based).
  276. """
  277. return (self.name, self.start_line, self.start_col)
  278. def get_position(self):
  279. """Python accessible wrapper around position(), only for error reporting.
  280. """
  281. return self.position()
  282. def begin(self, state_name):
  283. """Set the current state of the scanner to the named state."""
  284. self.initial_state = (
  285. self.lexicon.get_initial_state(state_name))
  286. self.state_name = state_name
  287. def produce(self, value, text=None):
  288. """
  289. Called from an action procedure, causes |value| to be returned
  290. as the token value from read(). If |text| is supplied, it is
  291. returned in place of the scanned text.
  292. produce() can be called more than once during a single call to an action
  293. procedure, in which case the tokens are queued up and returned one
  294. at a time by subsequent calls to read(), until the queue is empty,
  295. whereupon scanning resumes.
  296. """
  297. if text is None:
  298. text = self.text
  299. self.queue.append((value, text))
  300. def eof(self):
  301. """
  302. Override this method if you want something to be done at
  303. end of file.
  304. """