PdfParser.py 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043
  1. import calendar
  2. import codecs
  3. import collections
  4. import mmap
  5. import os
  6. import re
  7. import time
  8. import zlib
  9. from ._util import py3
  10. try:
  11. from UserDict import UserDict # Python 2.x
  12. except ImportError:
  13. UserDict = collections.UserDict # Python 3.x
  14. if py3: # Python 3.x
  15. def make_bytes(s):
  16. return s.encode("us-ascii")
  17. else: # Python 2.x
  18. def make_bytes(s): # pragma: no cover
  19. return s # pragma: no cover
  20. # see 7.9.2.2 Text String Type on page 86 and D.3 PDFDocEncoding Character Set
  21. # on page 656
  22. def encode_text(s):
  23. return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
  24. PDFDocEncoding = {
  25. 0x16: u"\u0017",
  26. 0x18: u"\u02D8",
  27. 0x19: u"\u02C7",
  28. 0x1A: u"\u02C6",
  29. 0x1B: u"\u02D9",
  30. 0x1C: u"\u02DD",
  31. 0x1D: u"\u02DB",
  32. 0x1E: u"\u02DA",
  33. 0x1F: u"\u02DC",
  34. 0x80: u"\u2022",
  35. 0x81: u"\u2020",
  36. 0x82: u"\u2021",
  37. 0x83: u"\u2026",
  38. 0x84: u"\u2014",
  39. 0x85: u"\u2013",
  40. 0x86: u"\u0192",
  41. 0x87: u"\u2044",
  42. 0x88: u"\u2039",
  43. 0x89: u"\u203A",
  44. 0x8A: u"\u2212",
  45. 0x8B: u"\u2030",
  46. 0x8C: u"\u201E",
  47. 0x8D: u"\u201C",
  48. 0x8E: u"\u201D",
  49. 0x8F: u"\u2018",
  50. 0x90: u"\u2019",
  51. 0x91: u"\u201A",
  52. 0x92: u"\u2122",
  53. 0x93: u"\uFB01",
  54. 0x94: u"\uFB02",
  55. 0x95: u"\u0141",
  56. 0x96: u"\u0152",
  57. 0x97: u"\u0160",
  58. 0x98: u"\u0178",
  59. 0x99: u"\u017D",
  60. 0x9A: u"\u0131",
  61. 0x9B: u"\u0142",
  62. 0x9C: u"\u0153",
  63. 0x9D: u"\u0161",
  64. 0x9E: u"\u017E",
  65. 0xA0: u"\u20AC",
  66. }
  67. def decode_text(b):
  68. if b[: len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
  69. return b[len(codecs.BOM_UTF16_BE) :].decode("utf_16_be")
  70. elif py3: # Python 3.x
  71. return "".join(PDFDocEncoding.get(byte, chr(byte)) for byte in b)
  72. else: # Python 2.x
  73. return u"".join(PDFDocEncoding.get(ord(byte), byte) for byte in b)
  74. class PdfFormatError(RuntimeError):
  75. """An error that probably indicates a syntactic or semantic error in the
  76. PDF file structure"""
  77. pass
  78. def check_format_condition(condition, error_message):
  79. if not condition:
  80. raise PdfFormatError(error_message)
  81. class IndirectReference(
  82. collections.namedtuple("IndirectReferenceTuple", ["object_id", "generation"])
  83. ):
  84. def __str__(self):
  85. return "%s %s R" % self
  86. def __bytes__(self):
  87. return self.__str__().encode("us-ascii")
  88. def __eq__(self, other):
  89. return (
  90. other.__class__ is self.__class__
  91. and other.object_id == self.object_id
  92. and other.generation == self.generation
  93. )
  94. def __ne__(self, other):
  95. return not (self == other)
  96. def __hash__(self):
  97. return hash((self.object_id, self.generation))
  98. class IndirectObjectDef(IndirectReference):
  99. def __str__(self):
  100. return "%s %s obj" % self
  101. class XrefTable:
  102. def __init__(self):
  103. self.existing_entries = {} # object ID => (offset, generation)
  104. self.new_entries = {} # object ID => (offset, generation)
  105. self.deleted_entries = {0: 65536} # object ID => generation
  106. self.reading_finished = False
  107. def __setitem__(self, key, value):
  108. if self.reading_finished:
  109. self.new_entries[key] = value
  110. else:
  111. self.existing_entries[key] = value
  112. if key in self.deleted_entries:
  113. del self.deleted_entries[key]
  114. def __getitem__(self, key):
  115. try:
  116. return self.new_entries[key]
  117. except KeyError:
  118. return self.existing_entries[key]
  119. def __delitem__(self, key):
  120. if key in self.new_entries:
  121. generation = self.new_entries[key][1] + 1
  122. del self.new_entries[key]
  123. self.deleted_entries[key] = generation
  124. elif key in self.existing_entries:
  125. generation = self.existing_entries[key][1] + 1
  126. self.deleted_entries[key] = generation
  127. elif key in self.deleted_entries:
  128. generation = self.deleted_entries[key]
  129. else:
  130. raise IndexError(
  131. "object ID " + str(key) + " cannot be deleted because it doesn't exist"
  132. )
  133. def __contains__(self, key):
  134. return key in self.existing_entries or key in self.new_entries
  135. def __len__(self):
  136. return len(
  137. set(self.existing_entries.keys())
  138. | set(self.new_entries.keys())
  139. | set(self.deleted_entries.keys())
  140. )
  141. def keys(self):
  142. return (
  143. set(self.existing_entries.keys()) - set(self.deleted_entries.keys())
  144. ) | set(self.new_entries.keys())
  145. def write(self, f):
  146. keys = sorted(set(self.new_entries.keys()) | set(self.deleted_entries.keys()))
  147. deleted_keys = sorted(set(self.deleted_entries.keys()))
  148. startxref = f.tell()
  149. f.write(b"xref\n")
  150. while keys:
  151. # find a contiguous sequence of object IDs
  152. prev = None
  153. for index, key in enumerate(keys):
  154. if prev is None or prev + 1 == key:
  155. prev = key
  156. else:
  157. contiguous_keys = keys[:index]
  158. keys = keys[index:]
  159. break
  160. else:
  161. contiguous_keys = keys
  162. keys = None
  163. f.write(make_bytes("%d %d\n" % (contiguous_keys[0], len(contiguous_keys))))
  164. for object_id in contiguous_keys:
  165. if object_id in self.new_entries:
  166. f.write(make_bytes("%010d %05d n \n" % self.new_entries[object_id]))
  167. else:
  168. this_deleted_object_id = deleted_keys.pop(0)
  169. check_format_condition(
  170. object_id == this_deleted_object_id,
  171. "expected the next deleted object ID to be %s, instead found %s"
  172. % (object_id, this_deleted_object_id),
  173. )
  174. try:
  175. next_in_linked_list = deleted_keys[0]
  176. except IndexError:
  177. next_in_linked_list = 0
  178. f.write(
  179. make_bytes(
  180. "%010d %05d f \n"
  181. % (next_in_linked_list, self.deleted_entries[object_id])
  182. )
  183. )
  184. return startxref
  185. class PdfName:
  186. def __init__(self, name):
  187. if isinstance(name, PdfName):
  188. self.name = name.name
  189. elif isinstance(name, bytes):
  190. self.name = name
  191. else:
  192. self.name = name.encode("us-ascii")
  193. def name_as_str(self):
  194. return self.name.decode("us-ascii")
  195. def __eq__(self, other):
  196. return (
  197. isinstance(other, PdfName) and other.name == self.name
  198. ) or other == self.name
  199. def __hash__(self):
  200. return hash(self.name)
  201. def __repr__(self):
  202. return "PdfName(%s)" % repr(self.name)
  203. @classmethod
  204. def from_pdf_stream(cls, data):
  205. return cls(PdfParser.interpret_name(data))
  206. allowed_chars = set(range(33, 127)) - set(ord(c) for c in "#%/()<>[]{}")
  207. def __bytes__(self):
  208. result = bytearray(b"/")
  209. for b in self.name:
  210. if py3: # Python 3.x
  211. if b in self.allowed_chars:
  212. result.append(b)
  213. else:
  214. result.extend(make_bytes("#%02X" % b))
  215. else: # Python 2.x
  216. if ord(b) in self.allowed_chars:
  217. result.append(b)
  218. else:
  219. result.extend(b"#%02X" % ord(b))
  220. return bytes(result)
  221. __str__ = __bytes__
  222. class PdfArray(list):
  223. def __bytes__(self):
  224. return b"[ " + b" ".join(pdf_repr(x) for x in self) + b" ]"
  225. __str__ = __bytes__
  226. class PdfDict(UserDict):
  227. def __setattr__(self, key, value):
  228. if key == "data":
  229. if hasattr(UserDict, "__setattr__"):
  230. UserDict.__setattr__(self, key, value)
  231. else:
  232. self.__dict__[key] = value
  233. else:
  234. self[key.encode("us-ascii")] = value
  235. def __getattr__(self, key):
  236. try:
  237. value = self[key.encode("us-ascii")]
  238. except KeyError:
  239. raise AttributeError(key)
  240. if isinstance(value, bytes):
  241. value = decode_text(value)
  242. if key.endswith("Date"):
  243. if value.startswith("D:"):
  244. value = value[2:]
  245. relationship = "Z"
  246. if len(value) > 17:
  247. relationship = value[14]
  248. offset = int(value[15:17]) * 60
  249. if len(value) > 20:
  250. offset += int(value[18:20])
  251. format = "%Y%m%d%H%M%S"[: len(value) - 2]
  252. value = time.strptime(value[: len(format) + 2], format)
  253. if relationship in ["+", "-"]:
  254. offset *= 60
  255. if relationship == "+":
  256. offset *= -1
  257. value = time.gmtime(calendar.timegm(value) + offset)
  258. return value
  259. def __bytes__(self):
  260. out = bytearray(b"<<")
  261. for key, value in self.items():
  262. if value is None:
  263. continue
  264. value = pdf_repr(value)
  265. out.extend(b"\n")
  266. out.extend(bytes(PdfName(key)))
  267. out.extend(b" ")
  268. out.extend(value)
  269. out.extend(b"\n>>")
  270. return bytes(out)
  271. if not py3:
  272. __str__ = __bytes__
  273. class PdfBinary:
  274. def __init__(self, data):
  275. self.data = data
  276. if py3: # Python 3.x
  277. def __bytes__(self):
  278. return make_bytes("<%s>" % "".join("%02X" % b for b in self.data))
  279. else: # Python 2.x
  280. def __str__(self):
  281. return "<%s>" % "".join("%02X" % ord(b) for b in self.data)
  282. class PdfStream:
  283. def __init__(self, dictionary, buf):
  284. self.dictionary = dictionary
  285. self.buf = buf
  286. def decode(self):
  287. try:
  288. filter = self.dictionary.Filter
  289. except AttributeError:
  290. return self.buf
  291. if filter == b"FlateDecode":
  292. try:
  293. expected_length = self.dictionary.DL
  294. except AttributeError:
  295. expected_length = self.dictionary.Length
  296. return zlib.decompress(self.buf, bufsize=int(expected_length))
  297. else:
  298. raise NotImplementedError(
  299. "stream filter %s unknown/unsupported" % repr(self.dictionary.Filter)
  300. )
  301. def pdf_repr(x):
  302. if x is True:
  303. return b"true"
  304. elif x is False:
  305. return b"false"
  306. elif x is None:
  307. return b"null"
  308. elif isinstance(x, (PdfName, PdfDict, PdfArray, PdfBinary)):
  309. return bytes(x)
  310. elif isinstance(x, int):
  311. return str(x).encode("us-ascii")
  312. elif isinstance(x, time.struct_time):
  313. return b"(D:" + time.strftime("%Y%m%d%H%M%SZ", x).encode("us-ascii") + b")"
  314. elif isinstance(x, dict):
  315. return bytes(PdfDict(x))
  316. elif isinstance(x, list):
  317. return bytes(PdfArray(x))
  318. elif (py3 and isinstance(x, str)) or (
  319. not py3 and isinstance(x, unicode) # noqa: F821
  320. ):
  321. return pdf_repr(encode_text(x))
  322. elif isinstance(x, bytes):
  323. # XXX escape more chars? handle binary garbage
  324. x = x.replace(b"\\", b"\\\\")
  325. x = x.replace(b"(", b"\\(")
  326. x = x.replace(b")", b"\\)")
  327. return b"(" + x + b")"
  328. else:
  329. return bytes(x)
  330. class PdfParser:
  331. """Based on
  332. https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
  333. Supports PDF up to 1.4
  334. """
  335. def __init__(self, filename=None, f=None, buf=None, start_offset=0, mode="rb"):
  336. if buf and f:
  337. raise RuntimeError("specify buf or f or filename, but not both buf and f")
  338. self.filename = filename
  339. self.buf = buf
  340. self.f = f
  341. self.start_offset = start_offset
  342. self.should_close_buf = False
  343. self.should_close_file = False
  344. if filename is not None and f is None:
  345. self.f = f = open(filename, mode)
  346. self.should_close_file = True
  347. if f is not None:
  348. self.buf = buf = self.get_buf_from_file(f)
  349. self.should_close_buf = True
  350. if not filename and hasattr(f, "name"):
  351. self.filename = f.name
  352. self.cached_objects = {}
  353. if buf:
  354. self.read_pdf_info()
  355. else:
  356. self.file_size_total = self.file_size_this = 0
  357. self.root = PdfDict()
  358. self.root_ref = None
  359. self.info = PdfDict()
  360. self.info_ref = None
  361. self.page_tree_root = {}
  362. self.pages = []
  363. self.orig_pages = []
  364. self.pages_ref = None
  365. self.last_xref_section_offset = None
  366. self.trailer_dict = {}
  367. self.xref_table = XrefTable()
  368. self.xref_table.reading_finished = True
  369. if f:
  370. self.seek_end()
  371. def __enter__(self):
  372. return self
  373. def __exit__(self, exc_type, exc_value, traceback):
  374. self.close()
  375. return False # do not suppress exceptions
  376. def start_writing(self):
  377. self.close_buf()
  378. self.seek_end()
  379. def close_buf(self):
  380. try:
  381. self.buf.close()
  382. except AttributeError:
  383. pass
  384. self.buf = None
  385. def close(self):
  386. if self.should_close_buf:
  387. self.close_buf()
  388. if self.f is not None and self.should_close_file:
  389. self.f.close()
  390. self.f = None
  391. def seek_end(self):
  392. self.f.seek(0, os.SEEK_END)
  393. def write_header(self):
  394. self.f.write(b"%PDF-1.4\n")
  395. def write_comment(self, s):
  396. self.f.write(("%% %s\n" % (s,)).encode("utf-8"))
  397. def write_catalog(self):
  398. self.del_root()
  399. self.root_ref = self.next_object_id(self.f.tell())
  400. self.pages_ref = self.next_object_id(0)
  401. self.rewrite_pages()
  402. self.write_obj(self.root_ref, Type=PdfName(b"Catalog"), Pages=self.pages_ref)
  403. self.write_obj(
  404. self.pages_ref,
  405. Type=PdfName(b"Pages"),
  406. Count=len(self.pages),
  407. Kids=self.pages,
  408. )
  409. return self.root_ref
  410. def rewrite_pages(self):
  411. pages_tree_nodes_to_delete = []
  412. for i, page_ref in enumerate(self.orig_pages):
  413. page_info = self.cached_objects[page_ref]
  414. del self.xref_table[page_ref.object_id]
  415. pages_tree_nodes_to_delete.append(page_info[PdfName(b"Parent")])
  416. if page_ref not in self.pages:
  417. # the page has been deleted
  418. continue
  419. # make dict keys into strings for passing to write_page
  420. stringified_page_info = {}
  421. for key, value in page_info.items():
  422. # key should be a PdfName
  423. stringified_page_info[key.name_as_str()] = value
  424. stringified_page_info["Parent"] = self.pages_ref
  425. new_page_ref = self.write_page(None, **stringified_page_info)
  426. for j, cur_page_ref in enumerate(self.pages):
  427. if cur_page_ref == page_ref:
  428. # replace the page reference with the new one
  429. self.pages[j] = new_page_ref
  430. # delete redundant Pages tree nodes from xref table
  431. for pages_tree_node_ref in pages_tree_nodes_to_delete:
  432. while pages_tree_node_ref:
  433. pages_tree_node = self.cached_objects[pages_tree_node_ref]
  434. if pages_tree_node_ref.object_id in self.xref_table:
  435. del self.xref_table[pages_tree_node_ref.object_id]
  436. pages_tree_node_ref = pages_tree_node.get(b"Parent", None)
  437. self.orig_pages = []
  438. def write_xref_and_trailer(self, new_root_ref=None):
  439. if new_root_ref:
  440. self.del_root()
  441. self.root_ref = new_root_ref
  442. if self.info:
  443. self.info_ref = self.write_obj(None, self.info)
  444. start_xref = self.xref_table.write(self.f)
  445. num_entries = len(self.xref_table)
  446. trailer_dict = {b"Root": self.root_ref, b"Size": num_entries}
  447. if self.last_xref_section_offset is not None:
  448. trailer_dict[b"Prev"] = self.last_xref_section_offset
  449. if self.info:
  450. trailer_dict[b"Info"] = self.info_ref
  451. self.last_xref_section_offset = start_xref
  452. self.f.write(
  453. b"trailer\n"
  454. + bytes(PdfDict(trailer_dict))
  455. + make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref)
  456. )
  457. def write_page(self, ref, *objs, **dict_obj):
  458. if isinstance(ref, int):
  459. ref = self.pages[ref]
  460. if "Type" not in dict_obj:
  461. dict_obj["Type"] = PdfName(b"Page")
  462. if "Parent" not in dict_obj:
  463. dict_obj["Parent"] = self.pages_ref
  464. return self.write_obj(ref, *objs, **dict_obj)
  465. def write_obj(self, ref, *objs, **dict_obj):
  466. f = self.f
  467. if ref is None:
  468. ref = self.next_object_id(f.tell())
  469. else:
  470. self.xref_table[ref.object_id] = (f.tell(), ref.generation)
  471. f.write(bytes(IndirectObjectDef(*ref)))
  472. stream = dict_obj.pop("stream", None)
  473. if stream is not None:
  474. dict_obj["Length"] = len(stream)
  475. if dict_obj:
  476. f.write(pdf_repr(dict_obj))
  477. for obj in objs:
  478. f.write(pdf_repr(obj))
  479. if stream is not None:
  480. f.write(b"stream\n")
  481. f.write(stream)
  482. f.write(b"\nendstream\n")
  483. f.write(b"endobj\n")
  484. return ref
  485. def del_root(self):
  486. if self.root_ref is None:
  487. return
  488. del self.xref_table[self.root_ref.object_id]
  489. del self.xref_table[self.root[b"Pages"].object_id]
  490. @staticmethod
  491. def get_buf_from_file(f):
  492. if hasattr(f, "getbuffer"):
  493. return f.getbuffer()
  494. elif hasattr(f, "getvalue"):
  495. return f.getvalue()
  496. else:
  497. try:
  498. return mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
  499. except ValueError: # cannot mmap an empty file
  500. return b""
  501. def read_pdf_info(self):
  502. self.file_size_total = len(self.buf)
  503. self.file_size_this = self.file_size_total - self.start_offset
  504. self.read_trailer()
  505. self.root_ref = self.trailer_dict[b"Root"]
  506. self.info_ref = self.trailer_dict.get(b"Info", None)
  507. self.root = PdfDict(self.read_indirect(self.root_ref))
  508. if self.info_ref is None:
  509. self.info = PdfDict()
  510. else:
  511. self.info = PdfDict(self.read_indirect(self.info_ref))
  512. check_format_condition(b"Type" in self.root, "/Type missing in Root")
  513. check_format_condition(
  514. self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog"
  515. )
  516. check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
  517. check_format_condition(
  518. isinstance(self.root[b"Pages"], IndirectReference),
  519. "/Pages in Root is not an indirect reference",
  520. )
  521. self.pages_ref = self.root[b"Pages"]
  522. self.page_tree_root = self.read_indirect(self.pages_ref)
  523. self.pages = self.linearize_page_tree(self.page_tree_root)
  524. # save the original list of page references
  525. # in case the user modifies, adds or deletes some pages
  526. # and we need to rewrite the pages and their list
  527. self.orig_pages = self.pages[:]
  528. def next_object_id(self, offset=None):
  529. try:
  530. # TODO: support reuse of deleted objects
  531. reference = IndirectReference(max(self.xref_table.keys()) + 1, 0)
  532. except ValueError:
  533. reference = IndirectReference(1, 0)
  534. if offset is not None:
  535. self.xref_table[reference.object_id] = (offset, 0)
  536. return reference
  537. delimiter = br"[][()<>{}/%]"
  538. delimiter_or_ws = br"[][()<>{}/%\000\011\012\014\015\040]"
  539. whitespace = br"[\000\011\012\014\015\040]"
  540. whitespace_or_hex = br"[\000\011\012\014\015\0400-9a-fA-F]"
  541. whitespace_optional = whitespace + b"*"
  542. whitespace_mandatory = whitespace + b"+"
  543. newline_only = br"[\r\n]+"
  544. newline = whitespace_optional + newline_only + whitespace_optional
  545. re_trailer_end = re.compile(
  546. whitespace_mandatory
  547. + br"trailer"
  548. + whitespace_optional
  549. + br"\<\<(.*\>\>)"
  550. + newline
  551. + br"startxref"
  552. + newline
  553. + br"([0-9]+)"
  554. + newline
  555. + br"%%EOF"
  556. + whitespace_optional
  557. + br"$",
  558. re.DOTALL,
  559. )
  560. re_trailer_prev = re.compile(
  561. whitespace_optional
  562. + br"trailer"
  563. + whitespace_optional
  564. + br"\<\<(.*?\>\>)"
  565. + newline
  566. + br"startxref"
  567. + newline
  568. + br"([0-9]+)"
  569. + newline
  570. + br"%%EOF"
  571. + whitespace_optional,
  572. re.DOTALL,
  573. )
  574. def read_trailer(self):
  575. search_start_offset = len(self.buf) - 16384
  576. if search_start_offset < self.start_offset:
  577. search_start_offset = self.start_offset
  578. m = self.re_trailer_end.search(self.buf, search_start_offset)
  579. check_format_condition(m, "trailer end not found")
  580. # make sure we found the LAST trailer
  581. last_match = m
  582. while m:
  583. last_match = m
  584. m = self.re_trailer_end.search(self.buf, m.start() + 16)
  585. if not m:
  586. m = last_match
  587. trailer_data = m.group(1)
  588. self.last_xref_section_offset = int(m.group(2))
  589. self.trailer_dict = self.interpret_trailer(trailer_data)
  590. self.xref_table = XrefTable()
  591. self.read_xref_table(xref_section_offset=self.last_xref_section_offset)
  592. if b"Prev" in self.trailer_dict:
  593. self.read_prev_trailer(self.trailer_dict[b"Prev"])
  594. def read_prev_trailer(self, xref_section_offset):
  595. trailer_offset = self.read_xref_table(xref_section_offset=xref_section_offset)
  596. m = self.re_trailer_prev.search(
  597. self.buf[trailer_offset : trailer_offset + 16384]
  598. )
  599. check_format_condition(m, "previous trailer not found")
  600. trailer_data = m.group(1)
  601. check_format_condition(
  602. int(m.group(2)) == xref_section_offset,
  603. "xref section offset in previous trailer doesn't match what was expected",
  604. )
  605. trailer_dict = self.interpret_trailer(trailer_data)
  606. if b"Prev" in trailer_dict:
  607. self.read_prev_trailer(trailer_dict[b"Prev"])
  608. re_whitespace_optional = re.compile(whitespace_optional)
  609. re_name = re.compile(
  610. whitespace_optional
  611. + br"/([!-$&'*-.0-;=?-Z\\^-z|~]+)(?="
  612. + delimiter_or_ws
  613. + br")"
  614. )
  615. re_dict_start = re.compile(whitespace_optional + br"\<\<")
  616. re_dict_end = re.compile(whitespace_optional + br"\>\>" + whitespace_optional)
  617. @classmethod
  618. def interpret_trailer(cls, trailer_data):
  619. trailer = {}
  620. offset = 0
  621. while True:
  622. m = cls.re_name.match(trailer_data, offset)
  623. if not m:
  624. m = cls.re_dict_end.match(trailer_data, offset)
  625. check_format_condition(
  626. m and m.end() == len(trailer_data),
  627. "name not found in trailer, remaining data: "
  628. + repr(trailer_data[offset:]),
  629. )
  630. break
  631. key = cls.interpret_name(m.group(1))
  632. value, offset = cls.get_value(trailer_data, m.end())
  633. trailer[key] = value
  634. check_format_condition(
  635. b"Size" in trailer and isinstance(trailer[b"Size"], int),
  636. "/Size not in trailer or not an integer",
  637. )
  638. check_format_condition(
  639. b"Root" in trailer and isinstance(trailer[b"Root"], IndirectReference),
  640. "/Root not in trailer or not an indirect reference",
  641. )
  642. return trailer
  643. re_hashes_in_name = re.compile(br"([^#]*)(#([0-9a-fA-F]{2}))?")
  644. @classmethod
  645. def interpret_name(cls, raw, as_text=False):
  646. name = b""
  647. for m in cls.re_hashes_in_name.finditer(raw):
  648. if m.group(3):
  649. name += m.group(1) + bytearray.fromhex(m.group(3).decode("us-ascii"))
  650. else:
  651. name += m.group(1)
  652. if as_text:
  653. return name.decode("utf-8")
  654. else:
  655. return bytes(name)
  656. re_null = re.compile(whitespace_optional + br"null(?=" + delimiter_or_ws + br")")
  657. re_true = re.compile(whitespace_optional + br"true(?=" + delimiter_or_ws + br")")
  658. re_false = re.compile(whitespace_optional + br"false(?=" + delimiter_or_ws + br")")
  659. re_int = re.compile(
  660. whitespace_optional + br"([-+]?[0-9]+)(?=" + delimiter_or_ws + br")"
  661. )
  662. re_real = re.compile(
  663. whitespace_optional
  664. + br"([-+]?([0-9]+\.[0-9]*|[0-9]*\.[0-9]+))(?="
  665. + delimiter_or_ws
  666. + br")"
  667. )
  668. re_array_start = re.compile(whitespace_optional + br"\[")
  669. re_array_end = re.compile(whitespace_optional + br"]")
  670. re_string_hex = re.compile(
  671. whitespace_optional + br"\<(" + whitespace_or_hex + br"*)\>"
  672. )
  673. re_string_lit = re.compile(whitespace_optional + br"\(")
  674. re_indirect_reference = re.compile(
  675. whitespace_optional
  676. + br"([-+]?[0-9]+)"
  677. + whitespace_mandatory
  678. + br"([-+]?[0-9]+)"
  679. + whitespace_mandatory
  680. + br"R(?="
  681. + delimiter_or_ws
  682. + br")"
  683. )
  684. re_indirect_def_start = re.compile(
  685. whitespace_optional
  686. + br"([-+]?[0-9]+)"
  687. + whitespace_mandatory
  688. + br"([-+]?[0-9]+)"
  689. + whitespace_mandatory
  690. + br"obj(?="
  691. + delimiter_or_ws
  692. + br")"
  693. )
  694. re_indirect_def_end = re.compile(
  695. whitespace_optional + br"endobj(?=" + delimiter_or_ws + br")"
  696. )
  697. re_comment = re.compile(
  698. br"(" + whitespace_optional + br"%[^\r\n]*" + newline + br")*"
  699. )
  700. re_stream_start = re.compile(whitespace_optional + br"stream\r?\n")
  701. re_stream_end = re.compile(
  702. whitespace_optional + br"endstream(?=" + delimiter_or_ws + br")"
  703. )
  704. @classmethod
  705. def get_value(cls, data, offset, expect_indirect=None, max_nesting=-1):
  706. if max_nesting == 0:
  707. return None, None
  708. m = cls.re_comment.match(data, offset)
  709. if m:
  710. offset = m.end()
  711. m = cls.re_indirect_def_start.match(data, offset)
  712. if m:
  713. check_format_condition(
  714. int(m.group(1)) > 0,
  715. "indirect object definition: object ID must be greater than 0",
  716. )
  717. check_format_condition(
  718. int(m.group(2)) >= 0,
  719. "indirect object definition: generation must be non-negative",
  720. )
  721. check_format_condition(
  722. expect_indirect is None
  723. or expect_indirect
  724. == IndirectReference(int(m.group(1)), int(m.group(2))),
  725. "indirect object definition different than expected",
  726. )
  727. object, offset = cls.get_value(data, m.end(), max_nesting=max_nesting - 1)
  728. if offset is None:
  729. return object, None
  730. m = cls.re_indirect_def_end.match(data, offset)
  731. check_format_condition(m, "indirect object definition end not found")
  732. return object, m.end()
  733. check_format_condition(
  734. not expect_indirect, "indirect object definition not found"
  735. )
  736. m = cls.re_indirect_reference.match(data, offset)
  737. if m:
  738. check_format_condition(
  739. int(m.group(1)) > 0,
  740. "indirect object reference: object ID must be greater than 0",
  741. )
  742. check_format_condition(
  743. int(m.group(2)) >= 0,
  744. "indirect object reference: generation must be non-negative",
  745. )
  746. return IndirectReference(int(m.group(1)), int(m.group(2))), m.end()
  747. m = cls.re_dict_start.match(data, offset)
  748. if m:
  749. offset = m.end()
  750. result = {}
  751. m = cls.re_dict_end.match(data, offset)
  752. while not m:
  753. key, offset = cls.get_value(data, offset, max_nesting=max_nesting - 1)
  754. if offset is None:
  755. return result, None
  756. value, offset = cls.get_value(data, offset, max_nesting=max_nesting - 1)
  757. result[key] = value
  758. if offset is None:
  759. return result, None
  760. m = cls.re_dict_end.match(data, offset)
  761. offset = m.end()
  762. m = cls.re_stream_start.match(data, offset)
  763. if m:
  764. try:
  765. stream_len = int(result[b"Length"])
  766. except (TypeError, KeyError, ValueError):
  767. raise PdfFormatError(
  768. "bad or missing Length in stream dict (%r)"
  769. % result.get(b"Length", None)
  770. )
  771. stream_data = data[m.end() : m.end() + stream_len]
  772. m = cls.re_stream_end.match(data, m.end() + stream_len)
  773. check_format_condition(m, "stream end not found")
  774. offset = m.end()
  775. result = PdfStream(PdfDict(result), stream_data)
  776. else:
  777. result = PdfDict(result)
  778. return result, offset
  779. m = cls.re_array_start.match(data, offset)
  780. if m:
  781. offset = m.end()
  782. result = []
  783. m = cls.re_array_end.match(data, offset)
  784. while not m:
  785. value, offset = cls.get_value(data, offset, max_nesting=max_nesting - 1)
  786. result.append(value)
  787. if offset is None:
  788. return result, None
  789. m = cls.re_array_end.match(data, offset)
  790. return result, m.end()
  791. m = cls.re_null.match(data, offset)
  792. if m:
  793. return None, m.end()
  794. m = cls.re_true.match(data, offset)
  795. if m:
  796. return True, m.end()
  797. m = cls.re_false.match(data, offset)
  798. if m:
  799. return False, m.end()
  800. m = cls.re_name.match(data, offset)
  801. if m:
  802. return PdfName(cls.interpret_name(m.group(1))), m.end()
  803. m = cls.re_int.match(data, offset)
  804. if m:
  805. return int(m.group(1)), m.end()
  806. m = cls.re_real.match(data, offset)
  807. if m:
  808. # XXX Decimal instead of float???
  809. return float(m.group(1)), m.end()
  810. m = cls.re_string_hex.match(data, offset)
  811. if m:
  812. # filter out whitespace
  813. hex_string = bytearray(
  814. [b for b in m.group(1) if b in b"0123456789abcdefABCDEF"]
  815. )
  816. if len(hex_string) % 2 == 1:
  817. # append a 0 if the length is not even - yes, at the end
  818. hex_string.append(ord(b"0"))
  819. return bytearray.fromhex(hex_string.decode("us-ascii")), m.end()
  820. m = cls.re_string_lit.match(data, offset)
  821. if m:
  822. return cls.get_literal_string(data, m.end())
  823. # return None, offset # fallback (only for debugging)
  824. raise PdfFormatError("unrecognized object: " + repr(data[offset : offset + 32]))
  825. re_lit_str_token = re.compile(
  826. br"(\\[nrtbf()\\])|(\\[0-9]{1,3})|(\\(\r\n|\r|\n))|(\r\n|\r|\n)|(\()|(\))"
  827. )
  828. escaped_chars = {
  829. b"n": b"\n",
  830. b"r": b"\r",
  831. b"t": b"\t",
  832. b"b": b"\b",
  833. b"f": b"\f",
  834. b"(": b"(",
  835. b")": b")",
  836. b"\\": b"\\",
  837. ord(b"n"): b"\n",
  838. ord(b"r"): b"\r",
  839. ord(b"t"): b"\t",
  840. ord(b"b"): b"\b",
  841. ord(b"f"): b"\f",
  842. ord(b"("): b"(",
  843. ord(b")"): b")",
  844. ord(b"\\"): b"\\",
  845. }
  846. @classmethod
  847. def get_literal_string(cls, data, offset):
  848. nesting_depth = 0
  849. result = bytearray()
  850. for m in cls.re_lit_str_token.finditer(data, offset):
  851. result.extend(data[offset : m.start()])
  852. if m.group(1):
  853. result.extend(cls.escaped_chars[m.group(1)[1]])
  854. elif m.group(2):
  855. result.append(int(m.group(2)[1:], 8))
  856. elif m.group(3):
  857. pass
  858. elif m.group(5):
  859. result.extend(b"\n")
  860. elif m.group(6):
  861. result.extend(b"(")
  862. nesting_depth += 1
  863. elif m.group(7):
  864. if nesting_depth == 0:
  865. return bytes(result), m.end()
  866. result.extend(b")")
  867. nesting_depth -= 1
  868. offset = m.end()
  869. raise PdfFormatError("unfinished literal string")
  870. re_xref_section_start = re.compile(whitespace_optional + br"xref" + newline)
  871. re_xref_subsection_start = re.compile(
  872. whitespace_optional
  873. + br"([0-9]+)"
  874. + whitespace_mandatory
  875. + br"([0-9]+)"
  876. + whitespace_optional
  877. + newline_only
  878. )
  879. re_xref_entry = re.compile(br"([0-9]{10}) ([0-9]{5}) ([fn])( \r| \n|\r\n)")
  880. def read_xref_table(self, xref_section_offset):
  881. subsection_found = False
  882. m = self.re_xref_section_start.match(
  883. self.buf, xref_section_offset + self.start_offset
  884. )
  885. check_format_condition(m, "xref section start not found")
  886. offset = m.end()
  887. while True:
  888. m = self.re_xref_subsection_start.match(self.buf, offset)
  889. if not m:
  890. check_format_condition(
  891. subsection_found, "xref subsection start not found"
  892. )
  893. break
  894. subsection_found = True
  895. offset = m.end()
  896. first_object = int(m.group(1))
  897. num_objects = int(m.group(2))
  898. for i in range(first_object, first_object + num_objects):
  899. m = self.re_xref_entry.match(self.buf, offset)
  900. check_format_condition(m, "xref entry not found")
  901. offset = m.end()
  902. is_free = m.group(3) == b"f"
  903. generation = int(m.group(2))
  904. if not is_free:
  905. new_entry = (int(m.group(1)), generation)
  906. check_format_condition(
  907. i not in self.xref_table or self.xref_table[i] == new_entry,
  908. "xref entry duplicated (and not identical)",
  909. )
  910. self.xref_table[i] = new_entry
  911. return offset
  912. def read_indirect(self, ref, max_nesting=-1):
  913. offset, generation = self.xref_table[ref[0]]
  914. check_format_condition(
  915. generation == ref[1],
  916. "expected to find generation %s for object ID %s in xref table, "
  917. "instead found generation %s at offset %s"
  918. % (ref[1], ref[0], generation, offset),
  919. )
  920. value = self.get_value(
  921. self.buf,
  922. offset + self.start_offset,
  923. expect_indirect=IndirectReference(*ref),
  924. max_nesting=max_nesting,
  925. )[0]
  926. self.cached_objects[ref] = value
  927. return value
  928. def linearize_page_tree(self, node=None):
  929. if node is None:
  930. node = self.page_tree_root
  931. check_format_condition(
  932. node[b"Type"] == b"Pages", "/Type of page tree node is not /Pages"
  933. )
  934. pages = []
  935. for kid in node[b"Kids"]:
  936. kid_object = self.read_indirect(kid)
  937. if kid_object[b"Type"] == b"Page":
  938. pages.append(kid)
  939. else:
  940. pages.extend(self.linearize_page_tree(node=kid_object))
  941. return pages