module.misc.pdf.php 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. <?php
  2. /////////////////////////////////////////////////////////////////
  3. /// getID3() by James Heinrich <info@getid3.org> //
  4. // available at https://github.com/JamesHeinrich/getID3 //
  5. // or https://www.getid3.org //
  6. // or http://getid3.sourceforge.net //
  7. // see readme.txt for more details //
  8. /////////////////////////////////////////////////////////////////
  9. // //
  10. // module.misc.pdf.php //
  11. // module for analyzing PDF files //
  12. // dependencies: NONE //
  13. // ///
  14. /////////////////////////////////////////////////////////////////
  15. if (!defined('GETID3_INCLUDEPATH')) { // prevent path-exposing attacks that access modules directly on public webservers
  16. exit;
  17. }
  18. class getid3_pdf extends getid3_handler
  19. {
  20. public $returnXREF = false; // return full details of PDF Cross-Reference Table (XREF)
  21. /**
  22. * @return bool
  23. */
  24. public function Analyze() {
  25. $info = &$this->getid3->info;
  26. $this->fseek(0);
  27. if (preg_match('#^%PDF-([0-9\\.]+)$#', rtrim($this->fgets()), $matches)) {
  28. $info['pdf']['header']['version'] = floatval($matches[1]);
  29. $info['fileformat'] = 'pdf';
  30. // the PDF Cross-Reference Table (XREF) is located near the end of the file
  31. // the starting offset is specified in the penultimate section, on the two lines just before "%%EOF"
  32. // the first line is "startxref", the second line is the byte offset of the XREF.
  33. // We know the length of "%%EOF" and "startxref", but the offset could be 2-10 bytes,
  34. // and we're not sure if the line ends are one or two bytes, so we might find "startxref" as little as 18(?) bytes
  35. // from EOF, but it could 30 bytes, so we start 40 bytes back just to be safe and do a search for the data we want.
  36. $this->fseek(-40, SEEK_END);
  37. if (preg_match('#[\r\n]startxref[ \r\n]+([0-9]+)[ \r\n]+#', $this->fread(40), $matches)) {
  38. $info['pdf']['trailer']['startxref'] = intval($matches[1]);
  39. $this->parseXREF($info['pdf']['trailer']['startxref']);
  40. if (!empty($info['pdf']['xref']['offset'])) {
  41. while (!$this->feof() && (max(array_keys($info['pdf']['xref']['offset'])) > $info['pdf']['xref']['count'])) {
  42. // suspect that there may be another XREF entry somewhere in the file, brute-force scan for it
  43. /*
  44. // starting at last known entry of main XREF table
  45. $this->fseek(max($info['pdf']['xref']['offset']));
  46. */
  47. // starting at the beginning of the file
  48. $this->fseek(0);
  49. while (!$this->feof()) {
  50. $XREFoffset = $this->ftell();
  51. if (rtrim($this->fgets()) == 'xref') {
  52. if (empty($info['pdf']['xref']['xref_offsets']) || !in_array($XREFoffset, $info['pdf']['xref']['xref_offsets'])) {
  53. $this->parseXREF($XREFoffset);
  54. break;
  55. }
  56. }
  57. }
  58. }
  59. foreach ($info['pdf']['xref']['offset'] as $objectNumber => $offset) {
  60. if ($info['pdf']['xref']['entry'][$objectNumber] == 'f') {
  61. // "free" object means "deleted", ignore
  62. continue;
  63. }
  64. $this->fseek($offset);
  65. $line = rtrim($this->fgets());
  66. if (preg_match('#^'.$objectNumber.' ([0-9]+) obj#', $line, $matches)) {
  67. if (strlen($line) > strlen($matches[0])) {
  68. // object header line not actually on its own line, rewind file pointer to start reading data
  69. $this->fseek($offset + strlen($matches[0]));
  70. }
  71. $objectData = '';
  72. while (true) {
  73. $line = $this->fgets();
  74. if (rtrim($line) == 'endobj') {
  75. break;
  76. }
  77. $objectData .= $line;
  78. }
  79. if (preg_match('#^<<[\r\n\s]*(/Type|/Pages|/Parent [0-9]+ [0-9]+ [A-Z]|/Count [0-9]+|/Kids *\\[[0-9A-Z ]+\\]|[\r\n\s])+[\r\n\s]*>>#', $objectData, $matches)) {
  80. if (preg_match('#/Count ([0-9]+)#', $objectData, $matches)) {
  81. $info['pdf']['pages'] = (int) $matches[1];
  82. break; // for now this is the only data we're looking for in the PDF not need to loop through every object in the file (and a large PDF may contain MANY objects). And it MAY be possible that there are other objects elsewhere in the file that define additional (or removed?) pages
  83. }
  84. }
  85. } else {
  86. $this->error('Unexpected structure "'.$line.'" at offset '.$offset);
  87. break;
  88. }
  89. }
  90. if (!$this->returnXREF) {
  91. unset($info['pdf']['xref']['offset'], $info['pdf']['xref']['generation'], $info['pdf']['xref']['entry']);
  92. }
  93. } else {
  94. $this->error('Did not find "xref" at offset '.$info['pdf']['trailer']['startxref']);
  95. }
  96. } else {
  97. $this->error('Did not find "startxref" in the last 40 bytes of the PDF');
  98. }
  99. $this->warning('PDF parsing incomplete in this version of getID3() ['.$this->getid3->version().']');
  100. return true;
  101. }
  102. $this->error('Did not find "%PDF" at the beginning of the PDF');
  103. return false;
  104. }
  105. /**
  106. * @return bool
  107. */
  108. private function parseXREF($XREFoffset) {
  109. $info = &$this->getid3->info;
  110. $this->fseek($XREFoffset);
  111. if (rtrim($this->fgets()) == 'xref') {
  112. $info['pdf']['xref']['xref_offsets'][$XREFoffset] = $XREFoffset;
  113. list($firstObjectNumber, $XREFcount) = explode(' ', rtrim($this->fgets()));
  114. $firstObjectNumber = (int) $firstObjectNumber;
  115. $XREFcount = (int) $XREFcount;
  116. $info['pdf']['xref']['count'] = $XREFcount + (!empty($info['pdf']['xref']['count']) ? $info['pdf']['xref']['count'] : 0);
  117. for ($i = 0; $i < $XREFcount; $i++) {
  118. $line = rtrim($this->fgets());
  119. if (preg_match('#^([0-9]+) ([0-9]+) ([nf])$#', $line, $matches)) {
  120. $info['pdf']['xref']['offset'][($firstObjectNumber + $i)] = (int) $matches[1];
  121. $info['pdf']['xref']['generation'][($firstObjectNumber + $i)] = (int) $matches[2];
  122. $info['pdf']['xref']['entry'][($firstObjectNumber + $i)] = $matches[3];
  123. } else {
  124. $this->error('failed to parse XREF entry #'.$i.' in XREF table at offset '.$XREFoffset);
  125. return false;
  126. }
  127. }
  128. sort($info['pdf']['xref']['xref_offsets']);
  129. return true;
  130. }
  131. $this->warning('failed to find expected XREF structure at offset '.$XREFoffset);
  132. return false;
  133. }
  134. }