| 1 |
lars |
1 |
<?php
|
|
|
2 |
//
|
|
|
3 |
// FPDI - Version 1.4.2
|
|
|
4 |
//
|
|
|
5 |
// Copyright 2004-2011 Setasign - Jan Slabon
|
|
|
6 |
//
|
|
|
7 |
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
8 |
// you may not use this file except in compliance with the License.
|
|
|
9 |
// You may obtain a copy of the License at
|
|
|
10 |
//
|
|
|
11 |
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
12 |
//
|
|
|
13 |
// Unless required by applicable law or agreed to in writing, software
|
|
|
14 |
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
15 |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
16 |
// See the License for the specific language governing permissions and
|
|
|
17 |
// limitations under the License.
|
|
|
18 |
//
|
|
|
19 |
|
|
|
20 |
if (!defined ('PDF_TYPE_NULL'))
|
|
|
21 |
define ('PDF_TYPE_NULL', 0);
|
|
|
22 |
if (!defined ('PDF_TYPE_NUMERIC'))
|
|
|
23 |
define ('PDF_TYPE_NUMERIC', 1);
|
|
|
24 |
if (!defined ('PDF_TYPE_TOKEN'))
|
|
|
25 |
define ('PDF_TYPE_TOKEN', 2);
|
|
|
26 |
if (!defined ('PDF_TYPE_HEX'))
|
|
|
27 |
define ('PDF_TYPE_HEX', 3);
|
|
|
28 |
if (!defined ('PDF_TYPE_STRING'))
|
|
|
29 |
define ('PDF_TYPE_STRING', 4);
|
|
|
30 |
if (!defined ('PDF_TYPE_DICTIONARY'))
|
|
|
31 |
define ('PDF_TYPE_DICTIONARY', 5);
|
|
|
32 |
if (!defined ('PDF_TYPE_ARRAY'))
|
|
|
33 |
define ('PDF_TYPE_ARRAY', 6);
|
|
|
34 |
if (!defined ('PDF_TYPE_OBJDEC'))
|
|
|
35 |
define ('PDF_TYPE_OBJDEC', 7);
|
|
|
36 |
if (!defined ('PDF_TYPE_OBJREF'))
|
|
|
37 |
define ('PDF_TYPE_OBJREF', 8);
|
|
|
38 |
if (!defined ('PDF_TYPE_OBJECT'))
|
|
|
39 |
define ('PDF_TYPE_OBJECT', 9);
|
|
|
40 |
if (!defined ('PDF_TYPE_STREAM'))
|
|
|
41 |
define ('PDF_TYPE_STREAM', 10);
|
|
|
42 |
if (!defined ('PDF_TYPE_BOOLEAN'))
|
|
|
43 |
define ('PDF_TYPE_BOOLEAN', 11);
|
|
|
44 |
if (!defined ('PDF_TYPE_REAL'))
|
|
|
45 |
define ('PDF_TYPE_REAL', 12);
|
|
|
46 |
|
|
|
47 |
require_once('pdf_context.php');
|
|
|
48 |
|
|
|
49 |
if (!class_exists('pdf_parser', false)) {
|
|
|
50 |
|
|
|
51 |
class pdf_parser {
|
|
|
52 |
|
|
|
53 |
/**
|
|
|
54 |
* Filename
|
|
|
55 |
* @var string
|
|
|
56 |
*/
|
|
|
57 |
var $filename;
|
|
|
58 |
|
|
|
59 |
/**
|
|
|
60 |
* File resource
|
|
|
61 |
* @var resource
|
|
|
62 |
*/
|
|
|
63 |
var $f;
|
|
|
64 |
|
|
|
65 |
/**
|
|
|
66 |
* PDF Context
|
|
|
67 |
* @var object pdf_context-Instance
|
|
|
68 |
*/
|
|
|
69 |
var $c;
|
|
|
70 |
|
|
|
71 |
/**
|
|
|
72 |
* xref-Data
|
|
|
73 |
* @var array
|
|
|
74 |
*/
|
|
|
75 |
var $xref;
|
|
|
76 |
|
|
|
77 |
/**
|
|
|
78 |
* root-Object
|
|
|
79 |
* @var array
|
|
|
80 |
*/
|
|
|
81 |
var $root;
|
|
|
82 |
|
|
|
83 |
/**
|
|
|
84 |
* PDF version of the loaded document
|
|
|
85 |
* @var string
|
|
|
86 |
*/
|
|
|
87 |
var $pdfVersion;
|
|
|
88 |
|
|
|
89 |
/**
|
|
|
90 |
* For reading encrypted documents and xref/objectstreams are in use
|
|
|
91 |
*
|
|
|
92 |
* @var boolean
|
|
|
93 |
*/
|
|
|
94 |
var $readPlain = true;
|
|
|
95 |
|
|
|
96 |
/**
|
|
|
97 |
* Constructor
|
|
|
98 |
*
|
|
|
99 |
* @param string $filename Source-Filename
|
|
|
100 |
*/
|
|
|
101 |
function pdf_parser($filename) {
|
|
|
102 |
$this->filename = $filename;
|
|
|
103 |
|
|
|
104 |
$this->f = @fopen($this->filename, 'rb');
|
|
|
105 |
|
|
|
106 |
if (!$this->f)
|
|
|
107 |
$this->error(sprintf('Cannot open %s !', $filename));
|
|
|
108 |
|
|
|
109 |
$this->getPDFVersion();
|
|
|
110 |
|
|
|
111 |
$this->c = new pdf_context($this->f);
|
|
|
112 |
|
|
|
113 |
// Read xref-Data
|
|
|
114 |
$this->xref = array();
|
|
|
115 |
$this->pdf_read_xref($this->xref, $this->pdf_find_xref());
|
|
|
116 |
|
|
|
117 |
// Check for Encryption
|
|
|
118 |
$this->getEncryption();
|
|
|
119 |
|
|
|
120 |
// Read root
|
|
|
121 |
$this->pdf_read_root();
|
|
|
122 |
}
|
|
|
123 |
|
|
|
124 |
/**
|
|
|
125 |
* Close the opened file
|
|
|
126 |
*/
|
|
|
127 |
function closeFile() {
|
|
|
128 |
if (isset($this->f) && is_resource($this->f)) {
|
|
|
129 |
fclose($this->f);
|
|
|
130 |
unset($this->f);
|
|
|
131 |
}
|
|
|
132 |
}
|
|
|
133 |
|
|
|
134 |
/**
|
|
|
135 |
* Print Error and die
|
|
|
136 |
*
|
|
|
137 |
* @param string $msg Error-Message
|
|
|
138 |
*/
|
|
|
139 |
function error($msg) {
|
|
|
140 |
die('<b>PDF-Parser Error:</b> ' . $msg);
|
|
|
141 |
}
|
|
|
142 |
|
|
|
143 |
/**
|
|
|
144 |
* Check Trailer for Encryption
|
|
|
145 |
*/
|
|
|
146 |
function getEncryption() {
|
|
|
147 |
if (isset($this->xref['trailer'][1]['/Encrypt'])) {
|
|
|
148 |
$this->error('File is encrypted!');
|
|
|
149 |
}
|
|
|
150 |
}
|
|
|
151 |
|
|
|
152 |
/**
|
|
|
153 |
* Find/Return /Root
|
|
|
154 |
*
|
|
|
155 |
* @return array
|
|
|
156 |
*/
|
|
|
157 |
function pdf_find_root() {
|
|
|
158 |
if ($this->xref['trailer'][1]['/Root'][0] != PDF_TYPE_OBJREF) {
|
|
|
159 |
$this->error('Wrong Type of Root-Element! Must be an indirect reference');
|
|
|
160 |
}
|
|
|
161 |
|
|
|
162 |
return $this->xref['trailer'][1]['/Root'];
|
|
|
163 |
}
|
|
|
164 |
|
|
|
165 |
/**
|
|
|
166 |
* Read the /Root
|
|
|
167 |
*/
|
|
|
168 |
function pdf_read_root() {
|
|
|
169 |
// read root
|
|
|
170 |
$this->root = $this->pdf_resolve_object($this->c, $this->pdf_find_root());
|
|
|
171 |
}
|
|
|
172 |
|
|
|
173 |
/**
|
|
|
174 |
* Get PDF-Version
|
|
|
175 |
*
|
|
|
176 |
* And reset the PDF Version used in FPDI if needed
|
|
|
177 |
*/
|
|
|
178 |
function getPDFVersion() {
|
|
|
179 |
fseek($this->f, 0);
|
|
|
180 |
preg_match('/\d\.\d/',fread($this->f, 16), $m);
|
|
|
181 |
if (isset($m[0]))
|
|
|
182 |
$this->pdfVersion = $m[0];
|
|
|
183 |
return $this->pdfVersion;
|
|
|
184 |
}
|
|
|
185 |
|
|
|
186 |
/**
|
|
|
187 |
* Find the xref-Table
|
|
|
188 |
*/
|
|
|
189 |
function pdf_find_xref() {
|
|
|
190 |
$toRead = 1500;
|
|
|
191 |
|
|
|
192 |
$stat = fseek ($this->f, -$toRead, SEEK_END);
|
|
|
193 |
if ($stat === -1) {
|
|
|
194 |
fseek ($this->f, 0);
|
|
|
195 |
}
|
|
|
196 |
$data = fread($this->f, $toRead);
|
|
|
197 |
|
|
|
198 |
$pos = strlen($data) - strpos(strrev($data), strrev('startxref'));
|
|
|
199 |
$data = substr($data, $pos);
|
|
|
200 |
|
|
|
201 |
if (!preg_match('/\s*(\d+).*$/s', $data, $matches)) {
|
|
|
202 |
$this->error('Unable to find pointer to xref table');
|
|
|
203 |
}
|
|
|
204 |
|
|
|
205 |
return (int) $matches[1];
|
|
|
206 |
}
|
|
|
207 |
|
|
|
208 |
/**
|
|
|
209 |
* Read xref-table
|
|
|
210 |
*
|
|
|
211 |
* @param array $result Array of xref-table
|
|
|
212 |
* @param integer $offset of xref-table
|
|
|
213 |
*/
|
|
|
214 |
function pdf_read_xref(&$result, $offset) {
|
|
|
215 |
$o_pos = $offset-min(20, $offset);
|
|
|
216 |
fseek($this->f, $o_pos); // set some bytes backwards to fetch errorious docs
|
|
|
217 |
|
|
|
218 |
$data = fread($this->f, 100);
|
|
|
219 |
|
|
|
220 |
$xrefPos = strrpos($data, 'xref');
|
|
|
221 |
|
|
|
222 |
if ($xrefPos === false) {
|
|
|
223 |
fseek($this->f, $offset);
|
|
|
224 |
$c = new pdf_context($this->f);
|
|
|
225 |
$xrefStreamObjDec = $this->pdf_read_value($c);
|
|
|
226 |
|
|
|
227 |
if (is_array($xrefStreamObjDec) && isset($xrefStreamObjDec[0]) && $xrefStreamObjDec[0] == PDF_TYPE_OBJDEC) {
|
|
|
228 |
$this->error(sprintf('This document (%s) probably uses a compression technique which is not supported by the free parser shipped with FPDI.', $this->filename));
|
|
|
229 |
} else {
|
|
|
230 |
$this->error('Unable to find xref table.');
|
|
|
231 |
}
|
|
|
232 |
}
|
|
|
233 |
|
|
|
234 |
if (!isset($result['xref_location'])) {
|
|
|
235 |
$result['xref_location'] = $o_pos + $xrefPos;
|
|
|
236 |
$result['max_object'] = 0;
|
|
|
237 |
}
|
|
|
238 |
|
|
|
239 |
$cylces = -1;
|
|
|
240 |
$bytesPerCycle = 100;
|
|
|
241 |
|
|
|
242 |
fseek($this->f, $o_pos = $o_pos + $xrefPos + 4); // set the handle directly after the "xref"-keyword
|
|
|
243 |
$data = fread($this->f, $bytesPerCycle);
|
|
|
244 |
|
|
|
245 |
while (($trailerPos = strpos($data, 'trailer', max($bytesPerCycle * $cylces++, 0))) === false && !feof($this->f)) {
|
|
|
246 |
$data .= fread($this->f, $bytesPerCycle);
|
|
|
247 |
}
|
|
|
248 |
|
|
|
249 |
if ($trailerPos === false) {
|
|
|
250 |
$this->error('Trailer keyword not found after xref table');
|
|
|
251 |
}
|
|
|
252 |
|
|
|
253 |
$data = substr($data, 0, $trailerPos);
|
|
|
254 |
|
|
|
255 |
// get Line-Ending
|
|
|
256 |
preg_match_all("/(\r\n|\n|\r)/", substr($data, 0, 100), $m); // check the first 100 bytes for linebreaks
|
|
|
257 |
|
|
|
258 |
$differentLineEndings = count(array_unique($m[0]));
|
|
|
259 |
if ($differentLineEndings > 1) {
|
|
|
260 |
$lines = preg_split("/(\r\n|\n|\r)/", $data, -1, PREG_SPLIT_NO_EMPTY);
|
|
|
261 |
} else {
|
|
|
262 |
$lines = explode($m[0][1], $data);
|
|
|
263 |
}
|
|
|
264 |
|
|
|
265 |
$data = $differentLineEndings = $m = null;
|
|
|
266 |
unset($data, $differentLineEndings, $m);
|
|
|
267 |
|
|
|
268 |
$linesCount = count($lines);
|
|
|
269 |
|
|
|
270 |
$start = 1;
|
|
|
271 |
|
|
|
272 |
for ($i = 0; $i < $linesCount; $i++) {
|
|
|
273 |
$line = trim($lines[$i]);
|
|
|
274 |
if ($line) {
|
|
|
275 |
$pieces = explode(' ', $line);
|
|
|
276 |
$c = count($pieces);
|
|
|
277 |
switch($c) {
|
|
|
278 |
case 2:
|
|
|
279 |
$start = (int)$pieces[0];
|
|
|
280 |
$end = $start + (int)$pieces[1];
|
|
|
281 |
if ($end > $result['max_object'])
|
|
|
282 |
$result['max_object'] = $end;
|
|
|
283 |
break;
|
|
|
284 |
case 3:
|
|
|
285 |
if (!isset($result['xref'][$start]))
|
|
|
286 |
$result['xref'][$start] = array();
|
|
|
287 |
|
|
|
288 |
if (!array_key_exists($gen = (int) $pieces[1], $result['xref'][$start])) {
|
|
|
289 |
$result['xref'][$start][$gen] = $pieces[2] == 'n' ? (int) $pieces[0] : null;
|
|
|
290 |
}
|
|
|
291 |
$start++;
|
|
|
292 |
break;
|
|
|
293 |
default:
|
|
|
294 |
$this->error('Unexpected data in xref table');
|
|
|
295 |
}
|
|
|
296 |
}
|
|
|
297 |
}
|
|
|
298 |
|
|
|
299 |
$lines = $pieces = $line = $start = $end = $gen = null;
|
|
|
300 |
unset($lines, $pieces, $line, $start, $end, $gen);
|
|
|
301 |
|
|
|
302 |
fseek($this->f, $o_pos + $trailerPos + 7);
|
|
|
303 |
|
|
|
304 |
$c = new pdf_context($this->f);
|
|
|
305 |
$trailer = $this->pdf_read_value($c);
|
|
|
306 |
|
|
|
307 |
$c = null;
|
|
|
308 |
unset($c);
|
|
|
309 |
|
|
|
310 |
if (!isset($result['trailer'])) {
|
|
|
311 |
$result['trailer'] = $trailer;
|
|
|
312 |
}
|
|
|
313 |
|
|
|
314 |
if (isset($trailer[1]['/Prev'])) {
|
|
|
315 |
$this->pdf_read_xref($result, $trailer[1]['/Prev'][1]);
|
|
|
316 |
}
|
|
|
317 |
|
|
|
318 |
$trailer = null;
|
|
|
319 |
unset($trailer);
|
|
|
320 |
|
|
|
321 |
return true;
|
|
|
322 |
}
|
|
|
323 |
|
|
|
324 |
/**
|
|
|
325 |
* Reads an Value
|
|
|
326 |
*
|
|
|
327 |
* @param object $c pdf_context
|
|
|
328 |
* @param string $token a Token
|
|
|
329 |
* @return mixed
|
|
|
330 |
*/
|
|
|
331 |
function pdf_read_value(&$c, $token = null) {
|
|
|
332 |
if (is_null($token)) {
|
|
|
333 |
$token = $this->pdf_read_token($c);
|
|
|
334 |
}
|
|
|
335 |
|
|
|
336 |
if ($token === false) {
|
|
|
337 |
return false;
|
|
|
338 |
}
|
|
|
339 |
|
|
|
340 |
switch ($token) {
|
|
|
341 |
case '<':
|
|
|
342 |
// This is a hex string.
|
|
|
343 |
// Read the value, then the terminator
|
|
|
344 |
|
|
|
345 |
$pos = $c->offset;
|
|
|
346 |
|
|
|
347 |
while(1) {
|
|
|
348 |
|
|
|
349 |
$match = strpos ($c->buffer, '>', $pos);
|
|
|
350 |
|
|
|
351 |
// If you can't find it, try
|
|
|
352 |
// reading more data from the stream
|
|
|
353 |
|
|
|
354 |
if ($match === false) {
|
|
|
355 |
if (!$c->increase_length()) {
|
|
|
356 |
return false;
|
|
|
357 |
} else {
|
|
|
358 |
continue;
|
|
|
359 |
}
|
|
|
360 |
}
|
|
|
361 |
|
|
|
362 |
$result = substr ($c->buffer, $c->offset, $match - $c->offset);
|
|
|
363 |
$c->offset = $match + 1;
|
|
|
364 |
|
|
|
365 |
return array (PDF_TYPE_HEX, $result);
|
|
|
366 |
}
|
|
|
367 |
|
|
|
368 |
break;
|
|
|
369 |
case '<<':
|
|
|
370 |
// This is a dictionary.
|
|
|
371 |
|
|
|
372 |
$result = array();
|
|
|
373 |
|
|
|
374 |
// Recurse into this function until we reach
|
|
|
375 |
// the end of the dictionary.
|
|
|
376 |
while (($key = $this->pdf_read_token($c)) !== '>>') {
|
|
|
377 |
if ($key === false) {
|
|
|
378 |
return false;
|
|
|
379 |
}
|
|
|
380 |
|
|
|
381 |
if (($value = $this->pdf_read_value($c)) === false) {
|
|
|
382 |
return false;
|
|
|
383 |
}
|
|
|
384 |
|
|
|
385 |
// Catch missing value
|
|
|
386 |
if ($value[0] == PDF_TYPE_TOKEN && $value[1] == '>>') {
|
|
|
387 |
$result[$key] = array(PDF_TYPE_NULL);
|
|
|
388 |
break;
|
|
|
389 |
}
|
|
|
390 |
|
|
|
391 |
$result[$key] = $value;
|
|
|
392 |
}
|
|
|
393 |
|
|
|
394 |
return array (PDF_TYPE_DICTIONARY, $result);
|
|
|
395 |
|
|
|
396 |
case '[':
|
|
|
397 |
// This is an array.
|
|
|
398 |
|
|
|
399 |
$result = array();
|
|
|
400 |
|
|
|
401 |
// Recurse into this function until we reach
|
|
|
402 |
// the end of the array.
|
|
|
403 |
while (($token = $this->pdf_read_token($c)) !== ']') {
|
|
|
404 |
if ($token === false) {
|
|
|
405 |
return false;
|
|
|
406 |
}
|
|
|
407 |
|
|
|
408 |
if (($value = $this->pdf_read_value($c, $token)) === false) {
|
|
|
409 |
return false;
|
|
|
410 |
}
|
|
|
411 |
|
|
|
412 |
$result[] = $value;
|
|
|
413 |
}
|
|
|
414 |
|
|
|
415 |
return array (PDF_TYPE_ARRAY, $result);
|
|
|
416 |
|
|
|
417 |
case '(' :
|
|
|
418 |
// This is a string
|
|
|
419 |
$pos = $c->offset;
|
|
|
420 |
|
|
|
421 |
$openBrackets = 1;
|
|
|
422 |
do {
|
|
|
423 |
for (; $openBrackets != 0 && $pos < $c->length; $pos++) {
|
|
|
424 |
switch (ord($c->buffer[$pos])) {
|
|
|
425 |
case 0x28: // '('
|
|
|
426 |
$openBrackets++;
|
|
|
427 |
break;
|
|
|
428 |
case 0x29: // ')'
|
|
|
429 |
$openBrackets--;
|
|
|
430 |
break;
|
|
|
431 |
case 0x5C: // backslash
|
|
|
432 |
$pos++;
|
|
|
433 |
}
|
|
|
434 |
}
|
|
|
435 |
} while($openBrackets != 0 && $c->increase_length());
|
|
|
436 |
|
|
|
437 |
$result = substr($c->buffer, $c->offset, $pos - $c->offset - 1);
|
|
|
438 |
$c->offset = $pos;
|
|
|
439 |
|
|
|
440 |
return array (PDF_TYPE_STRING, $result);
|
|
|
441 |
|
|
|
442 |
case 'stream':
|
|
|
443 |
$o_pos = ftell($c->file)-strlen($c->buffer);
|
|
|
444 |
$o_offset = $c->offset;
|
|
|
445 |
|
|
|
446 |
$c->reset($startpos = $o_pos + $o_offset);
|
|
|
447 |
|
|
|
448 |
$e = 0; // ensure line breaks in front of the stream
|
|
|
449 |
if ($c->buffer[0] == chr(10) || $c->buffer[0] == chr(13))
|
|
|
450 |
$e++;
|
|
|
451 |
if ($c->buffer[1] == chr(10) && $c->buffer[0] != chr(10))
|
|
|
452 |
$e++;
|
|
|
453 |
|
|
|
454 |
if ($this->actual_obj[1][1]['/Length'][0] == PDF_TYPE_OBJREF) {
|
|
|
455 |
$tmp_c = new pdf_context($this->f);
|
|
|
456 |
$tmp_length = $this->pdf_resolve_object($tmp_c, $this->actual_obj[1][1]['/Length']);
|
|
|
457 |
$length = $tmp_length[1][1];
|
|
|
458 |
} else {
|
|
|
459 |
$length = $this->actual_obj[1][1]['/Length'][1];
|
|
|
460 |
}
|
|
|
461 |
|
|
|
462 |
if ($length > 0) {
|
|
|
463 |
$c->reset($startpos + $e,$length);
|
|
|
464 |
$v = $c->buffer;
|
|
|
465 |
} else {
|
|
|
466 |
$v = '';
|
|
|
467 |
}
|
|
|
468 |
$c->reset($startpos + $e + $length + 9); // 9 = strlen("endstream")
|
|
|
469 |
|
|
|
470 |
return array(PDF_TYPE_STREAM, $v);
|
|
|
471 |
|
|
|
472 |
default :
|
|
|
473 |
if (is_numeric ($token)) {
|
|
|
474 |
// A numeric token. Make sure that
|
|
|
475 |
// it is not part of something else.
|
|
|
476 |
if (($tok2 = $this->pdf_read_token ($c)) !== false) {
|
|
|
477 |
if (is_numeric ($tok2)) {
|
|
|
478 |
|
|
|
479 |
// Two numeric tokens in a row.
|
|
|
480 |
// In this case, we're probably in
|
|
|
481 |
// front of either an object reference
|
|
|
482 |
// or an object specification.
|
|
|
483 |
// Determine the case and return the data
|
|
|
484 |
if (($tok3 = $this->pdf_read_token ($c)) !== false) {
|
|
|
485 |
switch ($tok3) {
|
|
|
486 |
case 'obj':
|
|
|
487 |
return array (PDF_TYPE_OBJDEC, (int) $token, (int) $tok2);
|
|
|
488 |
case 'R':
|
|
|
489 |
|