| 1 |
lars |
1 |
<?php
|
|
|
2 |
class ParserXHTML extends Parser {
|
|
|
3 |
function &process($html, &$pipeline, &$media) {
|
|
|
4 |
// Run the XML parser on the XHTML we've prepared
|
|
|
5 |
$dom_tree = TreeBuilder::build($html);
|
|
|
6 |
|
|
|
7 |
// Check if parser returned valid document
|
|
|
8 |
if (is_null($dom_tree)) {
|
|
|
9 |
readfile(HTML2PS_DIR.'templates/cannot_parse.html');
|
|
|
10 |
error_log(sprintf("Cannot parse document: %s", $pipeline->get_base_url()));
|
|
|
11 |
die("HTML2PS Error");
|
|
|
12 |
}
|
|
|
13 |
|
|
|
14 |
/**
|
|
|
15 |
* Detect the base URI for this document.
|
|
|
16 |
*
|
|
|
17 |
* According to the HTML 4.01 p. 12.4.1:
|
|
|
18 |
* User agents must calculate the base URI according to the following precedences (highest priority to lowest):
|
|
|
19 |
*
|
|
|
20 |
* 1. The base URI is set by the BASE element.
|
|
|
21 |
* 2. The base URI is given by meta data discovered during a protocol interaction, such as an HTTP header (see [RFC2616]).
|
|
|
22 |
* 3. By default, the base URI is that of the current document. Not all HTML documents have a base URI (e.g., a valid HTML document may appear in an email and may not be designated by a URI). Such HTML documents are considered erroneous if they contain relative URIs and rely on a default base URI.
|
|
|
23 |
*/
|
|
|
24 |
|
|
|
25 |
/**
|
|
|
26 |
* Check if BASE element present; use its first occurrence
|
|
|
27 |
*/
|
|
|
28 |
$this->_scan_base($dom_tree, $pipeline);
|
|
|
29 |
|
|
|
30 |
/**
|
|
|
31 |
* @todo fall back to the protocol metadata
|
|
|
32 |
*/
|
|
|
33 |
|
|
|
34 |
/**
|
|
|
35 |
* Parse STYLE / LINK nodes containing CSS references and definitions
|
|
|
36 |
* This should be done here, as the document body may include STYLE node
|
|
|
37 |
* (this violates HTML standard, but is rather often appears in Web)
|
|
|
38 |
*/
|
|
|
39 |
$css =& $pipeline->get_current_css();
|
|
|
40 |
$css->scan_styles($dom_tree, $pipeline);
|
|
|
41 |
|
|
|
42 |
if (!is_null($media)) {
|
|
|
43 |
// Setup media size and margins
|
|
|
44 |
$pipeline->get_page_media(1, $media);
|
|
|
45 |
$pipeline->output_driver->update_media($media);
|
|
|
46 |
$pipeline->_setupScales($media);
|
|
|
47 |
};
|
|
|
48 |
|
|
|
49 |
$body =& traverse_dom_tree_pdf($dom_tree);
|
|
|
50 |
$box =& create_pdf_box($body, $pipeline);
|
|
|
51 |
|
|
|
52 |
return $box;
|
|
|
53 |
}
|
|
|
54 |
|
|
|
55 |
function _scan_base(&$root, &$pipeline) {
|
|
|
56 |
switch ($root->node_type()) {
|
|
|
57 |
case XML_ELEMENT_NODE:
|
|
|
58 |
if ($root->tagname() === 'base') {
|
|
|
59 |
/**
|
|
|
60 |
* See HTML 4.01 p 12.4
|
|
|
61 |
* href - this attribute specifies an absolute URI that acts as the base URI for resolving relative URIs.
|
|
|
62 |
*
|
|
|
63 |
* At this moment pipeline object have current document URI on the top of the stack;
|
|
|
64 |
* we should replace it with the value of 'href' attribute of the BASE tag
|
|
|
65 |
*
|
|
|
66 |
* To handle (possibly) incorrect values, we use 'guess_url' function; in this case
|
|
|
67 |
* if 'href' attribute contains absolute value (is it SHOULD be), it will be used;
|
|
|
68 |
* if it is missing or is relative, we'll get more of less usable value base on current
|
|
|
69 |
* document URI.
|
|
|
70 |
*/
|
|
|
71 |
$new_url = $pipeline->guess_url($root->get_attribute('href'));
|
|
|
72 |
$pipeline->pop_base_url();
|
|
|
73 |
$pipeline->push_base_url($new_url);
|
|
|
74 |
|
|
|
75 |
return true;
|
|
|
76 |
};
|
|
|
77 |
|
|
|
78 |
// We continue processing here!
|
|
|
79 |
case XML_DOCUMENT_NODE:
|
|
|
80 |
$child = $root->first_child();
|
|
|
81 |
while ($child) {
|
|
|
82 |
if ($this->_scan_base($child, $pipeline)) { return; };
|
|
|
83 |
$child = $child->next_sibling();
|
|
|
84 |
};
|
|
|
85 |
|
|
|
86 |
return false;
|
|
|
87 |
};
|
|
|
88 |
|
|
|
89 |
return false;
|
|
|
90 |
}
|
|
|
91 |
}
|
|
|
92 |
?>
|