WebSVN – lars-tiefland.php_share – Blame – /html2pdf/html2ps_v2042/public_html/xhtml.utils.inc.php

Revision	Autor	Zeilennr.	Zeile
1	lars	1	`<?php`
		2	`// $Header: /cvsroot/html2ps/xhtml.utils.inc.php,v 1.35 2007/03/15 18:37:36 Konstantin Exp $`
		3
		4	`function close_tag($tag, $sample_html) {`
		5	`return preg_replace("!(<{$tag}(\s[^>]*[^/>])?)>!si","\\1/>",$sample_html);`
		6	`};`
		7
		8	`function make_attr_value($attr, $html) {`
		9	`return preg_replace("#(<[^>]*\s){$attr}(\s\|>\|/>)#si","\\1{$attr}=\"{$attr}\"\\2",$html);`
		10	`};`
		11
		12
		13	`function mk_open_tag_regexp($tag) { return "<\s{$tag}(\s+[^>])?>"; };`
		14	`function mk_close_tag_regexp($tag) { return "<\s/\s{$tag}\s*>"; };`
		15
		16	`function process_html($html) {`
		17	`$open = mk_open_tag_regexp("html");`
		18	`$close = mk_close_tag_regexp("html");`
		19
		20	`if (!preg_match("#{$open}#is",$html)) {`
		21	`$html = "<html>".$html;`
		22	`};`
		23
		24	`/**`
		25	`* Let's check if there's more than one <html> tags inside the page text`
		26	`* If there are, remove everything except the first one and content between the first and second <html>`
		27	`*/`
		28	`while (preg_match("#{$open}(.*?){$open}#is", $html)) {`
		29	`$html = preg_replace("#{$open}(.*?){$open}#is", "<html>\\2", $html);`
		30	`};`
		31
		32	`if (!preg_match("#{$close}#is", $html)) {`
		33	`$html = $html."</html>";`
		34	`};`
		35
		36	`// PHP 5.2.0 compatilibty issue`
		37	`// preg_replace may accidentally return NULL on large files not matching this`
		38	`$html = preg_replace("#.*({$open})#is","\\1",$html);`
		39
		40	`// PHP 5.2.0 compatilibty issue`
		41	`// preg_replace may accidentally return NULL on large files not matching this`
		42
		43	`// Cut off all data before and after 'html' tag; unless we'll do it,`
		44	`// the XML parser will die violently`
		45	`$html = preg_replace("#^.*<html#is","<html",$html);`
		46
		47	`$html = preg_replace("#</html\s>.$#is","</html>",$html);`
		48
		49	`return $html;`
		50	`}`
		51
		52	`function process_head($html) {`
		53	`$open = mk_open_tag_regexp("head");`
		54	`$close = mk_close_tag_regexp("head");`
		55	`$ohtml = mk_open_tag_regexp("html");`
		56	`$chtml = mk_close_tag_regexp("html");`
		57	`$obody = mk_open_tag_regexp("body");`
		58
		59	`if (!preg_match("#{$open}#is",$html)) {`
		60	`$html = preg_replace("#({$ohtml})(.*)({$obody})#is","\\1<head>\\3</head>\\4",$html);`
		61	`} elseif (!preg_match("#{$close}#is",$html)) {`
		62	`if (preg_match("#{$obody}#is",$html)) {`
		63	`$html = preg_replace("#({$obody})#is","</head>\\1",$html);`
		64	`} else {`
		65	`$html = preg_replace("#({$chtml})#is","</head>\\1",$html);`
		66	`};`
		67	`};`
		68	`return $html;`
		69	`}`
		70
		71	`function process_body($html) {`
		72	`$open = mk_open_tag_regexp("body");`
		73	`$close = mk_close_tag_regexp("body");`
		74	`$ohtml = mk_open_tag_regexp("html");`
		75	`$chtml = mk_close_tag_regexp("html");`
		76	`$chead = mk_close_tag_regexp("head");`
		77
		78	`if (!preg_match("#{$open}#is",$html)) {`
		79	`if (preg_match("#{$chead}#is",$html)) {`
		80	`$html = preg_replace("#({$chead})#is","\\1<body>",$html);`
		81	`} else {`
		82	`$html = preg_replace("#({$ohtml})#is","\\1<body>",$html);`
		83	`};`
		84	`};`
		85	`if (!preg_match("#{$close}#is",$html)) {`
		86	`$html = preg_replace("#({$chtml})#is","</body>\\1",$html);`
		87	`};`
		88
		89	`// Now check is there any data between </head> and <body>.`
		90	`$html = preg_replace("#({$chead})(.+)({$open})#is","\\1\\3\\2",$html);`
		91	`// Check if there's any data between </body> and </html>`
		92	`$html = preg_replace("#({$close})(.+)({$chtml})#is","\\2\\1\\3",$html);`
		93
		94	`return $html;`
		95	`}`
		96
		97	`// Hmmm. May be we'll just write SAX parser on PHP? ;-)`
		98	`function fix_tags($html) {`
		99	`$result = "";`
		100	`$tag_stack = array();`
		101
		102	`// these corrections can simplify the regexp used to parse tags`
		103	`// remove whitespaces before '/' and between '/' and '>' in autoclosing tags`
		104	`$html = preg_replace("#\s/\s>#is","/>",$html);`
		105	`// remove whitespaces between '<', '/' and first tag letter in closing tags`
		106	`$html = preg_replace("#<\s/\s#is","</",$html);`
		107	`// remove whitespaces between '<' and first tag letter`
		108	`$html = preg_replace("#<\s+#is","<",$html);`
		109
		110	`while (preg_match("#(.?)(<([a-z\d]+)[^>]/>\|<([a-z\d]+)[^>](?<!/)>\|</([a-z\d]+)[^>]>)#is",$html,$matches)) {`
		111	`$result .= $matches[1];`
		112	`$html = substr($html, strlen($matches[0]));`
		113
		114	`// Closing tag`
		115	`if (isset($matches[5])) {`
		116	`$tag = $matches[5];`
		117
		118	`if ($tag == $tag_stack[0]) {`
		119	`// Matched the last opening tag (normal state)`
		120	`// Just pop opening tag from the stack`
		121	`array_shift($tag_stack);`
		122	`$result .= $matches[2];`
		123	`} elseif (array_search($tag, $tag_stack)) {`
		124	`// We'll never should close 'table' tag such way, so let's check if any 'tables' found on the stack`
		125	`$no_critical_tags = !array_search('table',$tag_stack);`
		126	`if (!$no_critical_tags) {`
		127	`$no_critical_tags = (array_search('table',$tag_stack) >= array_search($tag, $tag_stack));`
		128	`};`
		129
		130	`if ($no_critical_tags) {`
		131	`// Corresponding opening tag exist on the stack (somewhere deep)`
		132	`// Note that we can forget about 0 value returned by array_search, becaus it is handled by previous 'if'`
		133
		134	`// Insert a set of closing tags for all non-matching tags`
		135	`$i = 0;`
		136	`while ($tag_stack[$i] != $tag) {`
		137	`$result .= "</{$tag_stack[$i]}> ";`
		138	`$i++;`
		139	`};`
		140
		141	`// close current tag`
		142	`$result .= "</{$tag_stack[$i]}> ";`
		143	`// remove it from the stack`
		144	`array_splice($tag_stack, $i, 1);`
		145	`// if this tag is not "critical", reopen "run-off" tags`
		146	`$no_reopen_tags = array("tr","td","table","marquee","body","html");`
		147	`if (array_search($tag, $no_reopen_tags) === false) {`
		148	`while ($i > 0) {`
		149	`$i--;`
		150	`$result .= "<{$tag_stack[$i]}> ";`
		151	`};`
		152	`} else {`
		153	`array_splice($tag_stack, 0, $i);`
		154	`};`
		155	`};`
		156	`} else {`
		157	`// No such tag found on the stack, just remove it (do nothing in out case, as we have to explicitly`
		158	`// add things to result`
		159	`};`
		160	`} elseif (isset($matches[4])) {`
		161	`// Opening tag`
		162	`$tag = $matches[4];`
		163	`array_unshift($tag_stack, $tag);`
		164	`$result .= $matches[2];`
		165	`} else {`
		166	`// Autoclosing tag; do nothing specific`
		167	`$result .= $matches[2];`
		168	`};`
		169	`};`
		170
		171	`// Close all tags left`
		172	`while (count($tag_stack) > 0) {`
		173	`$tag = array_shift($tag_stack);`
		174	`$result .= "</".$tag.">";`
		175	`}`
		176
		177	`return $result;`
		178	`}`
		179
		180	`/**`
		181	`* This function adds quotes to attribute values; it attribute values already have quotes, no changes are made`
		182	`*/`
		183	`function quote_attrs($html) {`
		184	`while (preg_match("!(<[^>]*)\s([^=>]+)=([^'\"\r\n >]+)([\r\n >])!si",$html, $matches)) {`
		185	`$html = preg_replace("#(<[^>]*)\s([^=>]+)=([^'\"\r\n >]+)([\r\n >])#si","\\1 \\2='\\3'\\4",$html);`
		186	`};`
		187	`return $html;`
		188	`};`
		189
		190	`function escape_attr_value_entities($html) {`
		191	`$html = str_replace("<","<",$html);`
		192	`$html = str_replace(">",">",$html);`
		193
		194	`// Replace all character references by their decimal codes`
		195	`process_character_references($html);`
		196	`$html = escape_amp($html);`
		197	`return $html;`
		198	`}`
		199
		200	`/**`
		201	`* Updates attribute values: if there's any unescaped <, > or & symbols inside an attribute value,`
		202	`* replaces them with corresponding entity. Also note that & should not be escaped if it is already the part`
		203	`* of entity reference`
		204	`*`
		205	`* @param String $html source HTML code`
		206	`* @return String updated HTML code`
		207	`*/`
		208	`function escape_attrs_entities($html) {`
		209	`$result = "";`
		210
		211	`// Regular expression may be described as follows:`
		212	`// (<[^>]*) - something starting with < (i.e. tag name and, probably, some attribute name/values pairs`
		213	`// \s([^\s=>]+)= - space after "something", followed by attribute name (which may contain anything except spaces, = and > signs`
		214	`// (['\"])([^\3]*?)\3 - quoted attribute value; (@todo won't work with escaped quotes inside value, by the way).`
		215	`while (preg_match("#^(.)(<[^>])\s([^\s=>]+)=(['\"])([^\\4]?)\\4(.)$#si", $html, $matches)) {`
		216	`$new_value = escape_attr_value_entities($matches[5]);`
		217
		218	`$result .= $matches[1].$matches[2]." ".$matches[3]."=".$matches[4].$new_value.$matches[4];`
		219	`$html = $matches[6];`
		220	`};`
		221
		222	`return $result.$html;`
		223	`};`
		224
		225	`function fix_attrs_spaces(&$html) {`
		226	`while (preg_match("#(<[^>])\s([^\s=>]+)=\"([^\"]?)\"([^\s])#si", $html)) {`
		227	`$html = preg_replace("#(<[^>])\s([^\s=>]+)=\"([^\"]?)\"([^\s])#si","\\1 \\2=\"\\3\" \\4",$html);`
		228	`};`
		229
		230	`while (preg_match("#(<[^>])\s([^\s=>]+)='([^']?)'([^\s])#si", $html)) {`
		231	`$html = preg_replace("#(<[^>])\s([^\s=>]+)='([^']?)'([^\s])#si","\\1 \\2='\\3' \\4",$html);`
		232	`};`
		233	`}`
		234
		235	`function fix_attrs_tag($tag) {`
		236	`if (preg_match("#(<)(.?)(/\s>)#is",$tag, $matches)) {`
		237	`$prefix = $matches[1];`
		238	`$suffix = $matches[3];`
		239	`$content = $matches[2];`
		240	`} elseif (preg_match("#(<)(.*?)(>)#is",$tag, $matches)) {`
		241	`$prefix = $matches[1];`
		242	`$suffix = $matches[3];`
		243	`$content = $matches[2];`
		244	`} else {`
		245	`return;`
		246	`};`
		247
		248	`if (preg_match("#^\s(\w+)\s(.)\s/\s*\$#is", $content, $matches)) {`
		249	`$tagname = $matches[1];`
		250	`$raw_attrs = isset($matches[2]) ? $matches[2] : "";`
		251	`} elseif (preg_match("#^\s(\w+)\s(.*)\$#is", $content, $matches)) {`
		252	`$tagname = $matches[1];`
		253	`$raw_attrs = isset($matches[2]) ? $matches[2] : "";`
		254	`} else {`
		255	`// A strange tag occurred; just remove everything`
		256	`$tagname = "";`
		257	`$raw_attrs = "";`
		258	`};`
		259
		260	`$attrs = array();`
		261	`while (!empty($raw_attrs)) {`
		262	`if (preg_match("#^\s(\w+?)\s=\s\"(.?)\"(.*)$#is",$raw_attrs,$matches)) {`
		263	`$attr = strtolower($matches[1]);`
		264	`$value = $matches[2];`
		265
		266	`if (!isset($attrs[$attr])) {`
		267	`$attrs[$attr] = $value;`
		268	`};`
		269
		270	`$raw_attrs = $matches[3];`
		271	`} elseif (preg_match("#^\s(\w+?)\s=\s'(.?)'(.*)$#is",$raw_attrs,$matches)) {`
		272	`$attr = strtolower($matches[1]);`
		273	`$value = $matches[2];`
		274
		275	`if (!isset($attrs[$attr])) {`
		276	`$attrs[$attr] = $value;`
		277	`};`
		278
		279	`$raw_attrs = $matches[3];`
		280	`} elseif (preg_match("#^\s(\w+?)=(\w+)(.)$#is",$raw_attrs,$matches)) {`
		281	`$attr = strtolower($matches[1]);`
		282	`$value = $matches[2];`
		283
		284	`if (!isset($attrs[$attr])) {`
		285	`$attrs[$attr] = $value;`
		286	`};`
		287
		288	`$raw_attrs = $matches[3];`
		289	`} elseif (preg_match("#^\s\S+\s+(.)$#is",$raw_attrs,$matches)) {`
		290	`// Just a junk at the beginning; skip till the first space`
		291	`$raw_attrs = $matches[1];`
		292	`} else {`
		293	`$raw_attrs = "";`
		294	`};`
		295	`};`
		296
		297	`$str = "";`
		298	`foreach ($attrs as $key => $value) {`
		299	`// In theory, if the garbage have been found inside the attrs section, we could get`
		300	`// and invalid attribute name here; just ignore them in this case`
		301	`if (HTML2PS_XMLUtils::valid_attribute_name($key)) {`
		302	`if (strpos($value,'"') !== false) {`
		303	`$str .= " ".$key."='".$value."'";`
		304	`} else {`
		305	`$str .= " ".$key."=\"".$value."\"";`
		306	`};`
		307	`};`
		308	`};`
		309
		310	`return $prefix.$tagname.$str.$suffix;`
		311	`}`
		312
		313	`function fix_attrs($html) {`
		314	`$result = "";`
		315
		316	`while (preg_match("#^(.?)(<[^/].?>)#is",$html,$matches)) {`
		317	`$result .= $matches[1].fix_attrs_tag($matches[2]);`
		318	`$html = substr($html, strlen($matches[0]));`
		319	`};`
		320
		321	`return $result.$html;`
		322	`}`
		323
		324	`function fix_closing_tags($html) {`
		325	`return preg_replace("#</\s(\w+).?>#","</\\1>",$html);`
		326	`}`
		327
		328	`function process_pagebreak_commands(&$html) {`
		329	`$html = preg_replace("#<\?page-break>\|<!--NewPage-->#","<pagebreak/>",$html);`
		330	`}`
		331
		332	`function xhtml2xhtml($html) {`
		333	`process_pagebreak_commands($html);`
		334
		335	`// Remove STYLE tags for the same reason and store them in the temporary variable`
		336	`// later they will be added back to HEAD section`
		337	`$styles = process_style($html);`
		338
		339	`// Do HTML -> XML (XHTML) conversion`
		340	`// Convert HTML character references to their Unicode analogues`
		341	`process_character_references($html);`
		342
		343	`remove_comments($html);`
		344
		345	`// Convert all tags to lower case`
		346	`$html = lowercase_tags($html);`
		347	`$html = lowercase_closing_tags($html);`
		348
		349	`// Remove SCRIPT tags`
		350	`$html = process_script($html);`
		351
		352	`$html = insert_styles($html, $styles);`
		353
		354	`return $html;`
		355	`}`
		356
		357	`function html2xhtml($html) {`
		358	`process_pagebreak_commands($html);`
		359
		360	`// Remove SCRIPT tags from the page being processed, as script content may`
		361	`// mess the firther html-parsing utilities`
		362	`$html = process_script($html);`
		363
		364	`// Remove STYLE tags for the same reason and store them in the temporary variable`
		365	`// later they will be added back to HEAD section`
		366	`$styles = process_style($html);`
		367
		368	`// Convert HTML character references to their Unicode analogues`
		369	`process_character_references($html);`
		370
		371	`remove_comments($html);`
		372
		373	`fix_attrs_spaces($html);`
		374	`$html = quote_attrs($html);`
		375	`$html = escape_attrs_entities($html);`
		376
		377	`$html = lowercase_tags($html);`
		378	`$html = lowercase_closing_tags($html);`
		379
		380	`$html = fix_closing_tags($html);`
		381
		382	`$html = close_tag("area",$html);`
		383	`$html = close_tag("base",$html);`
		384	`$html = close_tag("basefont",$html);`
		385	`$html = close_tag("br",$html);`
		386	`$html = close_tag("col",$html);`
		387	`$html = close_tag("embed",$html);`
		388	`$html = close_tag("frame",$html);`
		389	`$html = close_tag("hr",$html);`
		390	`$html = close_tag("img",$html);`
		391	`$html = close_tag("input",$html);`
		392	`$html = close_tag("isindex",$html);`
		393	`$html = close_tag("link",$html);`
		394	`$html = close_tag("meta",$html);`
		395	`$html = close_tag("param",$html);`
		396
		397	`$html = make_attr_value("checked",$html);`
		398	`$html = make_attr_value("compact",$html);`
		399	`$html = make_attr_value("declare",$html);`
		400	`$html = make_attr_value("defer",$html);`
		401	`$html = make_attr_value("disabled",$html);`
		402	`$html = make_attr_value("ismap",$html);`
		403	`$html = make_attr_value("multiple",$html);`
		404	`$html = make_attr_value("nohref",$html);`
		405	`$html = make_attr_value("noresize",$html);`
		406	`$html = make_attr_value("noshade",$html);`
		407	`$html = make_attr_value("nowrap",$html);`
		408	`$html = make_attr_value("readonly",$html);`
		409	`$html = make_attr_value("selected",$html);`
		410
		411	`$html = process_html($html);`
		412	`$html = process_body($html);`
		413
		414	`$html = process_head($html);`
		415	`$html = process_p($html);`
		416
		417	`$html = escape_amp($html);`
		418	`$html = escape_lt($html);`
		419	`$html = escape_gt($html);`
		420
		421	`$html = escape_textarea_content($html);`
		422
		423	`process_tables($html,0);`
		424
		425	`process_lists($html,0);`
		426	`process_deflists($html,0);`
		427	`process_selects($html,0);`
		428
		429	`$html = fix_tags($html);`
		430	`$html = fix_attrs($html);`
		431
		432	`$html = insert_styles($html, $styles);`
		433
		434	`return $html;`
		435	`}`
		436
		437	`function escape_textarea_content($html) {`
		438	`preg_match_all('#<textarea(.)>(.)<\s/\stextarea\s*>#Uis', $html, $matches, PREG_OFFSET_CAPTURE \| PREG_SET_ORDER);`
		439
		440	`// Why cycle from the last to first match?`
		441	`// It will keep unprocessed matches offsets valid,`
		442	`// as escaped content may differ from original content in length,`
		443	`for ($i = count($matches)-1; $i>=0; $i--) {`
		444	`$match = $matches[$i];`
		445	`$match_offset = $match[2][1];`
		446	`$match_content = $match[2][0];`
		447	`$match_length = strlen($match_content);`
		448	`$escaped_content = preg_replace('/&([^#])/', '&\1',`
		449	`str_replace('>', '>',`
		450	`str_replace('<', '<', $match_content)));`
		451	`$html = substr_replace($html, $escaped_content, $match_offset, $match_length);`
		452	`};`
		453
		454	`return $html;`
		455	`}`
		456
		457	`function lowercase_tags($html) {`
		458	`$result = "";`
		459
		460	`while (preg_match("#^(.*?)(</?)([a-zA-z0-9]+)([\s>])#is",$html,$matches)) {`
		461	`// Drop extracted part`
		462	`$html = substr($html,strlen($matches[0]));`
		463	`// Move extracted part to the result`
		464	`$result .= $matches[1].$matches[2].strtolower($matches[3]).$matches[4];`
		465	`};`
		466
		467	`return $result.$html;`
		468	`};`
		469
		470	`function lowercase_closing_tags($html) {`
		471	`$result = "";`
		472
		473	`while (preg_match("#^(.?)(<)([a-zA-z0-9]+)(\s/\s*>)#is",$html,$matches)) {`
		474	`// Drop extracted part`
		475	`$html = substr($html,strlen($matches[0]));`
		476	`// Move extracted part to the result`
		477	`$result .= $matches[1].$matches[2].strtolower($matches[3]).$matches[4];`
		478	`};`
		479
		480	`return $result.$html;`
		481	`};`
		482
		483	`?>`

Subversion-Projekte lars-tiefland.php_share

(root)/html2pdf/html2ps_v2042/public_html/xhtml.utils.inc.php – Revision 1