| 1 |
lars |
1 |
<?php
|
|
|
2 |
// $Header: /cvsroot/html2ps/xhtml.entities.inc.php,v 1.11 2006/12/24 14:42:44 Konstantin Exp $
|
|
|
3 |
|
|
|
4 |
function process_character_references(&$html) {
|
|
|
5 |
// Process symbolic character references
|
|
|
6 |
global $g_html_entities;
|
|
|
7 |
foreach ($g_html_entities as $entity => $code) {
|
|
|
8 |
$html = str_replace("&{$entity};","&#{$code};",$html);
|
|
|
9 |
|
|
|
10 |
// Some ill-brained webmasters write HTML symbolic references without
|
|
|
11 |
// terminating semicolor (especially at www.whitehouse.gov. The following
|
|
|
12 |
// replacemenet is required to fix these damaged inteties, converting them
|
|
|
13 |
// to the numerical character reference.
|
|
|
14 |
//
|
|
|
15 |
// We use [\s<] as entity name terminator to avoid breaking up longer entity
|
|
|
16 |
// names by filtering in only space or HTML-tag terminated ones.
|
|
|
17 |
//
|
|
|
18 |
$html = preg_replace("/&{$entity}([\s<])/","&#{$code};\\1",$html);
|
|
|
19 |
};
|
|
|
20 |
|
|
|
21 |
// Process hecadecimal character references
|
|
|
22 |
while (preg_match("/&#x([[:xdigit:]]{2,4});/i", $html, $matches)) {
|
|
|
23 |
// We cannot use plain str_replace, because 'x' symbol can be in both cases;
|
|
|
24 |
// str_ireplace have appeared in PHP 5 only, so we cannot use it due the
|
|
|
25 |
// compatibility problems
|
|
|
26 |
|
|
|
27 |
$html = preg_replace("/&#x".$matches[1].";/i","&#".hexdec($matches[1]).";",$html);
|
|
|
28 |
};
|
|
|
29 |
}
|
|
|
30 |
|
|
|
31 |
function escape_amp($html) {
|
|
|
32 |
// Escape all ampersants not followed by a # sharp sign
|
|
|
33 |
// Note that symbolic references were replaced by numeric before this!
|
|
|
34 |
$html = preg_replace("/&(?!#)/si","&\\1",$html);
|
|
|
35 |
|
|
|
36 |
// Complete all numeric character references unterminated with ';'
|
|
|
37 |
$html = preg_replace("/&#(\d+)(?![\d;])/si","&#\\1;",$html);
|
|
|
38 |
|
|
|
39 |
// Escape all ampersants followed by # sharp and NON-DIGIT symbol
|
|
|
40 |
// They we're not covered by above conversions and are not a
|
|
|
41 |
// symbol reference.
|
|
|
42 |
// Also, don't forget that we've used &! They should not be converted too...
|
|
|
43 |
//
|
|
|
44 |
$html = preg_replace("/&(?!#\d)/si","&\\1",$html);
|
|
|
45 |
|
|
|
46 |
return $html;
|
|
|
47 |
};
|
|
|
48 |
|
|
|
49 |
function escape_lt($html) {
|
|
|
50 |
// Why this loop is needed here?
|
|
|
51 |
// The cause is that, for example, <<<a> sequence will not be replaced by
|
|
|
52 |
// <<<a>, as it should be. The regular expression matches TWO symbols
|
|
|
53 |
// << (actually, first < symbold, and one following it, so, the second <
|
|
|
54 |
// will not be matched when script attempt to find and replace next occurrence using 'g' regexp
|
|
|
55 |
// modifier. So, we will need to check for such situations agint and, possibly, restart the
|
|
|
56 |
// search and replace process.
|
|
|
57 |
//
|
|
|
58 |
while (preg_match("#<(\s*[^!/a-zA-Z])#",$html)) {
|
|
|
59 |
$html = preg_replace("#<(\s*[^!/a-zA-Z])#si","<\\1",$html);
|
|
|
60 |
};
|
|
|
61 |
|
|
|
62 |
while (preg_match("#(<[^>]*?)<#si",$html)) {
|
|
|
63 |
$html = preg_replace("#(<[^>]*?)<#si","\\1<",$html);
|
|
|
64 |
};
|
|
|
65 |
|
|
|
66 |
return $html;
|
|
|
67 |
};
|
|
|
68 |
|
|
|
69 |
function escape_gt($html) {
|
|
|
70 |
$html = preg_replace("#([^\s\da-zA-Z'\"/=-])\s*>#si","\\1>",$html);
|
|
|
71 |
|
|
|
72 |
while (preg_match("#(>[^<]*?)>#si",$html)) {
|
|
|
73 |
$html = preg_replace("#(>[^<]*?)>#si","\\1>",$html);
|
|
|
74 |
};
|
|
|
75 |
|
|
|
76 |
return $html;
|
|
|
77 |
};
|
|
|
78 |
|
|
|
79 |
?>
|