Subversion-Projekte lars-tiefland.php_share

Revision

Details | Letzte Änderung | Log anzeigen | RSS feed

Revision Autor Zeilennr. Zeile
1 lars 1
<?php
2
// $Header: /cvsroot/html2ps/xhtml.entities.inc.php,v 1.11 2006/12/24 14:42:44 Konstantin Exp $
3
 
4
function process_character_references(&$html) {
5
  // Process symbolic character references
6
  global $g_html_entities;
7
  foreach ($g_html_entities as $entity => $code) {
8
    $html = str_replace("&{$entity};","&#{$code};",$html);
9
 
10
    // Some ill-brained webmasters write HTML symbolic references without
11
    // terminating semicolor (especially at www.whitehouse.gov. The following
12
    // replacemenet is required to fix these damaged inteties, converting them
13
    // to the numerical character reference.
14
    //
15
    // We use [\s<] as entity name terminator to avoid breaking up longer entity
16
    // names by filtering in only space or HTML-tag terminated ones.
17
    //
18
    $html = preg_replace("/&{$entity}([\s<])/","&#{$code};\\1",$html);
19
  };
20
 
21
  // Process hecadecimal character references
22
  while (preg_match("/&#x([[:xdigit:]]{2,4});/i", $html, $matches)) {
23
    // We cannot use plain str_replace, because 'x' symbol can be in both cases;
24
    // str_ireplace have appeared in PHP 5 only, so we cannot use it due the
25
    // compatibility problems
26
 
27
    $html = preg_replace("/&#x".$matches[1].";/i","&#".hexdec($matches[1]).";",$html);
28
  };
29
}
30
 
31
function escape_amp($html) {
32
  // Escape all ampersants not followed by a # sharp sign
33
  // Note that symbolic references were replaced by numeric before this!
34
  $html = preg_replace("/&(?!#)/si","&#38;\\1",$html);
35
 
36
  // Complete all numeric character references unterminated with ';'
37
  $html = preg_replace("/&#(\d+)(?![\d;])/si","&#\\1;",$html);
38
 
39
  // Escape all ampersants followed by # sharp and NON-DIGIT symbol
40
  // They we're not covered by above conversions and are not a
41
  // symbol reference.
42
  // Also, don't forget that we've used &amp;! They should not be converted too...
43
  //
44
  $html = preg_replace("/&(?!#\d)/si","&#38;\\1",$html);
45
 
46
  return $html;
47
};
48
 
49
function escape_lt($html) {
50
  // Why this loop is needed here?
51
  // The cause is that, for example, <<<a> sequence will not be replaced by
52
  // &lt;&lt<a>, as it should be. The regular expression matches TWO symbols
53
  // << (actually, first < symbold, and one following it, so, the second <
54
  // will not be matched when script attempt to find and replace next occurrence using 'g' regexp
55
  // modifier. So, we will need to check for such situations agint and, possibly, restart the
56
  // search and replace process.
57
  //
58
  while (preg_match("#<(\s*[^!/a-zA-Z])#",$html)) {
59
    $html = preg_replace("#<(\s*[^!/a-zA-Z])#si","&#60;\\1",$html);
60
  };
61
 
62
  while (preg_match("#(<[^>]*?)<#si",$html)) {
63
    $html = preg_replace("#(<[^>]*?)<#si","\\1&#60;",$html);
64
  };
65
 
66
  return $html;
67
};
68
 
69
function escape_gt($html) {
70
  $html = preg_replace("#([^\s\da-zA-Z'\"/=-])\s*>#si","\\1&#62;",$html);
71
 
72
  while (preg_match("#(>[^<]*?)>#si",$html)) {
73
    $html = preg_replace("#(>[^<]*?)>#si","\\1&#62;",$html);
74
  };
75
 
76
  return $html;
77
};
78
 
79
?>