Subversion-Projekte lars-tiefland.php_share

Revision

Details | Letzte Änderung | Log anzeigen | RSS feed

Revision Autor Zeilennr. Zeile
1 lars 1
<?php
2
// $Header: /cvsroot/html2ps/xhtml.utils.inc.php,v 1.35 2007/03/15 18:37:36 Konstantin Exp $
3
 
4
function close_tag($tag, $sample_html) {
5
  return preg_replace("!(<{$tag}(\s[^>]*[^/>])?)>!si","\\1/>",$sample_html);
6
};
7
 
8
function make_attr_value($attr, $html) {
9
  return preg_replace("#(<[^>]*\s){$attr}(\s|>|/>)#si","\\1{$attr}=\"{$attr}\"\\2",$html);
10
};
11
 
12
 
13
function mk_open_tag_regexp($tag) { return "<\s*{$tag}(\s+[^>]*)?>"; };
14
function mk_close_tag_regexp($tag) { return "<\s*/\s*{$tag}\s*>"; };
15
 
16
function process_html($html) {
17
  $open  = mk_open_tag_regexp("html");
18
  $close = mk_close_tag_regexp("html");
19
 
20
  if (!preg_match("#{$open}#is",$html)) {
21
    $html = "<html>".$html;
22
  };
23
 
24
  /**
25
   * Let's check if there's more than one <html> tags inside the page text
26
   * If there are, remove everything except the first one and content between the first and second <html>
27
   */
28
  while (preg_match("#{$open}(.*?){$open}#is", $html)) {
29
    $html = preg_replace("#{$open}(.*?){$open}#is", "<html>\\2", $html);
30
  };
31
 
32
  if (!preg_match("#{$close}#is", $html)) {
33
    $html = $html."</html>";
34
  };
35
 
36
  // PHP 5.2.0 compatilibty issue
37
  // preg_replace may accidentally return NULL on large files not matching this
38
  $html = preg_replace("#.*({$open})#is","\\1",$html);
39
 
40
  // PHP 5.2.0 compatilibty issue
41
  // preg_replace may accidentally return NULL on large files not matching this
42
 
43
  // Cut off all data before and after 'html' tag; unless we'll do it,
44
  // the XML parser will die violently
45
  $html = preg_replace("#^.*<html#is","<html",$html);
46
 
47
  $html = preg_replace("#</html\s*>.*$#is","</html>",$html);
48
 
49
  return $html;
50
}
51
 
52
function process_head($html) {
53
  $open  = mk_open_tag_regexp("head");
54
  $close = mk_close_tag_regexp("head");
55
  $ohtml = mk_open_tag_regexp("html");
56
  $chtml = mk_close_tag_regexp("html");
57
  $obody = mk_open_tag_regexp("body");
58
 
59
  if (!preg_match("#{$open}#is",$html)) {
60
    $html = preg_replace("#({$ohtml})(.*)({$obody})#is","\\1<head>\\3</head>\\4",$html);
61
  } elseif (!preg_match("#{$close}#is",$html)) {
62
    if (preg_match("#{$obody}#is",$html)) {
63
      $html = preg_replace("#({$obody})#is","</head>\\1",$html);
64
    } else {
65
      $html = preg_replace("#({$chtml})#is","</head>\\1",$html);
66
    };
67
  };
68
  return $html;
69
}
70
 
71
function process_body($html) {
72
  $open  = mk_open_tag_regexp("body");
73
  $close = mk_close_tag_regexp("body");
74
  $ohtml = mk_open_tag_regexp("html");
75
  $chtml = mk_close_tag_regexp("html");
76
  $chead = mk_close_tag_regexp("head");
77
 
78
  if (!preg_match("#{$open}#is",$html)) {
79
    if (preg_match("#{$chead}#is",$html)) {
80
      $html = preg_replace("#({$chead})#is","\\1<body>",$html);
81
    } else {
82
      $html = preg_replace("#({$ohtml})#is","\\1<body>",$html);
83
    };
84
  };
85
  if (!preg_match("#{$close}#is",$html)) {
86
    $html = preg_replace("#({$chtml})#is","</body>\\1",$html);
87
  };
88
 
89
  // Now check is there any data between </head> and <body>.
90
  $html = preg_replace("#({$chead})(.+)({$open})#is","\\1\\3\\2",$html);
91
  // Check if there's any data between </body> and </html>
92
  $html = preg_replace("#({$close})(.+)({$chtml})#is","\\2\\1\\3",$html);
93
 
94
  return $html;
95
}
96
 
97
// Hmmm. May be we'll just write SAX parser on PHP? ;-)
98
function fix_tags($html) {
99
  $result = "";
100
  $tag_stack = array();
101
 
102
  // these corrections can simplify the regexp used to parse tags
103
  // remove whitespaces before '/' and between '/' and '>' in autoclosing tags
104
  $html = preg_replace("#\s*/\s*>#is","/>",$html);
105
  // remove whitespaces between '<', '/' and first tag letter in closing tags
106
  $html = preg_replace("#<\s*/\s*#is","</",$html);
107
  // remove whitespaces between '<' and first tag letter
108
  $html = preg_replace("#<\s+#is","<",$html);
109
 
110
  while (preg_match("#(.*?)(<([a-z\d]+)[^>]*/>|<([a-z\d]+)[^>]*(?<!/)>|</([a-z\d]+)[^>]*>)#is",$html,$matches)) {
111
    $result .= $matches[1];
112
    $html = substr($html, strlen($matches[0]));
113
 
114
    // Closing tag
115
    if (isset($matches[5])) {
116
      $tag = $matches[5];
117
 
118
      if ($tag == $tag_stack[0]) {
119
        // Matched the last opening tag (normal state)
120
        // Just pop opening tag from the stack
121
        array_shift($tag_stack);
122
        $result .= $matches[2];
123
      } elseif (array_search($tag, $tag_stack)) {
124
        // We'll never should close 'table' tag such way, so let's check if any 'tables' found on the stack
125
        $no_critical_tags = !array_search('table',$tag_stack);
126
        if (!$no_critical_tags) {
127
          $no_critical_tags = (array_search('table',$tag_stack) >= array_search($tag, $tag_stack));
128
        };
129
 
130
        if ($no_critical_tags) {
131
          // Corresponding opening tag exist on the stack (somewhere deep)
132
          // Note that we can forget about 0 value returned by array_search, becaus it is handled by previous 'if'
133
 
134
          // Insert a set of closing tags for all non-matching tags
135
          $i = 0;
136
          while ($tag_stack[$i] != $tag) {
137
            $result .= "</{$tag_stack[$i]}> ";
138
            $i++;
139
          };
140
 
141
          // close current tag
142
          $result .= "</{$tag_stack[$i]}> ";
143
          // remove it from the stack
144
          array_splice($tag_stack, $i, 1);
145
          // if this tag is not "critical", reopen "run-off" tags
146
          $no_reopen_tags = array("tr","td","table","marquee","body","html");
147
          if (array_search($tag, $no_reopen_tags) === false) {
148
            while ($i > 0) {
149
              $i--;
150
              $result .= "<{$tag_stack[$i]}> ";
151
            };
152
          } else {
153
            array_splice($tag_stack, 0, $i);
154
          };
155
        };
156
      } else {
157
        // No such tag found on the stack, just remove it (do nothing in out case, as we have to explicitly
158
        // add things to result
159
      };
160
    } elseif (isset($matches[4])) {
161
      // Opening tag
162
      $tag = $matches[4];
163
      array_unshift($tag_stack, $tag);
164
      $result .= $matches[2];
165
    } else {
166
      // Autoclosing tag; do nothing specific
167
      $result .= $matches[2];
168
    };
169
  };
170
 
171
  // Close all tags left
172
  while (count($tag_stack) > 0) {
173
    $tag = array_shift($tag_stack);
174
    $result .= "</".$tag.">";
175
  }
176
 
177
  return $result;
178
}
179
 
180
/**
181
 * This function adds quotes to attribute values; it attribute values already have quotes, no changes are made
182
 */
183
function quote_attrs($html) {
184
  while (preg_match("!(<[^>]*)\s([^=>]+)=([^'\"\r\n >]+)([\r\n >])!si",$html, $matches)) {
185
    $html = preg_replace("#(<[^>]*)\s([^=>]+)=([^'\"\r\n >]+)([\r\n >])#si","\\1 \\2='\\3'\\4",$html);
186
  };
187
  return $html;
188
};
189
 
190
function escape_attr_value_entities($html) {
191
  $html = str_replace("<","&lt;",$html);
192
  $html = str_replace(">","&gt;",$html);
193
 
194
  // Replace all character references by their decimal codes
195
  process_character_references($html);
196
  $html = escape_amp($html);
197
  return $html;
198
}
199
 
200
/**
201
 * Updates attribute values: if there's any unescaped <, > or & symbols inside an attribute value,
202
 * replaces them with corresponding entity. Also note that & should not be escaped if it is already the part
203
 * of entity reference
204
 *
205
 * @param String $html source HTML code
206
 * @return String updated HTML code
207
 */
208
function escape_attrs_entities($html) {
209
  $result = "";
210
 
211
  // Regular expression may be described as follows:
212
  // (<[^>]*) - something starting with < (i.e. tag name and, probably, some attribute name/values pairs
213
  // \s([^\s=>]+)= - space after "something", followed by attribute name (which may contain anything except spaces, = and > signs
214
  // (['\"])([^\3]*?)\3 - quoted attribute value; (@todo won't work with escaped quotes inside value, by the way).
215
  while (preg_match("#^(.*)(<[^>]*)\s([^\s=>]+)=(['\"])([^\\4]*?)\\4(.*)$#si", $html, $matches)) {
216
    $new_value = escape_attr_value_entities($matches[5]);
217
 
218
    $result .= $matches[1].$matches[2]." ".$matches[3]."=".$matches[4].$new_value.$matches[4];
219
    $html = $matches[6];
220
  };
221
 
222
  return $result.$html;
223
};
224
 
225
function fix_attrs_spaces(&$html) {
226
  while (preg_match("#(<[^>]*)\s([^\s=>]+)=\"([^\"]*?)\"([^\s])#si", $html)) {
227
    $html = preg_replace("#(<[^>]*)\s([^\s=>]+)=\"([^\"]*?)\"([^\s])#si","\\1 \\2=\"\\3\" \\4",$html);
228
  };
229
 
230
  while (preg_match("#(<[^>]*)\s([^\s=>]+)='([^']*?)'([^\s])#si", $html)) {
231
    $html = preg_replace("#(<[^>]*)\s([^\s=>]+)='([^']*?)'([^\s])#si","\\1 \\2='\\3' \\4",$html);
232
  };
233
}
234
 
235
function fix_attrs_tag($tag) {
236
  if (preg_match("#(<)(.*?)(/\s*>)#is",$tag, $matches)) {
237
    $prefix  = $matches[1];
238
    $suffix  = $matches[3];
239
    $content = $matches[2];
240
  } elseif (preg_match("#(<)(.*?)(>)#is",$tag, $matches)) {
241
    $prefix  = $matches[1];
242
    $suffix  = $matches[3];
243
    $content = $matches[2];
244
  } else {
245
    return;
246
  };
247
 
248
  if (preg_match("#^\s*(\w+)\s*(.*)\s*/\s*\$#is", $content, $matches)) {
249
    $tagname   = $matches[1];
250
    $raw_attrs = isset($matches[2]) ? $matches[2] : "";
251
  } elseif (preg_match("#^\s*(\w+)\s*(.*)\$#is", $content, $matches)) {
252
    $tagname   = $matches[1];
253
    $raw_attrs = isset($matches[2]) ? $matches[2] : "";
254
  } else {
255
    // A strange tag occurred; just remove everything
256
    $tagname   = "";
257
    $raw_attrs = "";
258
  };
259
 
260
  $attrs = array();
261
  while (!empty($raw_attrs)) {
262
    if (preg_match("#^\s*(\w+?)\s*=\s*\"(.*?)\"(.*)$#is",$raw_attrs,$matches)) {
263
      $attr  = strtolower($matches[1]);
264
      $value = $matches[2];
265
 
266
      if (!isset($attrs[$attr])) {
267
        $attrs[$attr] = $value;
268
      };
269
 
270
      $raw_attrs = $matches[3];
271
    } elseif (preg_match("#^\s*(\w+?)\s*=\s*'(.*?)'(.*)$#is",$raw_attrs,$matches)) {
272
      $attr  = strtolower($matches[1]);
273
      $value = $matches[2];
274
 
275
      if (!isset($attrs[$attr])) {
276
        $attrs[$attr] = $value;
277
      };
278
 
279
      $raw_attrs = $matches[3];
280
    } elseif (preg_match("#^\s*(\w+?)=(\w+)(.*)$#is",$raw_attrs,$matches)) {
281
      $attr  = strtolower($matches[1]);
282
      $value = $matches[2];
283
 
284
      if (!isset($attrs[$attr])) {
285
        $attrs[$attr] = $value;
286
      };
287
 
288
      $raw_attrs = $matches[3];
289
    } elseif (preg_match("#^\s*\S+\s+(.*)$#is",$raw_attrs,$matches)) {
290
      // Just a junk at the beginning; skip till the first space
291
      $raw_attrs = $matches[1];
292
    } else {
293
      $raw_attrs = "";
294
    };
295
  };
296
 
297
  $str = "";
298
  foreach ($attrs as $key => $value) {
299
    // In theory, if the garbage have been found inside the attrs section, we could get
300
    // and invalid attribute name here; just ignore them in this case
301
    if (HTML2PS_XMLUtils::valid_attribute_name($key)) {
302
      if (strpos($value,'"') !== false) {
303
        $str .= " ".$key."='".$value."'";
304
      } else {
305
        $str .= " ".$key."=\"".$value."\"";
306
      };
307
    };
308
  };
309
 
310
  return $prefix.$tagname.$str.$suffix;
311
}
312
 
313
function fix_attrs($html) {
314
  $result = "";
315
 
316
  while (preg_match("#^(.*?)(<[^/].*?>)#is",$html,$matches)) {
317
    $result .= $matches[1].fix_attrs_tag($matches[2]);
318
    $html = substr($html, strlen($matches[0]));
319
  };
320
 
321
  return $result.$html;
322
}
323
 
324
function fix_closing_tags($html) {
325
  return preg_replace("#</\s*(\w+).*?>#","</\\1>",$html);
326
}
327
 
328
function process_pagebreak_commands(&$html) {
329
  $html = preg_replace("#<\?page-break>|<!--NewPage-->#","<pagebreak/>",$html);
330
}
331
 
332
function xhtml2xhtml($html) {
333
  process_pagebreak_commands($html);
334
 
335
  // Remove STYLE tags for the same reason and store them in the temporary variable
336
  // later they will be added back to HEAD section
337
  $styles = process_style($html);
338
 
339
  // Do HTML -> XML (XHTML) conversion
340
  // Convert HTML character references to their Unicode analogues
341
  process_character_references($html);
342
 
343
  remove_comments($html);
344
 
345
  // Convert all tags to lower case
346
  $html = lowercase_tags($html);
347
  $html = lowercase_closing_tags($html);
348
 
349
  // Remove SCRIPT tags
350
  $html = process_script($html);
351
 
352
  $html = insert_styles($html, $styles);
353
 
354
  return $html;
355
}
356
 
357
function html2xhtml($html) {
358
  process_pagebreak_commands($html);
359
 
360
  // Remove SCRIPT tags from the page being processed, as script content may
361
  // mess the firther html-parsing utilities
362
  $html = process_script($html);
363
 
364
  // Remove STYLE tags for the same reason and store them in the temporary variable
365
  // later they will be added back to HEAD section
366
  $styles = process_style($html);
367
 
368
  // Convert HTML character references to their Unicode analogues
369
  process_character_references($html);
370
 
371
  remove_comments($html);
372
 
373
  fix_attrs_spaces($html);
374
  $html = quote_attrs($html);
375
  $html = escape_attrs_entities($html);
376
 
377
  $html = lowercase_tags($html);
378
  $html = lowercase_closing_tags($html);
379
 
380
  $html = fix_closing_tags($html);
381
 
382
  $html = close_tag("area",$html);
383
  $html = close_tag("base",$html);
384
  $html = close_tag("basefont",$html);
385
  $html = close_tag("br",$html);
386
  $html = close_tag("col",$html);
387
  $html = close_tag("embed",$html);
388
  $html = close_tag("frame",$html);
389
  $html = close_tag("hr",$html);
390
  $html = close_tag("img",$html);
391
  $html = close_tag("input",$html);
392
  $html = close_tag("isindex",$html);
393
  $html = close_tag("link",$html);
394
  $html = close_tag("meta",$html);
395
  $html = close_tag("param",$html);
396
 
397
  $html = make_attr_value("checked",$html);
398
  $html = make_attr_value("compact",$html);
399
  $html = make_attr_value("declare",$html);
400
  $html = make_attr_value("defer",$html);
401
  $html = make_attr_value("disabled",$html);
402
  $html = make_attr_value("ismap",$html);
403
  $html = make_attr_value("multiple",$html);
404
  $html = make_attr_value("nohref",$html);
405
  $html = make_attr_value("noresize",$html);
406
  $html = make_attr_value("noshade",$html);
407
  $html = make_attr_value("nowrap",$html);
408
  $html = make_attr_value("readonly",$html);
409
  $html = make_attr_value("selected",$html);
410
 
411
  $html = process_html($html);
412
  $html = process_body($html);
413
 
414
  $html = process_head($html);
415
  $html = process_p($html);
416
 
417
  $html = escape_amp($html);
418
  $html = escape_lt($html);
419
  $html = escape_gt($html);
420
 
421
  $html = escape_textarea_content($html);
422
 
423
  process_tables($html,0);
424
 
425
  process_lists($html,0);
426
  process_deflists($html,0);
427
  process_selects($html,0);
428
 
429
  $html = fix_tags($html);
430
  $html = fix_attrs($html);
431
 
432
  $html = insert_styles($html, $styles);
433
 
434
  return $html;
435
}
436
 
437
function escape_textarea_content($html) {
438
  preg_match_all('#<textarea(.*)>(.*)<\s*/\s*textarea\s*>#Uis', $html, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER);
439
 
440
  // Why cycle from the last to first match?
441
  // It will keep unprocessed matches offsets valid,
442
  // as escaped content may differ from original content in length,
443
  for ($i = count($matches)-1; $i>=0; $i--) {
444
    $match = $matches[$i];
445
    $match_offset = $match[2][1];
446
    $match_content = $match[2][0];
447
    $match_length = strlen($match_content);
448
    $escaped_content = preg_replace('/&([^#])/', '&#38;\1',
449
                                    str_replace('>', '&#62;',
450
                                                str_replace('<', '&#60;', $match_content)));
451
    $html = substr_replace($html, $escaped_content, $match_offset, $match_length);
452
  };
453
 
454
  return $html;
455
}
456
 
457
function lowercase_tags($html) {
458
  $result = "";
459
 
460
  while (preg_match("#^(.*?)(</?)([a-zA-z0-9]+)([\s>])#is",$html,$matches)) {
461
    // Drop extracted part
462
    $html = substr($html,strlen($matches[0]));
463
    // Move extracted part to the result
464
    $result .= $matches[1].$matches[2].strtolower($matches[3]).$matches[4];
465
  };
466
 
467
  return $result.$html;
468
};
469
 
470
function lowercase_closing_tags($html) {
471
  $result = "";
472
 
473
  while (preg_match("#^(.*?)(<)([a-zA-z0-9]+)(\s*/\s*>)#is",$html,$matches)) {
474
    // Drop extracted part
475
    $html = substr($html,strlen($matches[0]));
476
    // Move extracted part to the result
477
    $result .= $matches[1].$matches[2].strtolower($matches[3]).$matches[4];
478
  };
479
 
480
  return $result.$html;
481
};
482
 
483
?>