Subversion-Projekte lars-tiefland.prado

Revision

Details | Letzte Änderung | Log anzeigen | RSS feed

Revision Autor Zeilennr. Zeile
1 lars 1
<?php
2
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
3
 
4
/**
5
 * SafeHTML Parser
6
 *
7
 * PHP versions 4 and 5
8
 *
9
 * @category   HTML
10
 * @package    System.Security
11
 * @author     Roman Ivanov <thingol@mail.ru>
12
 * @copyright  2004-2005 Roman Ivanov
13
 * @license    http://www.debian.org/misc/bsd.license  BSD License (3 Clause)
14
 * @version    1.3.7
15
 * @link       http://pixel-apes.com/safehtml/
16
 */
17
 
18
 
19
/**
20
 * This package requires HTMLSax3 package
21
 */
22
Prado::using('System.3rdParty.SafeHtml.HTMLSax3');
23
 
24
 
25
/**
26
 *
27
 * SafeHTML Parser
28
 *
29
 * This parser strips down all potentially dangerous content within HTML:
30
 * <ul>
31
 * <li>opening tag without its closing tag</li>
32
 * <li>closing tag without its opening tag</li>
33
 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
34
 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
35
 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
36
 * <li>any of these attributes: on*, data*, dynsrc</li>
37
 * <li>javascript:/vbscript:/about: etc. protocols</li>
38
 * <li>expression/behavior etc. in styles</li>
39
 * <li>any other active content</li>
40
 * </ul>
41
 * It also tries to convert code to XHTML valid, but htmltidy is far better
42
 * solution for this task.
43
 *
44
 * <b>Example:</b>
45
 * <pre>
46
 * $parser =& new SafeHTML();
47
 * $result = $parser->parse($doc);
48
 * </pre>
49
 *
50
 * @category   HTML
51
 * @package    System.Security
52
 * @author     Roman Ivanov <thingol@mail.ru>
53
 * @copyright  1997-2005 Roman Ivanov
54
 * @license    http://www.debian.org/misc/bsd.license  BSD License (3 Clause)
55
 * @version    Release: @package_version@
56
 * @link       http://pear.php.net/package/SafeHTML
57
 */
58
class TSafeHtmlParser
59
{
60
    /**
61
     * Storage for resulting HTML output
62
     *
63
     * @var string
64
     * @access private
65
     */
66
    private $_xhtml = '';
67
 
68
    /**
69
     * Array of counters for each tag
70
     *
71
     * @var array
72
     * @access private
73
     */
74
    private $_counter = array();
75
 
76
    /**
77
     * Stack of unclosed tags
78
     *
79
     * @var array
80
     * @access private
81
     */
82
    private $_stack = array();
83
 
84
    /**
85
     * Array of counters for tags that must be deleted with all content
86
     *
87
     * @var array
88
     * @access private
89
     */
90
    private $_dcCounter = array();
91
 
92
    /**
93
     * Stack of unclosed tags that must be deleted with all content
94
     *
95
     * @var array
96
     * @access private
97
     */
98
    private $_dcStack = array();
99
 
100
    /**
101
     * Stores level of list (ol/ul) nesting
102
     *
103
     * @var int
104
     * @access private
105
     */
106
    private $_listScope = 0;
107
 
108
    /**
109
     * Stack of unclosed list tags
110
     *
111
     * @var array
112
     * @access private
113
     */
114
    private $_liStack = array();
115
 
116
    /**
117
     * Array of prepared regular expressions for protocols (schemas) matching
118
     *
119
     * @var array
120
     * @access private
121
     */
122
    private $_protoRegexps = array();
123
 
124
    /**
125
     * Array of prepared regular expressions for CSS matching
126
     *
127
     * @var array
128
     * @access private
129
     */
130
    private $_cssRegexps = array();
131
 
132
    /**
133
     * List of single tags ("<tag />")
134
     *
135
     * @var array
136
     * @access public
137
     */
138
    public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
139
 
140
    /**
141
     * List of dangerous tags (such tags will be deleted)
142
     *
143
     * @var array
144
     * @access public
145
     */
146
    public $deleteTags = array(
147
        'applet', 'base',   'basefont', 'bgsound', 'blink',  'body',
148
        'embed',  'frame',  'frameset', 'head',    'html',   'ilayer',
149
        'iframe', 'layer',  'link',     'meta',    'object', 'style',
150
        'title',  'script',
151
        );
152
 
153
    /**
154
     * List of dangerous tags (such tags will be deleted, and all content
155
     * inside this tags will be also removed)
156
     *
157
     * @var array
158
     * @access public
159
     */
160
    public $deleteTagsContent = array('script', 'style', 'title', 'xml', );
161
 
162
    /**
163
     * Type of protocols filtering ('white' or 'black')
164
     *
165
     * @var string
166
     * @access public
167
     */
168
    public $protocolFiltering = 'white';
169
 
170
    /**
171
     * List of "dangerous" protocols (used for blacklist-filtering)
172
     *
173
     * @var array
174
     * @access public
175
     */
176
    public $blackProtocols = array(
177
        'about',   'chrome',     'data',       'disk',     'hcp',
178
        'help',    'javascript', 'livescript', 'lynxcgi',  'lynxexec',
179
        'ms-help', 'ms-its',     'mhtml',      'mocha',    'opera',
180
        'res',     'resource',   'shell',      'vbscript', 'view-source',
181
        'vnd.ms.radio',          'wysiwyg',
182
        );
183
 
184
    /**
185
     * List of "safe" protocols (used for whitelist-filtering)
186
     *
187
     * @var array
188
     * @access public
189
     */
190
    public $whiteProtocols = array(
191
        'ed2k',   'file', 'ftp',  'gopher', 'http',  'https',
192
        'irc',    'mailto', 'news', 'nntp', 'telnet', 'webcal',
193
        'xmpp',   'callto',
194
        );
195
 
196
    /**
197
     * List of attributes that can contain protocols
198
     *
199
     * @var array
200
     * @access public
201
     */
202
    public $protocolAttributes = array(
203
        'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
204
        );
205
 
206
    /**
207
     * List of dangerous CSS keywords
208
     *
209
     * Whole style="" attribute will be removed, if parser will find one of
210
     * these keywords
211
     *
212
     * @var array
213
     * @access public
214
     */
215
    public $cssKeywords = array(
216
        'absolute', 'behavior',       'behaviour',   'content', 'expression',
217
        'fixed',    'include-source', 'moz-binding',
218
        );
219
 
220
    /**
221
     * List of tags that can have no "closing tag"
222
     *
223
     * @var array
224
     * @access public
225
     * @deprecated XHTML does not allow such tags
226
     */
227
    public $noClose = array();
228
 
229
    /**
230
     * List of block-level tags that terminates paragraph
231
     *
232
     * Paragraph will be closed when this tags opened
233
     *
234
     * @var array
235
     * @access public
236
     */
237
    public $closeParagraph = array(
238
        'address', 'blockquote', 'center', 'dd',      'dir',       'div',
239
        'dl',      'dt',         'h1',     'h2',      'h3',        'h4',
240
        'h5',      'h6',         'hr',     'isindex', 'listing',   'marquee',
241
        'menu',    'multicol',   'ol',     'p',       'plaintext', 'pre',
242
        'table',   'ul',         'xmp',
243
        );
244
 
245
    /**
246
     * List of table tags, all table tags outside a table will be removed
247
     *
248
     * @var array
249
     * @access public
250
     */
251
    public $tableTags = array(
252
        'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
253
        'thead',   'tr',
254
        );
255
 
256
    /**
257
     * List of list tags
258
     *
259
     * @var array
260
     * @access public
261
     */
262
    public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
263
 
264
    /**
265
     * List of dangerous attributes
266
     *
267
     * @var array
268
     * @access public
269
     */
270
    public $attributes = array('dynsrc');
271
    //public $attributes = array('dynsrc', 'id', 'name', ); //id and name are dangerous?
272
 
273
    /**
274
     * List of allowed "namespaced" attributes
275
     *
276
     * @var array
277
     * @access public
278
     */
279
    public $attributesNS = array('xml:lang', );
280
 
281
    /**
282
     * Constructs class
283
     *
284
     * @access public
285
     */
286
    public function __construct()
287
    {
288
        //making regular expressions based on Proto & CSS arrays
289
        foreach ($this->blackProtocols as $proto) {
290
            $preg = "/[\s\x01-\x1F]*";
291
            for ($i=0; $i<strlen($proto); $i++) {
292
                $preg .= $proto{$i} . "[\s\x01-\x1F]*";
293
            }
294
            $preg .= ":/i";
295
            $this->_protoRegexps[] = $preg;
296
        }
297
 
298
        foreach ($this->cssKeywords as $css) {
299
            $this->_cssRegexps[] = '/' . $css . '/i';
300
        }
301
        return true;
302
    }
303
 
304
    /**
305
     * Handles the writing of attributes - called from $this->_openHandler()
306
     *
307
     * @param array $attrs array of attributes $name => $value
308
     * @return boolean
309
     * @access private
310
     */
311
    private function _writeAttrs ($attrs)
312
    {
313
        if (is_array($attrs)) {
314
            foreach ($attrs as $name => $value) {
315
 
316
                $name = strtolower($name);
317
 
318
                if (strpos($name, 'on') === 0) {
319
                    continue;
320
                }
321
                if (strpos($name, 'data') === 0) {
322
                    continue;
323
                }
324
                if (in_array($name, $this->attributes)) {
325
                    continue;
326
                }
327
                if (!preg_match("/^[a-z0-9]+$/i", $name)) {
328
                    if (!in_array($name, $this->attributesNS))
329
                    {
330
                        continue;
331
                    }
332
                }
333
 
334
                if (($value === TRUE) || (is_null($value))) {
335
                    $value = $name;
336
                }
337
 
338
                if ($name == 'style') {
339
 
340
                   // removes insignificant backslahes
341
                   $value = str_replace("\\", '', $value);
342
 
343
                   // removes CSS comments
344
                   while (1)
345
                   {
346
                     $_value = preg_replace("!/\*.*?\*/!s", '', $value);
347
                     if ($_value == $value) break;
348
                     $value = $_value;
349
                   }
350
 
351
                   // replace all & to &amp;
352
                   $value = str_replace('&amp;', '&', $value);
353
                   $value = str_replace('&', '&amp;', $value);
354
 
355
                   foreach ($this->_cssRegexps as $css) {
356
                       if (preg_match($css, $value)) {
357
                           continue 2;
358
                       }
359
                   }
360
                   foreach ($this->_protoRegexps as $proto) {
361
                       if (preg_match($proto, $value)) {
362
                           continue 2;
363
                       }
364
                   }
365
                }
366
 
367
                $tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"'
368
                $tempval = preg_replace('/&#x([0-9a-f]+);?/mei', "chr(hexdec('\\1'))", $tempval);
369
 
370
                if ((in_array($name, $this->protocolAttributes)) &&
371
                    (strpos($tempval, ':') !== false))
372
                {
373
                    if ($this->protocolFiltering == 'black') {
374
                        foreach ($this->_protoRegexps as $proto) {
375
                            if (preg_match($proto, $tempval)) continue 2;
376
                        }
377
                    } else {
378
                        $_tempval = explode(':', $tempval);
379
                        $proto = $_tempval[0];
380
                        if (!in_array($proto, $this->whiteProtocols)) {
381
                            continue;
382
                        }
383
                    }
384
                }
385
 
386
                $value = str_replace("\"", "&quot;", $value);
387
                $this->_xhtml .= ' ' . $name . '="' . $value . '"';
388
            }
389
        }
390
        return true;
391
    }
392
 
393
    /**
394
     * Opening tag handler - called from HTMLSax
395
     *
396
     * @param object $parser HTML Parser
397
     * @param string $name   tag name
398
     * @param array  $attrs  tag attributes
399
     * @return boolean
400
     * @access private
401
     */
402
    public function _openHandler(&$parser, $name, $attrs)
403
    {
404
        $name = strtolower($name);
405
 
406
        if (in_array($name, $this->deleteTagsContent)) {
407
            array_push($this->_dcStack, $name);
408
            $this->_dcCounter[$name] = isset($this->_dcCounter[$name]) ? $this->_dcCounter[$name]+1 : 1;
409
        }
410
        if (count($this->_dcStack) != 0) {
411
            return true;
412
        }
413
 
414
        if (in_array($name, $this->deleteTags)) {
415
            return true;
416
        }
417
 
418
        if (!preg_match("/^[a-z0-9]+$/i", $name)) {
419
            if (preg_match("!(?:\@|://)!i", $name)) {
420
                $this->_xhtml .= '&lt;' . $name . '&gt;';
421
            }
422
            return true;
423
        }
424
 
425
        if (in_array($name, $this->singleTags)) {
426
            $this->_xhtml .= '<' . $name;
427
            $this->_writeAttrs($attrs);
428
            $this->_xhtml .= ' />';
429
            return true;
430
        }
431
 
432
        // TABLES: cannot open table elements when we are not inside table
433
        if ((isset($this->_counter['table'])) && ($this->_counter['table'] <= 0)
434
            && (in_array($name, $this->tableTags)))
435
        {
436
            return true;
437
        }
438
 
439
        // PARAGRAPHS: close paragraph when closeParagraph tags opening
440
        if ((in_array($name, $this->closeParagraph)) && (in_array('p', $this->_stack))) {
441
            $this->_closeHandler($parser, 'p');
442
        }
443
 
444
        // LISTS: we should close <li> if <li> of the same level opening
445
        if ($name == 'li' && count($this->_liStack) &&
446
            $this->_listScope == $this->_liStack[count($this->_liStack)-1])
447
        {
448
            $this->_closeHandler($parser, 'li');
449
        }
450
 
451
        // LISTS: we want to know on what nesting level of lists we are
452
        if (in_array($name, $this->listTags)) {
453
            $this->_listScope++;
454
        }
455
        if ($name == 'li') {
456
            array_push($this->_liStack, $this->_listScope);
457
        }
458
 
459
        $this->_xhtml .= '<' . $name;
460
        $this->_writeAttrs($attrs);
461
        $this->_xhtml .= '>';
462
        array_push($this->_stack,$name);
463
        $this->_counter[$name] = isset($this->_counter[$name]) ? $this->_counter[$name]+1 : 1;
464
        return true;
465
    }
466
 
467
    /**
468
     * Closing tag handler - called from HTMLSax
469
     *
470
     * @param object $parsers HTML parser
471
     * @param string $name    tag name
472
     * @return boolean
473
     * @access private
474
     */
475
    public function _closeHandler(&$parser, $name)
476
    {
477
 
478
        $name = strtolower($name);
479
 
480
        if (isset($this->_dcCounter[$name]) && ($this->_dcCounter[$name] > 0) &&
481
            (in_array($name, $this->deleteTagsContent)))
482
        {
483
           while ($name != ($tag = array_pop($this->_dcStack))) {
484
            $this->_dcCounter[$tag]--;
485
           }
486
 
487
           $this->_dcCounter[$name]--;
488
        }
489
 
490
        if (count($this->_dcStack) != 0) {
491
            return true;
492
        }
493
 
494
        if ((isset($this->_counter[$name])) && ($this->_counter[$name] > 0)) {
495
           while ($name != ($tag = array_pop($this->_stack))) {
496
               $this->_closeTag($tag);
497
           }
498
 
499
           $this->_closeTag($name);
500
        }
501
        return true;
502
    }
503
 
504
    /**
505
     * Closes tag
506
     *
507
     * @param string $tag tag name
508
     * @return boolean
509
     * @access private
510
     */
511
    public function _closeTag($tag)
512
    {
513
        if (!in_array($tag, $this->noClose)) {
514
            $this->_xhtml .= '</' . $tag . '>';
515
        }
516
 
517
        $this->_counter[$tag]--;
518
 
519
        if (in_array($tag, $this->listTags)) {
520
            $this->_listScope--;
521
        }
522
 
523
        if ($tag == 'li') {
524
            array_pop($this->_liStack);
525
        }
526
        return true;
527
    }
528
 
529
    /**
530
     * Character data handler - called from HTMLSax
531
     *
532
     * @param object $parser HTML parser
533
     * @param string $data   textual data
534
     * @return boolean
535
     * @access private
536
     */
537
    public function _dataHandler(&$parser, $data)
538
    {
539
        if (count($this->_dcStack) == 0) {
540
            $this->_xhtml .= $data;
541
        }
542
        return true;
543
    }
544
 
545
    /**
546
     * Escape handler - called from HTMLSax
547
     *
548
     * @param object $parser HTML parser
549
     * @param string $data   comments or other type of data
550
     * @return boolean
551
     * @access private
552
     */
553
    public function _escapeHandler(&$parser, $data)
554
    {
555
        return true;
556
    }
557
 
558
    /**
559
     * Returns the XHTML document
560
     *
561
     * @return string Processed (X)HTML document
562
     * @access public
563
     */
564
    public function getXHTML ()
565
    {
566
        while ($tag = array_pop($this->_stack)) {
567
            $this->_closeTag($tag);
568
        }
569
 
570
        return $this->_xhtml;
571
    }
572
 
573
    /**
574
     * Clears current document data
575
     *
576
     * @return boolean
577
     * @access public
578
     */
579
    public function clear()
580
    {
581
        $this->_xhtml = '';
582
        return true;
583
    }
584
 
585
    /**
586
     * Main parsing fuction
587
     *
588
     * @param string $doc HTML document for processing
589
     * @return string Processed (X)HTML document
590
     * @access public
591
     */
592
    public function parse($doc)
593
    {
594
	   $this->clear();
595
 
596
       // Save all '<' symbols
597
       $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '&lt;', (string)$doc);
598
 
599
       // Web documents shouldn't contains \x00 symbol
600
       $doc = str_replace("\x00", '', $doc);
601
 
602
       // Opera6 bug workaround
603
       $doc = str_replace("\xC0\xBC", '&lt;', $doc);
604
 
605
       // UTF-7 encoding ASCII decode
606
       $doc = $this->repackUTF7($doc);
607
 
608
       // Instantiate the parser
609
       $parser= new TSax3();
610
 
611
       // Set up the parser
612
       $parser->set_object($this);
613
 
614
       $parser->set_element_handler('_openHandler','_closeHandler');
615
       $parser->set_data_handler('_dataHandler');
616
       $parser->set_escape_handler('_escapeHandler');
617
 
618
       $parser->parse($doc);
619
 
620
       return $this->getXHTML();
621
 
622
    }
623
 
624
 
625
    /**
626
     * UTF-7 decoding fuction
627
     *
628
     * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
629
     * @return string Decoded document
630
     * @access private
631
     */
632
    private function repackUTF7($str)
633
    {
634
       return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
635
    }
636
 
637
    /**
638
     * Additional UTF-7 decoding fuction
639
     *
640
     * @param string $str String for recode ASCII part of UTF-7 back to ASCII
641
     * @return string Recoded string
642
     * @access private
643
     */
644
    private function repackUTF7Callback($str)
645
    {
646
       $str = base64_decode($str[1]);
647
       $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
648
       return preg_replace('/\x00(.)/', '$1', $str);
649
    }
650
 
651
    /**
652
     * Additional UTF-7 encoding fuction
653
     *
654
     * @param string $str String for recode ASCII part of UTF-7 back to ASCII
655
     * @return string Recoded string
656
     * @access private
657
     */
658
    private function repackUTF7Back($str)
659
    {
660
       return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
661
    }
662
}
663
 
664
/*
665
 * Local variables:
666
 * tab-width: 4
667
 * c-basic-offset: 4
668
 * c-hanging-comment-ender-p: nil
669
 * End:
670
 */
671
 
672
?>