Subversion-Projekte lars-tiefland.prado

Revision

Details | Letzte Änderung | Log anzeigen | RSS feed

Revision Autor Zeilennr. Zeile
1 lars 1
<?php
2
 
3
#
4
# Markdown  -  A text-to-HTML conversion tool for web writers
5
#
6
# Copyright (c) 2004-2005 John Gruber
7
# <http://daringfireball.net/projects/markdown/>
8
#
9
# Copyright (c) 2004-2005 Michel Fortin - PHP Port
10
# <http://www.michelf.com/projects/php-markdown/>
11
#
12
 
13
/**
14
 * PHP5 version of the markdown parser.
15
 * Usage:
16
 * <code>
17
 * $markdown = new MarkdownParser;
18
 * echo $markdown->parse($text);
19
 * </code>
20
 */
21
class MarkdownParser
22
{
23
	private static $md_nested_brackets;
24
	private static $md_escape_table = array();
25
	private static $md_backslash_escape_table = array();
26
	private static $md_nested_brackets_depth = 6;
27
 
28
	protected $md_empty_element_suffix = " />";     # Change to ">" for HTML output
29
	protected $md_tab_width = 4;
30
 
31
	private $md_list_level = 0;
32
	private $md_urls = array();
33
	private $md_titles = array();
34
	private $md_html_blocks = array();
35
 
36
	public function __construct()
37
	{
38
		if(is_null(self::$md_nested_brackets))
39
			$this->initialize();
40
	}
41
 
42
	private function initialize()
43
	{
44
		self::$md_nested_brackets =
45
			str_repeat('(?>[^\[\]]+|\[', self::$md_nested_brackets_depth).
46
			str_repeat('\])*', self::$md_nested_brackets_depth);
47
 
48
		self::$md_escape_table = array(
49
			"\\" => md5("\\"),
50
			"`" => md5("`"),
51
			"*" => md5("*"),
52
			"_" => md5("_"),
53
			"{" => md5("{"),
54
			"}" => md5("}"),
55
			"[" => md5("["),
56
			"]" => md5("]"),
57
			"(" => md5("("),
58
			")" => md5(")"),
59
			">" => md5(">"),
60
			"#" => md5("#"),
61
			"+" => md5("+"),
62
			"-" => md5("-"),
63
			"." => md5("."),
64
			"!" => md5("!")
65
		);
66
 
67
		# Table of hash values for escaped characters:
68
		# Create an identical table but for escaped characters.
69
		foreach (self::$md_escape_table as $key => $char)
70
			self::$md_backslash_escape_table["\\$key"] = $char;
71
	}
72
 
73
	public function parse($text)
74
	{
75
	#
76
	# Main function. The order in which other subs are called here is
77
	# essential. Link and image substitutions need to happen before
78
	# _EscapeSpecialCharsWithinTagAttributes(), so that any *'s or _'s in the <a>
79
	# and <img> tags get encoded.
80
	#
81
		# Clear the hashes. If we don't clear these, you get conflicts
82
		# from other articles when generating a page which contains more than
83
		# one article (e.g. an index page that shows the N most recent
84
		# articles):
85
		$this->md_urls = array();
86
		$this->md_titles = array();
87
		$this->md_html_blocks = array();
88
 
89
		# Standardize line endings:
90
		#   DOS to Unix and Mac to Unix
91
		$text = str_replace(array("\r\n", "\r"), "\n", $text);
92
 
93
		# Make sure $text ends with a couple of newlines:
94
		$text .= "\n\n";
95
 
96
		# Convert all tabs to spaces.
97
		$text = $this->_Detab($text);
98
 
99
		# Strip any lines consisting only of spaces and tabs.
100
		# This makes subsequent regexen easier to write, because we can
101
		# match consecutive blank lines with /\n+/ instead of something
102
		# contorted like /[ \t]*\n+/ .
103
		$text = preg_replace('/^[ \t]+$/m', '', $text);
104
 
105
		# Turn block-level HTML blocks into hash entries
106
		$text = $this->_HashHTMLBlocks($text);
107
 
108
		# Strip link definitions, store in hashes.
109
		$text = $this->_StripLinkDefinitions($text);
110
 
111
		$text = $this->_RunBlockGamut($text);
112
 
113
		$text = $this->_UnescapeSpecialChars($text);
114
 
115
		return $text . "\n";
116
	}
117
 
118
 
119
	private function _StripLinkDefinitions($text) {
120
	#
121
	# Strips link definitions from text, stores the URLs and titles in
122
	# hash references.
123
	#
124
		$less_than_tab = $this->md_tab_width - 1;
125
 
126
		# Link defs are in the form: ^[id]: url "optional title"
127
		$text = preg_replace_callback('{
128
							^[ ]{0,'.$less_than_tab.'}\[(.+)\]:	# id = $1
129
							  [ \t]*
130
							  \n?				# maybe *one* newline
131
							  [ \t]*
132
							<?(\S+?)>?			# url = $2
133
							  [ \t]*
134
							  \n?				# maybe one newline
135
							  [ \t]*
136
							(?:
137
								(?<=\s)			# lookbehind for whitespace
138
								["(]
139
								(.+?)			# title = $3
140
								[")]
141
								[ \t]*
142
							)?	# title is optional
143
							(?:\n+|\Z)
144
			}xm',
145
			array($this,'_StripLinkDefinitions_callback'),
146
			$text);
147
		return $text;
148
	}
149
 
150
	private function _StripLinkDefinitions_callback($matches) {
151
		$link_id = strtolower($matches[1]);
152
		$this->md_urls[$link_id] = $this->_EncodeAmpsAndAngles($matches[2]);
153
		if (isset($matches[3]))
154
			$this->md_titles[$link_id] = str_replace('"', '&quot;', $matches[3]);
155
		return ''; # String that will replace the block
156
	}
157
 
158
 
159
	private function _HashHTMLBlocks($text) {
160
		$less_than_tab = $this->md_tab_width - 1;
161
 
162
		# Hashify HTML blocks:
163
		# We only want to do this for block-level HTML tags, such as headers,
164
		# lists, and tables. That's because we still want to wrap <p>s around
165
		# "paragraphs" that are wrapped in non-block-level tags, such as anchors,
166
		# phrase emphasis, and spans. The list of tags we're looking for is
167
		# hard-coded:
168
		$block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'.
169
						'script|noscript|form|fieldset|iframe|math|ins|del';
170
		$block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'.
171
						'script|noscript|form|fieldset|iframe|math';
172
 
173
		# First, look for nested blocks, e.g.:
174
		# 	<div>
175
		# 		<div>
176
		# 		tags for inner block must be indented.
177
		# 		</div>
178
		# 	</div>
179
		#
180
		# The outermost tags must start at the left margin for this to match, and
181
		# the inner nested divs must be indented.
182
		# We need to do this before the next, more liberal match, because the next
183
		# match will start at the first `<div>` and stop at the first `</div>`.
184
		$text = preg_replace_callback("{
185
					(						# save in $1
186
						^					# start of line  (with /m)
187
						<($block_tags_a)	# start tag = $2
188
						\\b					# word break
189
						(.*\\n)*?			# any number of lines, minimally matching
190
						</\\2>				# the matching end tag
191
						[ \\t]*				# trailing spaces/tabs
192
						(?=\\n+|\\Z)	# followed by a newline or end of document
193
					)
194
			}xm",
195
			array($this,'_HashHTMLBlocks_callback'),
196
			$text);
197
 
198
		#
199
		# Now match more liberally, simply from `\n<tag>` to `</tag>\n`
200
		#
201
		$text = preg_replace_callback("{
202
					(						# save in $1
203
						^					# start of line  (with /m)
204
						<($block_tags_b)	# start tag = $2
205
						\\b					# word break
206
						(.*\\n)*?			# any number of lines, minimally matching
207
						.*</\\2>				# the matching end tag
208
						[ \\t]*				# trailing spaces/tabs
209
						(?=\\n+|\\Z)	# followed by a newline or end of document
210
					)
211
			}xm",
212
			array($this,'_HashHTMLBlocks_callback'),
213
			$text);
214
 
215
		# Special case just for <hr />. It was easier to make a special case than
216
		# to make the other regex more complicated.
217
		$text = preg_replace_callback('{
218
					(?:
219
						(?<=\n\n)		# Starting after a blank line
220
						|				# or
221
						\A\n?			# the beginning of the doc
222
					)
223
					(						# save in $1
224
						[ ]{0,'.$less_than_tab.'}
225
						<(hr)				# start tag = $2
226
						\b					# word break
227
						([^<>])*?			#
228
						/?>					# the matching end tag
229
						[ \t]*
230
						(?=\n{2,}|\Z)		# followed by a blank line or end of document
231
					)
232
			}x',
233
			array($this,'_HashHTMLBlocks_callback'),
234
			$text);
235
 
236
		# Special case for standalone HTML comments:
237
		$text = preg_replace_callback('{
238
					(?:
239
						(?<=\n\n)		# Starting after a blank line
240
						|				# or
241
						\A\n?			# the beginning of the doc
242
					)
243
					(						# save in $1
244
						[ ]{0,'.$less_than_tab.'}
245
						(?s:
246
							<!
247
							(--.*?--\s*)+
248
							>
249
						)
250
						[ \t]*
251
						(?=\n{2,}|\Z)		# followed by a blank line or end of document
252
					)
253
				}x',
254
				array($this,'_HashHTMLBlocks_callback'),
255
				$text);
256
 
257
		return $text;
258
	}
259
	private function _HashHTMLBlocks_callback($matches) {
260
		$text = $matches[1];
261
		$key = md5($text);
262
		$this->md_html_blocks[$key] = $text;
263
		return "\n\n$key\n\n"; # String that will replace the block
264
	}
265
 
266
 
267
	private function _RunBlockGamut($text) {
268
	#
269
	# These are all the transformations that form block-level
270
	# tags like paragraphs, headers, and list items.
271
	#
272
		$text = $this->_DoHeaders($text);
273
 
274
		# Do Horizontal Rules:
275
		$text = preg_replace(
276
			array('{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}mx',
277
				  '{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}mx',
278
				  '{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}mx'),
279
			"\n<hr{$this->md_empty_element_suffix}\n",
280
			$text);
281
 
282
		$text = $this->_DoLists($text);
283
		$text = $this->_DoCodeBlocks($text);
284
		$text = $this->_DoBlockQuotes($text);
285
 
286
		# We already ran _HashHTMLBlocks() before, in Markdown(), but that
287
		# was to escape raw HTML in the original Markdown source. This time,
288
		# we're escaping the markup we've just created, so that we don't wrap
289
		# <p> tags around block-level tags.
290
		$text = $this->_HashHTMLBlocks($text);
291
		$text = $this->_FormParagraphs($text);
292
 
293
		return $text;
294
	}
295
 
296
 
297
	private function _RunSpanGamut($text) {
298
	#
299
	# These are all the transformations that occur *within* block-level
300
	# tags like paragraphs, headers, and list items.
301
	#
302
 
303
		$text = $this->_DoCodeSpans($text);
304
 
305
		$text = $this->_EscapeSpecialChars($text);
306
 
307
		# Process anchor and image tags. Images must come first,
308
		# because ![foo][f] looks like an anchor.
309
		$text = $this->_DoImages($text);
310
		$text = $this->_DoAnchors($text);
311
 
312
		# Make links out of things like `<http://example.com/>`
313
		# Must come after _DoAnchors(), because you can use < and >
314
		# delimiters in inline links like [this](<url>).
315
		$text = $this->_DoAutoLinks($text);
316
		$text = $this->_EncodeAmpsAndAngles($text);
317
		$text = $this->_DoItalicsAndBold($text);
318
 
319
		# Do hard breaks:
320
		$text = preg_replace('/ {2,}\n/', "<br{$this->md_empty_element_suffix}\n", $text);
321
 
322
		return $text;
323
	}
324
 
325
 
326
	private function _EscapeSpecialChars($text) {
327
		$tokens = $this->_TokenizeHTML($text);
328
 
329
		$text = '';   # rebuild $text from the tokens
330
	#	$in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
331
	#	$tags_to_skip = "!<(/?)(?:pre|code|kbd|script|math)[\s>]!";
332
 
333
		foreach ($tokens as $cur_token) {
334
			if ($cur_token[0] == 'tag') {
335
				# Within tags, encode * and _ so they don't conflict
336
				# with their use in Markdown for italics and strong.
337
				# We're replacing each such character with its
338
				# corresponding MD5 checksum value; this is likely
339
				# overkill, but it should prevent us from colliding
340
				# with the escape values by accident.
341
				$cur_token[1] = str_replace(array('*', '_'),
342
					array(self::$md_escape_table['*'], self::$md_escape_table['_']),
343
					$cur_token[1]);
344
				$text .= $cur_token[1];
345
			} else {
346
				$t = $cur_token[1];
347
				$t = $this->_EncodeBackslashEscapes($t);
348
				$text .= $t;
349
			}
350
		}
351
		return $text;
352
	}
353
 
354
 
355
	private function _DoAnchors($text) {
356
	#
357
	# Turn Markdown link shortcuts into XHTML <a> tags.
358
	#
359
		#
360
		# First, handle reference-style links: [link text] [id]
361
		#
362
		$bracket = self::$md_nested_brackets;
363
		$text = preg_replace_callback("{
364
			(					# wrap whole match in $1
365
			  \\[
366
				({$bracket})	# link text = $2
367
			  \\]
368
 
369
			  [ ]?				# one optional space
370
			  (?:\\n[ ]*)?		# one optional newline followed by spaces
371
 
372
			  \\[
373
				(.*?)		# id = $3
374
			  \\]
375
			)
376
			}xs",
377
			array($this,'_DoAnchors_reference_callback'), $text);
378
 
379
		#
380
		# Next, inline-style links: [link text](url "optional title")
381
		#
382
		$text = preg_replace_callback("{
383
			(				# wrap whole match in $1
384
			  \\[
385
				({$bracket})	# link text = $2
386
			  \\]
387
			  \\(			# literal paren
388
				[ \\t]*
389
				<?(.*?)>?	# href = $3
390
				[ \\t]*
391
				(			# $4
392
				  (['\"])	# quote char = $5
393
				  (.*?)		# Title = $6
394
				  \\5		# matching quote
395
				)?			# title is optional
396
			  \\)
397
			)
398
			}xs",
399
			array($this,'_DoAnchors_inline_callback'), $text);
400
 
401
		return $text;
402
	}
403
	private function _DoAnchors_reference_callback($matches) {
404
		$whole_match = $matches[1];
405
		$link_text   = $matches[2];
406
		$link_id     = strtolower($matches[3]);
407
 
408
		if ($link_id == "") {
409
			$link_id = strtolower($link_text); # for shortcut links like [this][].
410
		}
411
 
412
		if (isset($this->md_urls[$link_id])) {
413
			$url = $this->md_urls[$link_id];
414
			# We've got to encode these to avoid conflicting with italics/bold.
415
			$url = str_replace(array('*', '_'),
416
							   array(self::$md_escape_table['*'], self::$md_escape_table['_']),
417
							   $url);
418
			$result = "<a href=\"$url\"";
419
			if ( isset( $this->md_titles[$link_id] ) ) {
420
				$title = $this->md_titles[$link_id];
421
				$title = str_replace(array('*',     '_'),
422
									 array(self::$md_escape_table['*'],
423
										   self::$md_escape_table['_']), $title);
424
				$result .=  " title=\"$title\"";
425
			}
426
			$result .= ">$link_text</a>";
427
		}
428
		else {
429
			$result = $whole_match;
430
		}
431
		return $result;
432
	}
433
	private function _DoAnchors_inline_callback($matches) {
434
		$whole_match	= $matches[1];
435
		$link_text		= $matches[2];
436
		$url			= $matches[3];
437
		$title			=& $matches[6];
438
 
439
		# We've got to encode these to avoid conflicting with italics/bold.
440
		$url = str_replace(array('*', '_'),
441
						   array(self::$md_escape_table['*'], self::$md_escape_table['_']),
442
						   $url);
443
		$result = "<a href=\"$url\"";
444
		if (isset($title)) {
445
			$title = str_replace('"', '&quot;', $title);
446
			$title = str_replace(array('*', '_'),
447
								 array(self::$md_escape_table['*'], self::$md_escape_table['_']),
448
								 $title);
449
			$result .=  " title=\"$title\"";
450
		}
451
 
452
		$result .= ">$link_text</a>";
453
 
454
		return $result;
455
	}
456
 
457
 
458
	private function _DoImages($text) {
459
	#
460
	# Turn Markdown image shortcuts into <img> tags.
461
	#
462
		#
463
		# First, handle reference-style labeled images: ![alt text][id]
464
		#
465
		$text = preg_replace_callback('{
466
			(				# wrap whole match in $1
467
			  !\[
468
				('.self::$md_nested_brackets.')		# alt text = $2
469
			  \]
470
 
471
			  [ ]?				# one optional space
472
			  (?:\n[ ]*)?		# one optional newline followed by spaces
473
 
474
			  \[
475
				(.*?)		# id = $3
476
			  \]
477
 
478
			)
479
			}xs',
480
			array($this,'_DoImages_reference_callback'), $text);
481
 
482
		#
483
		# Next, handle inline images:  ![alt text](url "optional title")
484
		# Don't forget: encode * and _
485
 
486
		$text = preg_replace_callback('{
487
			(				# wrap whole match in $1
488
			  !\[
489
				('.self::$md_nested_brackets.')		# alt text = $2
490
			  \]
491
			  \(			# literal paren
492
				[ \t]*
493
				<?(\S+?)>?	# src url = $3
494
				[ \t]*
495
				(			# $4
496
				  ([\'"])	# quote char = $5
497
				  (.*?)		# title = $6
498
				  \5		# matching quote
499
				  [ \t]*
500
				)?			# title is optional
501
			  \)
502
			)
503
			}xs',
504
			array($this,'_DoImages_inline_callback'), $text);
505
 
506
		return $text;
507
	}
508
	private function _DoImages_reference_callback($matches) {
509
		$whole_match = $matches[1];
510
		$alt_text    = $matches[2];
511
		$link_id     = strtolower($matches[3]);
512
 
513
		if ($link_id == "") {
514
			$link_id = strtolower($alt_text); # for shortcut links like ![this][].
515
		}
516
 
517
		$alt_text = str_replace('"', '&quot;', $alt_text);
518
		if (isset($this->md_urls[$link_id])) {
519
			$url = $this->md_urls[$link_id];
520
			# We've got to encode these to avoid conflicting with italics/bold.
521
			$url = str_replace(array('*', '_'),
522
							   array(self::$md_escape_table['*'], self::$md_escape_table['_']),
523
							   $url);
524
			$result = "<img src=\"$url\" alt=\"$alt_text\"";
525
			if (isset($this->md_titles[$link_id])) {
526
				$title = $this->md_titles[$link_id];
527
				$title = str_replace(array('*', '_'),
528
									 array(self::$md_escape_table['*'],
529
										   self::$md_escape_table['_']), $title);
530
				$result .=  " title=\"$title\"";
531
			}
532
			$result .= $this->md_empty_element_suffix;
533
		}
534
		else {
535
			# If there's no such link ID, leave intact:
536
			$result = $whole_match;
537
		}
538
 
539
		return $result;
540
	}
541
	private function _DoImages_inline_callback($matches) {
542
		$whole_match	= $matches[1];
543
		$alt_text		= $matches[2];
544
		$url			= $matches[3];
545
		$title			= '';
546
		if (isset($matches[6])) {
547
			$title		= $matches[6];
548
		}
549
 
550
		$alt_text = str_replace('"', '&quot;', $alt_text);
551
		$title    = str_replace('"', '&quot;', $title);
552
		# We've got to encode these to avoid conflicting with italics/bold.
553
		$url = str_replace(array('*', '_'),
554
						   array(self::$md_escape_table['*'], self::$md_escape_table['_']),
555
						   $url);
556
		$result = "<img src=\"$url\" alt=\"$alt_text\"";
557
		if (isset($title)) {
558
			$title = str_replace(array('*', '_'),
559
								 array(self::$md_escape_table['*'], self::$md_escape_table['_']),
560
								 $title);
561
			$result .=  " title=\"$title\""; # $title already quoted
562
		}
563
		$result .= $this->md_empty_element_suffix;
564
 
565
		return $result;
566
	}
567
 
568
 
569
	private function _DoHeaders($text) {
570
		# Setext-style headers:
571
		#	  Header 1
572
		#	  ========
573
		#
574
		#	  Header 2
575
		#	  --------
576
		#
577
		$text = preg_replace(
578
			array('{ ^(.+)[ \t]*\n=+[ \t]*\n+ }emx',
579
				  '{ ^(.+)[ \t]*\n-+[ \t]*\n+ }emx'),
580
			array("'<h1>'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\1')).'</h1>\n\n'",
581
				  "'<h2>'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\1')).'</h2>\n\n'"),
582
			$text);
583
 
584
		# atx-style headers:
585
		#	# Header 1
586
		#	## Header 2
587
		#	## Header 2 with closing hashes ##
588
		#	...
589
		#	###### Header 6
590
		#
591
		$text = preg_replace("{
592
				^(\\#{1,6})	# $1 = string of #'s
593
				[ \\t]*
594
				(.+?)		# $2 = Header text
595
				[ \\t]*
596
				\\#*			# optional closing #'s (not counted)
597
				\\n+
598
			}xme",
599
			"'<h'.strlen('\\1').'>'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\2')).'</h'.strlen('\\1').'>\n\n'",
600
			$text);
601
 
602
		return $text;
603
	}
604
 
605
 
606
	private function _DoLists($text) {
607
	#
608
	# Form HTML ordered (numbered) and unordered (bulleted) lists.
609
	#
610
		$less_than_tab = $this->md_tab_width - 1;
611
 
612
		# Re-usable patterns to match list item bullets and number markers:
613
		$marker_ul  = '[*+-]';
614
		$marker_ol  = '\d+[.]';
615
		$marker_any = "(?:$marker_ul|$marker_ol)";
616
 
617
		$markers = array($marker_ul, $marker_ol);
618
 
619
		foreach ($markers as $marker) {
620
			# Re-usable pattern to match any entirel ul or ol list:
621
			$whole_list = '
622
				(								# $1 = whole list
623
				  (								# $2
624
					[ ]{0,'.$less_than_tab.'}
625
					('.$marker.')				# $3 = first list item marker
626
					[ \t]+
627
				  )
628
				  (?s:.+?)
629
				  (								# $4
630
					  \z
631
					|
632
					  \n{2,}
633
					  (?=\S)
634
					  (?!						# Negative lookahead for another list item marker
635
						[ \t]*
636
						'.$marker.'[ \t]+
637
					  )
638
				  )
639
				)
640
			'; // mx
641
 
642
			# We use a different prefix before nested lists than top-level lists.
643
			# See extended comment in _ProcessListItems().
644
 
645
			if ($this->md_list_level) {
646
				$text = preg_replace_callback('{
647
						^
648
						'.$whole_list.'
649
					}mx',
650
					array($this,'_DoLists_callback_top'), $text);
651
			}
652
			else {
653
				$text = preg_replace_callback('{
654
						(?:(?<=\n\n)|\A\n?)
655
						'.$whole_list.'
656
					}mx',
657
					array($this,'_DoLists_callback_nested'), $text);
658
			}
659
		}
660
 
661
		return $text;
662
	}
663
	private function _DoLists_callback_top($matches) {
664
		# Re-usable patterns to match list item bullets and number markers:
665
		$marker_ul  = '[*+-]';
666
		$marker_ol  = '\d+[.]';
667
		$marker_any = "(?:$marker_ul|$marker_ol)";
668
 
669
		$list = $matches[1];
670
		$list_type = preg_match("/$marker_ul/", $matches[3]) ? "ul" : "ol";
671
 
672
		$marker_any = ( $list_type == "ul" ? $marker_ul : $marker_ol );
673
 
674
		# Turn double returns into triple returns, so that we can make a
675
		# paragraph for the last item in a list, if necessary:
676
		$list = preg_replace("/\n{2,}/", "\n\n\n", $list);
677
		$result = $this->_ProcessListItems($list, $marker_any);
678
 
679
		# Trim any trailing whitespace, to put the closing `</$list_type>`
680
		# up on the preceding line, to get it past the current stupid
681
		# HTML block parser. This is a hack to work around the terrible
682
		# hack that is the HTML block parser.
683
		$result = rtrim($result);
684
		$result = "<$list_type>" . $result . "</$list_type>\n";
685
		return $result;
686
	}
687
	private function _DoLists_callback_nested($matches) {
688
		# Re-usable patterns to match list item bullets and number markers:
689
		$marker_ul  = '[*+-]';
690
		$marker_ol  = '\d+[.]';
691
		$marker_any = "(?:$marker_ul|$marker_ol)";
692
 
693
		$list = $matches[1];
694
		$list_type = preg_match("/$marker_ul/", $matches[3]) ? "ul" : "ol";
695
 
696
		$marker_any = ( $list_type == "ul" ? $marker_ul : $marker_ol );
697
 
698
		# Turn double returns into triple returns, so that we can make a
699
		# paragraph for the last item in a list, if necessary:
700
		$list = preg_replace("/\n{2,}/", "\n\n\n", $list);
701
		$result = $this->_ProcessListItems($list, $marker_any);
702
		$result = "<$list_type>\n" . $result . "</$list_type>\n";
703
		return $result;
704
	}
705
 
706
 
707
	private function _ProcessListItems($list_str, $marker_any) {
708
	#
709
	#	Process the contents of a single ordered or unordered list, splitting it
710
	#	into individual list items.
711
	#
712
 
713
		# The $md_list_level keeps track of when we're inside a list.
714
		# Each time we enter a list, we increment it; when we leave a list,
715
		# we decrement. If it's zero, we're not in a list anymore.
716
		#
717
		# We do this because when we're not inside a list, we want to treat
718
		# something like this:
719
		#
720
		#		I recommend upgrading to version
721
		#		8. Oops, now this line is treated
722
		#		as a sub-list.
723
		#
724
		# As a single paragraph, despite the fact that the second line starts
725
		# with a digit-period-space sequence.
726
		#
727
		# Whereas when we're inside a list (or sub-list), that line will be
728
		# treated as the start of a sub-list. What a kludge, huh? This is
729
		# an aspect of Markdown's syntax that's hard to parse perfectly
730
		# without resorting to mind-reading. Perhaps the solution is to
731
		# change the syntax rules such that sub-lists must start with a
732
		# starting cardinal number; e.g. "1." or "a.".
733
 
734
		$this->md_list_level++;
735
 
736
		# trim trailing blank lines:
737
		$list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
738
 
739
		$list_str = preg_replace_callback('{
740
			(\n)?							# leading line = $1
741
			(^[ \t]*)						# leading whitespace = $2
742
			('.$marker_any.') [ \t]+		# list marker = $3
743
			((?s:.+?)						# list item text   = $4
744
			(\n{1,2}))
745
			(?= \n* (\z | \2 ('.$marker_any.') [ \t]+))
746
			}xm',
747
			array($this,'_ProcessListItems_callback'), $list_str);
748
 
749
		$this->md_list_level--;
750
		return $list_str;
751
	}
752
	private function _ProcessListItems_callback($matches) {
753
		$item = $matches[4];
754
		$leading_line =& $matches[1];
755
		$leading_space =& $matches[2];
756
 
757
		if ($leading_line || preg_match('/\n{2,}/', $item)) {
758
			$item = $this->_RunBlockGamut($this->_Outdent($item));
759
		}
760
		else {
761
			# Recursion for sub-lists:
762
			$item = $this->_DoLists($this->_Outdent($item));
763
			$item = preg_replace('/\n+$/', '', $item);
764
			$item = $this->_RunSpanGamut($item);
765
		}
766
 
767
		return "<li>" . $item . "</li>\n";
768
	}
769
 
770
 
771
	private function _DoCodeBlocks($text) {
772
	#
773
	#	Process Markdown `<pre><code>` blocks.
774
	#
775
		$text = preg_replace_callback('{
776
				(?:\n\n|\A)
777
				(	            # $1 = the code block -- one or more lines, starting with a space/tab
778
				  (?:
779
					(?:[ ]{'.$this->md_tab_width.'} | \t)  # Lines must start with a tab or a tab-width of spaces
780
					.*\n+
781
				  )+
782
				)
783
				((?=^[ ]{0,'.$this->md_tab_width.'}\S)|\Z)	# Lookahead for non-space at line-start, or end of doc
784
			}xm',
785
			array($this,'_DoCodeBlocks_callback'), $text);
786
 
787
		return $text;
788
	}
789
	private function _DoCodeBlocks_callback($matches) {
790
		$codeblock = $matches[1];
791
 
792
		$codeblock = $this->_EncodeCode($this->_Outdent($codeblock));
793
	//	$codeblock = _Detab($codeblock);
794
		# trim leading newlines and trailing whitespace
795
		$codeblock = preg_replace(array('/\A\n+/', '/\s+\z/'), '', $codeblock);
796
 
797
		$result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
798
 
799
		return $result;
800
	}
801
 
802
 
803
	private function _DoCodeSpans($text) {
804
	#
805
	# 	*	Backtick quotes are used for <code></code> spans.
806
	#
807
	# 	*	You can use multiple backticks as the delimiters if you want to
808
	# 		include literal backticks in the code span. So, this input:
809
	#
810
	#		  Just type ``foo `bar` baz`` at the prompt.
811
	#
812
	#	  	Will translate to:
813
	#
814
	#		  <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
815
	#
816
	#		There's no arbitrary limit to the number of backticks you
817
	#		can use as delimters. If you need three consecutive backticks
818
	#		in your code, use four for delimiters, etc.
819
	#
820
	#	*	You can use spaces to get literal backticks at the edges:
821
	#
822
	#		  ... type `` `bar` `` ...
823
	#
824
	#	  	Turns to:
825
	#
826
	#		  ... type <code>`bar`</code> ...
827
	#
828
		$text = preg_replace_callback('@
829
				(?<!\\\)	# Character before opening ` can\'t be a backslash
830
				(`+)		# $1 = Opening run of `
831
				(.+?)		# $2 = The code block
832
				(?<!`)
833
				\1			# Matching closer
834
				(?!`)
835
			@xs',
836
			array($this,'_DoCodeSpans_callback'), $text);
837
 
838
		return $text;
839
	}
840
	private function _DoCodeSpans_callback($matches) {
841
		$c = $matches[2];
842
		$c = preg_replace('/^[ \t]*/', '', $c); # leading whitespace
843
		$c = preg_replace('/[ \t]*$/', '', $c); # trailing whitespace
844
		$c = $this->_EncodeCode($c);
845
		return "<code>$c</code>";
846
	}
847
 
848
 
849
	private function _EncodeCode($_) {
850
	#
851
	# Encode/escape certain characters inside Markdown code runs.
852
	# The point is that in code, these characters are literals,
853
	# and lose their special Markdown meanings.
854
	#
855
		# Encode all ampersands; HTML entities are not
856
		# entities within a Markdown code span.
857
		$_ = str_replace('&', '&amp;', $_);
858
 
859
		# Do the angle bracket song and dance:
860
		$_ = str_replace(array('<',    '>'),
861
						 array('&lt;', '&gt;'), $_);
862
 
863
		# Now, escape characters that are magic in Markdown:
864
		$_ = str_replace(array_keys(self::$md_escape_table),
865
						 array_values(self::$md_escape_table), $_);
866
 
867
		return $_;
868
	}
869
 
870
 
871
	private function _DoItalicsAndBold($text) {
872
		# <strong> must go first:
873
		$text = preg_replace('{
874
				(						# $1: Marker
875
					(?<!\*\*) \*\* |	#     (not preceded by two chars of
876
					(?<!__)   __		#      the same marker)
877
				)
878
				(?=\S) 					# Not followed by whitespace
879
				(?!\1)					#   or two others marker chars.
880
				(						# $2: Content
881
					(?:
882
						[^*_]+?			# Anthing not em markers.
883
					|
884
										# Balence any regular emphasis inside.
885
						([*_]) (?=\S) .+? (?<=\S) \3	# $3: em char (* or _)
886
					|
887
						(?! \1 ) .		# Allow unbalenced * and _.
888
					)+?
889
				)
890
				(?<=\S) \1				# End mark not preceded by whitespace.
891
			}sx',
892
			'<strong>\2</strong>', $text);
893
		# Then <em>:
894
		$text = preg_replace(
895
			'{ ( (?<!\*)\* | (?<!_)_ ) (?=\S) (?! \1) (.+?) (?<=\S) \1 }sx',
896
			'<em>\2</em>', $text);
897
 
898
		return $text;
899
	}
900
 
901
 
902
	private function _DoBlockQuotes($text) {
903
		$text = preg_replace_callback('/
904
			  (								# Wrap whole match in $1
905
				(
906
				  ^[ \t]*>[ \t]?			# ">" at the start of a line
907
					.+\n					# rest of the first line
908
				  (.+\n)*					# subsequent consecutive lines
909
				  \n*						# blanks
910
				)+
911
			  )
912
			/xm',
913
			array($this,'_DoBlockQuotes_callback'), $text);
914
 
915
		return $text;
916
	}
917
	private function _DoBlockQuotes_callback($matches) {
918
		$bq = $matches[1];
919
		# trim one level of quoting - trim whitespace-only lines
920
		$bq = preg_replace(array('/^[ \t]*>[ \t]?/m', '/^[ \t]+$/m'), '', $bq);
921
		$bq = $this->_RunBlockGamut($bq);		# recurse
922
 
923
		$bq = preg_replace('/^/m', "  ", $bq);
924
		# These leading spaces screw with <pre> content, so we need to fix that:
925
		$bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
926
									array($this,'_DoBlockQuotes_callback2'), $bq);
927
 
928
		return "<blockquote>\n$bq\n</blockquote>\n\n";
929
	}
930
	private function _DoBlockQuotes_callback2($matches) {
931
		$pre = $matches[1];
932
		$pre = preg_replace('/^  /m', '', $pre);
933
		return $pre;
934
	}
935
 
936
 
937
	private function _FormParagraphs($text) {
938
	#
939
	#	Params:
940
	#		$text - string to process with html <p> tags
941
	#
942
		# Strip leading and trailing lines:
943
		$text = preg_replace(array('/\A\n+/', '/\n+\z/'), '', $text);
944
 
945
		$grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
946
 
947
		#
948
		# Wrap <p> tags.
949
		#
950
		foreach ($grafs as $key => $value) {
951
			if (!isset( $this->md_html_blocks[$value] )) {
952
				$value = $this->_RunSpanGamut($value);
953
				$value = preg_replace('/^([ \t]*)/', '<p>', $value);
954
				$value .= "</p>";
955
				$grafs[$key] = $value;
956
			}
957
		}
958
 
959
		#
960
		# Unhashify HTML blocks
961
		#
962
		foreach ($grafs as $key => $value) {
963
			if (isset( $this->md_html_blocks[$value] )) {
964
				$grafs[$key] = $this->md_html_blocks[$value];
965
			}
966
		}
967
 
968
		return implode("\n\n", $grafs);
969
	}
970
 
971
 
972
	private function _EncodeAmpsAndAngles($text) {
973
	# Smart processing for ampersands and angle brackets that need to be encoded.
974
 
975
		# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
976
		#   http://bumppo.net/projects/amputator/
977
		$text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
978
							 '&amp;', $text);;
979
 
980
		# Encode naked <'s
981
		$text = preg_replace('{<(?![a-z/?\$!])}i', '&lt;', $text);
982
 
983
		return $text;
984
	}
985
 
986
 
987
	private function _EncodeBackslashEscapes($text) {
988
	#
989
	#	Parameter:  String.
990
	#	Returns:    The string, with after processing the following backslash
991
	#				escape sequences.
992
	#
993
		# Must process escaped backslashes first.
994
		return str_replace(array_keys(self::$md_backslash_escape_table),
995
						   array_values(self::$md_backslash_escape_table), $text);
996
	}
997
 
998
 
999
	private function _DoAutoLinks($text) {
1000
		$text = preg_replace("!<((https?|ftp):[^'\">\\s]+)>!",
1001
							 '<a href="\1">\1</a>', $text);
1002
 
1003
		# Email addresses: <address@domain.foo>
1004
		$text = preg_replace('{
1005
			<
1006
			(?:mailto:)?
1007
			(
1008
				[-.\w]+
1009
				\@
1010
				[-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1011
			)
1012
			>
1013
			}exi',
1014
			"\$this->_EncodeEmailAddress(\$this->_UnescapeSpecialChars(\$this->_UnslashQuotes('\\1')))",
1015
			$text);
1016
 
1017
		return $text;
1018
	}
1019
 
1020
 
1021
	private function _EncodeEmailAddress($addr) {
1022
	#
1023
	#	Input: an email address, e.g. "foo@example.com"
1024
	#
1025
	#	Output: the email address as a mailto link, with each character
1026
	#		of the address encoded as either a decimal or hex entity, in
1027
	#		the hopes of foiling most address harvesting spam bots. E.g.:
1028
	#
1029
	#	  <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1030
	#		x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1031
	#		&#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1032
	#
1033
	#	Based by a filter by Matthew Wickline, posted to the BBEdit-Talk
1034
	#	mailing list: <http://tinyurl.com/yu7ue>
1035
	#
1036
		$addr = "mailto:" . $addr;
1037
		$length = strlen($addr);
1038
 
1039
		# leave ':' alone (to spot mailto: later)
1040
		$addr = preg_replace_callback('/([^\:])/',
1041
									  array($this,'_EncodeEmailAddress_callback'), $addr);
1042
 
1043
		$addr = "<a href=\"$addr\">$addr</a>";
1044
		# strip the mailto: from the visible part
1045
		$addr = preg_replace('/">.+?:/', '">', $addr);
1046
 
1047
		return $addr;
1048
	}
1049
	private function _EncodeEmailAddress_callback($matches) {
1050
		$char = $matches[1];
1051
		$r = rand(0, 100);
1052
		# roughly 10% raw, 45% hex, 45% dec
1053
		# '@' *must* be encoded. I insist.
1054
		if ($r > 90 && $char != '@') return $char;
1055
		if ($r < 45) return '&#x'.dechex(ord($char)).';';
1056
		return '&#'.ord($char).';';
1057
	}
1058
 
1059
 
1060
	private function _UnescapeSpecialChars($text) {
1061
	#
1062
	# Swap back in all the special characters we've hidden.
1063
	#
1064
		return str_replace(array_values(self::$md_escape_table),
1065
						   array_keys(self::$md_escape_table), $text);
1066
	}
1067
 
1068
 
1069
	# _TokenizeHTML is shared between PHP Markdown and PHP SmartyPants.
1070
	# We only define it if it is not already defined.
1071
 
1072
	private function _TokenizeHTML($str) {
1073
	#
1074
	#   Parameter:  String containing HTML markup.
1075
	#   Returns:    An array of the tokens comprising the input
1076
	#               string. Each token is either a tag (possibly with nested,
1077
	#               tags contained therein, such as <a href="<MTFoo>">, or a
1078
	#               run of text between tags. Each element of the array is a
1079
	#               two-element array; the first is either 'tag' or 'text';
1080
	#               the second is the actual value.
1081
	#
1082
	#
1083
	#   Regular expression derived from the _tokenize() subroutine in
1084
	#   Brad Choate's MTRegex plugin.
1085
	#   <http://www.bradchoate.com/past/mtregex.php>
1086
	#
1087
		$index = 0;
1088
		$tokens = array();
1089
 
1090
		$match = '(?s:<!(?:--.*?--\s*)+>)|'.	# comment
1091
				 '(?s:<\?.*?\?>)|'.				# processing instruction
1092
												# regular tags
1093
				 '(?:<[/!$]?[-a-zA-Z0-9:]+\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)';
1094
 
1095
		$parts = preg_split("{($match)}", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
1096
 
1097
		foreach ($parts as $part) {
1098
			if (++$index % 2 && $part != '')
1099
				$tokens[] = array('text', $part);
1100
			else
1101
				$tokens[] = array('tag', $part);
1102
		}
1103
 
1104
		return $tokens;
1105
	}
1106
 
1107
	private function _Outdent($text) {
1108
	#
1109
	# Remove one level of line-leading tabs or spaces
1110
	#
1111
		return preg_replace("/^(\\t|[ ]{1,".$this->md_tab_width."})/m", "", $text);
1112
	}
1113
 
1114
 
1115
	private function _Detab($text) {
1116
	#
1117
	# Replace tabs with the appropriate amount of space.
1118
	#
1119
		# For each line we separate the line in blocks delemited by
1120
		# tab characters. Then we reconstruct every line by adding the
1121
		# appropriate number of space between each blocks.
1122
 
1123
		$lines = explode("\n", $text);
1124
		$text = "";
1125
 
1126
		foreach ($lines as $line) {
1127
			# Split in blocks.
1128
			$blocks = explode("\t", $line);
1129
			# Add each blocks to the line.
1130
			$line = $blocks[0];
1131
			unset($blocks[0]); # Do not add first block twice.
1132
			foreach ($blocks as $block) {
1133
				# Calculate amount of space, insert spaces, insert block.
1134
				$amount = $this->md_tab_width - strlen($line) % $this->md_tab_width;
1135
				$line .= str_repeat(" ", $amount) . $block;
1136
			}
1137
			$text .= "$line\n";
1138
		}
1139
		return $text;
1140
	}
1141
 
1142
 
1143
	private function _UnslashQuotes($text) {
1144
	#
1145
	#	This function is useful to remove automaticaly slashed double quotes
1146
	#	when using preg_replace and evaluating an expression.
1147
	#	Parameter:  String.
1148
	#	Returns:    The string with any slash-double-quote (\") sequence replaced
1149
	#				by a single double quote.
1150
	#
1151
		return str_replace('\"', '"', $text);
1152
	}
1153
}
1154
 
1155
/*
1156
 
1157
PHP Markdown
1158
============
1159
 
1160
Description
1161
-----------
1162
 
1163
This is a PHP translation of the original Markdown formatter written in
1164
Perl by John Gruber.
1165
 
1166
Markdown is a text-to-HTML filter; it translates an easy-to-read /
1167
easy-to-write structured text format into HTML. Markdown's text format
1168
is most similar to that of plain text email, and supports features such
1169
as headers, *emphasis*, code blocks, blockquotes, and links.
1170
 
1171
Markdown's syntax is designed not as a generic markup language, but
1172
specifically to serve as a front-end to (X)HTML. You can use span-level
1173
HTML tags anywhere in a Markdown document, and you can use block level
1174
HTML tags (like <div> and <table> as well).
1175
 
1176
For more information about Markdown's syntax, see:
1177
 
1178
<http://daringfireball.net/projects/markdown/>
1179
 
1180
 
1181
Bugs
1182
----
1183
 
1184
To file bug reports please send email to:
1185
 
1186
<michel.fortin@michelf.com>
1187
 
1188
Please include with your report: (1) the example input; (2) the output you
1189
expected; (3) the output Markdown actually produced.
1190
 
1191
 
1192
Version History
1193
---------------
1194
 
1195
See the readme file for detailed release notes for this version.
1196
 
1197
1.0.1c - 9 Dec 2005
1198
 
1199
1.0.1b - 6 Jun 2005
1200
 
1201
1.0.1a - 15 Apr 2005
1202
 
1203
1.0.1 - 16 Dec 2004
1204
 
1205
1.0 - 21 Aug 2004
1206
 
1207
 
1208
Author & Contributors
1209
---------------------
1210
 
1211
Original Perl version by John Gruber
1212
<http://daringfireball.net/>
1213
 
1214
PHP port and other contributions by Michel Fortin
1215
<http://www.michelf.com/>
1216
 
1217
 
1218
Copyright and License
1219
---------------------
1220
 
1221
Copyright (c) 2004-2005 Michel Fortin
1222
<http://www.michelf.com/>
1223
All rights reserved.
1224
 
1225
Copyright (c) 2003-2004 John Gruber
1226
<http://daringfireball.net/>
1227
All rights reserved.
1228
 
1229
Redistribution and use in source and binary forms, with or without
1230
modification, are permitted provided that the following conditions are
1231
met:
1232
 
1233
*	Redistributions of source code must retain the above copyright notice,
1234
	this list of conditions and the following disclaimer.
1235
 
1236
*	Redistributions in binary form must reproduce the above copyright
1237
	notice, this list of conditions and the following disclaimer in the
1238
	documentation and/or other materials provided with the distribution.
1239
 
1240
*	Neither the name "Markdown" nor the names of its contributors may
1241
	be used to endorse or promote products derived from this software
1242
	without specific prior written permission.
1243
 
1244
This software is provided by the copyright holders and contributors "as
1245
is" and any express or implied warranties, including, but not limited
1246
to, the implied warranties of merchantability and fitness for a
1247
particular purpose are disclaimed. In no event shall the copyright owner
1248
or contributors be liable for any direct, indirect, incidental, special,
1249
exemplary, or consequential damages (including, but not limited to,
1250
procurement of substitute goods or services; loss of use, data, or
1251
profits; or business interruption) however caused and on any theory of
1252
liability, whether in contract, strict liability, or tort (including
1253
negligence or otherwise) arising in any way out of the use of this
1254
software, even if advised of the possibility of such damage.
1255
 
1256
*/
1257
?>