| 1 |
lars |
1 |
<?php
|
|
|
2 |
/* vim: set noai expandtab ts=4 st=4 sw=4: */
|
|
|
3 |
|
|
|
4 |
/**
|
|
|
5 |
* Abstract class providing common functions to File_Sitemap related classes.
|
|
|
6 |
*
|
|
|
7 |
* PHP versions 5
|
|
|
8 |
*
|
|
|
9 |
* Redistribution and use in source and binary forms, with or without
|
|
|
10 |
* modification, are permitted provided that the following conditions are met:
|
|
|
11 |
*
|
|
|
12 |
* * Redistributions of source code must retain the above copyright notice,
|
|
|
13 |
* this list of conditions and the following disclaimer.
|
|
|
14 |
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
15 |
* this list of conditions and the following disclaimer in the documentation
|
|
|
16 |
* and/or other materials provided with the distribution.
|
|
|
17 |
* * The names of its contributors may not be used to endorse or promote
|
|
|
18 |
* products derived from this software without specific prior written
|
|
|
19 |
* permission.
|
|
|
20 |
*
|
|
|
21 |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
22 |
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
23 |
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
24 |
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
25 |
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
26 |
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
27 |
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
28 |
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
29 |
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
30 |
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
31 |
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
32 |
*
|
|
|
33 |
* @category File
|
|
|
34 |
* @package File_Sitemap
|
|
|
35 |
* @author Charles Brunet <cbrunet@php.net>
|
|
|
36 |
* @license http://www.opensource.org/licenses/bsd-license.html BSD License
|
|
|
37 |
* @version CVS: $Id: Base.php 299129 2010-05-07 22:21:04Z cbrunet $
|
|
|
38 |
* @link http://pear.php.net/package/File_Sitemap
|
|
|
39 |
*/
|
|
|
40 |
|
|
|
41 |
require_once "File/Sitemap/Exception.php";
|
|
|
42 |
|
|
|
43 |
/**
|
|
|
44 |
* Abstract class providing common functions to File_Sitemap related classes.
|
|
|
45 |
*
|
|
|
46 |
* @category File
|
|
|
47 |
* @package File_Sitemap
|
|
|
48 |
* @author Charles Brunet <cbrunet@php.net>
|
|
|
49 |
* @license http://www.opensource.org/licenses/bsd-license.html BSD License
|
|
|
50 |
* @version Release: 0.1.2
|
|
|
51 |
* @link http://pear.php.net/package/File_Sitemap
|
|
|
52 |
*/
|
|
|
53 |
abstract class File_Sitemap_Base
|
|
|
54 |
{
|
|
|
55 |
|
|
|
56 |
/**
|
|
|
57 |
* The internal DOMDocument used by this class
|
|
|
58 |
*
|
|
|
59 |
* @var DOMDocument
|
|
|
60 |
*/
|
|
|
61 |
protected $dom;
|
|
|
62 |
|
|
|
63 |
/**
|
|
|
64 |
* XML namespace
|
|
|
65 |
*/
|
|
|
66 |
const XMLNS = 'http://www.sitemaps.org/schemas/sitemap/0.9';
|
|
|
67 |
|
|
|
68 |
/**
|
|
|
69 |
* namespace of XMLSchema-instance
|
|
|
70 |
*/
|
|
|
71 |
const XSI = 'http://www.w3.org/2001/XMLSchema-instance';
|
|
|
72 |
|
|
|
73 |
/**
|
|
|
74 |
* Constructor. Build an empty XML document, with xmlns and root element.
|
|
|
75 |
*
|
|
|
76 |
* @param string $root Name of the root element
|
|
|
77 |
* @param string $schema Location of the schema
|
|
|
78 |
*
|
|
|
79 |
* @return void
|
|
|
80 |
*/
|
|
|
81 |
public function __construct($root, $schema)
|
|
|
82 |
{
|
|
|
83 |
$imp = new DomImplementation();
|
|
|
84 |
|
|
|
85 |
$this->dom = $imp->createDocument(self::XMLNS, $root);
|
|
|
86 |
$this->dom->version = '1.0';
|
|
|
87 |
$this->dom->encoding = 'UTF-8';
|
|
|
88 |
$attr = $this->dom->createAttributeNS(
|
|
|
89 |
self::XSI,
|
|
|
90 |
'xsi:schemaLocation'
|
|
|
91 |
);
|
|
|
92 |
$attr->value = self::XMLNS.' '.$schema;
|
|
|
93 |
$this->dom->documentElement->appendChild($attr);
|
|
|
94 |
}
|
|
|
95 |
|
|
|
96 |
/**
|
|
|
97 |
* Returns the DOMNode element which contains $url, or false if not found.
|
|
|
98 |
*
|
|
|
99 |
* @param string $url URL (loc) we are looking for.
|
|
|
100 |
*
|
|
|
101 |
* @return mixed DOMNode | false
|
|
|
102 |
*/
|
|
|
103 |
protected function findLoc($url)
|
|
|
104 |
{
|
|
|
105 |
foreach ($this->dom->getElementsByTagNameNS(self::XMLNS, 'loc')
|
|
|
106 |
as $urlElem) {
|
|
|
107 |
if ($urlElem->nodeValue == $url) {
|
|
|
108 |
return $urlElem->parentNode;
|
|
|
109 |
}
|
|
|
110 |
}
|
|
|
111 |
return false;
|
|
|
112 |
}
|
|
|
113 |
|
|
|
114 |
/**
|
|
|
115 |
* Set to $nodeValue the value of the $nodeName child of $urlNode.
|
|
|
116 |
*
|
|
|
117 |
* If $urlNode doen't have a $nodeName child, add it.
|
|
|
118 |
*
|
|
|
119 |
* @param DOMNode $urlNode The parent of the node we want to update.
|
|
|
120 |
* @param string $nodeName The name of the node we want to update.
|
|
|
121 |
* @param string $nodeVal The value we want to put into the node.
|
|
|
122 |
*
|
|
|
123 |
* @return void
|
|
|
124 |
*/
|
|
|
125 |
protected function updateNode($urlNode, $nodeName, $nodeVal)
|
|
|
126 |
{
|
|
|
127 |
$beforeNode = null;
|
|
|
128 |
switch ($nodeName) {
|
|
|
129 |
case 'lastmod':
|
|
|
130 |
$before = array('changefreq', 'priority');
|
|
|
131 |
break;
|
|
|
132 |
case 'changefreq':
|
|
|
133 |
$before = array('priority');
|
|
|
134 |
break;
|
|
|
135 |
default:
|
|
|
136 |
$before = array();
|
|
|
137 |
}
|
|
|
138 |
|
|
|
139 |
// replace old priority if it exists
|
|
|
140 |
foreach ($urlNode->childNodes as $child) {
|
|
|
141 |
if ($child->nodeName == $nodeName) {
|
|
|
142 |
$child->nodeValue = $nodeVal;
|
|
|
143 |
return;
|
|
|
144 |
}
|
|
|
145 |
if (is_null($beforeNode) && in_array($child->nodeName, $before)) {
|
|
|
146 |
$beforeNode = $child;
|
|
|
147 |
}
|
|
|
148 |
}
|
|
|
149 |
|
|
|
150 |
// node not found, we need to create it
|
|
|
151 |
$elem = $this->dom->createElementNS(self::XMLNS, $nodeName, $nodeVal);
|
|
|
152 |
if (is_null($beforeNode)) {
|
|
|
153 |
$urlNode->appendChild($elem);
|
|
|
154 |
} else {
|
|
|
155 |
$urlNode->insertBefore($elem, $beforeNode);
|
|
|
156 |
}
|
|
|
157 |
}
|
|
|
158 |
|
|
|
159 |
/**
|
|
|
160 |
* Used as callback function to preg_replace_callback to urlencode char
|
|
|
161 |
*
|
|
|
162 |
* @param array $char $char[0] will be encoded
|
|
|
163 |
*
|
|
|
164 |
* @return string
|
|
|
165 |
*/
|
|
|
166 |
private static function _myUrlEncode($char)
|
|
|
167 |
{
|
|
|
168 |
return rawurlencode($char[0]);
|
|
|
169 |
}
|
|
|
170 |
|
|
|
171 |
/**
|
|
|
172 |
* Ensure url contains valid chars and isn't longer than 2048 chars.
|
|
|
173 |
*
|
|
|
174 |
* urlencode invalid chars. Convert invalid XML chars to entities.
|
|
|
175 |
*
|
|
|
176 |
* @param string $url The url we want to verify and encode.
|
|
|
177 |
*
|
|
|
178 |
* @return string
|
|
|
179 |
*
|
|
|
180 |
* @throws {@link File_Sitemap_Exception} URL doesn't begin with valid
|
|
|
181 |
* protocol (http, https, ftp) or encoded URL longer than 2048 chars.
|
|
|
182 |
*/
|
|
|
183 |
protected function parseURL($url)
|
|
|
184 |
{
|
|
|
185 |
$protocols = array('http',
|
|
|
186 |
'https',
|
|
|
187 |
'ftp',
|
|
|
188 |
);
|
|
|
189 |
|
|
|
190 |
if (!preg_match('/^('.implode($protocols, ':\/\/|').':\/\/)/', $url)) {
|
|
|
191 |
throw new File_Sitemap_Exception(
|
|
|
192 |
'URL must begin with a protocol ('.
|
|
|
193 |
implode($protocols, ', ').').',
|
|
|
194 |
File_Sitemap_Exception::PARSE_ERROR);
|
|
|
195 |
}
|
|
|
196 |
|
|
|
197 |
// encode XML special chars
|
|
|
198 |
$url = strtr(
|
|
|
199 |
$url, array(
|
|
|
200 |
'&'=>'&',
|
|
|
201 |
'\''=>''',
|
|
|
202 |
'"'=>'"',
|
|
|
203 |
'>'=>'>',
|
|
|
204 |
'<'=>'<',
|
|
|
205 |
)
|
|
|
206 |
);
|
|
|
207 |
// replace other chars with %nn form
|
|
|
208 |
$url = preg_replace_callback(
|
|
|
209 |
'/[^0-9a-zA-Z_'.
|
|
|
210 |
':\/?#\[\]@!$&\'()*+,;=%~.-]/',
|
|
|
211 |
'File_Sitemap_Base::_myUrlEncode', $url
|
|
|
212 |
);
|
|
|
213 |
|
|
|
214 |
if (strlen($url) > 2048) {
|
|
|
215 |
throw new File_Sitemap_Exception(
|
|
|
216 |
'URL must not be longer than 2048 chars.',
|
|
|
217 |
File_Sitemap_Exception::PARSE_ERROR
|
|
|
218 |
);
|
|
|
219 |
}
|
|
|
220 |
|
|
|
221 |
return $url;
|
|
|
222 |
}
|
|
|
223 |
|
|
|
224 |
/**
|
|
|
225 |
* Ensure that $datetime is a valid date time string
|
|
|
226 |
*
|
|
|
227 |
* If $datetime is conform to the spec, it is returned as is.
|
|
|
228 |
* Else we try to decode it using strtotime function.
|
|
|
229 |
*
|
|
|
230 |
* @param string $datetime The date (and time) to pase.
|
|
|
231 |
*
|
|
|
232 |
* @return string
|
|
|
233 |
*
|
|
|
234 |
* @see http://www.w3.org/TR/NOTE-datetime
|
|
|
235 |
* @throws {@link File_Sitemap_Exception} Indalid date / time format.
|
|
|
236 |
*/
|
|
|
237 |
protected function parseDateTime($datetime)
|
|
|
238 |
{
|
|
|
239 |
$valid = preg_match(
|
|
|
240 |
'/^\d{4}(-\d{2}(-\d{2}(T\d{2}:\d{2}(:\d{2}(\.\d+)?)?'.
|
|
|
241 |
'([+-]\d{2}:\d{2}|Z))?)?)?$/',
|
|
|
242 |
$datetime
|
|
|
243 |
);
|
|
|
244 |
if ($valid) {
|
|
|
245 |
return $datetime;
|
|
|
246 |
}
|
|
|
247 |
|
|
|
248 |
// Try to convert it
|
|
|
249 |
$timestamp = @strtotime($datetime);
|
|
|
250 |
if ($timestamp === false) {
|
|
|
251 |
throw new File_Sitemap_Exception(
|
|
|
252 |
'unable to parse date time string.',
|
|
|
253 |
File_Sitemap_Exception::PARSE_ERROR);
|
|
|
254 |
}
|
|
|
255 |
$datetime = date('Y-m-d\TH:i:sP', $timestamp);
|
|
|
256 |
return $datetime;
|
|
|
257 |
}
|
|
|
258 |
|
|
|
259 |
/**
|
|
|
260 |
* Remove DOMNode that contains url $loc from the document.
|
|
|
261 |
*
|
|
|
262 |
* @param string $loc URL to remove
|
|
|
263 |
*
|
|
|
264 |
* @return void
|
|
|
265 |
*/
|
|
|
266 |
public function remove($loc)
|
|
|
267 |
{
|
|
|
268 |
$loc = $this->parseURL($loc);
|
|
|
269 |
$urlNode = $this->findLoc($loc);
|
|
|
270 |
if ($urlNode !== false) {
|
|
|
271 |
$this->dom->documentElement->removeChild($urlNode);
|
|
|
272 |
}
|
|
|
273 |
}
|
|
|
274 |
|
|
|
275 |
/**
|
|
|
276 |
* Load sitemap from file. The file can be gzipped or not.
|
|
|
277 |
*
|
|
|
278 |
* @param string $file Filename (or URL).
|
|
|
279 |
*
|
|
|
280 |
* @return void
|
|
|
281 |
*
|
|
|
282 |
* @throws {@link File_Sitemap_Exception} File read error.
|
|
|
283 |
*/
|
|
|
284 |
public function load($file)
|
|
|
285 |
{
|
|
|
286 |
if (substr($file, -2) == 'gz') {
|
|
|
287 |
$gzfile = gzopen($file, 'r');
|
|
|
288 |
if ($gzfile === false) {
|
|
|
289 |
throw new File_Sitemap_Exception(
|
|
|
290 |
'error opening gziped sitemap file.',
|
|
|
291 |
File_Sitemap_Exception::FILE_ERROR);
|
|
|
292 |
}
|
|
|
293 |
$xml = '';
|
|
|
294 |
while (!gzeof($gzfile)) {
|
|
|
295 |
$xml .= gzread($gzfile, 10000);
|
|
|
296 |
}
|
|
|
297 |
gzclose($gzfile);
|
|
|
298 |
$this->dom->loadXML($xml);
|
|
|
299 |
} else {
|
|
|
300 |
$this->dom->load($file);
|
|
|
301 |
}
|
|
|
302 |
}
|
|
|
303 |
|
|
|
304 |
/**
|
|
|
305 |
* Save sitemap to file.
|
|
|
306 |
*
|
|
|
307 |
* @param string $file Filename (or URL), including path.
|
|
|
308 |
* @param boolean $compress gzip the file? Default true.
|
|
|
309 |
* @param boolean $formatOutput Nice format XML. Default false.
|
|
|
310 |
*
|
|
|
311 |
* @return void
|
|
|
312 |
*
|
|
|
313 |
* @throws {@link File_Sitemap_Exception} File write error.
|
|
|
314 |
*/
|
|
|
315 |
public function save($file, $compress = true, $formatOutput = false)
|
|
|
316 |
{
|
|
|
317 |
$this->dom->formatOutput = $formatOutput;
|
|
|
318 |
|
|
|
319 |
if ($compress) {
|
|
|
320 |
if (substr($file, -3) != '.gz') {
|
|
|
321 |
$file .= '.gz';
|
|
|
322 |
}
|
|
|
323 |
$gzfile = gzopen($file, 'w9');
|
|
|
324 |
if ($gzfile === false) {
|
|
|
325 |
throw new File_Sitemap_Exception(
|
|
|
326 |
'error saving gziped sitemap file.',
|
|
|
327 |
File_Sitemap_Exception::FILE_ERROR);
|
|
|
328 |
}
|
|
|
329 |
gzwrite($gzfile, $this->dom->saveXML());
|
|
|
330 |
gzclose($gzfile);
|
|
|
331 |
} else {
|
|
|
332 |
$this->dom->save($file);
|
|
|
333 |
}
|
|
|
334 |
}
|
|
|
335 |
|
|
|
336 |
/**
|
|
|
337 |
* Notify $site that a sitemap was updated at $url
|
|
|
338 |
*
|
|
|
339 |
* @param string $url URL of the sitemap file (must be valid)
|
|
|
340 |
* @param mixed $site string | array. URL (or array of URL) of the search
|
|
|
341 |
* engine ping site
|
|
|
342 |
*
|
|
|
343 |
* @return void
|
|
|
344 |
*
|
|
|
345 |
* @throws {@link File_Sitemap_Exception} Sitemap file not reachable
|
|
|
346 |
* or ping site error.
|
|
|
347 |
*/
|
|
|
348 |
public function notify($url,
|
|
|
349 |
$site = 'http://www.google.com/webmasters/sitemaps/ping'
|
|
|
350 |
) {
|
|
|
351 |
$this->_includeHTTPRequest();
|
|
|
352 |
|
|
|
353 |
// check that $url exists
|
|
|
354 |
$req = new HTTP_Request('');
|
|
|
355 |
$req->setURL($url);
|
|
|
356 |
$req->sendRequest();
|
|
|
357 |
$code = $req->getResponseCode();
|
|
|
358 |
|
|
|
359 |
switch ($code) {
|
|
|
360 |
case 200:
|
|
|
361 |
// Everything ok!
|
|
|
362 |
break;
|
|
|
363 |
default:
|
|
|
364 |
throw new File_Sitemap_Exception(
|
|
|
365 |
'Cannot reach sitemap file. Error: '.$code,
|
|
|
366 |
File_Sitemap_Exception::ERROR + $code);
|
|
|
367 |
}
|
|
|
368 |
|
|
|
369 |
// Ping the web search engine
|
|
|
370 |
if (!is_array($site)) {
|
|
|
371 |
$site = array($site);
|
|
|
372 |
}
|
|
|
373 |
|
|
|
374 |
$req->setMethod(HTTP_REQUEST_METHOD_GET);
|
|
|
375 |
foreach ($site as $s) {
|
|
|
376 |
$req->setURL($s);
|
|
|
377 |
$req->addQueryString('sitemap', $url);
|
|
|
378 |
$req->sendRequest();
|
|
|
379 |
$code = $req->getResponseCode();
|
|
|
380 |
|
|
|
381 |
if ($code != 200) {
|
|
|
382 |
throw new File_Sitemap_Exception(
|
|
|
383 |
'Cannot reach '.$s.'. Error: '.$code,
|
|
|
384 |
File_Sitemap_Exception::ERROR + $code);
|
|
|
385 |
}
|
|
|
386 |
}
|
|
|
387 |
}
|
|
|
388 |
|
|
|
389 |
/**
|
|
|
390 |
* Test that all url in sitemap are valid URL
|
|
|
391 |
*
|
|
|
392 |
* @param array &$results An array that will contains result codes.
|
|
|
393 |
* key is the url, value is the response code (200, 302, 404, etc.)
|
|
|
394 |
*
|
|
|
395 |
* @return boolean true if all URLs reached
|
|
|
396 |
*/
|
|
|
397 |
public function test(&$results = array())
|
|
|
398 |
{
|
|
|
399 |
$this->_includeHTTPRequest();
|
|
|
400 |
|
|
|
401 |
$req = new HTTP_Request('');
|
|
|
402 |
$allok = true;
|
|
|
403 |
|
|
|
404 |
$urllist = $this->dom->getElementsByTagNameNS(self::XMLNS, 'loc');
|
|
|
405 |
foreach ($urllist as $urlnode) {
|
|
|
406 |
$url = html_entity_decode($urlnode->nodeValue);
|
|
|
407 |
$req->setURL($url);
|
|
|
408 |
$req->sendRequest();
|
|
|
409 |
$code = $req->getResponseCode();
|
|
|
410 |
$results[$url] = $code;
|
|
|
411 |
if ($code >= 400) {
|
|
|
412 |
$allok = false;
|
|
|
413 |
}
|
|
|
414 |
}
|
|
|
415 |
return $allok;
|
|
|
416 |
}
|
|
|
417 |
|
|
|
418 |
/**
|
|
|
419 |
* Validate the sitemap document against DTD
|
|
|
420 |
*
|
|
|
421 |
* Be warned that it will issue some warnings if it doesn't validate.
|
|
|
422 |
*
|
|
|
423 |
* @param string $schema URL of the validating schema.
|
|
|
424 |
*
|
|
|
425 |
* @return boolean
|
|
|
426 |
*/
|
|
|
427 |
public function validate($schema)
|
|
|
428 |
{
|
|
|
429 |
return $this->dom->schemaValidate($schema);
|
|
|
430 |
}
|
|
|
431 |
|
|
|
432 |
/**
|
|
|
433 |
* Check for HTTP_Request package and include it
|
|
|
434 |
*
|
|
|
435 |
* @return void
|
|
|
436 |
*/
|
|
|
437 |
private function _includeHTTPRequest()
|
|
|
438 |
{
|
|
|
439 |
static $included = false;
|
|
|
440 |
|
|
|
441 |
if ($included) {
|
|
|
442 |
return;
|
|
|
443 |
}
|
|
|
444 |
|
|
|
445 |
@include_once 'HTTP/Request.php';
|
|
|
446 |
|
|
|
447 |
if (!class_exists('HTTP_Request')) {
|
|
|
448 |
throw new File_Sitemap_Exception(
|
|
|
449 |
'HTTP_Request class not found.',
|
|
|
450 |
File_Sitemap_Exception::ERROR);
|
|
|
451 |
}
|
|
|
452 |
|
|
|
453 |
$included = true;
|
|
|
454 |
}
|
|
|
455 |
}
|
|
|
456 |
|
|
|
457 |
?>
|