Blame | Letzte Änderung | Log anzeigen | RSS feed
<?php declare(strict_types=1);namespace PHPHtmlParser;use PHPHtmlParser\Dom\AbstractNode;use PHPHtmlParser\Dom\Collection;use PHPHtmlParser\Dom\HtmlNode;use PHPHtmlParser\Dom\TextNode;use PHPHtmlParser\Exceptions\ChildNotFoundException;use PHPHtmlParser\Exceptions\CircularException;use PHPHtmlParser\Exceptions\CurlException;use PHPHtmlParser\Exceptions\NotLoadedException;use PHPHtmlParser\Exceptions\ParentNotFoundException;use PHPHtmlParser\Exceptions\StrictException;use PHPHtmlParser\Exceptions\UnknownChildTypeException;use PHPHtmlParser\Exceptions\LogicalException;use stringEncode\Encode;/*** Class Dom** @package PHPHtmlParser*/class Dom{/*** The charset we would like the output to be in.** @var string*/protected $defaultCharset = 'UTF-8';/*** Contains the root node of this dom tree.** @var HtmlNode*/public $root;/*** The raw version of the document string.** @var string*/protected $raw;/*** The document string.** @var Content*/protected $content = null;/*** The original file size of the document.** @var int*/protected $rawSize;/*** The size of the document after it is cleaned.** @var int*/protected $size;/*** A global options array to be used by all load calls.** @var array*/protected $globalOptions = [];/*** A persistent option object to be used for all options in the* parsing of the file.** @var Options*/protected $options;/*** A list of tags which will always be self closing** @var array*/protected $selfClosing = ['area','base','basefont','br','col','embed','hr','img','input','keygen','link','meta','param','source','spacer','track','wbr'];/*** A list of tags where there should be no /> at the end (html5 style)** @var array*/protected $noSlash = [];/*** Returns the inner html of the root node.** @return string* @throws ChildNotFoundException* @throws UnknownChildTypeException*/public function __toString(): string{return $this->root->innerHtml();}/*** A simple wrapper around the root node.** @param string $name* @return mixed*/public function __get($name){return $this->root->$name;}/*** Attempts to load the dom from any resource, string, file, or URL.* @param string $str* @param array $options* @return Dom* @throws ChildNotFoundException* @throws CircularException* @throws CurlException* @throws StrictException*/public function load(string $str, array $options = []): Dom{AbstractNode::resetCount();// check if it's a fileif (strpos($str, "\n") === false && is_file($str)) {return $this->loadFromFile($str, $options);}// check if it's a urlif (preg_match("/^https?:\/\//i", $str)) {return $this->loadFromUrl($str, $options);}return $this->loadStr($str, $options);}/*** Loads the dom from a document file/url* @param string $file* @param array $options* @return Dom* @throws ChildNotFoundException* @throws CircularException* @throws StrictException* @throws LogicalException*/public function loadFromFile(string $file, array $options = []): Dom{$content = file_get_contents($file);if ($content === false) {throw new LogicalException('file_get_contents failed and returned false when trying to read "'.$file.'".');}return $this->loadStr($content, $options);}/*** Use a curl interface implementation to attempt to load* the content from a url.* @param string $url* @param array $options* @param CurlInterface|null $curl* @return Dom* @throws ChildNotFoundException* @throws CircularException* @throws CurlException* @throws StrictException*/public function loadFromUrl(string $url, array $options = [], CurlInterface $curl = null): Dom{if (is_null($curl)) {// use the default curl interface$curl = new Curl;}$content = $curl->get($url, $options);return $this->loadStr($content, $options);}/*** Parsers the html of the given string. Used for load(), loadFromFile(),* and loadFromUrl().* @param string $str* @param array $option* @return Dom* @throws ChildNotFoundException* @throws CircularException* @throws StrictException*/public function loadStr(string $str, array $option = []): Dom{$this->options = new Options;$this->options->setOptions($this->globalOptions)->setOptions($option);$this->rawSize = strlen($str);$this->raw = $str;$html = $this->clean($str);$this->size = strlen($str);$this->content = new Content($html);$this->parse();$this->detectCharset();return $this;}/*** Sets a global options array to be used by all load calls.** @param array $options* @return Dom* @chainable*/public function setOptions(array $options): Dom{$this->globalOptions = $options;return $this;}/*** Find elements by css selector on the root node.* @param string $selector* @param int|null $nth* @return mixed|Collection|null* @throws ChildNotFoundException* @throws NotLoadedException*/public function find(string $selector, int $nth = null){$this->isLoaded();$depthFirstSearch = $this->options->get('depthFirstSearch');if (is_bool($depthFirstSearch)) {$result = $this->root->find($selector, $nth, $depthFirstSearch);} else {$result = $this->root->find($selector, $nth);}return $result;}/*** Find element by Id on the root node* @param int $id* @return bool|AbstractNode* @throws ChildNotFoundException* @throws NotLoadedException* @throws ParentNotFoundException*/public function findById(int $id){$this->isLoaded();return $this->root->findById($id);}/*** Adds the tag (or tags in an array) to the list of tags that will always* be self closing.** @param string|array $tag* @return Dom* @chainable*/public function addSelfClosingTag($tag): Dom{if ( ! is_array($tag)) {$tag = [$tag];}foreach ($tag as $value) {$this->selfClosing[] = $value;}return $this;}/*** Removes the tag (or tags in an array) from the list of tags that will* always be self closing.** @param string|array $tag* @return Dom* @chainable*/public function removeSelfClosingTag($tag): Dom{if ( ! is_array($tag)) {$tag = [$tag];}$this->selfClosing = array_diff($this->selfClosing, $tag);return $this;}/*** Sets the list of self closing tags to empty.** @return Dom* @chainable*/public function clearSelfClosingTags(): Dom{$this->selfClosing = [];return $this;}/*** Adds a tag to the list of self closing tags that should not have a trailing slash** @param $tag* @return Dom* @chainable*/public function addNoSlashTag($tag): Dom{if ( ! is_array($tag)) {$tag = [$tag];}foreach ($tag as $value) {$this->noSlash[] = $value;}return $this;}/*** Removes a tag from the list of no-slash tags.** @param $tag* @return Dom* @chainable*/public function removeNoSlashTag($tag): Dom{if ( ! is_array($tag)) {$tag = [$tag];}$this->noSlash = array_diff($this->noSlash, $tag);return $this;}/*** Empties the list of no-slash tags.** @return Dom* @chainable*/public function clearNoSlashTags(): Dom{$this->noSlash = [];return $this;}/*** Simple wrapper function that returns the first child.* @return AbstractNode* @throws ChildNotFoundException* @throws NotLoadedException*/public function firstChild(): AbstractNode{$this->isLoaded();return $this->root->firstChild();}/*** Simple wrapper function that returns the last child.* @return AbstractNode* @throws ChildNotFoundException* @throws NotLoadedException*/public function lastChild(): AbstractNode{$this->isLoaded();return $this->root->lastChild();}/*** Simple wrapper function that returns count of child elements** @return int* @throws NotLoadedException*/public function countChildren(): int{$this->isLoaded();return $this->root->countChildren();}/*** Get array of children** @return array* @throws NotLoadedException*/public function getChildren(): array{$this->isLoaded();return $this->root->getChildren();}/*** Check if node have children nodes** @return bool* @throws NotLoadedException*/public function hasChildren(): bool{$this->isLoaded();return $this->root->hasChildren();}/*** Simple wrapper function that returns an element by the* id.* @param $id* @return mixed|Collection|null* @throws ChildNotFoundException* @throws NotLoadedException*/public function getElementById($id){$this->isLoaded();return $this->find('#'.$id, 0);}/*** Simple wrapper function that returns all elements by* tag name.* @param string $name* @return mixed|Collection|null* @throws ChildNotFoundException* @throws NotLoadedException*/public function getElementsByTag(string $name){$this->isLoaded();return $this->find($name);}/*** Simple wrapper function that returns all elements by* class name.* @param string $class* @return mixed|Collection|null* @throws ChildNotFoundException* @throws NotLoadedException*/public function getElementsByClass(string $class){$this->isLoaded();return $this->find('.'.$class);}/*** Checks if the load methods have been called.** @throws NotLoadedException*/protected function isLoaded(): void{if (is_null($this->content)) {throw new NotLoadedException('Content is not loaded!');}}/*** Cleans the html of any none-html information.** @param string $str* @return string*/protected function clean(string $str): string{if ($this->options->get('cleanupInput') != true) {// skip entire cleanup stepreturn $str;}$is_gzip = 0 === mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, "US-ASCII");if ($is_gzip) {$str = gzdecode($str);if ($str === false) {throw new LogicalException('gzdecode returned false. Error when trying to decode the string.');}}// remove white space before closing tags$str = mb_eregi_replace("'\s+>", "'>", $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.');}$str = mb_eregi_replace('"\s+>', '">', $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.');}// clean out the \n\r$replace = ' ';if ($this->options->get('preserveLineBreaks')) {$replace = ' ';}$str = str_replace(["\r\n", "\r", "\n"], $replace, $str);if ($str === false) {throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.');}// strip the doctype$str = mb_eregi_replace("<!doctype(.*?)>", '', $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.');}// strip out comments$str = mb_eregi_replace("<!--(.*?)-->", '', $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.');}// strip out cdata$str = mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.');}// strip out <script> tagsif ($this->options->get('removeScripts')) {$str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove scripts 1.');}$str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove scripts 2.');}}// strip out <style> tagsif ($this->options->get('removeStyles')) {$str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out style tags 1.');}$str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out style tags 2.');}}// strip out server side scriptsif ($this->options->get('serverSideScripts')) {$str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out service side scripts.');}}// strip smarty scriptsif ($this->options->get('removeSmartyScripts')) {$str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);if ($str === false) {throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove smarty scripts.');}}return $str;}/*** Attempts to parse the html in content.** @return void* @throws ChildNotFoundException* @throws CircularException* @throws StrictException*/protected function parse(): void{// add the root node$this->root = new HtmlNode('root');$this->root->setHtmlSpecialCharsDecode($this->options->htmlSpecialCharsDecode);$activeNode = $this->root;while ( ! is_null($activeNode)) {$str = $this->content->copyUntil('<');if ($str == '') {$info = $this->parseTag();if ( ! $info['status']) {// we are done here$activeNode = null;continue;}// check if it was a closing tagif ($info['closing']) {$foundOpeningTag = true;$originalNode = $activeNode;while ($activeNode->getTag()->name() != $info['tag']) {$activeNode = $activeNode->getParent();if (is_null($activeNode)) {// we could not find opening tag$activeNode = $originalNode;$foundOpeningTag = false;break;}}if ($foundOpeningTag) {$activeNode = $activeNode->getParent();}continue;}if ( ! isset($info['node'])) {continue;}/** @var AbstractNode $node */$node = $info['node'];$activeNode->addChild($node);// check if node is self closingif ( ! $node->getTag()->isSelfClosing()) {$activeNode = $node;}} else if ($this->options->whitespaceTextNode ||trim($str) != '') {// we found text we care about$textNode = new TextNode($str, $this->options->removeDoubleSpace);$textNode->setHtmlSpecialCharsDecode($this->options->htmlSpecialCharsDecode);$activeNode->addChild($textNode);}}}/*** Attempt to parse a tag out of the content.** @return array* @throws StrictException*/protected function parseTag(): array{$return = ['status' => false,'closing' => false,'node' => null,];if ($this->content->char() != '<') {// we are not at the beginning of a tagreturn $return;}// check if this is a closing tagif ($this->content->fastForward(1)->char() == '/') {// end tag$tag = $this->content->fastForward(1)->copyByToken('slash', true);// move to end of tag$this->content->copyUntil('>');$this->content->fastForward(1);// check if this closing tag counts$tag = strtolower($tag);if (in_array($tag, $this->selfClosing, true)) {$return['status'] = true;return $return;} else {$return['status'] = true;$return['closing'] = true;$return['tag'] = strtolower($tag);}return $return;}$tag = strtolower($this->content->copyByToken('slash', true));if (trim($tag) == ''){// no tag found, invalid < foundreturn $return;}$node = new HtmlNode($tag);$node->setHtmlSpecialCharsDecode($this->options->htmlSpecialCharsDecode);// attributeswhile ($this->content->char() != '>' &&$this->content->char() != '/') {$space = $this->content->skipByToken('blank', true);if (empty($space)) {$this->content->fastForward(1);continue;}$name = $this->content->copyByToken('equal', true);if ($name == '/') {break;}if (empty($name)) {$this->content->skipByToken('blank');continue;}$this->content->skipByToken('blank');if ($this->content->char() == '=') {$attr = [];$this->content->fastForward(1)->skipByToken('blank');switch ($this->content->char()) {case '"':$attr['doubleQuote'] = true;$this->content->fastForward(1);$string = $this->content->copyUntil('"', true);do {$moreString = $this->content->copyUntilUnless('"', '=>');$string .= $moreString;} while ( ! empty($moreString));$attr['value'] = $string;$this->content->fastForward(1);$node->getTag()->$name = $attr;break;case "'":$attr['doubleQuote'] = false;$this->content->fastForward(1);$string = $this->content->copyUntil("'", true);do {$moreString = $this->content->copyUntilUnless("'", '=>');$string .= $moreString;} while ( ! empty($moreString));$attr['value'] = $string;$this->content->fastForward(1);$node->getTag()->$name = $attr;break;default:$attr['doubleQuote'] = true;$attr['value'] = $this->content->copyByToken('attr', true);$node->getTag()->$name = $attr;break;}} else {// no value attributeif ($this->options->strict) {// can't have this in strict html$character = $this->content->getPosition();throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");}$node->getTag()->$name = ['value' => null,'doubleQuote' => true,];if ($this->content->char() != '>') {$this->content->rewind(1);}}}$this->content->skipByToken('blank');$tag = strtolower($tag);if ($this->content->char() == '/') {// self closing tag$node->getTag()->selfClosing();$this->content->fastForward(1);} elseif (in_array($tag, $this->selfClosing, true)) {// Should be a self closing tag, check if we are strictif ($this->options->strict) {$character = $this->content->getPosition();throw new StrictException("Tag '$tag' is not self closing! (character #$character)");}// We force self closing on this tag.$node->getTag()->selfClosing();// Should this tag use a trailing slash?if(in_array($tag, $this->noSlash, true)){$node->getTag()->noTrailingSlash();}}$this->content->fastForward(1);$return['status'] = true;$return['node'] = $node;return $return;}/*** Attempts to detect the charset that the html was sent in.** @return bool* @throws ChildNotFoundException*/protected function detectCharset(): bool{// set the default$encode = new Encode;$encode->from($this->defaultCharset);$encode->to($this->defaultCharset);$enforceEncoding = $this->options->enforceEncoding;if ( ! is_null($enforceEncoding)) {// they want to enforce the given encoding$encode->from($enforceEncoding);$encode->to($enforceEncoding);return false;}/** @var AbstractNode $meta */$meta = $this->root->find('meta[http-equiv=Content-Type]', 0);if (is_null($meta)) {// could not find meta tag$this->root->propagateEncoding($encode);return false;}$content = $meta->getAttribute('content');if (is_null($content)) {// could not find content$this->root->propagateEncoding($encode);return false;}$matches = [];if (preg_match('/charset=(.+)/', $content, $matches)) {$encode->from(trim($matches[1]));$this->root->propagateEncoding($encode);return true;}// no charset found$this->root->propagateEncoding($encode);return false;}}