<?php class Scrapper{ public $url; private $data; private $dataAfter; private $doc; private $xpath; private $ch; function __construct($url){ libxml_use_internal_errors(true); $this->url = $url; $this->data = $this->curl($this->url); $this->doc = new \DOMDocument(); $this->doc->loadHTML($this->data); $this->xpath = new DOMXPath($this->doc); } } public function queryTag($query){ $this->data = $this->xpath->query($query); return $this; } } public function getData($noHTML = false, $removeAttribute = false){ foreach ($this->data as $dataNodes){ if($removeAttribute === true) { $dataNodes->removeAttribute('style'); $dataNodes->removeAttribute('class'); $dataNodes->removeAttribute('id'); } if($noHTML === true){ $this->dataAfter .= $dataNodes->nodeValue; }else{ $this->dataAfter .= $dataNodes->ownerDocument->saveHTML($dataNodes); } } return $this->dataAfter; } private function curl($url){ CURLOPT_RETURNTRANSFER => TRUE, // Setting cURL's option to return the webpage data CURLOPT_FOLLOWLOCATION => TRUE, // Setting cURL to follow 'location' HTTP headers CURLOPT_AUTOREFERER => TRUE, // Automatically set the referer where following 'location' HTTP headers CURLOPT_CONNECTTIMEOUT => 120, // Setting the amount of time (in seconds) before the request times out CURLOPT_TIMEOUT => 120, // Setting the maximum amount of time for cURL to execute queries CURLOPT_MAXREDIRS => 10, // Setting the maximum number of redirections to follow CURLOPT_USERAGENT => "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1a2pre) Gecko/2008073000 Shredder/3.0a2pre ThunderBrowse/3.2.1.8", // Setting the useragent CURLOPT_URL => $this->url, // Setting cURL's URL option with the $url variable passed into the function ); $this->ch = curl_init(); curl_setopt_array($this->ch, $options); $this->data = curl_exec($this->ch); return $this->data; } } function __destruct(){ curl_close($this->ch); } } $class = new \Scrapper('http://www.....'); $pic = $class->queryTag('//div[@id="left"]//img[@class="pic"]/@src')->getData(); $title = $class->queryTag('//div[@id="left"]//h2')->getData(true); $text = $class->queryTag('//div[@id="left"]/p | //center')->getData(false, true);
Po wywołaniu tej klasy, przypisuję do każdej zmiennej szukanej wartości - zdjęcie, tytuł i treść.
Niestety tytuł zawiera również ciąg URL obrazka, natomiast tekst zawiera dodatkowo obrazek oraz tytuł. Gdzie robię błąd? Jak to oddzielić?
Jednocześnie proszę o sugestię co mogę poprawić w samej klasie.