Forum PHP.pl > [php] pobieranie rekordów wyszukiwania

Pomoc - Szukaj - Użytkownicy - Kalendarz

Pełna wersja: [php] pobieranie rekordów wyszukiwania

yta

25.03.2011, 18:06:04

Witam mam problem ze skryptem PHP który fizycznie miałby pobierać adresy URL wyszukiwania chodzi mi o pobranie do bazy danych wszystkich rekordów z wyszukiwania site:domena
czyli tego : http://www.google.pl/#sclient=psy&hl=p...d516b91cedd4b71 .

Znalazłem na pewnej stronie taki skrypt :
plik config.php

[PHP] pobierz, plaintext 
<?php
 
 
$use_proxy='no'; // use either  yes or  no
 
if ($use_proxy=="yes"){
 
// Please test the proxy and make sure it works before using it in the config fields below
 
$proxy_ip = '124.153.75.31:80';  // use format ip:port  ex. 202.106.121.134:80 - get more from <a href="http://www.samair.ru/proxy/time-01.htm" target="_blank">http://www.samair.ru/proxy/time-01.htm</a>
$proxy_user = 'user:pass';  // use format user:pass - some proxies don't need user/pass so in such case make it $proxy_user = '';
} 
 
?>
[PHP] pobierz, plaintext

url_harvester.php

[PHP] pobierz, plaintext 
<?php
 
 
 
include("config.php");
 
 
// Check if form has been submitted
if($_POST['submit']){
 
	ini_set("max_execution_time", 0); set_time_limit(0); 	// no time-outs!
 
 
	// This will allow you to view errors in the browser     
  	// Note: set display_errors to 0 in production
//	ini_set('display_errors',1);
 
	// Report all PHP errors (notices, errors, warnings, etc.)
//	error_reporting(E_ALL);
 
$engine = $_POST["engine"];
 
 
if($engine=="yahoo") {
 
$query = urlencode($_REQUEST['query']);
 
	// // Substitute this application ID with your own application ID provided by Yahoo!.`
   	$appID = "7mOnTDvV34GdHdNm9XPcb6Ms_lbhz8hKyylyUJVY8pva..UnfTCTaw31kRoAQ1vi";
 
	$start = 1;
 
while($start<902)		{
	// URI used for making REST call, Each Web Service uses a unique URL.
	$request = "http://search.yahooapis.com/WebSearchService/V1/webSearch?query=$query&appid=$appID&output=xml&start=$start&results=100";
// echo "$request<br><br>";
 
//$response = file_get_contents($request);
 
			$ch = curl_init();
			curl_setopt($ch, CURLOPT_URL, $request);
			curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
			curl_setopt($ch, CURLOPT_HEADER, 1);
			if($use_proxy=="yes") {
			curl_setopt($ch, CURLOPT_PROXY, $proxy_ip);
			curl_setopt($ch, CURLOPT_PROXYUSERPWD, $proxy_user);
			}
			curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
			curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
			curl_setopt($ch, CURLOPT_TIMEOUT, 10);
 
			$response = curl_exec($ch);
			curl_close($ch);
 
	// Confirm that the request was transmitted to the Yahoo! Image Search Service
	if(!$response) {
   		die('Request to Yahoo! Search Service failed and no response was returned.');
	}
 
//	echo $response;
	// Get the XML from the response, bypassing the header
	if (!($xml = strstr($response, '<?xml'))) {
       		 $xml = null;
	}
	// Create a SimpleXML object with XML response
	$simple_xml = simplexml_load_string($xml);
 
	// Traverse XML tree and save desired values from child nodes
	foreach($simple_xml->Result as $result)
	{
	$output= "{$result->Url}<br>";
	echo $output;
	}
	$start=($start+100);
	}  // loop end
	echo "<br><br>Results finished or limit of 1000 results reached...<br>";
 
	} // if yahoo
 
if($engine=="google_blog") {
$query = $_REQUEST['query'];
//echo "Query 1: $query<br>";
$query = str_replace(" ", "+", $query);
//echo "Query 2: $query<br>";
$query = stripslashes($query);
echo "Query 3: $query<br>";
 
$num = 0;
 
$start = 0;
 
do {
 
$request = 'http://blogsearch.google.com/blogsearch_feeds?hl=en&q=' .$query. '&ie=utf-8&num=100&start=' .$start. '&output=rss';
 
 
			$ch = curl_init();
			curl_setopt($ch, CURLOPT_URL, $request);
			curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
			curl_setopt($ch, CURLOPT_HEADER, 1);
			if($use_proxy=="yes") {
			curl_setopt($ch, CURLOPT_PROXY, $proxy_ip);
			curl_setopt($ch, CURLOPT_PROXYUSERPWD, $proxy_user);
			}
			curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
			curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
			curl_setopt($ch, CURLOPT_TIMEOUT, 10);
 
			$response = curl_exec($ch);
			curl_close($ch);
 
	// Confirm that the request was transmitted to the Yahoo! Image Search Service
	if(!$response) {
   		die('Request to Yahoo! Search Service failed and no response was returned.');
	}
 
 
//Loop through the feed, and suck out the URL's
$xml = new SimpleXMLElement($response);
 
foreach ($xml->channel->item as $item) {
 
//Add 1 to our counter, so our list has numbers next to the URL's
$num = $num + 1;
$link = $item->link;
 
echo "$link <br>";
 
}
 
sleep(rand(10,20));
$start = $start + 100;
 
}
 
while ($start < 25270000000);
 
echo "<br><br>Results number: $num<br>";
 
} // if google blog
 
 
if($engine=="google_site") {
 
 
$query = urlencode($_REQUEST['query']);
$num = 0;
 
$start = 0;
 
do {
 
$request = "http://www.google.com/search?hl=en&start=$start&num=25270000000&q=$query";
 
 
			$ch = curl_init();
			curl_setopt($ch, CURLOPT_URL, $request);
			curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
			curl_setopt($ch, CURLOPT_HEADER, 1);
			if($use_proxy=="yes") {
			curl_setopt($ch, CURLOPT_PROXY, $proxy_ip);
			curl_setopt($ch, CURLOPT_PROXYUSERPWD, $proxy_user);
			}
			curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
			curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
			curl_setopt($ch, CURLOPT_TIMEOUT, 10);
 
			$response = curl_exec($ch);
			curl_close($ch);
 
	if(!$response) {
   		die('Request to Yahoo! Search Service failed and no response was returned.');
	}
 
    preg_match_all("(<h3 class=r><a href=\"(.*)\".*>(.*)</a></h3>)siU",
        $response, $matches);
 
    for ($i = 0; $i < count($matches[1]); $i++) {
        $matches[1][$i] = strip_tags($matches[1][$i]);
		$resultlink=$matches[1][$i];
		echo "$resultlink<br>";
		$num = $num + 1;
    }
sleep(rand(10,20));
$start = $start + 100;
}
while ($start < 1000);
 
echo "<br><br>Results number: $num<br>";
 
 
} // if google site
 
 
 
} // if submit
 
 
	else {
?>
 
<br />
<br />
<fieldset>
        <legend>
            <label for="query">Write your search query, Results are powered by Yahoo/Google with maximum number of results is 1000 per Yahoo/Google terms.</label>
        </legend>
	    <form method=POST>
            <label for="query">Query: </label><br />
            <input type="text" size="150" id="query" name="query"/><br /><br />
            Search Engine: <select name="engine">
<option value="yahoo">Yahoo Site Search</option>
<option value="google_site">Google Site Search</option>
<option value="google_blog">Google Blog Search</option>
</select>
&nbsp;&nbsp;&nbsp; <input type=submit name="submit" value="Submit Query" />
	    </form>
</fieldset>
<br /><br />
<center>Powered by <a title="Scripteen Free URL Harvester" href="http://www.scripteen.com">Scripteen Free URL Harvester</a></center>
<?}?>
[PHP] pobierz, plaintext

Problem jest taki :
A) jak dodać jemu zapis linkow / rekordow pod wskazaną bazę SQL ?

jak dodać jemu sprawdzanie czy adres IP jest zbanowany (jeżeli tak pobiera nowy z pliku txt jak nowy zbanują to kolejny itd)
C) czy ten skrypt poradzi sobie ze zapisem wszystkich odnośników ?

Podbijam zależy mi na czasie..
Pomoże ktoś ? bd wdzięczny

REF pomoże ktoś ?

To jest wersja lo-fi głównej zawartości. Aby zobaczyć pełną wersję z większą zawartością, obrazkami i formatowaniem proszę kliknij tutaj.