Witam mam problem ze skryptem PHP który fizycznie miałby pobierać adresy URL wyszukiwania chodzi mi o pobranie do bazy danych wszystkich rekordów z wyszukiwania site:domena
czyli tego : http://www.google.pl/#sclient=psy&hl=p...d516b91cedd4b71 .

Znalazłem na pewnej stronie taki skrypt :
plik config.php
  1. <?php
  2.  
  3.  
  4. $use_proxy='no'; // use either yes or no
  5.  
  6. if ($use_proxy=="yes"){
  7.  
  8. // Please test the proxy and make sure it works before using it in the config fields below
  9.  
  10. $proxy_ip = '124.153.75.31:80'; // use format ip:port ex. 202.106.121.134:80 - get more from <a href="http://www.samair.ru/proxy/time-01.htm" target="_blank">http://www.samair.ru/proxy/time-01.htm</a>
  11. $proxy_user = 'user:pass'; // use format user:pass - some proxies don't need user/pass so in such case make it $proxy_user = '';
  12. }
  13.  
  14. ?>


url_harvester.php

  1. <?php
  2.  
  3.  
  4.  
  5. include("config.php");
  6.  
  7.  
  8. // Check if form has been submitted
  9. if($_POST['submit']){
  10.  
  11. ini_set("max_execution_time", 0); set_time_limit(0); // no time-outs!
  12.  
  13.  
  14. // This will allow you to view errors in the browser
  15. // Note: set display_errors to 0 in production
  16. // ini_set('display_errors',1);
  17.  
  18. // Report all PHP errors (notices, errors, warnings, etc.)
  19. // error_reporting(E_ALL);
  20.  
  21. $engine = $_POST["engine"];
  22.  
  23.  
  24. if($engine=="yahoo") {
  25.  
  26. $query = urlencode($_REQUEST['query']);
  27.  
  28. // // Substitute this application ID with your own application ID provided by Yahoo!.`
  29. $appID = "7mOnTDvV34GdHdNm9XPcb6Ms_lbhz8hKyylyUJVY8pva..UnfTCTaw31kRoAQ1vi";
  30.  
  31. $start = 1;
  32.  
  33. while($start<902) {
  34. // URI used for making REST call, Each Web Service uses a unique URL.
  35. $request = "http://search.yahooapis.com/WebSearchService/V1/webSearch?query=$query&appid=$appID&output=xml&start=$start&results=100";
  36. // echo "$request<br><br>";
  37.  
  38. //$response = file_get_contents($request);
  39.  
  40. $ch = curl_init();
  41. curl_setopt($ch, CURLOPT_URL, $request);
  42. curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
  43. curl_setopt($ch, CURLOPT_HEADER, 1);
  44. if($use_proxy=="yes") {
  45. curl_setopt($ch, CURLOPT_PROXY, $proxy_ip);
  46. curl_setopt($ch, CURLOPT_PROXYUSERPWD, $proxy_user);
  47. }
  48. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  49. curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  50. curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
  51. curl_setopt($ch, CURLOPT_TIMEOUT, 10);
  52.  
  53. $response = curl_exec($ch);
  54. curl_close($ch);
  55.  
  56. // Confirm that the request was transmitted to the Yahoo! Image Search Service
  57. if(!$response) {
  58. die('Request to Yahoo! Search Service failed and no response was returned.');
  59. }
  60.  
  61. // echo $response;
  62. // Get the XML from the response, bypassing the header
  63. if (!($xml = strstr($response, '<?xml'))) {
  64. $xml = null;
  65. }
  66. // Create a SimpleXML object with XML response
  67. $simple_xml = simplexml_load_string($xml);
  68.  
  69. // Traverse XML tree and save desired values from child nodes
  70. foreach($simple_xml->Result as $result)
  71. {
  72. $output= "{$result->Url}<br>";
  73. echo $output;
  74. }
  75. $start=($start+100);
  76. } // loop end
  77. echo "<br><br>Results finished or limit of 1000 results reached...<br>";
  78.  
  79. } // if yahoo
  80.  
  81. if($engine=="google_blog") {
  82. $query = $_REQUEST['query'];
  83. //echo "Query 1: $query<br>";
  84. $query = str_replace(" ", "+", $query);
  85. //echo "Query 2: $query<br>";
  86. $query = stripslashes($query);
  87. echo "Query 3: $query<br>";
  88.  
  89. $num = 0;
  90.  
  91. $start = 0;
  92.  
  93. do {
  94.  
  95. $request = 'http://blogsearch.google.com/blogsearch_feeds?hl=en&q=' .$query. '&ie=utf-8&num=100&start=' .$start. '&output=rss';
  96.  
  97.  
  98. $ch = curl_init();
  99. curl_setopt($ch, CURLOPT_URL, $request);
  100. curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
  101. curl_setopt($ch, CURLOPT_HEADER, 1);
  102. if($use_proxy=="yes") {
  103. curl_setopt($ch, CURLOPT_PROXY, $proxy_ip);
  104. curl_setopt($ch, CURLOPT_PROXYUSERPWD, $proxy_user);
  105. }
  106. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  107. curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  108. curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
  109. curl_setopt($ch, CURLOPT_TIMEOUT, 10);
  110.  
  111. $response = curl_exec($ch);
  112. curl_close($ch);
  113.  
  114. // Confirm that the request was transmitted to the Yahoo! Image Search Service
  115. if(!$response) {
  116. die('Request to Yahoo! Search Service failed and no response was returned.');
  117. }
  118.  
  119.  
  120. //Loop through the feed, and suck out the URL's
  121. $xml = new SimpleXMLElement($response);
  122.  
  123. foreach ($xml->channel->item as $item) {
  124.  
  125. //Add 1 to our counter, so our list has numbers next to the URL's
  126. $num = $num + 1;
  127. $link = $item->link;
  128.  
  129. echo "$link <br>";
  130.  
  131. }
  132.  
  133. sleep(rand(10,20));
  134. $start = $start + 100;
  135.  
  136. }
  137.  
  138. while ($start < 25270000000);
  139.  
  140. echo "<br><br>Results number: $num<br>";
  141.  
  142. } // if google blog
  143.  
  144.  
  145. if($engine=="google_site") {
  146.  
  147.  
  148. $query = urlencode($_REQUEST['query']);
  149. $num = 0;
  150.  
  151. $start = 0;
  152.  
  153. do {
  154.  
  155. $request = "http://www.google.com/search?hl=en&start=$start&num=25270000000&q=$query";
  156.  
  157.  
  158. $ch = curl_init();
  159. curl_setopt($ch, CURLOPT_URL, $request);
  160. curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
  161. curl_setopt($ch, CURLOPT_HEADER, 1);
  162. if($use_proxy=="yes") {
  163. curl_setopt($ch, CURLOPT_PROXY, $proxy_ip);
  164. curl_setopt($ch, CURLOPT_PROXYUSERPWD, $proxy_user);
  165. }
  166. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  167. curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  168. curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
  169. curl_setopt($ch, CURLOPT_TIMEOUT, 10);
  170.  
  171. $response = curl_exec($ch);
  172. curl_close($ch);
  173.  
  174. if(!$response) {
  175. die('Request to Yahoo! Search Service failed and no response was returned.');
  176. }
  177.  
  178. preg_match_all("(<h3 class=r><a href=\"(.*)\".*>(.*)</a></h3>)siU",
  179. $response, $matches);
  180.  
  181. for ($i = 0; $i < count($matches[1]); $i++) {
  182. $matches[1][$i] = strip_tags($matches[1][$i]);
  183. $resultlink=$matches[1][$i];
  184. echo "$resultlink<br>";
  185. $num = $num + 1;
  186. }
  187. sleep(rand(10,20));
  188. $start = $start + 100;
  189. }
  190. while ($start < 1000);
  191.  
  192. echo "<br><br>Results number: $num<br>";
  193.  
  194.  
  195. } // if google site
  196.  
  197.  
  198.  
  199. } // if submit
  200.  
  201.  
  202. else {
  203. ?>
  204.  
  205. <br />
  206. <br />
  207. <fieldset>
  208. <legend>
  209. <label for="query">Write your search query, Results are powered by Yahoo/Google with maximum number of results is 1000 per Yahoo/Google terms.</label>
  210. </legend>
  211. <form method=POST>
  212. <label for="query">Query: </label><br />
  213. <input type="text" size="150" id="query" name="query"/><br /><br />
  214. Search Engine: <select name="engine">
  215. <option value="yahoo">Yahoo Site Search</option>
  216. <option value="google_site">Google Site Search</option>
  217. <option value="google_blog">Google Blog Search</option>
  218. </select>
  219. &nbsp;&nbsp;&nbsp; <input type=submit name="submit" value="Submit Query" />
  220. </form>
  221. </fieldset>
  222. <br /><br />
  223. <center>Powered by <a title="Scripteen Free URL Harvester" href="http://www.scripteen.com">Scripteen Free URL Harvester</a></center>
  224. <?}?>


Problem jest taki :
A) jak dodać jemu zapis linkow / rekordow pod wskazaną bazę SQL ?
cool.gif jak dodać jemu sprawdzanie czy adres IP jest zbanowany (jeżeli tak pobiera nowy z pliku txt jak nowy zbanują to kolejny itd)
C) czy ten skrypt poradzi sobie ze zapisem wszystkich odnośników ?

Podbijam zależy mi na czasie..
Pomoże ktoś ? bd wdzięczny

REF pomoże ktoś ?