log = "New http() object instantiated.
\n"; //Seconds to attempt socket connection before giving up. $this->connect_timeout = 30; /* Set the 'dir' property to the directory where you want to store the cached content. I suggest a folder that is not web-accessible. End this value with a "/". */ $this->dir = realpath("./")."/"; //Default to current dir. $this->clean(); return true; } /* fetch() method to get the content. fetch() will use 'ttl' property to determine whether to get the content from the url or the cache. */ function fetch($url="", $ttl=0, $name="", $user="", $pwd="", $verb="GET") { $this->log .= "--------------------------------
fetch() called
\n"; $this->log .= "url: ".$url."
\n"; $this->status = ""; $this->header = ""; $this->body = ""; if (!$url) { $this->log .= "OOPS: You need to pass a URL!
"; return false; } $this->url = $url; $this->ttl = $ttl; $this->name = $name; $need_to_save = false; if ($this->ttl == "0") { if (!$fh = $this->getFromUrl($url, $user, $pwd, $verb)) { return false; } } else { if (strlen(trim($this->name)) == 0) { $this->name = MD5($url); } $this->filename = $this->dir."http_".$this->name; $this->log .= "Filename: ".$this->filename."
"; $this->getFile_ts(); if ($this->ttl == "daily") { if (date('Y-m-d',$this->data_ts) != date('Y-m-d',time())) { $this->log .= "cache has expired
"; if (!$fh = $this->getFromUrl($url, $user, $pwd, $verb)) { return false; } $need_to_save = true; if ($this->getFromUrl()) { return $this->saveToCache(); } } else { if (!$fh = $this->getFromCache()) { return false; } } } else { if ((time() - $this->data_ts) >= $this->ttl) { $this->log .= "cache has expired
"; if (!$fh = $this->getFromUrl($url, $user, $pwd, $verb)) { return false; } $need_to_save = true; } else { if (!$fh = $this->getFromCache()) { return false; } } } } /* Get response header. */ $this->header = fgets($fh, 1024); $this->status = substr($this->header,9,3); while ((trim($line = fgets($fh, 1024)) != "") && (!feof($fh))) { $this->header .= $line; if ($this->status=="401" and strpos($line,"WWW-Authenticate: Basic realm=\"")===0) { fclose($fh); $this->log .= "Could not authenticate
\n"; return FALSE; } } /* Get response body. */ while (!feof($fh)) { $this->body .= fgets($fh, 1024); } fclose($fh); if ($need_to_save) { $this->saveToCache(); } return $this->status; } /* PRIVATE getFromUrl() method to scrape content from url. */ function getFromUrl($url, $user="", $pwd="", $verb="GET") { $this->log .= "getFromUrl() called
"; preg_match("~([a-z]*://)?([^:^/]*)(:([0-9]{1,5}))?(/.*)?~i", $url, $parts); $protocol = $parts[1]; $server = $parts[2]; $port = $parts[4]; $path = $parts[5]; if ($port == "") { if (strtolower($protocol) == "https://") { $port = "443"; } else { $port = "80"; } } if ($path == "") { $path = "/"; } if (!$sock = @fsockopen(((strtolower($protocol) == "https://")?"ssl://":"").$server, $port, $errno, $errstr, $this->connect_timeout)) { $this->log .= "Could not open connection. Error " .$errno.": ".$errstr."
\n"; return false; } $this->headers["Host"] = $server.":".$port; if ($user != "" && $pwd != "") { $this->log .= "Authentication will be attempted
\n"; $this->headers["Authorization"] = "Basic ".base64_encode($user.":".$pwd); } if (count($this->postvars) > 0) { $this->log .= "Variables will be POSTed
\n"; $request = "POST ".$path." HTTP/1.0\r\n"; $post_string = ""; foreach ($this->postvars as $key=>$value) { $post_string .= "&".urlencode($key)."=".urlencode($value); } $post_string = substr($post_string,1); $this->headers["Content-Type"] = "application/x-www-form-urlencoded"; $this->headers["Content-Length"] = strlen($post_string); } elseif (strlen($this->xmlrequest) > 0) { $this->log .= "XML request will be sent
\n"; $request = $verb." ".$path." HTTP/1.0\r\n"; $this->headers["Content-Length"] = strlen($this->xmlrequest); } else { $request = $verb." ".$path." HTTP/1.0\r\n"; } #echo "
request: ".$request; if (fwrite($sock, $request) === FALSE) { fclose($sock); $this->log .= "Error writing request type to socket
\n"; return false; } foreach ($this->headers as $key=>$value) { if (fwrite($sock, $key.": ".$value."\r\n") === FALSE) { fclose($sock); $this->log .= "Error writing headers to socket
\n"; return false; } } if (fwrite($sock, "\r\n") === FALSE) { fclose($sock); $this->log .= "Error writing end-of-line to socket
\n"; return false; } #echo "
post_string: ".$post_string; if (count($this->postvars) > 0) { if (fwrite($sock, $post_string."\r\n") === FALSE) { fclose($sock); $this->log .= "Error writing POST string to socket
\n"; return false; } } elseif (strlen($this->xmlrequest) > 0) { if (fwrite($sock, $this->xmlrequest."\r\n") === FALSE) { fclose($sock); $this->log .= "Error writing xml request string to socket
\n"; return false; } } return $sock; } /* PRIVATE clean() method to reset the instance back to mostly new state. */ function clean() { $this->status = ""; $this->header = ""; $this->body = ""; $this->headers = array(); $this->postvars = array(); /* Try to use user agent of the user making this request. If not available, default to IE6.0 on WinXP, SP1. */ if (isset($_SERVER['HTTP_USER_AGENT'])) { $this->headers["User-Agent"] = $_SERVER['HTTP_USER_AGENT']; } else { $this->headers["User-Agent"] = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"; } /* Set referrer to the current script since in essence, it is the referring page. */ if (substr($_SERVER['SERVER_PROTOCOL'],0,5) == "HTTPS") { $this->headers["Referer"] = "https://".$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']; } else { $this->headers["Referer"] = "http://".$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']; } } /* PRIVATE getFromCache() method to retrieve content from cache file. */ function getFromCache() { $this->log .= "getFromCache() called
"; //create file pointer if (!$fp=@fopen($this->filename,"r")) { $this->log .= "Could not open ".$this->filename."
"; return false; } return $fp; } /* PRIVATE saveToCache() method to save content to cache file. */ function saveToCache() { $this->log .= "saveToCache() called
"; //create file pointer if (!$fp=@fopen($this->filename,"w")) { $this->log .= "Could not open ".$this->filename."
"; return false; } //write to file if (!@fwrite($fp,$this->header."\r\n".$this->body)) { $this->log .= "Could not write to ".$this->filename."
"; fclose($fp); return false; } //close file pointer fclose($fp); return true; } /* PRIVATE getFile_ts() method to get cache file modified date. */ function getFile_ts() { $this->log .= "getFile_ts() called
"; if (!file_exists($this->filename)) { $this->data_ts = 0; $this->log .= $this->filename." does not exist
"; return false; } $this->data_ts = filemtime($this->filename); return true; } /* Static method table_into_array() Generic function to return data array from HTML table data rawHTML: the page source needle: optional string to start parsing source from needle_within: 0 = needle is BEFORE table, 1 = needle is within table allowed_tags: list of tags to NOT strip from data, e.g. "" */ function table_into_array($rawHTML,$needle="",$needle_within=0,$allowed_tags="") { $upperHTML = strtoupper($rawHTML); $idx = 0; if (strlen($needle) > 0) { $needle = strtoupper($needle); $idx = strpos($upperHTML,$needle); if ($idx === false) { return false; } if ($needle_within == 1) { $cnt = 0; while(($cnt < 100) && (substr($upperHTML,$idx,6) != "",$tmp); if ($tmp2 === false) { return false; } $row = substr($rawHTML,$tmp,$tmp2-$tmp); $pattern = "/||",$tmp); if ($tmp === false) { return false; } $tmp++; $tmp2 = strpos(strtoupper($row),"",$idx)+5; $rowIdx++; /* Now parse the rest of the rows. */ $tmp = strpos($upperHTML,"",$idx); if ($tmp2 === false) { return false; } $table = substr($rawHTML,$tmp,$tmp2-$tmp); while ($tmp = strpos(strtoupper($table),"",$tmp); if ($tmp === false) { return false; } $tmp++; $tmp2 = strpos(strtoupper($row),"")+5); $rowIdx++; } return $aryData; } /* Static method table_into_xml() Generic function to return xml dataset from HTML table data rawHTML: the page source needle: optional string to start parsing source from allowedTags: list of tags to NOT strip from data, e.g. "" */ function table_into_xml($rawHTML,$needle="",$needle_within=0,$allowedTags="") { if (!$aryTable = http::table_into_array($rawHTML,$needle,$needle_within,$allowedTags)) { return false; } $xml = "\n"; $xml .= "\n"; $rowIdx = 0; foreach ($aryTable as $row) { $xml .= "\t\n"; $colIdx = 0; foreach ($row as $col) { $xml .= "\t\t".trim(utf8_encode(htmlspecialchars($col)))."\n"; $colIdx++; } $xml .= "\t\n"; $rowIdx++; } $xml .= "
"; return $xml; } } // end class, start functions function Site1() { $h = new http(); $h->dir = "FOLDER HERE"; // enter server path to the cache directory on YOUR server $c = "URL HERE"; // enter the URL - http:// and all - to home page of the site you're scraping $make_external = "rel=\"external\" "; $url = "FULL URL HERE"; // enter the full http paht to the URL fo the page you're scraping if (!$h->fetch($url, 86400)) { echo "

There is a problem with the http request!

"; echo $h->log; exit(); } $matches = http::table_into_array($h->body, "ITEM 1", 1, "ITEM 2"); // "Item 1" and "Item 2" are the beginning and end tags of the content you're scraping if($matches) { foreach($matches as $m) { $correct_url = str_replace('href="', $make_external . 'href="' . $c, $m); echo $correct_url[0] . "
\n"; } } else { echo "Whoops! We had a problem loading this content. Please try refreshing the page."; } } ?>