index.php
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" /> <title>Web crawling using php</title> <link rel="stylesheet" type="text/css" href="style.css" /> <script type="text/javascript" src="jquery.min.js"></script> <script type="text/javascript"> $(document).ready(function() { $("#submit").click(function() { var url = $("#url").val(); if(url.length > 0) { //A loading or waiting gif image will display in the demo_output div until the extract data will appearing $("#demo_output").html(' <img src="loading.gif">'); $.ajax ({ type: "POST", url: "get_content.php", data: "url="+url, success: function(option) { $("#demo_output").html(option); } }); } }); }); </script> </head> <body> <div align="center"><b>Extract image links and hyperlinks from website using php</b> <br /> <br /> <div class="demo_wrapper"> <div id="demo_input"> Enter Url : <input type="text" name="url" id="url" value="" /> <input type="submit" id="submit" value="Go" /> </div> <div id="demo_output"> </div> </div> </div> </body> </html>Here we just enter the url address and send it to the get_content.php and display the extract data, that’s all.
get_content.php
<?php set_time_limit(120); class Crawler { protected $markup = ""; public function __construct($url) { $this->markup = $this->getMarkup($url); } public function getMarkup($url) { return file_get_contents($url); } public function get($type) { $method = "_get_{$type}"; if (method_exists($this, $method)) { return call_user_func(array($this, $method)); } } protected function _get_images() { if (!empty($this->markup)) { preg_match_all('/<img [^>]*src="?([^ ">]+)"?/i', $this->markup, $images); return !empty($images[1]) ? $images[1] : FALSE; } } protected function _get_links() { if (!empty($this->markup)) { preg_match_all('/<a [^>]*href="?([^ ">]+)"?/i', $this->markup, $links); return !empty($links[1]) ? $links[1] : FALSE; } } } // End of Crawler class if(isset($_POST['url']) && $_POST['url'] != '') { $url = $_POST['url']; //We must enter http:// or https:// before the url, if it does not, then we check here //and write http if needed. if(substr($url, 0, 4) != 'http') $url = 'http://'.$url; //Create an object of class Crawler. $crawl = new Crawler($url); //Call the function get() with argument "images" $images = $crawl->get('images'); //Call the function get() with argument "links" $links = $crawl->get('links'); $i = 0; echo "<table cellpadding='5'>"; echo "<tr><td id='title'>IMAGE LINKS</td></tr>"; //Here we chech if array $images is empty or not. If it empty then we just pass the control to //the else condition. if(!empty($images)) { //Here we print the image links foreach($images as $img) { if($i%2 == 0) $style = "style='background-color:#cccccc;'"; else $style="style='background-color:#eeeeee;'"; if($img[0] == "'") $img = substr($img,1,-1); echo "<tr><td ".$style.">".$img."</td></tr>"; $i++; } } else echo "<tr><td>No Image!</td></tr>"; echo "</table>"; $j = 0; echo "<table cellpadding='5'>"; echo "<tr><td id='title'>HYPERLINKS</td></tr>"; //Here we chech if array $links is empty or not. If it empty then we just pass the control to //the else condition. if(!empty($links)) { //Here we print the hyperlinks foreach($links as $link) { if($j%2 == 0) $style = "style='background-color:#cccccc;'"; else $style="style='background-color:#eeeeee;'"; if($link[0] == "'") $link = substr($link,1,-1); if($link[0] == "/") $link = $_POST['url'].$link; echo "<tr><td ".$style.">".$link."</td></tr>"; $j++; } } else echo "<tr><td>No Hyperlink!</td></tr>"; echo "</table>"; } ?>
No comments:
Post a Comment