index.php
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
<title>Web crawling using php</title>
<link rel="stylesheet" type="text/css" href="style.css" />
<script type="text/javascript" src="jquery.min.js"></script>
<script type="text/javascript">
$(document).ready(function()
{
$("#submit").click(function()
{
var url = $("#url").val();
if(url.length > 0)
{
//A loading or waiting gif image will display in the demo_output div until the extract data will appearing
$("#demo_output").html(' <img src="loading.gif">');
$.ajax
({
type: "POST",
url: "get_content.php",
data: "url="+url,
success: function(option)
{
$("#demo_output").html(option);
}
});
}
});
});
</script>
</head>
<body>
<div align="center"><b>Extract image links and hyperlinks from website using php</b>
<br />
<br />
<div class="demo_wrapper">
<div id="demo_input">
Enter Url : <input type="text" name="url" id="url" value="" /> <input type="submit" id="submit" value="Go" />
</div>
<div id="demo_output">
</div>
</div>
</div>
</body>
</html>
Here we just enter the url address and send it to the get_content.php and display the extract data, that’s all.get_content.php
<?php
set_time_limit(120);
class Crawler
{
protected $markup = "";
public function __construct($url)
{
$this->markup = $this->getMarkup($url);
}
public function getMarkup($url)
{
return file_get_contents($url);
}
public function get($type)
{
$method = "_get_{$type}";
if (method_exists($this, $method))
{
return call_user_func(array($this, $method));
}
}
protected function _get_images()
{
if (!empty($this->markup))
{
preg_match_all('/<img [^>]*src="?([^ ">]+)"?/i', $this->markup, $images);
return !empty($images[1]) ? $images[1] : FALSE;
}
}
protected function _get_links()
{
if (!empty($this->markup))
{
preg_match_all('/<a [^>]*href="?([^ ">]+)"?/i', $this->markup, $links);
return !empty($links[1]) ? $links[1] : FALSE;
}
}
} // End of Crawler class
if(isset($_POST['url']) && $_POST['url'] != '')
{
$url = $_POST['url'];
//We must enter http:// or https:// before the url, if it does not, then we check here
//and write http if needed.
if(substr($url, 0, 4) != 'http') $url = 'http://'.$url;
//Create an object of class Crawler.
$crawl = new Crawler($url);
//Call the function get() with argument "images"
$images = $crawl->get('images');
//Call the function get() with argument "links"
$links = $crawl->get('links');
$i = 0;
echo "<table cellpadding='5'>";
echo "<tr><td id='title'>IMAGE LINKS</td></tr>";
//Here we chech if array $images is empty or not. If it empty then we just pass the control to
//the else condition.
if(!empty($images))
{
//Here we print the image links
foreach($images as $img)
{
if($i%2 == 0) $style = "style='background-color:#cccccc;'";
else $style="style='background-color:#eeeeee;'";
if($img[0] == "'") $img = substr($img,1,-1);
echo "<tr><td ".$style.">".$img."</td></tr>";
$i++;
}
}
else echo "<tr><td>No Image!</td></tr>";
echo "</table>";
$j = 0;
echo "<table cellpadding='5'>";
echo "<tr><td id='title'>HYPERLINKS</td></tr>";
//Here we chech if array $links is empty or not. If it empty then we just pass the control to
//the else condition.
if(!empty($links))
{
//Here we print the hyperlinks
foreach($links as $link)
{
if($j%2 == 0) $style = "style='background-color:#cccccc;'";
else $style="style='background-color:#eeeeee;'";
if($link[0] == "'") $link = substr($link,1,-1);
if($link[0] == "/") $link = $_POST['url'].$link;
echo "<tr><td ".$style.">".$link."</td></tr>";
$j++;
}
}
else echo "<tr><td>No Hyperlink!</td></tr>";
echo "</table>";
}
?>





No comments:
Post a Comment