Email Crawler Script PHP MySQL

This script is useful for crawling the emails from the website in recursive manner. this is really very easy class to call and starting crawling emails.

Note: I found it somewhere on internet. I put it here for my future reference. It also useful to people who wants this type of script.

emailcrawler.php

<?php
/*
Written by: Aziz S. Hussain 
Email: azizsaleh@gmail.com 
Website: www.azizsaleh.com 
Produced under GPL License 
*/ 
/*****/ 
Email address scraper based on a URL.
*/
class scraper
{
	// URL that stores first URL to start
	var $startURL; 

	// List of allowed page extensions
	var $allowedExtensions = array('.css','.xml','.rss','.ico','.js','.gif','.jpg','.jpeg','.png','.bmp','.wmv','.avi','.mp3','.flash','.swf','.css'); 

	// Which URL to scrape
	var $useURL; 

	// Start path, for links that are relative
	var $startPath; 

	// Set start path
	function setStartPath($path = NULL){
		if($path != NULL)
		{
			$this->startPath = $path;
		} else {
			$temp = explode('/',$this->startURL);
			$this->startPath = $temp[0].'//'.$temp[2];
		}
	} 

	// Add the start URL
	function startURL($theURL){
		// Set start URL
		$this->startURL = $theURL;
	} 

	// Function to get URL contents
	function getContents($url)
	{
		$ch = curl_init(); // initialize curl handle
		curl_setopt($ch, CURLOPT_HEADER, 0);
		curl_setopt($ch, CURLOPT_VERBOSE, 0);
		curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible;)");
		curl_setopt($ch, CURLOPT_AUTOREFERER, false);
		curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,7);
		curl_setopt($ch, CURLOPT_REFERER, 'http://'.$this->useURL);
		curl_setopt($ch, CURLOPT_URL,$url); // set url to post to
		curl_setopt($ch, CURLOPT_FAILONERROR, 1);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);// allow redirects
		curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable
		curl_setopt($ch, CURLOPT_TIMEOUT, 50); // times out after 50s
		curl_setopt($ch, CURLOPT_POST, 0); // set POST method
		$buffer = curl_exec($ch); // run the whole process
		curl_close($ch);
		return $buffer;
	}

	// Actually do the URLS
	function startScraping()
	{
		// Get page content
		$pageContent = $this->getContents($this->startURL);
		echo 'Scraping URL: '.$this->startURL.PHP_EOL; 

		// Get list of all emails on page
		preg_match_all('/([\w+\.]*\w+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);
		// Add the email to the email list array
		$insertCount=0;
		foreach($results[1] as $curEmail)
		{
			$insert = mysql_query("INSERT INTO `emaillist` (`emailadd`) VALUES ('$curEmail')");
			if($insert){$insertCount++;}
		} 

		echo 'Emails found: '.number_format($insertCount).PHP_EOL; 

		// Mark the page done
		$insert = mysql_query("INSERT INTO `finishedurls` (`urlname`) VALUES ('".$this->startURL."')"); 

		// Get list of new page URLS is emails were found on previous page
		preg_match_all('/href="([^"]+)"/Umis',$pageContent,$results);
		$currentList = $this->cleanListURLs($results[1]); 

		$insertURLCount=0;
		// Add the list to the array
		foreach($currentList as $curURL)
		{
			$insert = mysql_query("INSERT INTO `workingurls` (`urlname`) VALUES ('$curURL')");
			if($insert){$insertURLCount++;}
		} 

		echo 'URLs found: '.number_format($insertURLCount).PHP_EOL;
		$getURL = mysql_fetch_assoc(mysql_query("SELECT `urlname` FROM `workingurls` ORDER BY RAND() LIMIT 1"));
		$remove = mysql_query("DELETE FROM `workingurls` WHERE `urlname`='$getURL[urlname]' LIMIT 1"); 

		// Get the new page ready
		$this->startURL = $getURL['urlname'];
		$this->setStartPath(); 

		// If no more pages, return
		if($this->startURL == NULL){ return;}
		// Clean vars
		unset($results,$pageContent);
		// If more pages, loop again
		$this->startScraping();
	} 

	// Function to clean input URLS
	function cleanListURLs($linkList)
	{
		foreach($linkList as $sub => $url)
		{
			// Check if only 1 character - there must exist at least / character
			if(strlen($url) <= 1){unset($linkList[$sub]);}
			// Check for any javascript
			if(strpos('javascript',$url)){unset($linkList[$sub]);}
			// Check for invalid extensions
			str_replace($this->allowedExtensions,'',$url,$count);
			if($count > 0){ unset($linkList[$sub]);}
			// If URL starts with #, ignore
			if(substr($url,0,1) == '#'){unset($linkList[$sub]);} 

			// If everything is OK and path is relative, add starting path
			if(substr($url,0,1) == '/' || substr($url,0,1) == '?' || substr($url,0,1) == '='){
			$linkList[$sub] = $this->startPath.$url;
			}
		}
		return $linkList;
	} 
}
?>

database.sql

CREATE TABLE IF NOT EXISTS `emaillist` (
 `emailadd` varchar(255) NOT NULL,
 PRIMARY KEY (`emailadd`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='List of all gotten emails';

CREATE TABLE IF NOT EXISTS `finishedurls` (
 `urlname` varchar(255) NOT NULL,
 PRIMARY KEY (`urlname`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='List of finished urls';

CREATE TABLE IF NOT EXISTS `workingurls` (
 `urlname` varchar(255) NOT NULL,
 PRIMARY KEY (`urlname`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='List of current working urls';

start.php

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
	<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
	</head>
	<body>
<?php
	error_reporting(0);
	$DB_USER = 'root';
	$DB_PASSWORD = '';
	$DB_HOST = 'localhost';
	$DB_NAME = 'test';
	$dbc = mysql_connect ($DB_HOST, $DB_USER, $DB_PASSWORD) or $error = mysql_error();
	mysql_select_db($DB_NAME) or $error = mysql_error();
	mysql_query("SET NAMES `utf8`") or $error = mysql_error();
	if($error){ die($error);}

	include('emailcrawler.php');

	$new = new scraper;
	// Start Path can be empty, which will be extracted from the start URL
	$new->setStartPath();
	//$new->setStartPath('http://geekiest.net');
	$new->startURL('http://geekiest.net/beautifulmails/');
	$new->startScraping();
?>
	</body>
</html>

 

&npsp;