Email Scrapper – The Digi Life

This script is useful for crawling the emails from the website in recursive manner. this is really very easy class to call and starting crawling emails.

Note: I found it somewhere on internet. I put it here for my future reference. It also useful to people who wants this type of script.

emailcrawler.php

<?php
/*
Written by: Aziz S. Hussain 
Email: azizsaleh@gmail.com 
Website: www.azizsaleh.com 
Produced under GPL License 
*/ 
/*****/ 
Email address scraper based on a URL.
*/
class scraper
{
	// URL that stores first URL to start
	var $startURL; 

	// List of allowed page extensions
	var $allowedExtensions = array('.css','.xml','.rss','.ico','.js','.gif','.jpg','.jpeg','.png','.bmp','.wmv','.avi','.mp3','.flash','.swf','.css'); 

	// Which URL to scrape
	var $useURL; 

	// Start path, for links that are relative
	var $startPath; 

	// Set start path
	function setStartPath($path = NULL){
		if($path != NULL)
		{
			$this->startPath = $path;
		} else {
			$temp = explode('/',$this->startURL);
			$this->startPath = $temp[0].'//'.$temp[2];
		}
	} 

	// Add the start URL
	function startURL($theURL){
		// Set start URL
		$this->startURL = $theURL;
	} 

	// Function to get URL contents
	function getContents($url)
	{
		$ch = curl_init(); // initialize curl handle
		curl_setopt($ch, CURLOPT_HEADER, 0);
		curl_setopt($ch, CURLOPT_VERBOSE, 0);
		curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible;)");
		curl_setopt($ch, CURLOPT_AUTOREFERER, false);
		curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,7);
		curl_setopt($ch, CURLOPT_REFERER, 'http://'.$this->useURL);
		curl_setopt($ch, CURLOPT_URL,$url); // set url to post to
		curl_setopt($ch, CURLOPT_FAILONERROR, 1);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);// allow redirects
		curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable
		curl_setopt($ch, CURLOPT_TIMEOUT, 50); // times out after 50s
		curl_setopt($ch, CURLOPT_POST, 0); // set POST method
		$buffer = curl_exec($ch); // run the whole process
		curl_close($ch);
		return $buffer;
	}

	// Actually do the URLS
	function startScraping()
	{
		// Get page content
		$pageContent = $this->getContents($this->startURL);
		echo 'Scraping URL: '.$this->startURL.PHP_EOL; 

		// Get list of all emails on page
		preg_match_all('/([\w+\.]*\w+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);
		// Add the email to the email list array
		$insertCount=0;
		foreach($results[1] as $curEmail)
		{
			$insert = mysql_query("INSERT INTO `emaillist` (`emailadd`) VALUES ('$curEmail')");
			if($insert){$insertCount++;}
		} 

		echo 'Emails found: '.number_format($insertCount).PHP_EOL; 

		// Mark the page done
		$insert = mysql_query("INSERT INTO `finishedurls` (`urlname`) VALUES ('".$this->startURL."')"); 

		// Get list of new page URLS is emails were found on previous page
		preg_match_all('/href="([^"]+)"/Umis',$pageContent,$results);
		$currentList = $this->cleanListURLs($results[1]); 

		$insertURLCount=0;
		// Add the list to the array
		foreach($currentList as $curURL)
		{
			$insert = mysql_query("INSERT INTO `workingurls` (`urlname`) VALUES ('$curURL')");
			if($insert){$insertURLCount++;}
		} 

		echo 'URLs found: '.number_format($insertURLCount).PHP_EOL;
		$getURL = mysql_fetch_assoc(mysql_query("SELECT `urlname` FROM `workingurls` ORDER BY RAND() LIMIT 1"));
		$remove = mysql_query("DELETE FROM `workingurls` WHERE `urlname`='$getURL[urlname]' LIMIT 1"); 

		// Get the new page ready
		$this->startURL = $getURL['urlname'];
		$this->setStartPath(); 

		// If no more pages, return
		if($this->startURL == NULL){ return;}
		// Clean vars
		unset($results,$pageContent);
		// If more pages, loop again
		$this->startScraping();
	} 

	// Function to clean input URLS
	function cleanListURLs($linkList)
	{
		foreach($linkList as $sub => $url)
		{
			// Check if only 1 character - there must exist at least / character
			if(strlen($url) <= 1){unset($linkList[$sub]);}
			// Check for any javascript
			if(strpos('javascript',$url)){unset($linkList[$sub]);}
			// Check for invalid extensions
			str_replace($this->allowedExtensions,'',$url,$count);
			if($count > 0){ unset($linkList[$sub]);}
			// If URL starts with #, ignore
			if(substr($url,0,1) == '#'){unset($linkList[$sub]);} 

			// If everything is OK and path is relative, add starting path
			if(substr($url,0,1) == '/' || substr($url,0,1) == '?' || substr($url,0,1) == '='){
			$linkList[$sub] = $this->startPath.$url;
			}
		}
		return $linkList;
	} 
}
?>

database.sql

CREATE TABLE IF NOT EXISTS `emaillist` (
 `emailadd` varchar(255) NOT NULL,
 PRIMARY KEY (`emailadd`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='List of all gotten emails';

CREATE TABLE IF NOT EXISTS `finishedurls` (
 `urlname` varchar(255) NOT NULL,
 PRIMARY KEY (`urlname`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='List of finished urls';

CREATE TABLE IF NOT EXISTS `workingurls` (
 `urlname` varchar(255) NOT NULL,
 PRIMARY KEY (`urlname`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='List of current working urls';

start.php

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
	<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
	</head>
	<body>
<?php
	error_reporting(0);
	$DB_USER = 'root';
	$DB_PASSWORD = '';
	$DB_HOST = 'localhost';
	$DB_NAME = 'test';
	$dbc = mysql_connect ($DB_HOST, $DB_USER, $DB_PASSWORD) or $error = mysql_error();
	mysql_select_db($DB_NAME) or $error = mysql_error();
	mysql_query("SET NAMES `utf8`") or $error = mysql_error();
	if($error){ die($error);}

	include('emailcrawler.php');

	$new = new scraper;
	// Start Path can be empty, which will be extracted from the start URL
	$new->setStartPath();
	//$new->setStartPath('http://geekiest.net');
	$new->startURL('http://geekiest.net/beautifulmails/');
	$new->startScraping();
?>
	</body>
</html>

&npsp;

Tag: Email Scrapper

Email Crawler Script PHP MySQL