Primary functions/operations of web link crawlers are:
1. Retrieve seed web page
2. Extract all valid URLs/links
3. Visit every link extracted in step 2.
4. Stop if depth has reached maximum depth.
The following Python script is an extremely simple and effective web crawler! It can be configured to use different seed URLs and also different depth.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, urllib | |
from urlparse import urlparse | |
depth_max = 1 #determines the maximum depth the crawler goes from the seed website | |
urls = [] #stores the unique urls retrieved while crawling | |
url_visited = [] #stores the urls of web pages visited | |
#clean url; add protocol and host if absent; remove query section | |
def getCleanURL(_cURL, _baseHost): | |
try: | |
oURL = urlparse(_cURL) | |
except: | |
return None | |
if oURL.scheme == '': | |
scheme = 'http' | |
else: | |
scheme = oURL.scheme | |
if oURL.netloc == '': | |
host = _baseHost | |
else: | |
host = oURL.netloc | |
url = scheme + '://' + host + oURL.path | |
return url | |
def crawl(_baseURL, fh, _depth=0): | |
oBase = urlparse(_baseURL) | |
baseHost = oBase.netloc | |
if _depth > depth_max: #if depth exceed maximum depth stop crawling further | |
return | |
elif _baseURL in url_visited: #web page already visited | |
return | |
try: | |
res = urllib.urlopen(_baseURL).read() | |
url_visited.append(_baseURL) | |
except: #error visiting url/web page | |
return | |
res = res.replace('\n', '') | |
for url in re.findall('''href=["'](/[^"']+)["']''', res, re.I): | |
url = getCleanURL(url, baseHost) | |
if url not in urls and url != None: #check url validity and uniqueness | |
urls.append(url) | |
fh.write(url + '\n') | |
for url in urls: | |
crawl(url, fh, _depth+1) | |
def main(): | |
seed_url = 'http://www.cnn.com' | |
fh_urls = open('urls.txt','w') | |
crawl(seed_url, fh_urls) | |
fh_urls.close() | |
if __name__ == "__main__": | |
main() |
A partial output of the above script with http://www.cnn.com as the seed URL is given below:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
http://www.cnn.com/tools/search/cnncom.xml | |
http://www.cnn.com/tools/search/cnncomvideo.xml | |
http://www.cnn.com/CNN/Programs | |
http://www.cnn.com/cnn/programs/ | |
http://www.cnn.com/cnni/ | |
http://www.cnn.com/video/ | |
http://www.cnn.com/trends/ | |
http://www.cnn.com/US/ | |
http://www.cnn.com/WORLD/ | |
http://www.cnn.com/POLITICS/ | |
http://www.cnn.com/JUSTICE/ | |
http://www.cnn.com/SHOWBIZ/ | |
http://www.cnn.com/TECH/ | |
http://www.cnn.com/HEALTH/ | |
http://www.cnn.com/LIVING/ | |
http://www.cnn.com/TRAVEL/ | |
http://www.cnn.com/OPINION/ | |
http://www.cnn.com/2014/05/04/us/circus-accident-rhode-island/index.html | |
http://www.cnn.com/2014/05/03/politics/washington-correspondents-dinner/index.html | |
http://www.cnn.com/2014/05/04/world/europe/ukraine-crisis/index.html | |
http://www.cnn.com/2014/05/04/us/clippers-shelly-sterling/index.html | |
http://www.cnn.com/2014/05/04/us/rocky-top-tennessee/index.html | |
http://www.cnn.com/2014/05/04/us/condoleeza-rice-rutgers-protest |
No comments:
Post a Comment