Web Crawler in Python
Written by Abhishek Ghosh
I used the following web crawler to crawl and extract recipes from a website 'chooseindia.com', for a university project. You can customize it according to your needs and use it as you like.The source file can be downloaded here.
#!/usr/bin/python
from HTMLParser import HTMLParser
import urllib
import sys
import re
import time
import datetime
class myParser(HTMLParser):
viewedQueue = []
instQueue = []
recipe_links = []
def __init__(self, url):
# self.baseUrl = url[:url.rfind('/')]
self.baseUrl=url
HTMLParser.__init__(self)
def abc(self):
print "mohit"
def reset(self):
self.urls = set()
HTMLParser.reset(self)
def handle_starttag(self, tag, attrs):
#print "Encountered the beginning of a %s tag" % tag
if tag == 'a':
if attrs[0][0] == 'href':
if attrs[0][1].find(':') == -1:
# we need to add the base URL.
# self.urls.add(self.baseUrl + '/' + attrs[0][1])
print "the base url is" + self.baseUrl
temp = self.baseUrl + attrs[0][1]
#temp = self.baseUrl + '/' + attrs[0][1]
if temp != '':
if re.match('http://www.chooseindia.com/recipe/*.*',temp):
self.instQueue.append(temp)
self.recipe_links.append(temp)
print "\nadding ",temp
# if re.match('http://www.chooseindia.com/recipe/[a-zA-Z]+/[a-zA-Z]+[0-9]+.htm',temp):
# self.instQueue.append(temp)
# self.recipe_links.append(temp)
# print "\nrecipe found ",temp
# if re.match('http://www.chooseindia.com/recipe/[a-zA-Z]+[0-9]+.htm',temp):
# self.instQueue.append(temp)
# self.recipe_links.append(temp)
# print "\nrecipe found ",temp
else:
print "\nignored ",temp
else:
# self.urls.add(attrs[0][1])
temp = attrs[0][1]
# self.instQueue.append(temp)
print "\nignored ",temp
def get_next_link( self ):
if self.instQueue == []:
return ''
else:
return self.instQueue.pop(0)
def get_next_recipe( self ):
if self.recipe_links == []:
return ''
else:
return self.recipe_links.pop(0)
def main():
url = sys.argv[1]
p = myParser(url)
n = 0
startTime = time.time()
# while url != '':
while ((n < 2) & (url != '')):
print "\nChecking ",url
s = urllib.urlopen(url)
data = s.read()
p.feed(data)
url= p.get_next_link()
n = n + 1
print "\ndone "
r = 0
print "\n Recipes found ... "
recipe = p.get_next_recipe()
while recipe != '':
#fout.write(recipe + '\n')
recipe = p.get_next_recipe()
r = r + 1
#fout.close()
elapsedTime = time.time() - startTime
# t = datetime.time(0,0,int(elapsedTime), (elapsedTime%1)*1000)
# print t.isoformat()
print "\n\n ",int(elapsedTime/3600)," Hrs",int( ((elapsedTime/60)%60) )," Mins",int(elapsedTime%60)," Secs elapsed";
print "\n Recipes found per second : ",int(r/elapsedTime)
# urllist = p.urls._data.keys()
# urllist.sort()
# print '\n'.join(urllist)
if __name__ == "__main__":
main()
















Comments
i wanna know which version of python you used?
waiting for reply