Sunday Feb 05
Move
Display 0 | 5 | 10 | 15 Stories

Software Tutorials

Topics
Top Story

How to Enable / Disable Autoformat in word 2007

How to Enable / Disable Autoformat in word 2007

  The interface of Office 2007 does does not offer an intuitive tool to enable or disable the autocorrect...

How to use old MSN Messenger without upgrading

  If you dont like the newer version of MSN Live Messenger and want to use the old MSN Messenger...

Software Tutorials

 

Web Crawler in Python

(0 votes, average: 0 out of 5)





 

I used the following web crawler to crawl and extract recipes from a website 'chooseindia.com', for a university project. You can customize it according to your needs and use it as you like.The source file can be downloaded here.

#!/usr/bin/python

from HTMLParser import HTMLParser
import urllib
import sys
import re

import time
import datetime


class myParser(HTMLParser):


    viewedQueue = []
    instQueue = []
    recipe_links = []

    def __init__(self, url):
       # self.baseUrl = url[:url.rfind('/')]
        self.baseUrl=url
        HTMLParser.__init__(self)
   
    def abc(self):
    print "mohit"        
    def reset(self):
        self.urls = set()
        HTMLParser.reset(self)

    def handle_starttag(self, tag, attrs):
        #print "Encountered the beginning of a %s tag" % tag
        if tag == 'a':
            if attrs[0][0] == 'href':
                if attrs[0][1].find(':') == -1:
                    # we need to add the base URL.
              #      self.urls.add(self.baseUrl + '/' + attrs[0][1])
                    print "the base url is" + self.baseUrl
                    temp = self.baseUrl  + attrs[0][1]
                    #temp = self.baseUrl + '/' + attrs[0][1]
                    if temp != '':
                        if re.match('http://www.chooseindia.com/recipe/*.*',temp):
                          
                            self.instQueue.append(temp)
                            self.recipe_links.append(temp)
                            print "\nadding ",temp
                          #  if re.match('http://www.chooseindia.com/recipe/[a-zA-Z]+/[a-zA-Z]+[0-9]+.htm',temp):
                           #     self.instQueue.append(temp)
                           #     self.recipe_links.append(temp)
                            #    print "\nrecipe found ",temp
                           # if re.match('http://www.chooseindia.com/recipe/[a-zA-Z]+[0-9]+.htm',temp):
                           #     self.instQueue.append(temp)
                            #    self.recipe_links.append(temp)
                             #   print "\nrecipe found ",temp
                        else:
                            print "\nignored ",temp
                else:
               #     self.urls.add(attrs[0][1])
                    temp = attrs[0][1]
               #     self.instQueue.append(temp)
                    print "\nignored ",temp

    def get_next_link( self ):
        if self.instQueue == []:
            return ''
        else:
            return self.instQueue.pop(0)

    def get_next_recipe( self ):
        if self.recipe_links == []:
            return ''
        else:
            return self.recipe_links.pop(0)

def main():

    url = sys.argv[1]
    p = myParser(url)
    n = 0
    startTime = time.time()
#    while url != '':
    while ((n < 2) & (url != '')):
        print "\nChecking ",url
        s = urllib.urlopen(url)
        data = s.read()
        p.feed(data)
        url= p.get_next_link()
        n = n + 1

    print "\ndone "
   
    r = 0

   
   
    print "\n Recipes found ... "
    recipe = p.get_next_recipe()
    while recipe != '':
        #fout.write(recipe + '\n')
        recipe = p.get_next_recipe() 
        r = r + 1
       
    #fout.close()
    elapsedTime = time.time() - startTime
  #  t = datetime.time(0,0,int(elapsedTime), (elapsedTime%1)*1000)
  #  print t.isoformat()
    print "\n\n ",int(elapsedTime/3600)," Hrs",int( ((elapsedTime/60)%60) )," Mins",int(elapsedTime%60)," Secs elapsed";
    print "\n Recipes found per second : ",int(r/elapsedTime)


#    urllist = p.urls._data.keys()
#    urllist.sort()
#    print '\n'.join(urllist)
if __name__ == "__main__":
    main()


 



Add this page to your favorite Social Bookmarking websites
Reddit! Del.icio.us! Google! Live! Facebook! Slashdot! Technorati! StumbleUpon! Spurl! Furl! Yahoo! Squidoo! Ask! DZone! Free Joomla PHP extensions, software, information and tutorials.



Comments

avatar Bumm Alfonz
0
 
 
what about robots.txt? Ignoring this file will make your spider a senseless tool, because you will fall into bad spider traps. I can not believe, that on any university this code was accepted.
B
i
u
Quote
Code
List
List item
URL
Name *
Code   
ChronoComments by Joomla Professional Solutions
Submit Comment
Cancel
avatar Nayya
0
 
 
Hello
i wanna know which version of python you used?
waiting for reply
B
i
u
Quote
Code
List
List item
URL
Name *
Code   
ChronoComments by Joomla Professional Solutions
Submit Comment
Cancel
B
i
u
Quote
Code
List
List item
URL
Name *
Code   
ChronoComments by Joomla Professional Solutions
Submit Comment

Tag Cloud

You may also like to read

Login Form