From 9dd97e5467b078aeb5b776ce74149a2307d820dc Mon Sep 17 00:00:00 2001 From: bnewbold Date: Tue, 3 May 2011 20:24:50 -0400 Subject: added draft jetfunny scraper --- piccast/feeds/management/commands/scrape_feeds.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'piccast') diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py index a023e9f..265efd4 100644 --- a/piccast/feeds/management/commands/scrape_feeds.py +++ b/piccast/feeds/management/commands/scrape_feeds.py @@ -67,7 +67,7 @@ def scrape_pics_from_html(pset, html): print "Found and saved " + str(len(pics)) + " new Pics" ############################################################################### -def scrape_pics_acidcow(pset): +def scrape_pics_bypage(pset): if(type(pset) != PicSet): raise("Type error, expected PicSet") @@ -77,7 +77,12 @@ def scrape_pics_acidcow(pset): print "Scraping from " + pset.source_url + "..." html = urllib.urlopen(pset.source_url) - mset = re.finditer('src="(http://acidcow.com/pics/\d+/\S+)" alt=\'([^\.]+)\' title', html.read()) + if(pset.feed.shortname == "acidcow"): + mset = re.finditer('src="(http://acidcow.com/pics/\d+/\S+)" alt=\'([^\.]+)\' title', html.read()) + elif(pset.feed.shortname == "jetfunny"): + mset = re.finditer('src=\'(http://jetfunnypictures.com/img/jetfunnypictures.com/\S+)\'', html.read()) + else: + mset = re.finditer('src=[\'\"](http://\S+)[\"\']', html.read()) pics = list() index = 0 @@ -184,7 +189,7 @@ def scrape_feed(feed_shortname): p.description = None p.save() print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" - scrape_pics_acidcow(p) + scrape_pics_bypage(p) elif(feed_shortname == u"butdoesitfloat"): # the descriptions here are usually important # TODO: should comment URL instead of link url (which bounces)? @@ -193,11 +198,16 @@ def scrape_feed(feed_shortname): print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" scrape_pics_from_html(p, pset['description']) elif(feed_shortname == u"vectortut"): - print pset['description'] p.description = "" p.save() print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" scrape_pics_from_html(p, pset['description']) + elif(feed_shortname == u"jetfunny"): + if(len(pset['description']) > 0): + p.description = strip_tags(pset['description'].split("

")[0]) + p.save() + print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" + scrape_pics_bypage(p) else: print "ERROR: unknown PicFeed: " + pset['feed'] #parse_page(s['source_url'], httplib.HTTPConnection) -- cgit v1.2.3