From 9f4c07024e685235cb10e7f6e0d9a9090c771532 Mon Sep 17 00:00:00 2001 From: bnewbold Date: Wed, 11 May 2011 22:49:11 -0400 Subject: WIP: new scraper stuff, should be a branch --- piccast/scrapers.py | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 piccast/scrapers.py (limited to 'piccast/scrapers.py') diff --git a/piccast/scrapers.py b/piccast/scrapers.py new file mode 100644 index 0000000..933c54c --- /dev/null +++ b/piccast/scrapers.py @@ -0,0 +1,226 @@ + +# Pulls in helpers as well as utility libraries +from scrape_helpers import * + +# Pulls in the PicCast model objects (PicFeed, PicSet, and Pic) +from feeds.models import * + +from django.utils.html import strip_tags + +""" +This file contains a set of scraper functions, one for each PicCast feed. +Helper functions are defined in scrape_helpers.py; when run as a django command +this file is included from scrape_feeds.py. + +The general pattern for each scraper is shown below. Note that neither pics nor +sets are saved by this function:: + + def scrape_examplefeed(PicFeed): + sets_data = fetch_data(PicFeed.uri) + sets = sets_from_data(sets_data) + filter(sets) + foreach(sets): + set = modifcations(set) + pic_data = fetch_data(set) + pics = pics_from_data(pic_data) + filter(pics) + set.pics = pics + good_sets.append(set) + return (good_sets) + +It is difficult to generalize too much further because there are several source +data types for both feeds of PicSets and the list of pictures themselves. For +example, some of the known patterns which need to be accomodated are: + + rss -> sets, foreach inline html -> pics + rss -> sets, foreach html -> pics + json -> sets, foreach rss -> pics + json -> sets, foreach json -> pics + html -> sets, foreach html -> pics + +To add a new scraper, first add the PicFeed to the local database, setting +is_active to True. Then create a scrape_() method (copy one of the +below) and modify the parameters and dataflow to suit that PicFeed. Test by +running a django shell ("./manage.py shell"), import the scrapers ("run +scrapers.py"), and then run a test for that one feed +("test_scrapers(name='shortname')"). You'll need to download some test +RSS/HTML/whatever to test with and put it in ../example_feed_data/. + +""" + +def save_sets(sets): + print "Saving " + str(len(sets)) + " new PicSets..." + for s in sets: + print " - " + str(s) + " with " + str(len(s.pics)) + " pics" + s.save() + for p in s.pics: + p.set = s + p.save() + +# ---------------------- Real scrapers go here --------------------- +def scrape_acidcow(pf, testing = False): + if testing: + pf.rssfeed_url = "../example_feed_data/acidcow_rss.xml" + testing_uri = "../example_feed_data/acidcow_page.html" + sets_data = fetch_rss(pf.rssfeed_url) + sets = sets_from_rss(sets_data, + source_url_field = "guid", + ) + existing_urls = map(lambda s: s.source_url, pf.picset_set.all()) + sets = filter(lambda s: s.source_url not in existing_urls, sets) + sets = filter(lambda s: s.category_name in + ("Pics", "Picdump", "Celebs", "Girls", "Cars"), sets) + good_sets = [] + for s in sets: + if testing: + s.source_url = testing_uri + if(len(s.description) > 0): + s.description = strip_tags(s.description) + if(s.description.startswith("Similar posts:")): + s.description = None + full_title = s.title + s.title = strip_parens(s.title) + s.category = Category.objects.get_or_create(name=s.category_name)[0] + print s + pic_data = fetch_html(s.source_url) + pics = pics_from_html_simple(pic_data, + match_src = "http://acidcow.com/pics/", + source_url = s.source_url, + meaningless_titles = [full_title, ], + ) + #filter(pics,) + if(len(pics) < 3): + continue + s.pics = pics + good_sets.append(s) + if testing: + return good_sets + else: + save_sets(good_sets) + +def scrape_butdoesitfloat(pf, testing = False): + if testing: + pf.rssfeed_url = "../example_feed_data/butdoesitfloat_rss.xml" + sets_data = fetch_rss(pf.rssfeed_url) + sets = sets_from_rss(sets_data, + source_url_field = "comments", + category = Category.objects.get_or_create(name="Art")[0] + ) + existing_urls = map(lambda s: s.source_url, pf.picset_set.all()) + sets = filter(lambda s: s.source_url not in existing_urls, sets) + good_sets = [] + for s in sets: + pic_data = fetch_html("", raw=s.description) + if(len(s.description) > 0): + s.description = s.description.split("