# Pulls in helpers as well as utility libraries from scrape_helpers import * # Pulls in the PicCast model objects (PicFeed, PicSet, and Pic) from feeds.models import * from django.utils.html import strip_tags """ This file contains a set of scraper functions, one for each PicCast feed. Helper functions are defined in scrape_helpers.py; when run as a django command this file is included from scrape_feeds.py. The general pattern for each scraper is shown below. Note that neither pics nor sets are saved by this function:: def scrape_examplefeed(PicFeed): sets_data = fetch_data(PicFeed.uri) sets = sets_from_data(sets_data) filter(sets) foreach(sets): set = modifcations(set) pic_data = fetch_data(set) pics = pics_from_data(pic_data) filter(pics) set.pics = pics good_sets.append(set) return (good_sets) It is difficult to generalize too much further because there are several source data types for both feeds of PicSets and the list of pictures themselves. For example, some of the known patterns which need to be accomodated are: rss -> sets, foreach inline html -> pics rss -> sets, foreach html -> pics json -> sets, foreach rss -> pics json -> sets, foreach json -> pics html -> sets, foreach html -> pics To add a new scraper, first add the PicFeed to the local database, setting is_active to True. Then create a scrape_() method (copy one of the below) and modify the parameters and dataflow to suit that PicFeed. Test by running a django shell ("./manage.py shell"), import the scrapers ("run scrapers.py"), and then run a test for that one feed ("test_scrapers(name='shortname')"). You'll need to download some test RSS/HTML/whatever to test with and put it in ../example_feed_data/. """ def save_sets(sets): print "Saving " + str(len(sets)) + " new PicSets..." for s in sets: print " - " + str(s) + " with " + str(len(s.pics)) + " pics" s.save() for p in s.pics: p.set = s p.save() # ---------------------- Real scrapers go here --------------------- def scrape_acidcow(pf, testing = False): if testing: pf.rssfeed_url = "../example_feed_data/acidcow_rss.xml" testing_uri = "../example_feed_data/acidcow_page.html" sets_data = fetch_rss(pf.rssfeed_url) sets = sets_from_rss(sets_data, source_url_field = "guid", ) existing_urls = map(lambda s: s.source_url, pf.picset_set.all()) sets = filter(lambda s: s.source_url not in existing_urls, sets) sets = filter(lambda s: s.category_name in ("Pics", "Picdump", "Celebs", "Girls", "Cars"), sets) good_sets = [] for s in sets: if testing: s.source_url = testing_uri if(len(s.description) > 0): s.description = strip_tags(s.description) if(s.description.startswith("Similar posts:")): s.description = None full_title = s.title s.title = strip_parens(s.title) s.category = Category.objects.get_or_create(name=s.category_name)[0] print s pic_data = fetch_html(s.source_url) pics = pics_from_html_simple(pic_data, match_src = "http://acidcow.com/pics/", source_url = s.source_url, meaningless_titles = [full_title, ], ) #filter(pics,) if(len(pics) < 3): continue s.pics = pics good_sets.append(s) if testing: return good_sets else: save_sets(good_sets) def scrape_butdoesitfloat(pf, testing = False): if testing: pf.rssfeed_url = "../example_feed_data/butdoesitfloat_rss.xml" sets_data = fetch_rss(pf.rssfeed_url) sets = sets_from_rss(sets_data, source_url_field = "comments", category = Category.objects.get_or_create(name="Art")[0] ) existing_urls = map(lambda s: s.source_url, pf.picset_set.all()) sets = filter(lambda s: s.source_url not in existing_urls, sets) good_sets = [] for s in sets: pic_data = fetch_html("", raw=s.description) if(len(s.description) > 0): s.description = s.description.split("