aboutsummaryrefslogtreecommitdiffstats
path: root/piccast/scrapers.py
diff options
context:
space:
mode:
authorbnewbold <bnewbold@robocracy.org>2011-05-11 22:49:11 -0400
committerbnewbold <bnewbold@robocracy.org>2011-05-11 22:49:11 -0400
commit9f4c07024e685235cb10e7f6e0d9a9090c771532 (patch)
tree8af75034f0d7d22d7b761f2cf2b2e9ab741c323a /piccast/scrapers.py
parent5fbd142dddd6b364d6d22a1c027057aa5d9d9e6e (diff)
downloadpiccast-9f4c07024e685235cb10e7f6e0d9a9090c771532.zip
piccast-9f4c07024e685235cb10e7f6e0d9a9090c771532.tar.gz
WIP: new scraper stuff, should be a branch
Diffstat (limited to 'piccast/scrapers.py')
-rw-r--r--piccast/scrapers.py226
1 files changed, 226 insertions, 0 deletions
diff --git a/piccast/scrapers.py b/piccast/scrapers.py
new file mode 100644
index 0000000..933c54c
--- /dev/null
+++ b/piccast/scrapers.py
@@ -0,0 +1,226 @@
+
+# Pulls in helpers as well as utility libraries
+from scrape_helpers import *
+
+# Pulls in the PicCast model objects (PicFeed, PicSet, and Pic)
+from feeds.models import *
+
+from django.utils.html import strip_tags
+
+"""
+This file contains a set of scraper functions, one for each PicCast feed.
+Helper functions are defined in scrape_helpers.py; when run as a django command
+this file is included from scrape_feeds.py.
+
+The general pattern for each scraper is shown below. Note that neither pics nor
+sets are saved by this function::
+
+ def scrape_examplefeed(PicFeed):
+ sets_data = fetch_data(PicFeed.uri)
+ sets = sets_from_data(sets_data)
+ filter(sets)
+ foreach(sets):
+ set = modifcations(set)
+ pic_data = fetch_data(set)
+ pics = pics_from_data(pic_data)
+ filter(pics)
+ set.pics = pics
+ good_sets.append(set)
+ return (good_sets)
+
+It is difficult to generalize too much further because there are several source
+data types for both feeds of PicSets and the list of pictures themselves. For
+example, some of the known patterns which need to be accomodated are:
+
+ rss -> sets, foreach inline html -> pics
+ rss -> sets, foreach html -> pics
+ json -> sets, foreach rss -> pics
+ json -> sets, foreach json -> pics
+ html -> sets, foreach html -> pics
+
+To add a new scraper, first add the PicFeed to the local database, setting
+is_active to True. Then create a scrape_<shortname>() method (copy one of the
+below) and modify the parameters and dataflow to suit that PicFeed. Test by
+running a django shell ("./manage.py shell"), import the scrapers ("run
+scrapers.py"), and then run a test for that one feed
+("test_scrapers(name='shortname')"). You'll need to download some test
+RSS/HTML/whatever to test with and put it in ../example_feed_data/.
+
+"""
+
+def save_sets(sets):
+ print "Saving " + str(len(sets)) + " new PicSets..."
+ for s in sets:
+ print " - " + str(s) + " with " + str(len(s.pics)) + " pics"
+ s.save()
+ for p in s.pics:
+ p.set = s
+ p.save()
+
+# ---------------------- Real scrapers go here ---------------------
+def scrape_acidcow(pf, testing = False):
+ if testing:
+ pf.rssfeed_url = "../example_feed_data/acidcow_rss.xml"
+ testing_uri = "../example_feed_data/acidcow_page.html"
+ sets_data = fetch_rss(pf.rssfeed_url)
+ sets = sets_from_rss(sets_data,
+ source_url_field = "guid",
+ )
+ existing_urls = map(lambda s: s.source_url, pf.picset_set.all())
+ sets = filter(lambda s: s.source_url not in existing_urls, sets)
+ sets = filter(lambda s: s.category_name in
+ ("Pics", "Picdump", "Celebs", "Girls", "Cars"), sets)
+ good_sets = []
+ for s in sets:
+ if testing:
+ s.source_url = testing_uri
+ if(len(s.description) > 0):
+ s.description = strip_tags(s.description)
+ if(s.description.startswith("Similar posts:")):
+ s.description = None
+ full_title = s.title
+ s.title = strip_parens(s.title)
+ s.category = Category.objects.get_or_create(name=s.category_name)[0]
+ print s
+ pic_data = fetch_html(s.source_url)
+ pics = pics_from_html_simple(pic_data,
+ match_src = "http://acidcow.com/pics/",
+ source_url = s.source_url,
+ meaningless_titles = [full_title, ],
+ )
+ #filter(pics,)
+ if(len(pics) < 3):
+ continue
+ s.pics = pics
+ good_sets.append(s)
+ if testing:
+ return good_sets
+ else:
+ save_sets(good_sets)
+
+def scrape_butdoesitfloat(pf, testing = False):
+ if testing:
+ pf.rssfeed_url = "../example_feed_data/butdoesitfloat_rss.xml"
+ sets_data = fetch_rss(pf.rssfeed_url)
+ sets = sets_from_rss(sets_data,
+ source_url_field = "comments",
+ category = Category.objects.get_or_create(name="Art")[0]
+ )
+ existing_urls = map(lambda s: s.source_url, pf.picset_set.all())
+ sets = filter(lambda s: s.source_url not in existing_urls, sets)
+ good_sets = []
+ for s in sets:
+ pic_data = fetch_html("", raw=s.description)
+ if(len(s.description) > 0):
+ s.description = s.description.split("<img")[0]
+ pics = pics_from_html_simple(pic_data,)
+ #filter(pics,)
+ if(len(pics) < 3):
+ continue
+ s.pics = pics
+ good_sets.append(s)
+ if testing:
+ return good_sets
+ else:
+ save_sets(good_sets)
+
+def scrape_nzinzi(pf, testing = False):
+ if testing:
+ pf.rssfeed_url = "../example_feed_data/nzinzi_rss.xml"
+ sets_data = fetch_rss(pf.rssfeed_url)
+ sets = sets_from_rss(sets_data,
+ category = Category.objects.get_or_create(name="Art")[0],
+ created_format = ("%Y-%m-%dT%H:%M:%S", -10),
+ )
+ existing_urls = map(lambda s: s.source_url, pf.picset_set.all())
+ sets = filter(lambda s: s.source_url not in existing_urls, sets)
+ good_sets = []
+ for s in sets:
+ pic_data = fetch_html("", raw=s.description)
+ s.description = ""
+ pics = pics_from_html_simple(pic_data,
+ match_src = "http://ntamak.free.fr/",
+ )
+ #filter(pics,)
+ if(len(pics) < 3):
+ continue
+ s.pics = pics
+ good_sets.append(s)
+ if testing:
+ return good_sets
+ else:
+ save_sets(good_sets)
+
+def scrape_vectortut(pf, testing = False):
+ if testing:
+ pf.rssfeed_url = "../example_feed_data/vectortut_rss.xml"
+ sets_data = fetch_rss(pf.rssfeed_url)
+ sets = sets_from_rss(sets_data,
+ )
+ existing_urls = map(lambda s: s.source_url, pf.picset_set.all())
+ sets = filter(lambda s: s.source_url not in existing_urls, sets)
+ sets = filter(lambda s: s.category_name in
+ ("Inspirational",), sets)
+ good_sets = []
+ for s in sets:
+ pic_data = fetch_html("", raw=s.description)
+ s.description = ""
+ s.category = Category.objects.get_or_create(name=s.category_name)[0]
+ pics = pics_from_html_simple(pic_data,
+ match_src = ".cloudfront.net/",
+ )
+ #filter(pics,)
+ if(len(pics) < 3):
+ continue
+ s.pics = pics
+ good_sets.append(s)
+ if testing:
+ return good_sets
+ else:
+ save_sets(good_sets)
+
+def scrape_chubbychinese(pf, testing = False):
+ if testing:
+ pf.rssfeed_url = "../example_feed_data/chubbychinese_rss.xml"
+ sets_data = fetch_rss(pf.rssfeed_url)
+ sets = sets_from_rss(sets_data,
+ category = Category.objects.get_or_create(name="Food")[0],
+ )
+ existing_urls = map(lambda s: s.source_url, pf.picset_set.all())
+ sets = filter(lambda s: s.source_url not in existing_urls, sets)
+ sets = filter(lambda s: s.category_name in
+ ("Inspirational",), sets)
+ good_sets = []
+ for s in sets:
+ pic_data = fetch_html("", raw=s.description)
+ pics = pics_from_html_simple(pic_data,
+ match_src = "static.flickr.com/",
+ )
+ #filter(pics,)
+ if(len(pics) < 4):
+ continue
+ s.pics = pics
+ good_sets.append(s)
+ if testing:
+ return good_sets
+ else:
+ save_sets(good_sets)
+
+# ---------------------- Testing routines, not required ---------------------
+def test_scrapers(name=None):
+ for pf in PicFeed.objects.filter(is_active=True):
+ if name and pf.shortname != name:
+ continue
+ print "Testing " + pf.shortname + " scrapper ============================="
+ try:
+ scrape = globals()["scrape_" + pf.shortname]
+ except:
+ print "FAILED, no scrape_" + pf.shortname + " found in globals()"
+ continue
+ s = scrape(pf, testing=True)
+ print s[0].source_url
+ print s[0].title
+ print s[0].category
+ print s[0].pics[0].original_url
+ print s[0].pics[0].title
+