From 9f4c07024e685235cb10e7f6e0d9a9090c771532 Mon Sep 17 00:00:00 2001 From: bnewbold Date: Wed, 11 May 2011 22:49:11 -0400 Subject: WIP: new scraper stuff, should be a branch --- piccast/scrape_feeds_new.py | 7 ++ piccast/scrape_helpers.py | 155 ++++++++++++++++++++++++++++++ piccast/scrapers.py | 226 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 388 insertions(+) create mode 100644 piccast/scrape_feeds_new.py create mode 100644 piccast/scrape_helpers.py create mode 100644 piccast/scrapers.py (limited to 'piccast') diff --git a/piccast/scrape_feeds_new.py b/piccast/scrape_feeds_new.py new file mode 100644 index 0000000..a28cff2 --- /dev/null +++ b/piccast/scrape_feeds_new.py @@ -0,0 +1,7 @@ + +from scrapers import * + + +# save_set() + + diff --git a/piccast/scrape_helpers.py b/piccast/scrape_helpers.py new file mode 100644 index 0000000..b5d711b --- /dev/null +++ b/piccast/scrape_helpers.py @@ -0,0 +1,155 @@ +import re +import feedparser +import urllib +from datetime import * +from BeautifulSoup import BeautifulSoup +import json +import sys + +# this is a django thing +from feeds.models import * + +# ---------------------------- Little Helpers ------------------------------- +def strip_parens(s): + return s.split("(")[0] + +# ---------------------------- Fetchers ------------------------------- +def fetch_rss(uri): + """URI can actually be a file, stream, or string as well as a URL. + """ + print "Fetching " + uri + ret = feedparser.parse(uri) + if ret.has_key("bozo_exception"): + raise Exception("problem parsing in RSS? uri=" + uri) + return ret + +def fetch_html(uri, raw=None): + if raw: + return BeautifulSoup(raw) + else: + print "Fetching " + uri + return BeautifulSoup(urllib.urlopen(uri)) + +def fetch_json(uri): + print "Fetching " + uri + return json.load(urllib.urlopen(uri)) + +def test_fetching(remotes=False): + # test fetching local rss + local_rss = fetch_rss("../example_feed_data/acidcow_rss.xml") + print str(local_rss.feed.description) + # test fetching local/inline html + local_html = fetch_html("../example_feed_data/acidcow_page.html") + print "Local has title: " + str(local_html.html.head.title) + inline_html = fetch_html("", raw =""" + +

+ """) + print inline_html.first("img")["src"] + # test fetching local/remote json + local_json = fetch_json("../example_feed_data/piccast_feeds.json") + print "Local json: " + local_json["list"][0]["source_url"] + if remotes: + remote_json = fetch_json("http://piccastapp.com/json/v0/feeds/") + print "Remote json: " + remote_json["list"][1]["source_url"] + +# ---------------------------- Data to PicSet ------------------------------- +def sets_from_rss(data, + source_url_field = "link", + created_field = "date", + created_format = ("%a, %d %b %Y %H:%M:%S", -6), + title_field = "title", + category = None, + category_field = "category", + description_field = "description", + data_field = None, + ): + """ + This function takes an RSS feedparser object and returns a list of PicSets. + Feed-dependant parameters can be used to select specific RSS elements to + look in for various fields. The feed content/description can be saved to + the data parameter for each set, which allows this HTML to get pulled out + for processing later in the pipeline. + + A base assumption is that each RSS entry corresponds to a PicSet. + """ + if not type(data) == feedparser.FeedParserDict: + raise Exception("'data' must be a feedparser object") + + sets = [] + for entry in data['entries']: + s = PicSet() + try: + s.source_url = entry[source_url_field] + # created_format[1] is the length of date/time string to chomp off + s.created = \ + datetime.strptime(entry[created_field][:created_format[1]], \ + created_format[0]) + s.title = entry[title_field] + if category: + s.category = category + else: + s.category_name = entry[category_field] + s.description = entry[description_field] + if data_field: + s.data = entry[data_field] + except KeyError as ke: + sys.stderr.write("Missing field while parsing RSS into PicSet: " + str(ke) + " (continuing...)\n") + continue + sets.append(s) + return sets + +# sets_from_html(data, params): TODO +# sets_from_json(data, params): TODO + +# ---------------------------- Data to Pics ------------------------------- + +def pics_from_html_simple(data, + find_images = lambda d: d.findAll("img"), + title_base = "Untitled Pic", + source_url = None, + match_src = None, + meaningless_titles = [], + ): + """ + This function simply looks for well tags, creates a Pic for each and returns + a list of these. + 'data' should be a BeautifulSoup HTML parsing object; use fetch_html(). + """ + pics = [] + index = 1 + for i in find_images(data): + p = Pic() + if match_src and not (i["src"].find(match_src) >= 0): + continue + p.original_url = i["src"] + if i.has_key("width"): + p.width = i["width"] + if i.has_key("height"): + p.height = i["height"] + if source_url: + p.source_url = source_url + + if i.has_key("title"): + p.title = i["title"] + elif i.has_key("alt"): + p.title = i["alt"] + else: + p.title = title_base + " #" + str(index) + if p.title in list(("", " ",)) + list(meaningless_titles): + p.title = title_base + " #" + str(index) + pics.append(p) + index += 1 + return pics + +# pics_from_html(data, params): +# TODO, in a more general/careful fashion. eg, scrape captions +# pics_from_rss(data, params): TODO +# pics_from_json(data, params): TODO + +# ---------------------------- Generics ------------------------------- +# generic_blogger(PicFeed) +# generic_flickr(PicFeed) +# def generic_single_rss(pf): + + diff --git a/piccast/scrapers.py b/piccast/scrapers.py new file mode 100644 index 0000000..933c54c --- /dev/null +++ b/piccast/scrapers.py @@ -0,0 +1,226 @@ + +# Pulls in helpers as well as utility libraries +from scrape_helpers import * + +# Pulls in the PicCast model objects (PicFeed, PicSet, and Pic) +from feeds.models import * + +from django.utils.html import strip_tags + +""" +This file contains a set of scraper functions, one for each PicCast feed. +Helper functions are defined in scrape_helpers.py; when run as a django command +this file is included from scrape_feeds.py. + +The general pattern for each scraper is shown below. Note that neither pics nor +sets are saved by this function:: + + def scrape_examplefeed(PicFeed): + sets_data = fetch_data(PicFeed.uri) + sets = sets_from_data(sets_data) + filter(sets) + foreach(sets): + set = modifcations(set) + pic_data = fetch_data(set) + pics = pics_from_data(pic_data) + filter(pics) + set.pics = pics + good_sets.append(set) + return (good_sets) + +It is difficult to generalize too much further because there are several source +data types for both feeds of PicSets and the list of pictures themselves. For +example, some of the known patterns which need to be accomodated are: + + rss -> sets, foreach inline html -> pics + rss -> sets, foreach html -> pics + json -> sets, foreach rss -> pics + json -> sets, foreach json -> pics + html -> sets, foreach html -> pics + +To add a new scraper, first add the PicFeed to the local database, setting +is_active to True. Then create a scrape_() method (copy one of the +below) and modify the parameters and dataflow to suit that PicFeed. Test by +running a django shell ("./manage.py shell"), import the scrapers ("run +scrapers.py"), and then run a test for that one feed +("test_scrapers(name='shortname')"). You'll need to download some test +RSS/HTML/whatever to test with and put it in ../example_feed_data/. + +""" + +def save_sets(sets): + print "Saving " + str(len(sets)) + " new PicSets..." + for s in sets: + print " - " + str(s) + " with " + str(len(s.pics)) + " pics" + s.save() + for p in s.pics: + p.set = s + p.save() + +# ---------------------- Real scrapers go here --------------------- +def scrape_acidcow(pf, testing = False): + if testing: + pf.rssfeed_url = "../example_feed_data/acidcow_rss.xml" + testing_uri = "../example_feed_data/acidcow_page.html" + sets_data = fetch_rss(pf.rssfeed_url) + sets = sets_from_rss(sets_data, + source_url_field = "guid", + ) + existing_urls = map(lambda s: s.source_url, pf.picset_set.all()) + sets = filter(lambda s: s.source_url not in existing_urls, sets) + sets = filter(lambda s: s.category_name in + ("Pics", "Picdump", "Celebs", "Girls", "Cars"), sets) + good_sets = [] + for s in sets: + if testing: + s.source_url = testing_uri + if(len(s.description) > 0): + s.description = strip_tags(s.description) + if(s.description.startswith("Similar posts:")): + s.description = None + full_title = s.title + s.title = strip_parens(s.title) + s.category = Category.objects.get_or_create(name=s.category_name)[0] + print s + pic_data = fetch_html(s.source_url) + pics = pics_from_html_simple(pic_data, + match_src = "http://acidcow.com/pics/", + source_url = s.source_url, + meaningless_titles = [full_title, ], + ) + #filter(pics,) + if(len(pics) < 3): + continue + s.pics = pics + good_sets.append(s) + if testing: + return good_sets + else: + save_sets(good_sets) + +def scrape_butdoesitfloat(pf, testing = False): + if testing: + pf.rssfeed_url = "../example_feed_data/butdoesitfloat_rss.xml" + sets_data = fetch_rss(pf.rssfeed_url) + sets = sets_from_rss(sets_data, + source_url_field = "comments", + category = Category.objects.get_or_create(name="Art")[0] + ) + existing_urls = map(lambda s: s.source_url, pf.picset_set.all()) + sets = filter(lambda s: s.source_url not in existing_urls, sets) + good_sets = [] + for s in sets: + pic_data = fetch_html("", raw=s.description) + if(len(s.description) > 0): + s.description = s.description.split("