import re import feedparser import urllib from datetime import * from BeautifulSoup import BeautifulSoup import json import sys # this is a django thing from feeds.models import * # ---------------------------- Little Helpers ------------------------------- def strip_parens(s): return s.split("(")[0] # ---------------------------- Fetchers ------------------------------- def fetch_rss(uri): """URI can actually be a file, stream, or string as well as a URL. """ print "Fetching " + uri ret = feedparser.parse(uri) if ret.has_key("bozo_exception"): raise Exception("problem parsing in RSS? uri=" + uri) return ret def fetch_html(uri, raw=None): if raw: return BeautifulSoup(raw) else: print "Fetching " + uri return BeautifulSoup(urllib.urlopen(uri)) def fetch_json(uri): print "Fetching " + uri return json.load(urllib.urlopen(uri)) def test_fetching(remotes=False): # test fetching local rss local_rss = fetch_rss("../example_feed_data/acidcow_rss.xml") print str(local_rss.feed.description) # test fetching local/inline html local_html = fetch_html("../example_feed_data/acidcow_page.html") print "Local has title: " + str(local_html.html.head.title) inline_html = fetch_html("", raw ="""
""") print inline_html.first("img")["src"] # test fetching local/remote json local_json = fetch_json("../example_feed_data/piccast_feeds.json") print "Local json: " + local_json["list"][0]["source_url"] if remotes: remote_json = fetch_json("http://piccastapp.com/json/v0/feeds/") print "Remote json: " + remote_json["list"][1]["source_url"] # ---------------------------- Data to PicSet ------------------------------- def sets_from_rss(data, source_url_field = "link", created_field = "date", created_format = ("%a, %d %b %Y %H:%M:%S", -6), title_field = "title", category = None, category_field = "category", description_field = "description", data_field = None, ): """ This function takes an RSS feedparser object and returns a list of PicSets. Feed-dependant parameters can be used to select specific RSS elements to look in for various fields. The feed content/description can be saved to the data parameter for each set, which allows this HTML to get pulled out for processing later in the pipeline. A base assumption is that each RSS entry corresponds to a PicSet. """ if not type(data) == feedparser.FeedParserDict: raise Exception("'data' must be a feedparser object") sets = [] for entry in data['entries']: s = PicSet() try: s.source_url = entry[source_url_field] # created_format[1] is the length of date/time string to chomp off s.created = \ datetime.strptime(entry[created_field][:created_format[1]], \ created_format[0]) s.title = entry[title_field] if category: s.category = category else: s.category_name = entry[category_field] s.description = entry[description_field] if data_field: s.data = entry[data_field] except KeyError as ke: sys.stderr.write("Missing field while parsing RSS into PicSet: " + str(ke) + " (continuing...)\n") continue sets.append(s) return sets # sets_from_html(data, params): TODO # sets_from_json(data, params): TODO # ---------------------------- Data to Pics ------------------------------- def pics_from_html_simple(data, find_images = lambda d: d.findAll("img"), title_base = "Untitled Pic", source_url = None, match_src = None, meaningless_titles = [], ): """ This function simply looks for well tags, creates a Pic for each and returns a list of these. 'data' should be a BeautifulSoup HTML parsing object; use fetch_html(). """ pics = [] index = 1 for i in find_images(data): p = Pic() if match_src and not (i["src"].find(match_src) >= 0): continue p.original_url = i["src"] if i.has_key("width"): p.width = i["width"] if i.has_key("height"): p.height = i["height"] if source_url: p.source_url = source_url if i.has_key("title"): p.title = i["title"] elif i.has_key("alt"): p.title = i["alt"] else: p.title = title_base + " #" + str(index) if p.title in list(("", " ",)) + list(meaningless_titles): p.title = title_base + " #" + str(index) pics.append(p) index += 1 return pics # pics_from_html(data, params): # TODO, in a more general/careful fashion. eg, scrape captions # pics_from_rss(data, params): TODO # pics_from_json(data, params): TODO # ---------------------------- Generics ------------------------------- # generic_blogger(PicFeed) # generic_flickr(PicFeed) # def generic_single_rss(pf):