import re
import feedparser
import urllib
from datetime import *
from BeautifulSoup import BeautifulSoup
import json
import sys
# this is a django thing
from feeds.models import *
# ---------------------------- Little Helpers -------------------------------
def strip_parens(s):
return s.split("(")[0]
# ---------------------------- Fetchers -------------------------------
def fetch_rss(uri):
"""URI can actually be a file, stream, or string as well as a URL.
"""
print "Fetching " + uri
ret = feedparser.parse(uri)
if ret.has_key("bozo_exception"):
raise Exception("problem parsing in RSS? uri=" + uri)
return ret
def fetch_html(uri, raw=None):
if raw:
return BeautifulSoup(raw)
else:
print "Fetching " + uri
return BeautifulSoup(urllib.urlopen(uri))
def fetch_json(uri):
print "Fetching " + uri
return json.load(urllib.urlopen(uri))
def test_fetching(remotes=False):
# test fetching local rss
local_rss = fetch_rss("../example_feed_data/acidcow_rss.xml")
print str(local_rss.feed.description)
# test fetching local/inline html
local_html = fetch_html("../example_feed_data/acidcow_page.html")
print "Local has title: " + str(local_html.html.head.title)
inline_html = fetch_html("", raw ="""

""")
print inline_html.first("img")["src"]
# test fetching local/remote json
local_json = fetch_json("../example_feed_data/piccast_feeds.json")
print "Local json: " + local_json["list"][0]["source_url"]
if remotes:
remote_json = fetch_json("http://piccastapp.com/json/v0/feeds/")
print "Remote json: " + remote_json["list"][1]["source_url"]
# ---------------------------- Data to PicSet -------------------------------
def sets_from_rss(data,
source_url_field = "link",
created_field = "date",
created_format = ("%a, %d %b %Y %H:%M:%S", -6),
title_field = "title",
category = None,
category_field = "category",
description_field = "description",
data_field = None,
):
"""
This function takes an RSS feedparser object and returns a list of PicSets.
Feed-dependant parameters can be used to select specific RSS elements to
look in for various fields. The feed content/description can be saved to
the data parameter for each set, which allows this HTML to get pulled out
for processing later in the pipeline.
A base assumption is that each RSS entry corresponds to a PicSet.
"""
if not type(data) == feedparser.FeedParserDict:
raise Exception("'data' must be a feedparser object")
sets = []
for entry in data['entries']:
s = PicSet()
try:
s.source_url = entry[source_url_field]
# created_format[1] is the length of date/time string to chomp off
s.created = \
datetime.strptime(entry[created_field][:created_format[1]], \
created_format[0])
s.title = entry[title_field]
if category:
s.category = category
else:
s.category_name = entry[category_field]
s.description = entry[description_field]
if data_field:
s.data = entry[data_field]
except KeyError as ke:
sys.stderr.write("Missing field while parsing RSS into PicSet: " + str(ke) + " (continuing...)\n")
continue
sets.append(s)
return sets
# sets_from_html(data, params): TODO
# sets_from_json(data, params): TODO
# ---------------------------- Data to Pics -------------------------------
def pics_from_html_simple(data,
find_images = lambda d: d.findAll("img"),
title_base = "Untitled Pic",
source_url = None,
match_src = None,
meaningless_titles = [],
):
"""
This function simply looks for well
tags, creates a Pic for each and returns
a list of these.
'data' should be a BeautifulSoup HTML parsing object; use fetch_html().
"""
pics = []
index = 1
for i in find_images(data):
p = Pic()
if match_src and not (i["src"].find(match_src) >= 0):
continue
p.original_url = i["src"]
if i.has_key("width"):
p.width = i["width"]
if i.has_key("height"):
p.height = i["height"]
if source_url:
p.source_url = source_url
if i.has_key("title"):
p.title = i["title"]
elif i.has_key("alt"):
p.title = i["alt"]
else:
p.title = title_base + " #" + str(index)
if p.title in list(("", " ",)) + list(meaningless_titles):
p.title = title_base + " #" + str(index)
pics.append(p)
index += 1
return pics
# pics_from_html(data, params):
# TODO, in a more general/careful fashion. eg, scrape captions
# pics_from_rss(data, params): TODO
# pics_from_json(data, params): TODO
# ---------------------------- Generics -------------------------------
# generic_blogger(PicFeed)
# generic_flickr(PicFeed)
# def generic_single_rss(pf):