aboutsummaryrefslogtreecommitdiffstats
path: root/piccast
diff options
context:
space:
mode:
authorbnewbold <bnewbold@robocracy.org>2011-05-03 19:56:32 -0400
committerbnewbold <bnewbold@robocracy.org>2011-05-03 19:56:32 -0400
commitd53fcbecdc690bd42372f3ce9c322f849220e225 (patch)
tree86a72e9b2a3fcd5107af11fa12a807258bc3334b /piccast
parenta8cf5584204d77bd3f234fe579844ca84bb3558d (diff)
downloadpiccast-d53fcbecdc690bd42372f3ce9c322f849220e225.zip
piccast-d53fcbecdc690bd42372f3ce9c322f849220e225.tar.gz
basic vectortut scraper added
Diffstat (limited to 'piccast')
-rw-r--r--piccast/feeds/management/commands/scrape_feeds.py25
1 files changed, 20 insertions, 5 deletions
diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py
index bfa9bfc..a023e9f 100644
--- a/piccast/feeds/management/commands/scrape_feeds.py
+++ b/piccast/feeds/management/commands/scrape_feeds.py
@@ -10,6 +10,8 @@ from feeds.models import *
# see Command definition at the end
+# TODO: need a u.encode('utf-8') somewhere in here...
+
# this mechanism isn't robust yet b/c any small sets get parsed repeatedly
MIN_SET_SIZE = 3 # Need to find at least this many images for each set
@@ -20,7 +22,10 @@ def scrape_pics_from_html(pset, html):
raise("Type error, expected PicSet")
# assumptions: one <img> per line, ordering, etc
- mset = re.finditer('<img src="(http://\S+)" \S* width="(\d+)" height="(\d+)"', html)
+ if(pset.feed.shortname == "vectortut"):
+ mset = re.finditer('<img src="(http://\S+cloudfront.net\S+)" border=', html)
+ else:
+ mset = re.finditer('<img src="(http://\S+)" \S* width="(\d+)" height="(\d+)"', html)
pics = list()
index = 0
@@ -38,8 +43,9 @@ def scrape_pics_from_html(pset, html):
#p.caption = ""
p.original_url = m.group(1)
p.source_url = pset.source_url
- p.width = m.group(2)
- p.height = m.group(3)
+ if pset.feed.shortname != "vectortut":
+ p.width = m.group(2)
+ p.height = m.group(3)
pics.append(p)
if(len(pics) < MIN_SET_SIZE):
@@ -148,6 +154,8 @@ def scrape_feed(feed_shortname):
if(feed_shortname == u"acidcow"):
psets = filter(lambda s: s['category'] in (u'Pics', u'Picdump',u'Celebs',u'Girls',u'Cars'), psets)
+ elif(feed_shortname == u"vectortut"):
+ psets = filter(lambda s: s['category'] in (u'Inspirational'), psets)
new_psets = filter(lambda s: \
0 == len(PicSet.objects.filter(source_url=s['source_url'])), \
@@ -162,8 +170,8 @@ def scrape_feed(feed_shortname):
"%a, %d %b %Y %H:%M:%S")
# This oneline strips any paren content
p.title = pset['title'].split(" (")[0]
- if(feed_shortname == u"butdoesitfloat"):
- # Category is a list for this site
+ if(feed_shortname in ("butdoesitfloat", "vectortut")):
+ # Category is ignored for these sites
p.category = Category.objects.get_or_create(name="Art")[0]
else:
p.category = Category.objects.get_or_create(name=pset['category'])[0]
@@ -184,6 +192,12 @@ def scrape_feed(feed_shortname):
p.save()
print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
scrape_pics_from_html(p, pset['description'])
+ elif(feed_shortname == u"vectortut"):
+ print pset['description']
+ p.description = ""
+ p.save()
+ print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
+ scrape_pics_from_html(p, pset['description'])
else:
print "ERROR: unknown PicFeed: " + pset['feed']
#parse_page(s['source_url'], httplib.HTTPConnection)
@@ -207,6 +221,7 @@ class Command(BaseCommand):
except Exception as e:
sys.stderr.write("Error scraping " + shortname + ":\n")
sys.stderr.write(str(e) + "\n")
+ #raise e
sys.stdout.flush()
sys.stdout.write('Done scraping feeds.\n')