summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/matched.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-01-24 16:42:59 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-01-24 16:43:33 -0800
commitcc2aecf897eb80211fae5b57a07d2f98890dee78 (patch)
tree7bf3fcf94b1d9d04f89da3bd5da52d37dcb62ab1 /python/fatcat_tools/importers/matched.py
parent5d0e1d5c8f33def3d7e48e0cfdbb4286be6fb5fd (diff)
downloadfatcat-cc2aecf897eb80211fae5b57a07d2f98890dee78.tar.gz
fatcat-cc2aecf897eb80211fae5b57a07d2f98890dee78.zip
refactor make_rel_url
Diffstat (limited to 'python/fatcat_tools/importers/matched.py')
-rw-r--r--python/fatcat_tools/importers/matched.py17
1 files changed, 3 insertions, 14 deletions
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 2ec6c95d..aca2cc34 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,7 +4,7 @@ import json
import sqlite3
import itertools
import fatcat_client
-from .common import EntityImporter, clean
+from .common import EntityImporter, clean, make_rel_url
class MatchedImporter(EntityImporter):
@@ -43,17 +43,6 @@ class MatchedImporter(EntityImporter):
self.default_link_rel = kwargs.get("default_link_rel", "web")
self.default_mime = kwargs.get("default_mime", None)
- def make_url(self, raw):
- rel = self.default_link_rel
- # TODO: this is where we could map specific domains to rel types,
- # and also filter out bad domains, invalid URLs, etc
- if "//archive.org/" in raw or "//arxiv.org/" in raw:
- # TODO: special-case the arxiv.org bulk mirror?
- rel = "repository"
- elif "//web.archive.org/" in raw or "//archive.is/" in raw:
- rel = "webarchive"
- return (rel, raw)
-
def want(self, raw_record):
return True
@@ -80,7 +69,7 @@ class MatchedImporter(EntityImporter):
# parse URLs and CDX
urls = set()
for url in obj.get('url', []):
- url = self.make_url(url)
+ url = make_rel_url(url, default_link_rel=self.default_link_rel)
if url != None:
urls.add(url)
for cdx in obj.get('cdx', []):
@@ -89,7 +78,7 @@ class MatchedImporter(EntityImporter):
cdx['dt'],
original)
urls.add(("webarchive", wayback))
- url = self.make_url(original)
+ url = make_rel_url(original, default_link_rel=self.default_link_rel)
if url != None:
urls.add(url)
urls = [fatcat_client.FileEntityUrls(rel, url) for (rel, url) in urls]