summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/matched_importer.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-11-13 11:32:41 -0800
committerBryan Newbold <bnewbold@robocracy.org>2018-11-13 11:32:41 -0800
commit279b22e30d9b590838268f5f5acdaa1110ee593a (patch)
treec9965a089be1b8ef607573ea9261c0c378c0ab47 /python/fatcat_tools/matched_importer.py
parent7ebda2e051b51e49544ab75673b19ec5f27d9d45 (diff)
downloadfatcat-279b22e30d9b590838268f5f5acdaa1110ee593a.tar.gz
fatcat-279b22e30d9b590838268f5f5acdaa1110ee593a.zip
shuffle around fatcat_tools layout
Diffstat (limited to 'python/fatcat_tools/matched_importer.py')
-rw-r--r--python/fatcat_tools/matched_importer.py144
1 files changed, 0 insertions, 144 deletions
diff --git a/python/fatcat_tools/matched_importer.py b/python/fatcat_tools/matched_importer.py
deleted file mode 100644
index 627ab6f1..00000000
--- a/python/fatcat_tools/matched_importer.py
+++ /dev/null
@@ -1,144 +0,0 @@
-
-import sys
-import json
-import sqlite3
-import itertools
-import fatcat_client
-from fatcat_tools.importer_common import FatcatImporter
-
-#row = row.split('\t')
-#assert len(row) == 2
-#sha1 = row[0].replace('sha1:')
-#sha1 = base64.b16encode(base64.b32decode(sha1)).lower()
-#print(sha1)
-#dois = [d.lower() for d in json.loads(row[1])]
-
-class FatcatMatchedImporter(FatcatImporter):
- """
- Input format is JSON with keys:
- - dois (list)
- - sha1 (hex)
- - md5 (hex)
- - sha256 (hex)
- - size (int)
- - cdx (list of objects)
- - dt
- - url
- - mimetype
- - urls (list of strings... or objects?)
-
- Future handlings/extensions:
- - core_id, wikidata_id, pmcid, pmid: not as lists
- """
-
- def __init__(self, host_url, skip_file_update=False, default_mime=None,
- default_link_rel="web"):
- super().__init__(host_url)
- self.default_mime = default_mime
- self.default_link_rel = default_link_rel
- self.skip_file_update = skip_file_update
-
- def make_url(self, raw):
- rel = self.default_link_rel
- # TODO: this is where we could map specific domains to rel types,
- # and also filter out bad domains, invalid URLs, etc
- if "//archive.org/" in raw or "//arxiv.org/" in raw:
- # TODO: special-case the arxiv.org bulk mirror?
- rel = "repository"
- elif "//web.archive.org/" in raw or "//archive.is/" in raw:
- rel = "webarchive"
- return fatcat_client.FileEntityUrls(url=raw, rel=rel)
-
- def parse_matched_dict(self, obj):
- sha1 = obj['sha1']
- dois = [d.lower() for d in obj.get('dois', [])]
-
- # lookup sha1, or create new entity
- fe = None
- if not self.skip_file_update:
- try:
- fe = self.api.lookup_file(sha1=sha1)
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
- if fe is None:
- fe = fatcat_client.FileEntity(
- sha1=sha1,
- releases=[],
- urls=[],
- )
-
- # lookup dois
- re_list = set()
- for doi in dois:
- try:
- re = self.api.lookup_release(doi=doi)
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
- re = None
- if re is None:
- print("DOI not found: {}".format(doi))
- else:
- re_list.add(re.ident)
- if len(re_list) == 0:
- return None
- if fe.releases == set(re_list):
- return None
- re_list.update(fe.releases)
- fe.releases = list(re_list)
-
- # parse URLs and CDX
- existing_urls = [feu.url for feu in fe.urls]
- for url in obj.get('url', []):
- if url not in existing_urls:
- url = self.make_url(url)
- if url != None:
- fe.urls.append(url)
- for cdx in obj.get('cdx', []):
- original = cdx['url']
- wayback = "https://web.archive.org/web/{}/{}".format(
- cdx['dt'],
- original)
- if wayback not in existing_urls:
- fe.urls.append(
- fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
- if original not in existing_urls:
- url = self.make_url(original)
- if url != None:
- fe.urls.append(url)
-
- if obj.get('size') != None:
- fe.size = int(obj['size'])
- fe.sha256 = obj.get('sha256', fe.sha256)
- fe.md5 = obj.get('md5', fe.sha256)
- if obj.get('mimetype') is None:
- if fe.mimetype is None:
- fe.mimetype = self.default_mime
- else:
- fe.mimetype = obj.get('mimetype')
- return fe
-
- def create_row(self, row, editgroup=None):
- obj = json.loads(row)
- fe = self.parse_matched_dict(obj)
- if fe is not None:
- if fe.ident is None:
- self.api.create_file(fe, editgroup=editgroup)
- self.insert_count = self.insert_count + 1
- else:
- self.api.update_file(fe.ident, fe, editgroup=editgroup)
- self.update_count = self.update_count + 1
-
- def create_batch(self, batch, editgroup=None):
- """Reads and processes in batches (not API-call-per-line)"""
- objects = [self.parse_matched_dict(json.loads(l))
- for l in batch if l != None]
- new_objects = [o for o in objects if o != None and o.ident == None]
- update_objects = [o for o in objects if o != None and o.ident != None]
- for obj in update_objects:
- self.api.update_file(obj.ident, obj, editgroup=editgroup)
- if len(new_objects) > 0:
- self.api.create_file_batch(new_objects, autoaccept="true", editgroup=editgroup)
- self.update_count = self.update_count + len(update_objects)
- self.insert_count = self.insert_count + len(new_objects)