From ad9cd8a98aa5dce24d49b5a9c460a653f5e840a3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 23 Jan 2019 21:08:36 -0800 Subject: importer bugfixes --- python/fatcat_tools/importers/crossref.py | 6 +++--- python/fatcat_tools/importers/grobid_metadata.py | 8 +++++--- python/fatcat_tools/importers/matched.py | 8 ++++++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 75132901..00c719f1 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -111,7 +111,7 @@ class CrossrefImporter(EntityImporter): return CROSSREF_TYPE_MAP.get(crossref_type) def map_container_type(self, crossref_type): - return CONTAINER_TYPE_MAP.get(release_type) + return CONTAINER_TYPE_MAP.get(crossref_type) def want(self, obj): if not obj.get('title'): @@ -238,7 +238,7 @@ class CrossrefImporter(EntityImporter): if rm.get('DOI'): extra['doi'] = rm.get('DOI').lower() # TODO: what fields here? CSL citation stuff - for k in ('authors', 'editor', 'edition', 'authority', 'version', + for k in ('author', 'editor', 'edition', 'authority', 'version', 'genre', 'url', 'event', 'issue', 'volume', 'date', 'accessed_date', 'issued', 'page', 'medium', 'collection_title', 'chapter_number'): @@ -253,7 +253,7 @@ class CrossrefImporter(EntityImporter): # doing lookups would be a second import pass target_release_id=None, key=key, - year=clean(year), + year=year, container_name=clean(container_name), title=clean(rm.get('title')), locator=clean(rm.get('first-page')), diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 468b0ede..9d95fe0b 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -32,6 +32,7 @@ class GrobidMetadataImporter(EntityImporter): editgroup_description=eg_desc, editgroup_extra=eg_extra) self.default_link_rel = kwargs.get("default_link_rel", "web") + self.longtail_oa = kwargs.get("longtail_oa", False) def want(self, raw_record): return True @@ -130,12 +131,13 @@ class GrobidMetadataImporter(EntityImporter): if obj.get('doi'): extra['doi'] = obj['doi'] if obj['journal'] and obj['journal'].get('name'): - extra['container_name'] = obj['journal']['name'] - - extra['is_longtail_oa'] = True + extra['container_name'] = clean(obj['journal']['name']) # TODO: ISSN/eISSN handling? or just journal name lookup? + if self.longtail_oa: + extra['longtail_oa'] = True + if extra: extra = dict(grobid=extra) else: diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 123e3530..2ec6c95d 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -120,6 +120,9 @@ class MatchedImporter(EntityImporter): if err.status != 404: raise err + if not existing: + return True + fe.release_ids = list(set(fe.release_ids + existing.release_ids)) if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0: # no new release matches *and* there are already existing URLs @@ -127,13 +130,14 @@ class MatchedImporter(EntityImporter): return False # merge the existing into this one and update - existing.urls = list(set(fe.urls + existing.urls)) + existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) + existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] existing.release_ids = list(set(fe.release_ids + existing.release_ids)) existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size existing.md5 = existing.md5 or fe.md5 existing.sha256 = existing.sha256 or fe.sha256 - self.api.update_file(existing.ident, existing) + self.api.update_file(existing.ident, existing, editgroup_id=self._get_editgroup()) self.counts['update'] += 1 return False -- cgit v1.2.3