diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 9 | ||||
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 7 |
2 files changed, 15 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 7c587395..e2157ee5 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -375,6 +375,15 @@ class EntityImporter: if u.rel == "social": u.rel = "academicsocial" + # remove exact URL duplicates, while preserving order, and removing + # "later" copies, not "first" copies + # this is sensitive to both url.url and url.rel combined! + dedupe_urls = [] + for url_pair in existing.urls: + if url_pair not in dedupe_urls: + dedupe_urls.append(url_pair) + existing.urls = dedupe_urls + # remove URLs which are near-duplicates redundant_urls = [] all_urls = [u.url for u in existing.urls] diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 5bc7a9ff..a6c7409d 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -466,7 +466,12 @@ class PubmedImporter(EntityImporter): self.counts["exists"] += 1 return False - if existing and existing.ext_ids.pmid and (existing.refs or not re.refs): + if ( + existing + and existing.ext_ids.pmid + and (existing.ext_ids.pmcid or not re.ext_ids.pmcid) + and (existing.refs or not re.refs) + ): # TODO: any other reasons to do an update? # don't update if it already has PMID self.counts["exists"] += 1 |