summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/common.py9
-rw-r--r--python/fatcat_tools/importers/pubmed.py7
2 files changed, 15 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 7c587395..e2157ee5 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -375,6 +375,15 @@ class EntityImporter:
if u.rel == "social":
u.rel = "academicsocial"
+ # remove exact URL duplicates, while preserving order, and removing
+ # "later" copies, not "first" copies
+ # this is sensitive to both url.url and url.rel combined!
+ dedupe_urls = []
+ for url_pair in existing.urls:
+ if url_pair not in dedupe_urls:
+ dedupe_urls.append(url_pair)
+ existing.urls = dedupe_urls
+
# remove URLs which are near-duplicates
redundant_urls = []
all_urls = [u.url for u in existing.urls]
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 5bc7a9ff..a6c7409d 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -466,7 +466,12 @@ class PubmedImporter(EntityImporter):
self.counts["exists"] += 1
return False
- if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
+ if (
+ existing
+ and existing.ext_ids.pmid
+ and (existing.ext_ids.pmcid or not re.ext_ids.pmcid)
+ and (existing.refs or not re.refs)
+ ):
# TODO: any other reasons to do an update?
# don't update if it already has PMID
self.counts["exists"] += 1