aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/Pipfile2
-rw-r--r--python/Pipfile.lock33
-rwxr-xr-xpython/fatcat_import.py6
-rw-r--r--python/fatcat_tools/importers/arabesque.py2
-rw-r--r--python/fatcat_tools/importers/chocula.py3
-rw-r--r--python/fatcat_tools/importers/common.py1
-rw-r--r--python/fatcat_tools/importers/ingest.py2
-rw-r--r--python/fatcat_tools/importers/pubmed.py4
8 files changed, 41 insertions, 12 deletions
diff --git a/python/Pipfile b/python/Pipfile
index 01c1eb3d..1a19a145 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -8,7 +8,7 @@ verify_ssl = true
name = "pypi"
[dev-packages]
-pytest = ">=4,<5.0.0"
+pytest = ">=5,<6.0.0"
pytest-pythonpath = "*"
pytest-pylint = "*"
ipython = "<7.0.0"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 35125b67..a4408cdd 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,11 @@
{
"_meta": {
"hash": {
+<<<<<<< HEAD
+ "sha256": "03fc6c65c7bcbf96a5ef90afba8b6a0264a248a67b31ed339f399470b5f3d5fc"
+=======
"sha256": "fb9c3d2307483efe01d9c28a306bad319c84a94a4253d5c7c25bcfe2dad20c5d"
+>>>>>>> martin-datacite-import
},
"pipfile-spec": 6,
"requires": {
@@ -298,6 +302,8 @@
],
"version": "==2.5.0"
},
+<<<<<<< HEAD
+=======
"langdetect": {
"hashes": [
"sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30"
@@ -305,6 +311,7 @@
"index": "pypi",
"version": "==1.0.7"
},
+>>>>>>> martin-datacite-import
"loginpass": {
"hashes": [
"sha256:717c87c1870a7e00547fd9d989aea9b22232b2f48826f552d79c34a47f9618c9",
@@ -617,7 +624,12 @@
},
"wcwidth": {
"hashes": [
+<<<<<<< HEAD
+ "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603",
+ "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8"
+=======
"sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603"
+>>>>>>> martin-datacite-import
],
"version": "==0.1.8"
},
@@ -645,13 +657,6 @@
],
"version": "==2.3.3"
},
- "atomicwrites": {
- "hashes": [
- "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
- "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
- ],
- "version": "==1.3.0"
- },
"attrs": {
"hashes": [
"sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c",
@@ -805,7 +810,6 @@
"sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d",
"sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564"
],
- "markers": "python_version > '2.7'",
"version": "==8.0.2"
},
"packaging": {
@@ -923,11 +927,19 @@
},
"pytest": {
"hashes": [
+<<<<<<< HEAD
+ "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa",
+ "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4"
+ ],
+ "index": "pypi",
+ "version": "==5.3.2"
+=======
"sha256:6192875be8af57b694b7c4904e909680102befcb99e610ef3d9f786952f795aa",
"sha256:f8447ebf8fd3d362868a5d3f43a9df786dfdfe9608843bd9002a2d47a104808f"
],
"index": "pypi",
"version": "==4.6.8"
+>>>>>>> martin-datacite-import
},
"pytest-cov": {
"hashes": [
@@ -1032,7 +1044,12 @@
},
"wcwidth": {
"hashes": [
+<<<<<<< HEAD
+ "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603",
+ "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8"
+=======
"sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603"
+>>>>>>> martin-datacite-import
],
"version": "==0.1.8"
},
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index ea7e12f2..fb8830ca 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -61,7 +61,8 @@ def run_journal_metadata(args):
def run_chocula(args):
fii = ChoculaImporter(args.api,
- edit_batch_size=args.batch_size)
+ edit_batch_size=args.batch_size,
+ do_updates=args.do_updates)
JsonLinePusher(fii, args.json_file).run()
def run_matched(args):
@@ -315,6 +316,9 @@ def main():
sub_chocula.add_argument('json_file',
help="chocula JSON entities file (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
+ sub_chocula.add_argument('--do-updates',
+ action='store_true',
+ help="update pre-existing container entities")
sub_matched = subparsers.add_parser('matched',
help="add file entities matched against existing releases; custom JSON format")
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index acfc2b87..c71b33e9 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -47,6 +47,7 @@ class ArabesqueMatchImporter(EntityImporter):
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter')
if kwargs.get('crawl_id'):
eg_extra['crawl_id'] = kwargs.get('crawl_id')
+ kwargs['do_updates'] = kwargs.get("do_updates", False)
super().__init__(api,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
@@ -56,7 +57,6 @@ class ArabesqueMatchImporter(EntityImporter):
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
self.default_mimetype = kwargs.get("default_mimetype", None)
- self.do_updates = kwargs.get("do_updates", False)
self.require_grobid = require_grobid
if self.require_grobid:
print("Requiring GROBID status == 200")
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index eea50314..375b6051 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -109,6 +109,9 @@ class ChoculaImporter(EntityImporter):
# decide whether to update
do_update = False
+ if not self.do_updates:
+ self.counts['exists'] += 1
+ return False
if not existing.extra:
existing.extra = dict()
if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index be5db8d8..8d103372 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -287,6 +287,7 @@ class EntityImporter:
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')
self.api = api
+ self.do_updates = bool(kwargs.get('do_updates', True))
self.bezerk_mode = kwargs.get('bezerk_mode', False)
self.submit_mode = kwargs.get('submit_mode', False)
self.edit_batch_size = kwargs.get('edit_batch_size', 100)
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 33c40eff..16643eb5 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -14,13 +14,13 @@ class IngestFileResultImporter(EntityImporter):
eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool"
eg_extra = kwargs.pop('editgroup_extra', dict())
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter')
+ kwargs['do_updates'] = kwargs.get("do_updates", False)
super().__init__(api,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
**kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
- self.do_updates = kwargs.get("do_updates", False)
self.require_grobid = require_grobid
if self.require_grobid:
print("Requiring GROBID status == 200")
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3611a299..c32ce34a 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -715,6 +715,10 @@ class PubmedImporter(EntityImporter):
re.ext_ids.doi = None
re.work_id = existing.work_id
+ if existing and not self.do_updates:
+ self.counts['exists'] += 1
+ return False
+
if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
# TODO: any other reasons to do an update?
# don't update if it already has PMID