aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-01-03 13:46:05 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-01-03 13:46:05 +0100
commit391565cbbc0ba17ffd8c4f5d88d4dfda8a8b323c (patch)
treea6f8c2c5dd3244e2bfc8a6035f5c351770ea9975
parent61f0bbfbfdaf41be799fa41c88077806ef913188 (diff)
downloadfatcat-391565cbbc0ba17ffd8c4f5d88d4dfda8a8b323c.tar.gz
fatcat-391565cbbc0ba17ffd8c4f5d88d4dfda8a8b323c.zip
datacite: remove --lang-detect flag
Estimated time for a single call is in the order of 50ms.
-rwxr-xr-xpython/fatcat_import.py4
-rw-r--r--python/fatcat_tools/importers/datacite.py17
-rw-r--r--python/tests/files/datacite/datacite_result_04.json5
-rw-r--r--python/tests/files/datacite/datacite_result_05.json5
-rw-r--r--python/tests/files/datacite/datacite_result_07.json5
-rw-r--r--python/tests/files/datacite/datacite_result_08.json5
-rw-r--r--python/tests/files/datacite/datacite_result_14.json5
7 files changed, 21 insertions, 25 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index a17029cc..6b04d547 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -172,7 +172,6 @@ def run_datacite(args):
edit_batch_size=args.batch_size,
bezerk_mode=args.bezerk_mode,
debug=args.debug,
- lang_detect=args.lang_detect,
extid_map_file=args.extid_map_file,
insert_log_file=args.insert_log_file)
if args.kafka_mode:
@@ -474,9 +473,6 @@ def main():
sub_datacite.add_argument('--debug',
action='store_true',
help="write converted JSON to stdout")
- sub_datacite.add_argument('--lang-detect',
- action='store_true',
- help="try to detect language (slow)")
sub_datacite.add_argument('--insert-log-file',
default='',
type=str,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index bd135569..8034a5c1 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -196,7 +196,6 @@ class DataciteImporter(EntityImporter):
api,
issn_map_file,
debug=False,
- lang_detect=False,
insert_log_file=None,
**kwargs):
@@ -225,12 +224,9 @@ class DataciteImporter(EntityImporter):
self.read_issn_map_file(issn_map_file)
self.debug = debug
- self.lang_detect = lang_detect
self.insert_log_file = insert_log_file
- print('datacite with debug={}, lang_detect={}'.format(
- self.debug, self.lang_detect),
- file=sys.stderr)
+ print('datacite with debug={}'.format(self.debug), file=sys.stderr)
def lookup_ext_ids(self, doi):
"""
@@ -537,12 +533,11 @@ class DataciteImporter(EntityImporter):
if len(text) > MAX_ABSTRACT_LENGTH:
text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
lang = None
- if self.lang_detect:
- try:
- lang = langdetect.detect(text)
- except langdetect.lang_detect_exception.LangDetectException as err:
- print('[{}] language detection failed: {}'.format(doi, err),
- file=sys.stderr)
+ try:
+ lang = langdetect.detect(text)
+ except langdetect.lang_detect_exception.LangDetectException as err:
+ print('[{}] language detection failed: {}'.format(doi, err),
+ file=sys.stderr)
abstracts.append(
fatcat_openapi_client.ReleaseAbstract(
mimetype="text/plain",
diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json
index 54b19ef9..94fa1f94 100644
--- a/python/tests/files/datacite/datacite_result_04.json
+++ b/python/tests/files/datacite/datacite_result_04.json
@@ -22,7 +22,8 @@
"abstracts": [
{
"content": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps. In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X\u02d9 \u03b5. |KA)| can be embedded in a complex I\u02d9 \u03b5. |K(I)| in such a way that I\u02d9 has the same cohomology as X\u02d9. In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor J : K(A) \u2192 K(I) and a natural transformation [formula omitted] (where E : K(I) \u2192 K(A) is the embedding functor) such that [formula omitted] is injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open. We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A). In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted] In Chapter II we study the natural homomorphism [formula omitted] where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology. In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.",
- "mimetype": "text/plain"
+ "mimetype": "text/plain",
+ "lang": "en"
}
]
-} \ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index a790c26e..ff998c0f 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -524,7 +524,8 @@
"abstracts": [
{
"content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (&lt;0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
- "mimetype": "text/plain"
+ "mimetype": "text/plain",
+ "lang": "en"
}
]
-} \ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json
index f572263c..f694ddef 100644
--- a/python/tests/files/datacite/datacite_result_07.json
+++ b/python/tests/files/datacite/datacite_result_07.json
@@ -67,7 +67,8 @@
"abstracts": [
{
"content": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
- "mimetype": "text/plain"
+ "mimetype": "text/plain",
+ "lang": "en"
}
]
-} \ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index 581ca1eb..cc0e968b 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -47,7 +47,8 @@
"abstracts": [
{
"content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan\u2019s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
- "mimetype": "text/plain"
+ "mimetype": "text/plain",
+ "lang": "en"
}
]
-} \ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json
index 94ad000a..4521f891 100644
--- a/python/tests/files/datacite/datacite_result_14.json
+++ b/python/tests/files/datacite/datacite_result_14.json
@@ -104,7 +104,8 @@
"abstracts": [
{
"content": "An entry from the Cambridge Structural Database, the world\u2019s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
- "mimetype": "text/plain"
+ "mimetype": "text/plain",
+ "lang": "en"
}
]
-} \ No newline at end of file
+}