summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/sql_dumps/ia_exports_item_readme.md2
-rw-r--r--fatcat-openapi2.yml2
-rw-r--r--notes/bulk_edits/2020-03-19_arxiv_pubmed.md57
-rwxr-xr-xpython/fatcat_harvest.py10
-rwxr-xr-xpython/fatcat_import.py34
-rwxr-xr-xpython/fatcat_ingest.py5
-rw-r--r--python/fatcat_tools/harvest/__init__.py5
-rw-r--r--python/fatcat_tools/harvest/oaipmh.py19
-rw-r--r--python/fatcat_tools/harvest/pubmed.py249
-rw-r--r--python/fatcat_tools/importers/__init__.py2
-rw-r--r--python/fatcat_tools/importers/common.py65
-rw-r--r--python/fatcat_tools/importers/crossref.py8
-rw-r--r--python/fatcat_tools/importers/datacite.py10
-rw-r--r--python/fatcat_tools/importers/jalc.py12
-rw-r--r--python/fatcat_tools/importers/pubmed.py27
-rw-r--r--python/fatcat_tools/transforms/csl.py18
-rw-r--r--python/fatcat_tools/workers/changelog.py3
-rw-r--r--python/fatcat_web/entity_helpers.py21
-rw-r--r--python/fatcat_web/templates/container_lookup.html7
-rw-r--r--python/fatcat_web/templates/home.html6
-rw-r--r--python/fatcat_web/templates/release_view.html3
-rw-r--r--python/tests/files/pubmed_19129924.xml206
-rw-r--r--python/tests/files/pubmedsample_2019.xml.gzbin0 -> 218528 bytes
-rw-r--r--python/tests/files/pubmedsample_no_pmid_2019.xml.gzbin0 -> 1128 bytes
-rw-r--r--python/tests/harvest_pubmed.py80
-rw-r--r--python/tests/import_pubmed.py12
-rw-r--r--python/tests/transform_csl.py20
-rw-r--r--python/tests/web_citation_csl.py46
-rw-r--r--python/tests/web_entity_views.py2
29 files changed, 849 insertions, 82 deletions
diff --git a/extra/sql_dumps/ia_exports_item_readme.md b/extra/sql_dumps/ia_exports_item_readme.md
index b61e82f3..19d15d13 100644
--- a/extra/sql_dumps/ia_exports_item_readme.md
+++ b/extra/sql_dumps/ia_exports_item_readme.md
@@ -1,5 +1,5 @@
-This item contains a complete PostgreSQL SQL database snapshot from https://fatcat.wiki, in binary 'pg_dump tar mode' format.
+This item contains bulk entity exports (JSON schema) from https://fatcat.wiki
With the exception of the 'abstracts' table (for which no aggregate license or copyright claims can be made; downstream users are responsible for their use), all metadata here is licensed CC-0 (public domain release) and may be used for any purpose. Downstream users are strongly encouraged to provide attribution and link here to the snapshot, as well as give credit to upstream sources (including Crossref, ORCID, DOAJ, the ISSN ROAD database, etc).
diff --git a/fatcat-openapi2.yml b/fatcat-openapi2.yml
index 47b9bc3b..88b5f5fb 100644
--- a/fatcat-openapi2.yml
+++ b/fatcat-openapi2.yml
@@ -50,7 +50,7 @@ info:
### Other Nitty Gritties
Cross-origin requests are allowed for the API service, to enable third
- parties to bulid in-browser applications.
+ parties to build in-browser applications.
A metadata search service is available at <https://search.fatcat.wiki> (and
<https://search.qa.fatcat.wiki>). The API is currently the raw
diff --git a/notes/bulk_edits/2020-03-19_arxiv_pubmed.md b/notes/bulk_edits/2020-03-19_arxiv_pubmed.md
new file mode 100644
index 00000000..b2fd29d5
--- /dev/null
+++ b/notes/bulk_edits/2020-03-19_arxiv_pubmed.md
@@ -0,0 +1,57 @@
+
+On 2020-03-20, automated daily harvesting and importing of arxiv and pubmed
+metadata started. In the case of pubmed, updates are enabled, so that recently
+created DOI releases get updated with PMID and extra metdata.
+
+We also want to do last backfills of metadata since the last import up through
+the first day updated by the continuous harvester.
+
+
+## arxiv
+
+The previous date span was 2019-05-22 through 2019-12-20. This time we should
+do 2019-12-20 through today.
+
+First do metha update from last harvest through today, and grab the new daily files:
+
+ metha-sync -format arXivRaw http://export.arxiv.org/oai2
+
+ mkdir arxiv_20191220_20200319
+ cp 2019-12-2* 2019-12-3* 2020-* arxiv_20191220_20200319/
+ tar cf arxiv_20191220_20200319.tar arxiv_20191220_20200319/
+ gzip arxiv_20191220_20200319.tar
+
+Then copy to fatcat server and run import:
+
+ export FATCAT_AUTH_WORKER_ARXIV=...
+
+ ./fatcat_import.py --batch-size 100 arxiv /srv/fatcat/datasets/arxiv_20191220_20200319/2019-12-31-00000000.xml
+ => Counter({'exists': 1824, 'total': 1001, 'insert': 579, 'skip': 1, 'update': 0})
+
+ fd .xml /srv/fatcat/datasets/arxiv_20191220_20200319/ | parallel -j15 ./fatcat_import.py --batch-size 100 arxiv {}
+
+Ran fairly quickly only some ~80-90k entities to process.
+
+## PubMed
+
+First, mirror update files from FTP, e.g. via lftp:
+
+ mkdir -p /srv/fatcat/datasets/pubmed_updates
+ lftp -e 'mirror -c /pubmed/updatefiles /srv/fatcat/datasets/pubmed_updates; bye' ftp://ftp.ncbi.nlm.nih.gov
+
+Inspect completed dates from kafka:
+
+ kafkacat -b $KAFKA_BROKER -t fatcat-prod.ftp-pubmed-state -C
+
+Show dates and corresponding files:
+
+ find /srv/fatcat/datasets/pubmed_updates -name "*html" | xargs cat | grep "Created" | sort
+
+For this bulk import, we used files pubmed20n1016.xml.gz (2019-12-16) up to pubmed20n1110.xml.gz (2020-03-06).
+
+To import the corresponding files, run:
+
+ printf "%s\n" /srv/fatcat/datasets/pubmed_updates/pubmed20n{1016..1110}.xml.gz | shuf | \
+ parallel -j16 'gunzip -c {} | ./fatcat_import.py pubmed --do-updates - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt'
+
+Import took 254 min, there were 1715427 PubmedArticle docs in these update files.
diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py
index efd78685..a45b44f8 100755
--- a/python/fatcat_harvest.py
+++ b/python/fatcat_harvest.py
@@ -5,8 +5,8 @@ import argparse
import datetime
import raven
from fatcat_tools.harvest import HarvestCrossrefWorker, HarvestDataciteWorker,\
- HarvestArxivWorker, HarvestPubmedWorker, HarvestDoajArticleWorker,\
- HarvestDoajJournalWorker
+ HarvestArxivWorker, HarvestDoajArticleWorker, HarvestDoajJournalWorker,\
+ PubmedFTPWorker
# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
sentry_client = raven.Client()
@@ -42,10 +42,10 @@ def run_arxiv(args):
worker.run(continuous=args.continuous)
def run_pubmed(args):
- worker = HarvestPubmedWorker(
+ worker = PubmedFTPWorker(
kafka_hosts=args.kafka_hosts,
- produce_topic="fatcat-{}.oaipmh-pubmed".format(args.env),
- state_topic="fatcat-{}.oaipmh-pubmed-state".format(args.env),
+ produce_topic="fatcat-{}.ftp-pubmed".format(args.env),
+ state_topic="fatcat-{}.ftp-pubmed-state".format(args.env),
start_date=args.start_date,
end_date=args.end_date)
worker.run(continuous=args.continuous)
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index c70cb426..331cf791 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -39,15 +39,16 @@ def run_arxiv(args):
ari = ArxivRawImporter(args.api,
edit_batch_size=args.batch_size)
if args.kafka_mode:
- raise NotImplementedError
- #KafkaBs4XmlPusher(
- # ari,
- # args.kafka_hosts,
- # args.kafka_env,
- # "api-arxiv",
- # "fatcat-{}-import-arxiv".format(args.kafka_env),
- #).run()
+ KafkaBs4XmlPusher(
+ ari,
+ args.kafka_hosts,
+ args.kafka_env,
+ "oaipmh-arxiv",
+ "fatcat-{}-import-arxiv".format(args.kafka_env),
+ ).run()
else:
+ if args.xml_file == sys.stdin:
+ print('note: reading from stdin', file=sys.stderr)
Bs4XmlFilePusher(ari, args.xml_file, "record").run()
def run_pubmed(args):
@@ -57,14 +58,13 @@ def run_pubmed(args):
do_updates=args.do_updates,
lookup_refs=(not args.no_lookup_refs))
if args.kafka_mode:
- raise NotImplementedError
- #KafkaBs4XmlPusher(
- # pi,
- # args.kafka_hosts,
- # args.kafka_env,
- # "api-pubmed",
- # "fatcat-{}import-arxiv".format(args.kafka_env),
- #).run()
+ KafkaBs4XmlPusher(
+ pi,
+ args.kafka_hosts,
+ args.kafka_env,
+ "ftp-pubmed",
+ "fatcat-{}-import-pubmed".format(args.kafka_env),
+ ).run()
else:
Bs4XmlLargeFilePusher(
pi,
@@ -302,6 +302,7 @@ def main():
auth_var="FATCAT_AUTH_WORKER_ARXIV",
)
sub_arxiv.add_argument('xml_file',
+ nargs='?',
help="arXivRaw XML file to import from",
default=sys.stdin, type=argparse.FileType('r'))
sub_arxiv.add_argument('--kafka-mode',
@@ -315,6 +316,7 @@ def main():
auth_var="FATCAT_AUTH_WORKER_PUBMED",
)
sub_pubmed.add_argument('xml_file',
+ nargs='?',
help="Pubmed XML file to import from",
default=sys.stdin, type=argparse.FileType('r'))
sub_pubmed.add_argument('issn_map_file',
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 6c3c8859..c6f27ad3 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -90,6 +90,8 @@ def _run_search_dump(args, search):
)
if not ingest_request:
continue
+ if args.force_recrawl:
+ ingest_request['force_recrawl'] = True
counts['ingest_request'] += 1
if args.dry_run:
continue
@@ -206,6 +208,9 @@ def main():
parser.add_argument('--allow-non-oa',
action='store_true',
help="By default, we limit to OA releases. This removes that filter")
+ parser.add_argument('--force-recrawl',
+ action='store_true',
+ help="Tell ingest worker to skip GWB history lookup and do SPNv2 crawl")
subparsers = parser.add_subparsers()
sub_container = subparsers.add_parser('container',
diff --git a/python/fatcat_tools/harvest/__init__.py b/python/fatcat_tools/harvest/__init__.py
index 7d814696..b3757a7d 100644
--- a/python/fatcat_tools/harvest/__init__.py
+++ b/python/fatcat_tools/harvest/__init__.py
@@ -1,5 +1,6 @@
from .harvest_common import HarvestState
from .doi_registrars import HarvestCrossrefWorker, HarvestDataciteWorker
-from .oaipmh import HarvestArxivWorker, HarvestPubmedWorker,\
- HarvestDoajArticleWorker, HarvestDoajJournalWorker
+from .oaipmh import HarvestArxivWorker, HarvestDoajArticleWorker, \
+ HarvestDoajJournalWorker
+from .pubmed import PubmedFTPWorker
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py
index 11b5fa0a..c95f3445 100644
--- a/python/fatcat_tools/harvest/oaipmh.py
+++ b/python/fatcat_tools/harvest/oaipmh.py
@@ -132,25 +132,6 @@ class HarvestArxivWorker(HarvestOaiPmhWorker):
self.name = "arxiv"
-class HarvestPubmedWorker(HarvestOaiPmhWorker):
- """
- Will likely be doing MEDLINE daily batch imports for primary metadata, but
- might also want to run a PMC importer to update fulltext and assign OA
- licenses (when appropriate).
-
- Pubmed refs:
- - https://www.ncbi.nlm.nih.gov/pmc/tools/oai/
- - https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm
- - https://github.com/titipata/pubmed_parser
- """
-
- def __init__(self, **kwargs):
- super().__init__(**kwargs)
- self.endpoint_url = "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi"
- self.metadata_prefix = "pmc_fm"
- self.name = "pubmed"
-
-
class HarvestDoajJournalWorker(HarvestOaiPmhWorker):
"""
WARNING: DOAJ OAI-PMH doesn't seem to respect 'from' and 'until' params
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
new file mode 100644
index 00000000..3f31696e
--- /dev/null
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -0,0 +1,249 @@
+"""
+Pubmed harvest via FTP.
+
+Assumptions:
+
+* fixed hostname and directory structure
+* XML files are gzip compressed
+* accompanying HTML files contain correct dates
+"""
+
+import collections
+import gzip
+import io
+import os
+import re
+import shutil
+import sys
+import tempfile
+import time
+import xml.etree.ElementTree as ET
+from ftplib import FTP
+from urllib.parse import urljoin, urlparse
+
+import dateparser
+from bs4 import BeautifulSoup
+from confluent_kafka import KafkaException, Producer
+
+from .harvest_common import HarvestState
+
+
+class PubmedFTPWorker:
+ """
+ Access Pubmed FTP for daily updates.
+
+ * Server directory: ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles
+ * Docs: ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt
+
+ Daily Update Files (02/2020)
+ ----------------------------
+ Each day, NLM produces update files that include new, revised and deleted
+ citations. The first Update file to be loaded after loading the complete
+ set of 2019 MEDLINE/PubMed Baseline files is pubmed20n1016.xml.
+
+ Usually, three files per update, e.g.:
+
+ * ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/pubmed20n1016_stats.html
+ * ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/pubmed20n1016.xml.gz
+ * ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/pubmed20n1016.xml.gz.md5
+
+ Currently (02/2020) the HTML contains the date.
+
+ <html>
+ <head><title></title></head>
+ <body>
+ <h4>Filename: pubmed20n1019.xml -- Created: Wed Dec 18 14:31:09 EST 2019</h4>
+ <table cellspacing="0" cellpadding="0" border="0" width="300">
+ <tr>
+
+ """
+ def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None):
+ self.name = 'Pubmed'
+ self.host = 'ftp.ncbi.nlm.nih.gov'
+ self.produce_topic = produce_topic
+ self.state_topic = state_topic
+ self.kafka_config = {
+ 'bootstrap.servers': kafka_hosts,
+ 'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes
+ }
+ self.loop_sleep = 60 * 60 # how long to wait, in seconds, between date checks
+ self.state = HarvestState(start_date, end_date)
+ self.state.initialize_from_kafka(self.state_topic, self.kafka_config)
+ self.producer = self._kafka_producer()
+ self.date_file_map = None
+
+ def _kafka_producer(self):
+ def fail_fast(err, msg):
+ if err is not None:
+ print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
+ print("Bailing out...", file=sys.stderr)
+ # TODO: should it be sys.exit(-1)?
+ raise KafkaException(err)
+
+ self._kafka_fail_fast = fail_fast
+
+ producer_conf = self.kafka_config.copy()
+ producer_conf.update({
+ 'delivery.report.only.error': True,
+ 'default.topic.config': {
+ 'request.required.acks': -1, # all brokers must confirm
+ },
+ })
+ return Producer(producer_conf)
+
+ def fetch_date(self, date):
+ """
+ Fetch file for a given date and feed Kafka one article per message. If
+ the fetched XML does not contain a PMID, this method will fail.
+
+ If no date file mapping is found, this will fail.
+ """
+ if self.date_file_map is None:
+ raise ValueError("cannot fetch date without date file mapping")
+
+ date_str = date.strftime('%Y-%m-%d')
+ paths = self.date_file_map.get(date_str)
+ if paths is None:
+ print("WARN: no pubmed update for this date: {} (UTC), available dates were: {}".format(date_str, self.date_file_map), file=sys.stderr)
+ return False
+
+ count = 0
+ for path in paths:
+ # Fetch and decompress file.
+ url = "ftp://{}{}".format(self.host, path)
+ filename = ftpretr(url)
+ with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp:
+ gzf = gzip.open(filename)
+ shutil.copyfileobj(gzf, decomp)
+
+ # Here, blob is the unparsed XML; we peek into it to use PMID as
+ # message key. We need streaming, since some updates would consume
+ # GBs otherwise.
+ # WARNING: Parsing foreign XML exposes us at some
+ # https://docs.python.org/3/library/xml.html#xml-vulnerabilities
+ # here.
+ for blob in xmlstream(decomp.name, 'PubmedArticle', encoding='utf-8'):
+ soup = BeautifulSoup(blob, 'xml')
+ pmid = soup.find('PMID')
+ if pmid is None:
+ raise ValueError("no PMID found, please adjust identifier extraction")
+ count += 1
+ if count % 50 == 0:
+ print("... up to {}".format(count), file=sys.stderr)
+ self.producer.produce(self.produce_topic, blob, key=pmid.text, on_delivery=self._kafka_fail_fast)
+
+ self.producer.flush()
+ os.remove(filename)
+ os.remove(decomp.name)
+
+ return True
+
+ def run(self, continuous=False):
+ while True:
+ self.date_file_map = generate_date_file_map(host=self.host)
+ if len(self.date_file_map) == 0:
+ raise ValueError("map from dates to files should not be empty, maybe the HTML changed?")
+
+ current = self.state.next(continuous)
+ if current:
+ print("Fetching citations updated on {} (UTC)".format(current), file=sys.stderr)
+ self.fetch_date(current)
+ self.state.complete(current, kafka_topic=self.state_topic, kafka_config=self.kafka_config)
+ continue
+
+ if continuous:
+ print("Sleeping {} seconds...".format(self.loop_sleep))
+ time.sleep(self.loop_sleep)
+ else:
+ break
+ print("{} FTP ingest caught up".format(self.name))
+
+
+def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):
+ """
+ Generate a DefaultDict[string, set] mapping dates to absolute filepaths on
+ the server (mostly we have one file, but sometimes more).
+
+ Example: {"2020-01-02": set(["/pubmed/updatefiles/pubmed20n1016.xml.gz"]), ...}
+ """
+ mapping = collections.defaultdict(set)
+ pattern = re.compile(r'Filename: ([^ ]*.xml) -- Created: ([^<]*)')
+ ftp = FTP(host)
+ ftp.login()
+ filenames = ftp.nlst('/pubmed/updatefiles')
+
+ for name in filenames:
+ if not name.endswith('.html'):
+ continue
+ sio = io.StringIO()
+ ftp.retrlines('RETR {}'.format(name), sio.write)
+ contents = sio.getvalue()
+ match = pattern.search(contents)
+ if match is None:
+ print('pattern miss in {} on: {}, may need to adjust pattern: {}'.format(name, contents, pattern), file=sys.stderr)
+ continue
+ filename, filedate = match.groups() # ('pubmed20n1017.xml', 'Tue Dec 17 15:23:32 EST 2019')
+ date = dateparser.parse(filedate)
+ fullpath = '/pubmed/updatefiles/{}.gz'.format(filename)
+ date_str = date.strftime('%Y-%m-%d')
+ mapping[date_str].add(fullpath)
+ print('added entry for {}: {}'.format(date_str, fullpath), file=sys.stderr)
+
+ print('generated date-file mapping for {} dates'.format(len(mapping)), file=sys.stderr)
+ return mapping
+
+
+def ftpretr(url):
+ """
+ Note: This might move into a generic place in the future.
+
+ Fetch (RETR) a remote file given by its URL (e.g.
+ "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/pubmed20n1016.xml.gz") to a
+ local temporary file. Returns the name of the local, closed temporary file.
+
+ It is the reponsibility of the caller to cleanup the temporary file.
+ """
+ parsed = urlparse(url)
+ server, path = parsed.netloc, parsed.path
+ ftp = FTP(server)
+ ftp.login()
+ with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f:
+ print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr)
+ ftp.retrbinary('RETR %s' % path, f.write)
+ ftp.close()
+ return f.name
+
+
+def xmlstream(filename, tag, encoding='utf-8'):
+ """
+ Note: This might move into a generic place in the future.
+
+ Given a path to an XML file and a tag name (without namespace), stream
+ through the XML and yield elements denoted by tag as string.
+
+ for snippet in xmlstream("sample.xml", "sometag"):
+ print(len(snippet))
+
+ Known vulnerabilities: https://docs.python.org/3/library/xml.html#xml-vulnerabilities
+ """
+ def strip_ns(tag):
+ if not '}' in tag:
+ return tag
+ return tag.split('}')[1]
+
+ # https://stackoverflow.com/a/13261805, http://effbot.org/elementtree/iterparse.htm
+ context = iter(ET.iterparse(filename, events=(
+ 'start',
+ 'end',
+ )))
+ try:
+ _, root = next(context)
+ except StopIteration:
+ return
+
+ for event, elem in context:
+ if not strip_ns(elem.tag) == tag or event == 'start':
+ continue
+
+ yield ET.tostring(elem, encoding=encoding)
+ root.clear()
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 10557ef8..c26446fd 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,7 +12,7 @@ To run an import you combine two classes; one each of:
"""
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, KafkaBs4XmlPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
from .datacite import DataciteImporter
from .jalc import JalcImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 694ef359..da611ecb 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -730,6 +730,71 @@ class Bs4XmlFileListPusher(RecordPusher):
print(counts)
return counts
+class KafkaBs4XmlPusher(RecordPusher):
+ """
+ Fetch XML for an article from Kafka, parse via Bs4.
+ """
+ def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
+ self.importer = importer
+ self.consumer = make_kafka_consumer(
+ kafka_hosts,
+ kafka_env,
+ topic_suffix,
+ group,
+ kafka_namespace=kwargs.get('kafka_namespace', 'fatcat')
+ )
+ self.poll_interval = kwargs.get('poll_interval', 5.0)
+ self.consume_batch_size = kwargs.get('consume_batch_size', 25)
+
+ def run(self):
+ count = 0
+ last_push = datetime.datetime.now()
+ while True:
+ # Note: this is batch-oriented, because underlying importer is
+ # often batch-oriented, but this doesn't confirm that entire batch
+ # has been pushed to fatcat before commiting offset. Eg, consider
+ # case where there there is one update and thousands of creates;
+ # update would be lingering in importer, and if importer crashed
+ # never created.
+ # This is partially mitigated for the worker case by flushing any
+ # outstanding editgroups every 5 minutes, but there is still that
+ # window when editgroups might be hanging (unsubmitted).
+ batch = self.consumer.consume(
+ num_messages=self.consume_batch_size,
+ timeout=self.poll_interval)
+ print("... got {} kafka messages ({}sec poll interval)".format(
+ len(batch), self.poll_interval))
+ if not batch:
+ if datetime.datetime.now() - last_push > datetime.timedelta(minutes=5):
+ # it has been some time, so flush any current editgroup
+ self.importer.finish()
+ last_push = datetime.datetime.now()
+ #print("Flushed any partial import batch: {}".format(self.importer.counts))
+ continue
+ # first check errors on entire batch...
+ for msg in batch:
+ if msg.error():
+ raise KafkaException(msg.error())
+ # ... then process
+ for msg in batch:
+ soup = BeautifulSoup(msg.value().decode('utf-8'), "xml")
+ self.importer.push_record(soup)
+ soup.decompose()
+ count += 1
+ if count % 500 == 0:
+ print("Import counts: {}".format(self.importer.counts))
+ last_push = datetime.datetime.now()
+ for msg in batch:
+ # locally store offsets of processed messages; will be
+ # auto-commited by librdkafka from this "stored" value
+ self.consumer.store_offsets(message=msg)
+
+ # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
+ # commit the current batch if it has been lingering
+ counts = self.importer.finish()
+ print(counts)
+ self.consumer.close()
+ return counts
class KafkaJsonPusher(RecordPusher):
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 18703a1a..9617299c 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -163,6 +163,14 @@ class CrossrefImporter(EntityImporter):
self.counts['skip-blank-title'] += 1
return False
+ # these are pre-registered DOIs before the actual record is ready
+ # title is a list of titles
+ if obj.get('title')[0].strip().lower() in [
+ "OUP accepted manuscript".lower(),
+ ]:
+ self.counts['skip-stub-title'] += 1
+ return False
+
# do most of these checks in-line below
return True
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 9250fc5e..81f00876 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -222,6 +222,7 @@ class DataciteImporter(EntityImporter):
self.read_issn_map_file(issn_map_file)
self.debug = debug
self.insert_log_file = insert_log_file
+ self.this_year = datetime.datetime.now().year
print('datacite with debug={}'.format(self.debug), file=sys.stderr)
@@ -311,6 +312,12 @@ class DataciteImporter(EntityImporter):
release_date, release_month, release_year = parse_datacite_dates(
attributes.get('dates', []))
+ # block bogus far-future years/dates
+ if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ release_date = None
+ release_month = None
+ release_year = None
+
# Some records do not use the "dates" field (e.g. micropub), but:
# "attributes.published" or "attributes.publicationYear"
if not any((release_date, release_month, release_year)):
@@ -714,7 +721,8 @@ class DataciteImporter(EntityImporter):
name_scheme = nid.get('nameIdentifierScheme', '') or ''
if not name_scheme.lower() == "orcid":
continue
- orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
+ orcid = nid.get('nameIdentifier') or ''
+ orcid = orcid.replace('https://orcid.org/', '')
if not orcid:
continue
creator_id = self.lookup_orcid(orcid)
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index a0e0086b..351a20a3 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -209,10 +209,14 @@ class JalcImporter(EntityImporter):
release_year = int(date)
pages = None
- if record.startingPage:
- pages = record.startingPage.string
- if record.endingPage:
- pages = "{}-{}".format(pages, record.endingPage.string)
+ if record.startingPage and record.startingPage.string.strip():
+ pages = record.startingPage.string.strip()
+ if record.endingPage and record.endingPage.string.strip():
+ pages = "{}-{}".format(pages, record.endingPage.string.strip())
+ # double check to prevent "-" as pages
+ if pages and pages.strip() == '-':
+ pages = None
+
volume = None
if record.volume:
volume = record.volume.string
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index c32ce34a..3ecf5ef4 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -616,7 +616,10 @@ class PubmedImporter(EntityImporter):
### References
refs = []
if pubmed.ReferenceList:
- for ref in pubmed.ReferenceList.find_all('Reference'):
+ # note that Reference always exists within a ReferenceList, but
+ # that there may be multiple ReferenceList (eg, sometimes one per
+ # Reference)
+ for ref in pubmed.find_all('Reference'):
ref_extra = dict()
ref_doi = ref.find("ArticleId", IdType="doi")
if ref_doi:
@@ -729,8 +732,29 @@ class PubmedImporter(EntityImporter):
existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi
existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid
existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid
+
+ existing.container_id = existing.container_id or re.container_id
existing.refs = existing.refs or re.refs
+ existing.abstracts = existing.abstracts or re.abstracts
existing.extra['pubmed'] = re.extra['pubmed']
+
+ # fix stub titles
+ if existing.title in [
+ "OUP accepted manuscript",
+ ]:
+ existing.title = re.title
+
+ existing.original_title = existing.original_title or re.original_title
+ existing.release_type = existing.release_type or re.release_type
+ existing.release_stage = existing.release_stage or re.release_stage
+ existing.release_date = existing.release_date or re.release_date
+ existing.release_year = existing.release_year or re.release_year
+ existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status
+ existing.volume = existing.volume or re.volume
+ existing.issue = existing.issue or re.issue
+ existing.pages = existing.pages or re.pages
+ existing.language = existing.language or re.language
+
# update subtitle in-place first
if not existing.subtitle and existing.extra.get('subtitle'):
subtitle = existing.extra.pop('subtitle')
@@ -740,6 +764,7 @@ class PubmedImporter(EntityImporter):
existing.subtitle = subtitle
if not existing.subtitle:
existing.subtitle = re.subtitle
+
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
self.counts['update'] += 1
diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py
index 7ab94cac..832ad6aa 100644
--- a/python/fatcat_tools/transforms/csl.py
+++ b/python/fatcat_tools/transforms/csl.py
@@ -37,8 +37,9 @@ def release_to_csl(entity):
# Default to "local" (publication-specific) metadata; fall back to
# creator-level
family = contrib.surname or contrib.creator.surname or (contrib.raw_name and contrib.raw_name.split()[-1])
- if not contrib.raw_name:
- raise ValueError("CSL requires some surname (family name)")
+ if not family:
+ # CSL requires some surname (family name)
+ continue
c = dict(
family=family,
given=contrib.given_name or contrib.creator.given_name,
@@ -49,22 +50,27 @@ def release_to_csl(entity):
#static-ordering
literal=contrib.raw_name or contrib.creator.display_name,
#parse-names,
- role=contrib.role,
+ # role must be defined; default to author
+ role=contrib.role or 'author',
)
else:
family = contrib.surname or (contrib.raw_name and contrib.raw_name.split()[-1])
- if not contrib.raw_name:
- raise ValueError("CSL requires some surname (family name)")
+ if not family:
+ # CSL requires some surname (family name)
+ continue
c = dict(
family=family,
given=contrib.given_name,
literal=contrib.raw_name,
- role=contrib.role,
+ # role must be defined; default to author
+ role=contrib.role or 'author',
)
for k in list(c.keys()):
if not c[k]:
c.pop(k)
contribs.append(c)
+ if not contribs:
+ raise ValueError("citeproc requires at least one author with a surname")
abstract = None
if entity.abstracts:
abstract = entity.abstracts[0].content
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index b84d5e70..5783bbfc 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -107,6 +107,9 @@ class EntityUpdatesWorker(FatcatWorker):
"10.1101/",
# researchgate
"10.13140/",
+ # the lancet (often hybrid OA)
+ "10.1016/s0140-6736",
+ "10.1016/s2213-2600",
]
def want_live_ingest(self, release, ingest_request):
diff --git a/python/fatcat_web/entity_helpers.py b/python/fatcat_web/entity_helpers.py
index 520bb832..4d13da43 100644
--- a/python/fatcat_web/entity_helpers.py
+++ b/python/fatcat_web/entity_helpers.py
@@ -1,6 +1,6 @@
from flask import abort
-from fatcat_openapi_client.rest import ApiException
+from fatcat_openapi_client.rest import ApiException, ApiValueError
from fatcat_tools.transforms import *
from fatcat_web import app, api
from fatcat_web.search import get_elastic_container_stats, get_elastic_container_random_releases
@@ -74,8 +74,13 @@ def enrich_release_entity(entity):
ref.extra['unstructured'] = strip_extlink_xml(ref.extra['unstructured'])
# author list to display; ensure it's sorted by index (any othors with
# index=None go to end of list)
- authors = [c for c in entity.contribs if c.role in ('author', None)]
+ authors = [c for c in entity.contribs if
+ c.role in ('author', None) and
+ (c.surname or c.raw_name or (c.creator and c.creator.surname))
+ ]
entity._authors = sorted(authors, key=lambda c: (c.index == None and 99999999) or c.index)
+ # need authors, title for citeproc to work
+ entity._can_citeproc = bool(entity._authors) and bool(entity.title)
if entity.abstracts:
# hack to show plain text instead of latex abstracts
if 'latex' in entity.abstracts[0].mimetype:
@@ -118,6 +123,8 @@ def generic_get_entity(entity_type, ident):
raise NotImplementedError
except ApiException as ae:
abort(ae.status)
+ except ApiValueError:
+ abort(400)
def generic_get_entity_revision(entity_type, revision_id):
try:
@@ -139,6 +146,8 @@ def generic_get_entity_revision(entity_type, revision_id):
raise NotImplementedError
except ApiException as ae:
abort(ae.status)
+ except ApiValueError:
+ abort(400)
def generic_get_editgroup_entity(editgroup, entity_type, ident):
if entity_type == 'container':
@@ -167,6 +176,12 @@ def generic_get_editgroup_entity(editgroup, entity_type, ident):
# couldn't find relevant edit in this editgroup
abort(404)
- entity = generic_get_entity_revision(entity_type, revision_id)
+ try:
+ entity = generic_get_entity_revision(entity_type, revision_id)
+ except ApiException as ae:
+ abort(ae.status)
+ except ApiValueError:
+ abort(400)
+
entity.ident = ident
return entity, edit
diff --git a/python/fatcat_web/templates/container_lookup.html b/python/fatcat_web/templates/container_lookup.html
index e6fb860c..798e5587 100644
--- a/python/fatcat_web/templates/container_lookup.html
+++ b/python/fatcat_web/templates/container_lookup.html
@@ -20,10 +20,9 @@ search for an existing record missing that identifier, or create a new container
entity.
{% if lookup_key == "issnl" %}
<p>You can check if it is a registered ISSN-L by visiting:
-<b><a href="https://portal.issn.org/{{ lookup_value
-}}">https://portal.issn.org/{{ lookup_value }}</a></b>. If this is a valid
-electronic or print ISSN, but not the ISSN-L for the container, you need to use
-the indicated "linking" ISSN.
+<b><a href="https://portal.issn.org/api/search?search[]=MUST=allissnbis={{ lookup_value }}">https://portal.issn.org/api/search?search[]=MUST=allissnbis={{ lookup_value }}</a></b>.
+If this is a valid electronic or print ISSN, but not the ISSN-L for the
+container, you need to use the indicated "linking" ISSN.
{% elif lookup_key == "wikidata_qid" %}
<p>You can check if it is a real Wikidata entity by visiting:
<b><a href="https://www.wikidata.org/wiki/{{ lookup_value }}">https://www.wikidata.org/wiki/{{ lookup_value }}</a></b>
diff --git a/python/fatcat_web/templates/home.html b/python/fatcat_web/templates/home.html
index 0039e3a7..698230d3 100644
--- a/python/fatcat_web/templates/home.html
+++ b/python/fatcat_web/templates/home.html
@@ -35,17 +35,17 @@
<div class="row">
<div class="four wide mobile three wide center aligned column">
<a href="/stats" style="color: black;">
- <h4>96,947,165<br>Papers</h4>
+ <h4>106,283,000<br>Papers</h4>
</a>
</div>
<div class="four wide mobile three wide center aligned column">
<a href="/stats" style="color: black;">
- <h4>18,117,429<br>Fulltext</h4>
+ <h4>23,036,825<br>Fulltext</h4>
</a>
</div>
<div class="four wide mobile three wide center aligned column">
<a href="/stats" style="color: black;">
- <h4>140,085<br>Journals</h4>
+ <h4>148,757<br>Journals</h4>
</a>
</div>
</div>
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index 961b4759..d7c4e76e 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -388,8 +388,7 @@ accessible version.
<br>grouping other versions (eg, pre-print) and variants of this release
</div>
-{# this restriction, for CSL-JSON generation, rules out almost everything #}
-{% if release.contribs and release.contribs[0].creator_id %}
+{% if release._can_citeproc %}
<div class="ui segment attached accordion">
<div class="title" style="padding: 0px;">
<i class="dropdown icon"></i><b>Cite This Release</b>
diff --git a/python/tests/files/pubmed_19129924.xml b/python/tests/files/pubmed_19129924.xml
new file mode 100644
index 00000000..a8ff0bcd
--- /dev/null
+++ b/python/tests/files/pubmed_19129924.xml
@@ -0,0 +1,206 @@
+<PubmedArticle>
+ <MedlineCitation Owner="NLM" Status="PubMed-not-MEDLINE">
+ <PMID Version="1">19129924</PMID>
+ <DateCompleted>
+ <Year>2011</Year>
+ <Month>07</Month>
+ <Day>14</Day>
+ </DateCompleted>
+ <DateRevised>
+ <Year>2020</Year>
+ <Month>03</Month>
+ <Day>06</Day>
+ </DateRevised>
+ <Article PubModel="Electronic-eCollection">
+ <Journal>
+ <ISSN IssnType="Electronic">1662-5196</ISSN>
+ <JournalIssue CitedMedium="Internet">
+ <Volume>2</Volume>
+ <PubDate>
+ <Year>2008</Year>
+ </PubDate>
+ </JournalIssue>
+ <Title>Frontiers in neuroinformatics</Title>
+ <ISOAbbreviation>Front Neuroinform</ISOAbbreviation>
+ </Journal>
+ <ArticleTitle>PyMOOSE: Interoperable Scripting in Python for MOOSE.</ArticleTitle>
+ <Pagination>
+ <MedlinePgn>6</MedlinePgn>
+ </Pagination>
+ <ELocationID EIdType="doi" ValidYN="Y">10.3389/neuro.11.006.2008</ELocationID>
+ <Abstract>
+ <AbstractText>Python is emerging as a common scripting language for simulators. This opens up many possibilities for interoperability in the form of analysis, interfaces, and communications between simulators. We report the integration of Python scripting with the Multi-scale Object Oriented Simulation Environment (MOOSE). MOOSE is a general-purpose simulation system for compartmental neuronal models and for models of signaling pathways based on chemical kinetics. We show how the Python-scripting version of MOOSE, PyMOOSE, combines the power of a compiled simulator with the versatility and ease of use of Python. We illustrate this by using Python numerical libraries to analyze MOOSE output online, and by developing a GUI in Python/Qt for a MOOSE simulation. Finally, we build and run a composite neuronal/signaling model that uses both the NEURON and MOOSE numerical engines, and Python as a bridge between the two. Thus PyMOOSE has a high degree of interoperability with analysis routines, with graphical toolkits, and with other simulators.</AbstractText>
+ </Abstract>
+ <AuthorList CompleteYN="Y">
+ <Author ValidYN="Y">
+ <LastName>Ray</LastName>
+ <ForeName>Subhasis</ForeName>
+ <Initials>S</Initials>
+ <AffiliationInfo>
+ <Affiliation>National Centre for Biological Sciences Bangalore, India.</Affiliation>
+ </AffiliationInfo>
+ </Author>
+ <Author ValidYN="Y">
+ <LastName>Bhalla</LastName>
+ <ForeName>Upinder S</ForeName>
+ <Initials>US</Initials>
+ </Author>
+ </AuthorList>
+ <Language>eng</Language>
+ <PublicationTypeList>
+ <PublicationType UI="D016428">Journal Article</PublicationType>
+ </PublicationTypeList>
+ <ArticleDate DateType="Electronic">
+ <Year>2008</Year>
+ <Month>12</Month>
+ <Day>19</Day>
+ </ArticleDate>
+ </Article>
+ <MedlineJournalInfo>
+ <Country>Switzerland</Country>
+ <MedlineTA>Front Neuroinform</MedlineTA>
+ <NlmUniqueID>101477957</NlmUniqueID>
+ <ISSNLinking>1662-5196</ISSNLinking>
+ </MedlineJournalInfo>
+ <KeywordList Owner="NOTNLM">
+ <Keyword MajorTopicYN="N">GENESIS</Keyword>
+ <Keyword MajorTopicYN="N">MOOSE</Keyword>
+ <Keyword MajorTopicYN="N">NEURON</Keyword>
+ <Keyword MajorTopicYN="N">Python</Keyword>
+ <Keyword MajorTopicYN="N">compartmental models</Keyword>
+ <Keyword MajorTopicYN="N">multi-scale models</Keyword>
+ <Keyword MajorTopicYN="N">simulators</Keyword>
+ <Keyword MajorTopicYN="N">systems biology</Keyword>
+ </KeywordList>
+ </MedlineCitation>
+ <PubmedData>
+ <History>
+ <PubMedPubDate PubStatus="received">
+ <Year>2008</Year>
+ <Month>09</Month>
+ <Day>15</Day>
+ </PubMedPubDate>
+ <PubMedPubDate PubStatus="accepted">
+ <Year>2008</Year>
+ <Month>11</Month>
+ <Day>01</Day>
+ </PubMedPubDate>
+ <PubMedPubDate PubStatus="entrez">
+ <Year>2009</Year>
+ <Month>1</Month>
+ <Day>9</Day>
+ <Hour>9</Hour>
+ <Minute>0</Minute>
+ </PubMedPubDate>
+ <PubMedPubDate PubStatus="pubmed">
+ <Year>2009</Year>
+ <Month>1</Month>
+ <Day>9</Day>
+ <Hour>9</Hour>
+ <Minute>0</Minute>
+ </PubMedPubDate>
+ <PubMedPubDate PubStatus="medline">
+ <Year>2009</Year>
+ <Month>1</Month>
+ <Day>9</Day>
+ <Hour>9</Hour>
+ <Minute>1</Minute>
+ </PubMedPubDate>
+ </History>
+ <PublicationStatus>epublish</PublicationStatus>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">19129924</ArticleId>
+ <ArticleId IdType="doi">10.3389/neuro.11.006.2008</ArticleId>
+ <ArticleId IdType="pmc">PMC2614320</ArticleId>
+ </ArticleIdList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Eur J Neurosci. 2004 Nov;20(10):2671-80</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">15548210</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Science. 2002 Aug 9;297(5583):1018-23</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">12169734</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Philos Trans R Soc Lond B Biol Sci. 2001 Aug 29;356(1412):1209-28</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">11545699</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Biol Cybern. 1985;53(1):41-56</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">3841014</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Neuroinformatics. 2007 Summer;5(2):96-104</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">17873371</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Science. 1999 Jan 15;283(5400):381-7</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">9888852</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Neuroinformatics. 2007 Summer;5(2):127-38</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">17873374</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Nat Biotechnol. 2005 Dec;23(12):1509-15</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">16333295</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Biol Cybern. 1985;53(1):27-40</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">3841013</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>J Neurophysiol. 1995 Mar;73(3):1157-68</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">7608762</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ <ReferenceList>
+ <Reference>
+ <Citation>Bioinformatics. 2003 Mar 1;19(4):524-31</Citation>
+ <ArticleIdList>
+ <ArticleId IdType="pubmed">12611808</ArticleId>
+ </ArticleIdList>
+ </Reference>
+ </ReferenceList>
+ </PubmedData>
+ </PubmedArticle>
+
diff --git a/python/tests/files/pubmedsample_2019.xml.gz b/python/tests/files/pubmedsample_2019.xml.gz
new file mode 100644
index 00000000..bafad833
--- /dev/null
+++ b/python/tests/files/pubmedsample_2019.xml.gz
Binary files differ
diff --git a/python/tests/files/pubmedsample_no_pmid_2019.xml.gz b/python/tests/files/pubmedsample_no_pmid_2019.xml.gz
new file mode 100644
index 00000000..8785a06d
--- /dev/null
+++ b/python/tests/files/pubmedsample_no_pmid_2019.xml.gz
Binary files differ
diff --git a/python/tests/harvest_pubmed.py b/python/tests/harvest_pubmed.py
new file mode 100644
index 00000000..f8db46b6
--- /dev/null
+++ b/python/tests/harvest_pubmed.py
@@ -0,0 +1,80 @@
+"""
+Test pubmed FTP harvest.
+"""
+
+import datetime
+import json
+import os
+
+import pytest
+
+from fatcat_tools.harvest import *
+from fatcat_tools.harvest.pubmed import generate_date_file_map
+
+
+def test_pubmed_harvest_date(mocker):
+
+ # mock out the harvest state object so it doesn't try to actually connect
+ # to Kafka
+ mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
+
+ # Mocking a file fetched from FTP, should contain some 'PubmedArticle' elements.
+ # $ zcat tests/files/pubmedsample_2019.xml.gz | grep -c '<PubmedArticle>'
+ # 176
+ file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_2019.xml.gz')
+ ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr')
+ ftpretr.return_value = file_to_retrieve
+
+ test_date = '2020-02-20'
+
+ # We'll need one entry in the date_file_map.
+ generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map')
+ generate_date_file_map.return_value = {test_date: set(['dummy'])}
+
+ # For cleanup.
+ os.remove = mocker.Mock()
+
+ harvester = PubmedFTPWorker(
+ kafka_hosts="dummy",
+ produce_topic="dummy-produce-topic",
+ state_topic="dummy-state-topic",
+ )
+
+ harvester.producer = mocker.Mock()
+ harvester.date_file_map = generate_date_file_map()
+ # Since we mock out the FTP fetch, the concrete date does not matter here.
+ harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d'))
+
+ # check that we published the expected number of DOI objects were published
+ # to the (mock) kafka topic
+ assert harvester.producer.produce.call_count == 176
+ assert harvester.producer.flush.call_count == 1
+ assert os.remove.call_count == 2
+
+def test_pubmed_harvest_date_no_pmid(mocker):
+ # mock out the harvest state object so it doesn't try to actually connect
+ # to Kafka
+ mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
+
+ file_to_retrieve = os.path.join(os.path.dirname(__file__), 'files/pubmedsample_no_pmid_2019.xml.gz')
+ ftpretr = mocker.patch('fatcat_tools.harvest.pubmed.ftpretr')
+ ftpretr.return_value = file_to_retrieve
+
+ test_date = '2020-02-20'
+
+ # We'll need one entry in the date_file_map.
+ generate_date_file_map = mocker.patch('fatcat_tools.harvest.pubmed.generate_date_file_map')
+ generate_date_file_map.return_value = {test_date: set(['dummy'])}
+
+ harvester = PubmedFTPWorker(
+ kafka_hosts="dummy",
+ produce_topic="dummy-produce-topic",
+ state_topic="dummy-state-topic",
+ )
+
+ harvester.producer = mocker.Mock()
+
+ # The file has not PMID, not importable.
+ with pytest.raises(ValueError):
+ harvester.fetch_date(datetime.datetime.strptime(test_date, '%Y-%m-%d'))
+
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index 49609f75..f57aa273 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -118,6 +118,7 @@ def test_pubmed_xml_parse(pubmed_importer):
assert r2.refs[0].extra['unstructured'] == "Microbiology. 2009 Jun;155(Pt 6):1840-6"
assert r2.refs[0].extra['pmid'] == "19383690"
+ assert len(r2.refs) > 1
def test_pubmed_xml_dates(pubmed_importer):
with open('tests/files/pubmed_31393839.xml', 'r') as f:
@@ -126,3 +127,14 @@ def test_pubmed_xml_dates(pubmed_importer):
assert r1.release_year == 2019
+def test_pubmed_xml_parse_refs(pubmed_importer):
+ """
+ Tests the case of multiple nested ReferenceList/Reference objects, instead
+ of a single ReferenceList with multiple Reference
+ """
+ with open('tests/files/pubmed_19129924.xml', 'r') as f:
+ soup = BeautifulSoup(f, "xml")
+ r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])
+
+ assert len(r1.refs) > 1
+
diff --git a/python/tests/transform_csl.py b/python/tests/transform_csl.py
index 6f29cba7..15c64ce5 100644
--- a/python/tests/transform_csl.py
+++ b/python/tests/transform_csl.py
@@ -12,22 +12,22 @@ def test_csl_crossref(crossref_importer):
# not a single line
raw = json.loads(f.read())
r = crossref_importer.parse_record(raw)
- # this work has some null contrib names; these should cause errors
- with pytest.raises(ValueError):
- release_to_csl(r)
- with pytest.raises(ValueError):
- csl = release_to_csl(r)
- citeproc_csl(csl, 'csl-json')
- # set with dummy so we can run other tests
- for c in r.contribs:
- if not c.raw_name:
- c.raw_name = "dummy"
csl = release_to_csl(r)
citeproc_csl(csl, 'csl-json')
citeproc_csl(csl, 'bibtex')
citeproc_csl(csl, 'harvard1')
citeproc_csl(csl, 'harvard1', html=True)
+ # check that with no author surnames, can't run
+ for c in r.contribs:
+ c.raw_name = None
+ c.surname = None
+ with pytest.raises(ValueError):
+ release_to_csl(r)
+ with pytest.raises(ValueError):
+ csl = release_to_csl(r)
+ citeproc_csl(csl, 'csl-json')
+
def test_csl_pubmed(crossref_importer):
with open('tests/files/example_releases_pubmed19n0972.json', 'r') as f:
# multiple single lines
diff --git a/python/tests/web_citation_csl.py b/python/tests/web_citation_csl.py
index 3279ebea..e016b2d9 100644
--- a/python/tests/web_citation_csl.py
+++ b/python/tests/web_citation_csl.py
@@ -6,7 +6,7 @@ from fatcat_openapi_client.rest import ApiException
from fixtures import *
-def test_release_bibtex(app):
+def test_release_bibtex(app, api):
# "realistic" demo entity
rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam')
@@ -17,6 +17,8 @@ def test_release_bibtex(app):
assert b'@article{' in rv.data
rv = app.get('/release/ccccccccccccccccccccccccca.bib')
assert rv.status_code == 404
+ rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam/citeproc?style=bibtex')
+ assert rv.status_code == 200
rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam/citeproc?style=csl-json')
assert rv.status_code == 200
# could also rv.get_json() here
@@ -25,10 +27,48 @@ def test_release_bibtex(app):
assert rv.status_code == 200
assert rv.data.decode('utf-8').startswith('Ioannidis, John. “Why Most Published Research Findings Are False”. 2.8 (2005)')
- # "dummy" demo entity
+ # "dummy" demo entity; very minimal metadata
rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai')
assert rv.status_code == 200
+ assert b'BibTeX' in rv.data
+ rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai.bib')
+ assert rv.status_code == 200
+ rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/citeproc?style=modern-language-association')
+ assert rv.status_code == 200
+ rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/citeproc?style=csl-json')
+ assert rv.status_code == 200
+
+ # create release which can not have citeproc run on it (no authors)
+ eg = quick_eg(api)
+ r1 = ReleaseEntity(
+ title="some title",
+ ext_ids=ReleaseExtIds(),
+ )
+ r1edit = api.create_release(eg.editgroup_id, r1)
+ api.accept_editgroup(eg.editgroup_id)
+
+ rv = app.get('/release/{}'.format(r1edit.ident))
+ assert rv.status_code == 200
assert not b'BibTeX' in rv.data
with pytest.raises(ValueError):
- rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai.bib')
+ rv = app.get('/release/{}.bib'.format(r1edit.ident))
+
+ # create release can have citeproc run on it (no authors)
+ eg = quick_eg(api)
+ r2 = ReleaseEntity(
+ title="some title again",
+ contribs=[
+ ReleaseContrib(
+ given_name="Paul",
+ surname="Otlet"),
+ ],
+ ext_ids=ReleaseExtIds(),
+ )
+ r2edit = api.create_release(eg.editgroup_id, r2)
+ api.accept_editgroup(eg.editgroup_id)
+ rv = app.get('/release/{}'.format(r2edit.ident))
+ assert rv.status_code == 200
+ assert b'BibTeX' in rv.data
+ rv = app.get('/release/{}.bib'.format(r2edit.ident))
+ assert rv.status_code == 200
diff --git a/python/tests/web_entity_views.py b/python/tests/web_entity_views.py
index cc4c498f..a3f0f897 100644
--- a/python/tests/web_entity_views.py
+++ b/python/tests/web_entity_views.py
@@ -42,6 +42,8 @@ def test_entity_basics(app):
assert rv.status_code == 200
rv = app.get('/{}/rev/{}'.format(entity_type, revision))
assert rv.status_code == 200
+ rv = app.get('/{}/rev/{}_something'.format(entity_type, revision))
+ assert rv.status_code == 400
rv = app.get('/{}/rev/{}/metadata'.format(entity_type, revision))
assert rv.status_code == 200
print('/editgroup/aaaaaaaaaaaabo53aaaaaaaaaq/{}/{}'.format(entity_type, ident))