aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-21 19:59:56 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-21 19:59:58 -0700
commitc82dbcaaa89d99cbe482eeb2d8ffbce28201fd14 (patch)
tree019481f865b283833bbbf9cddb97af9b9717b9bb
parent45deea74f80d1e8deed6076f2a93d711d16a3a83 (diff)
downloadgrobid_tei_xml-c82dbcaaa89d99cbe482eeb2d8ffbce28201fd14.tar.gz
grobid_tei_xml-c82dbcaaa89d99cbe482eeb2d8ffbce28201fd14.zip
add examples to README, and test those examples in CI
These tests don't run as part of 'make test' by default because they do live fetches against the internet.
-rw-r--r--.gitlab-ci.yml2
-rw-r--r--Makefile13
-rw-r--r--Pipfile2
-rw-r--r--README.md111
4 files changed, 120 insertions, 8 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5ea0e58..9217767 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -13,5 +13,5 @@ variables:
test_python:
script:
- make dep
- - make test
+ - make test-readme
- make lint
diff --git a/Makefile b/Makefile
index 1eb93a5..7d5229a 100644
--- a/Makefile
+++ b/Makefile
@@ -20,14 +20,17 @@ lint: ## Run lints (eg, flake8, mypy)
.PHONY: fmt
fmt: ## Run code formating on all source code
- pipenv run isort --atomic grobid_tei_xml/*
- pipenv run yapf -p -i -r grobid_tei_xml/*
- pipenv run yapf -p -i -r tests
+ pipenv run isort --atomic grobid_tei_xml/*.py
+ pipenv run yapf -p -i -r grobid_tei_xml tests
.PHONY: test
test: ## Run all tests and lints
- pipenv run python -m pytest
+ pipenv run python -m pytest -vv
+
+.PHONY: test-readme
+test-readme: ## Test codeblocks in the README (includes live web requests)
+ pipenv run python -m pytest --codeblocks
.PHONY: coverage
coverage: ## Run all tests with coverage
- pipenv run pytest --cov --cov-report=term --cov-report=html
+ pipenv run python -m pytest --cov --cov-report=term --cov-report=html
diff --git a/Pipfile b/Pipfile
index cf666f7..3fd334e 100644
--- a/Pipfile
+++ b/Pipfile
@@ -14,9 +14,11 @@ flake8-annotations = "*"
mypy = "*"
pytest = "*"
pytest-cov = "*"
+pytest-codeblocks = "*"
typing_extensions = "*"
yapf = "*"
isort = "*"
+requests = "*"
[requires]
python_version = "3.8"
diff --git a/README.md b/README.md
index ca91cfe..0700757 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,111 @@
-grobid-tei-xml: Python parser and transforms for GROBID-flavor TEI-XML
-======================================================================
+`grobid_tei_xml`: Python parser and transforms for GROBID-flavor TEI-XML
+========================================================================
+This is a simple python library for parsing the TEI-XML structured documents
+returned by [GROBID](https://github.com/kermitt2/grobid), a machine learning
+tool for extracting text and bibliographic metadata from research article PDFs.
+
+TEI-XML is a standard format, and there are other libraries to parse entire
+documents and work with annotated text. This library is focused specifically on
+extracting "header" metadata from document (eg, title, authors, journal name,
+volume, issue), content in flattened text form (full abstract and body text as
+single strings, for things like search indexing), and structured citation
+metadata.
+
+`grobid_tei_xml` works with Python 3, using only the standard library. It does
+not talk to the GROBID HTTP API or read files off disk on it's own, but see
+examples below.
+
+In the near future, it should be possible to install `grobid_tei_xml` from
+[pypi.org](https://pypi.org) using `pip`.
+
+
+## Use Examples
+
+Read an XML file from disk, parse it, and print to stdout as JSON:
+
+```python
+import json
+import grobid_tei_xml
+
+xml_path = "./tests/files/small.xml"
+
+with open(xml_path, 'r') as xml_file:
+ doc = grobid_tei_xml.parse_document_xml(xml_file.read())
+
+print(json.dumps(doc.to_dict(), indent=2))
+```
+
+Use `requests` to download a PDF from the web, submit to GROBID (via HTTP API),
+parse the TEI-XML response with `grobid_tei_xml`, and print some metadata
+fields:
+
+```python
+import requests
+import grobid_tei_xml
+
+pdf_resp = requests.get("https://arxiv.org/pdf/1802.01168v3")
+pdf_resp.raise_for_status()
+
+grobid_resp = requests.post(
+ "https://cloud.science-miner.com/grobid/api/processFulltextDocument",
+ files={
+ 'input': pdf_resp.content,
+ 'consolidate_Citations': 0,
+ 'includeRawCitations': 1,
+ },
+ timeout=60.0,
+)
+grobid_resp.raise_for_status()
+
+doc = grobid_tei_xml.parse_document_xml(grobid_resp.text)
+
+print("title: " + doc.header.title)
+print("authors: " + ", ".join([a.name for a in doc.header.authors]))
+print("doi: " + str(doc.header.doi))
+print("citation count: " + str(len(doc.citations)))
+print("abstract: " + doc.abstract)
+```
+
+Use `requests` to submit a "raw" citation string to GROBID for extraction,
+parse the response with `grobid_tei_xml`, and print the structured output to
+stdout:
+
+```python
+import requests
+import grobid_tei_xml
+
+raw_citation = "Kvenvolden K.A. and Field M.E. 1981. Thermogenic hydrocarbons in unconsolidated sediment of Eel River Basin, offshore northern California. AAPG Bulletin 65:1642-1646"
+
+grobid_resp = requests.post(
+ "https://cloud.science-miner.com/grobid/api/processCitation",
+ data={
+ 'citations': raw_citation,
+ 'consolidateCitations': 0,
+ },
+ timeout=10.0,
+)
+grobid_resp.raise_for_status()
+
+citation = grobid_tei_xml.parse_citations_xml(grobid_resp.text)[0]
+print(citation)
+```
+
+## See Also
+
+[`grobid_client_python`](https://github.com/kermitt2/grobid_client_python):
+Python client and CLI tool for making requests to GROBID via HTTP API. Returns
+TEI-XML; could be used with this library (`grobid_tei_xml`) for parsing into
+python object or, eg, JSON.
+
+[GROBID Documentation](https://grobid.readthedocs.io/en/latest/)
+
+[delb](https://github.com/funkyfuture/delb): more flexible/powerful interface
+to TEI-XML documents. would be a better tool for working with structured text
+(body, abstract, etc)
+
+["Parsing TEI XML documents with
+Python"](https://komax.github.io/blog/text/python/xml/parsing_tei_xml_python/)
+(2019): blog post about basic parsing of GROBID TEI-XML files into Pandas
+DataFrames