component ingest support for dataverse files (individual)

author: Bryan Newbold <bnewbold@archive.org> 2021-10-06 18:20:24 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-15 18:15:29 -0700
commit: bff5da3971aa3ad458da048926da5c35252f1fb9 (patch)
tree: 8700cf7b09458bee43fd7870caf006b729d05120
parent: f98f6226097ac34cf8a57ee09a4feea9171addfe (diff)
download: sandcrawler-bff5da3971aa3ad458da048926da5c35252f1fb9.tar.gz
sandcrawler-bff5da3971aa3ad458da048926da5c35252f1fb9.zip
2 files changed, 31 insertions, 13 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 23bf136..93c7269 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -268,6 +268,14 @@ COMPONENT_FULLTEXT_PATTERNS: List[dict] = [
         "technique": "Active figure download link (zookeys)",
         "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/",
     },
+    {
+        "in_doc_url": "/file.xhtml?persistentId",
+        "in_fulltext_url": "/access/datafile/",
+        "selector": "div.form-group code",
+        "use_body": True,
+        "technique": "Dataverse 'download URL'",
+        "example_page": "https://data.lipi.go.id/file.xhtml?persistentId=hdl:20.500.12690/RIN/IDDOAH/BTNH25&version=1.0",
+    },
 ]
 
 # This is a database of matching patterns. Most of these discovered by hand,
@@ -667,23 +675,28 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict
         elem = doc.css_first(pattern['selector'])
         if not elem:
             continue
+        val = None
         if 'attr' in pattern:
             val = elem.attrs.get(pattern['attr'])
-            if not val:
+        elif pattern.get('use_body'):
+            val = elem.text()
+            if not '://' in val:
                 continue
-            val = urllib.parse.urljoin(doc_url, val)
-            assert val
-            if 'in_fulltext_url' in pattern:
-                if not pattern['in_fulltext_url'] in val:
-                    continue
-            for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
-                if skip_pattern in val.lower():
-                    continue
-            if url_fuzzy_equal(doc_url, val):
-                # don't link to self, unless no other options
-                self_doc_url = (val, pattern.get('technique', 'unknown'))
+        if not val:
+            continue
+        val = urllib.parse.urljoin(doc_url, val)
+        assert val
+        if 'in_fulltext_url' in pattern:
+            if not pattern['in_fulltext_url'] in val:
                 continue
-            return (val, pattern.get('technique', 'unknown'))
+        for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
+            if skip_pattern in val.lower():
+                continue
+        if url_fuzzy_equal(doc_url, val):
+            # don't link to self, unless no other options
+            self_doc_url = (val, pattern.get('technique', 'unknown'))
+            continue
+        return (val, pattern.get('technique', 'unknown'))
     if self_doc_url:
         print(f"  WARN: returning fulltext URL pointing to self", file=sys.stderr)
         return self_doc_url
@@ -694,6 +707,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
     meta: Any = dict()
     head = doc.css_first("head")
     if not head:
+        print(f"WARN: empty <head>? {doc_url}", file=sys.stderr)
         return None
 
     for field, patterns in HEAD_META_PATTERNS.items():
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 305a5d1..ce38e13 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -194,6 +194,10 @@ class IngestFileWorker(SandcrawlerWorker):
             "video/mpeg",
             "text/plain",
             "text/csv",
+            "text/x-r-source",              # dataverse
+            "text/tab-separated-values",    # dataverse
+            "text/x-rst",                   # dataverse
+            "application/x-rlang-transport", # dataverse
             "application/json",
             "application/xml",
             "application/pdf",
author	Bryan Newbold <bnewbold@archive.org>	2021-10-06 18:20:24 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-15 18:15:29 -0700
commit	bff5da3971aa3ad458da048926da5c35252f1fb9 (patch)
tree	8700cf7b09458bee43fd7870caf006b729d05120
parent	f98f6226097ac34cf8a57ee09a4feea9171addfe (diff)
download	sandcrawler-bff5da3971aa3ad458da048926da5c35252f1fb9.tar.gz sandcrawler-bff5da3971aa3ad458da048926da5c35252f1fb9.zip