aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-16 18:40:26 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-16 18:40:26 -0700
commitf1f7842dd2ed110e9958f56d79ec504ae5d2bcd6 (patch)
treef9404c524531c60ce96859ac851842d0b940783f /fatcat_scholar
parentf5cb3637ccc0914a027a452a214c845deb36494b (diff)
downloadfatcat-scholar-f1f7842dd2ed110e9958f56d79ec504ae5d2bcd6.tar.gz
fatcat-scholar-f1f7842dd2ed110e9958f56d79ec504ae5d2bcd6.zip
crude djvu XML parsing
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/djvu.py49
1 files changed, 49 insertions, 0 deletions
diff --git a/fatcat_scholar/djvu.py b/fatcat_scholar/djvu.py
new file mode 100644
index 0000000..b4a0774
--- /dev/null
+++ b/fatcat_scholar/djvu.py
@@ -0,0 +1,49 @@
+
+from io import StringIO
+from typing import List, Dict, Tuple, Optional, Any, Sequence
+import xml.etree.ElementTree as ET
+
+def djvu_extract_leaf_texts(blob: StringIO, only_leaves: Optional[List[int]] = None) -> Dict[int, str]:
+ """
+ Takes an in-memory djvu XML string (note: not an actual djvu file, just the
+ IA XML file type), and iterates throug
+ """
+
+ leaf_text = dict()
+ max_leaf = None
+ if only_leaves:
+ max_leaf = max(only_leaves)
+ elem_iter = ET.iterparse(blob, ["start", "end"])
+ for (event, element) in elem_iter:
+ if event == "start":
+ continue
+ if not (element.tag == "OBJECT" and event == "end"):
+ continue
+
+ # <OBJECT data="file://localhost//tmp/derive/ERIC_ED441501//ERIC_ED441501.djvu" height="6545" type="image/x.djvu" usemap="ERIC_ED441501_0002.djvu" width="5048">
+ usemap = element.get('usemap')
+ if not usemap:
+ continue
+ leaf_num = None
+ try:
+ leaf_num = int(usemap.replace('.djvu', '').split('_')[-1])
+ except:
+ continue
+ if only_leaves is not None and leaf_num is not None:
+ if leaf_num not in only_leaves:
+ continue
+ if max_leaf is not None and leaf_num > max_leaf:
+ break
+ paragraph_texts = []
+ for paragraph in element.iter("PARAGRAPH"):
+ tokens = [r.strip() for r in paragraph.itertext()]
+ tokens = [t for t in tokens if t]
+ p_text = " ".join(tokens)
+ if p_text:
+ paragraph_texts.append(p_text)
+ page_text = "\n".join(paragraph_texts)
+ #print(f"### {leaf_num}\n{page_text}\n")
+ if page_text:
+ leaf_text[leaf_num] = page_text
+ element.clear()
+ return leaf_text