aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-27 15:27:23 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-27 15:42:22 -0700
commitae851f3f205b741dbc826c3197cdd3cc9bde8802 (patch)
tree905f7fdab06f0aacef4664a50dc5e27d90720ba1
parent12a51fd28ca64338fca040ab7c470a70bf7a2a1b (diff)
downloadsandcrawler-ae851f3f205b741dbc826c3197cdd3cc9bde8802.tar.gz
sandcrawler-ae851f3f205b741dbc826c3197cdd3cc9bde8802.zip
start HTML metadata extraction code
-rw-r--r--python/sandcrawler/html_metadata.py230
-rw-r--r--python/tests/files/dlib_05vanhyning.html350
-rw-r--r--python/tests/files/first_monday_ojs3_landingpage.html616
-rw-r--r--python/tests/files/genders_g58_fairlie.html146
-rw-r--r--python/tests/files/nature_article.html1379
-rw-r--r--python/tests/test_html_metadata.py137
6 files changed, 2858 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
new file mode 100644
index 0000000..71715c2
--- /dev/null
+++ b/python/sandcrawler/html_metadata.py
@@ -0,0 +1,230 @@
+
+import datetime
+from typing import List, Optional, Any
+
+import dateparser
+from selectolax.parser import HTMLParser
+import pydantic
+
+
+# this is a map of metadata keys to CSS selectors
+HEAD_META_PATTERNS: Any = {
+ "title": [
+ "meta[name='citation_title']",
+ "meta[name='bepress_citation_title']",
+ "meta[name='eprints.title']",
+ "meta[name='prism.title']",
+ "meta[name='dc.title']",
+ ],
+ "subtitle": [
+ "meta[name='prism.subtitle']",
+ ],
+ "doi": [
+ "meta[name='citation_doi']",
+ "meta[name='prism.doi']",
+ "meta[name='DOI']",
+ "meta[id='DOI']",
+ "meta[name='dc.identifier.doi']",
+ ],
+ "abstract": [
+ "meta[name='citation_abstract']",
+ "meta[name='dc.description']",
+ "meta[name='og:description']",
+ ],
+ "container_name": [
+ "meta[name='citation_journal_title']",
+ "meta[name='citation_conference_title']",
+ "meta[name='prism.publicationName']",
+ "meta[name='dc.relation.ispartof']",
+ "meta[name='dc.source']",
+ "meta[property='og:site_name']",
+ ],
+ "container_abbrev": [
+ "meta[name='citation_journal_abbrev']",
+ ],
+ "raw_date": [
+ "meta[name='citation_publication_date']",
+ "meta[name='citation_date']",
+ "meta[name='prism.publicationDate']",
+ "meta[itemprop='datePublished']",
+ "meta[name='dc.date.created']",
+ "meta[name='dc.issued']",
+ "meta[name='dc.date']",
+ ],
+ "release_year": [
+ "meta[itemprop='prism:copyrightYear']",
+ ],
+ "first_page": [
+ "meta[name='citation_firstpage']",
+ "meta[name='prism.startingPage']",
+ "meta[name='dc.citation.spage']",
+ ],
+ "last_page": [
+ "meta[name='citation_lastpage']",
+ "meta[name='prism.endingPage']",
+ "meta[name='dc.citation.epage']",
+ ],
+ "issue": [
+ "meta[name='citation_issue']",
+ "meta[name='prism.issueIdentifier']",
+ "meta[name='dc.citation.issue']",
+ ],
+ "volume": [
+ "meta[name='citation_volume']",
+ "meta[name='prism.volume']",
+ "meta[name='dc.citation.volume']",
+ ],
+ "number": [
+ "meta[name='citation_technical_report_number']",
+ "meta[name='citation_number']",
+ "meta[name='prism.number']",
+ ],
+ "container_issn": [
+ "meta[name='citation_issn']",
+ "meta[name='prism.issn']",
+ "meta[name='prism.eIssn']",
+ "meta[name='dc.source.issn']",
+ ],
+ "isbn": [
+ "meta[name='citation_isbn']",
+ "meta[name='prism.isbn']",
+ ],
+ "publisher": [
+ "meta[name='citation_publisher']",
+ "meta[name='dc.publisher']",
+ ],
+ "raw_release_type": [
+ "meta[name='citation_article_type']",
+ "meta[name='prism.contentType']",
+ "meta[name='dc.type']",
+ ],
+ "lang": [
+ "meta[name='citation_language']",
+ "meta[name='dc.language']",
+ ],
+ "html_fulltext_url": [
+ "meta[name='citation_fulltext_html_url']",
+ ],
+ "xml_fulltext_url": [
+ ],
+ "pdf_fulltext_url": [
+ "meta[name='citation_pdf_url']",
+ ],
+}
+
+HEAD_META_LIST_PATTERNS: Any = {
+ "contrib_names": [
+ "meta[name='citation_author']",
+ "meta[name='dc.creator']",
+ "meta[name='dc.contributor']",
+ ],
+ # TODO: citation_author_institution
+ "raw_references": [
+ "meta[name='citation_reference']",
+ ],
+ "raw_identifiers": [
+ "meta[name='dc.identifier']",
+ ],
+}
+
+RELEASE_TYPE_MAP = {
+ "research article": "article-journal",
+ "text.serial.journal": "article-journal",
+}
+
+
+class BiblioMetadata(pydantic.BaseModel):
+ title: Optional[str]
+ subtitle: Optional[str]
+ contrib_names: Optional[List[str]]
+ release_date: Optional[datetime.date]
+ release_year: Optional[int]
+ release_type: Optional[str]
+ release_stage: Optional[str]
+ withdrawn_status: Optional[str]
+ lang: Optional[str]
+ country_code: Optional[str]
+ volume: Optional[str]
+ issue: Optional[str]
+ number: Optional[str]
+ pages: Optional[str]
+ first_page: Optional[str]
+ last_page: Optional[str]
+ license: Optional[str]
+ publisher: Optional[str]
+ container_name: Optional[str]
+ container_abbrev: Optional[str]
+ container_issn: Optional[str]
+ container_type: Optional[str]
+ raw_references: Optional[List[str]]
+
+ doi: Optional[str]
+ pmid: Optional[str]
+ pmcid: Optional[str]
+ isbn13: Optional[str]
+ publisher_ident: Optional[str]
+ oai_id: Optional[str]
+
+ abstract: Optional[str]
+ pdf_fulltext_url: Optional[str]
+ html_fulltext_url: Optional[str]
+ xml_fulltext_url: Optional[str]
+
+
+def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]:
+ """
+ TODO:
+ - meta dc.identifier: parse DOI
+ """
+
+ meta: Any = dict()
+ head = doc.css_first("head")
+ if not head:
+ return None
+
+ for field, patterns in HEAD_META_PATTERNS.items():
+ for pattern in patterns:
+ val = head.css_first(pattern)
+ #print((field, pattern, val))
+ if val and val.attrs['content']:
+ meta[field] = val.attrs['content']
+ break
+
+ for field, patterns in HEAD_META_LIST_PATTERNS.items():
+ for pattern in patterns:
+ val_list = head.css(pattern)
+ if val_list:
+ for val in val_list:
+ if val.attrs['content']:
+ if not field in meta:
+ meta[field] = []
+ meta[field].append(val.attrs['content'])
+ break
+
+ # TODO: replace with clean_doi() et al
+ if meta.get('doi') and meta.get('doi').startswith('doi:'):
+ meta['doi'] = meta['doi'][4:]
+
+ raw_identifiers = meta.pop('raw_identifiers', [])
+ for ident in raw_identifiers:
+ if ident.startswith('doi:10.'):
+ if not 'doi' in meta:
+ meta['doi'] = ident.replace('doi:', '')
+ elif ident.startswith('10.') and '/' in ident:
+ if not 'doi' in meta:
+ meta['doi'] = ident
+ elif ident.startswith('isbn:'):
+ if not 'isbn' in meta:
+ meta['isbn'] = ident.replace('isbn:', '')
+
+ raw_date = meta.pop('raw_date', None)
+ if raw_date:
+ meta['release_date'] = dateparser.parse(raw_date).date()
+
+ raw_release_type = meta.pop('raw_release_type', None)
+ if raw_release_type:
+ release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip())
+ if release_type:
+ meta['release_type'] = release_type
+
+ return BiblioMetadata(**meta)
diff --git a/python/tests/files/dlib_05vanhyning.html b/python/tests/files/dlib_05vanhyning.html
new file mode 100644
index 0000000..dbe3ef7
--- /dev/null
+++ b/python/tests/files/dlib_05vanhyning.html
@@ -0,0 +1,350 @@
+<!DOCTYPE html>
+<html lang="en" itemscope itemtype="http://schema.org/Article">
+<head>
+<script type="text/javascript" src="/js/ga.js"></script>
+<style type="text/css">
+
+.topLeft { border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.topLeftThick { border-top: 2px solid #000000;
+ border-left: 1px solid #000000;
+ vertical-align: text-top;
+ }
+
+.topLeftRight {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-right: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.topLeftRightThick {border-top: 2px solid #000000;
+ border-left: 1px solid #000000;
+ border-right: 1px solid #000000;
+ vertical-align: text-top;
+ }
+
+.topLeftBottom {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-bottom: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+.all {border-top: 1px solid #000000;
+ border-left: 1px solid #000000;
+ border-bottom: 1px solid #000000;
+ border-right: 1px solid #000000;
+ padding: 10px;
+ vertical-align: text-top;
+ }
+
+table.plain {border-collapse: separate;
+ border-spacing: 0px;
+ margin-left: auto;
+ margin-right: auto;
+ }
+td.plain {padding: 6px;
+ vertical-align: text-top;
+ }
+
+table.author {border-collapse: separate;
+ border-spacing: 6px;
+ }
+td.authors {padding: 6px;
+ }
+
+li:not(:last-child) {
+ margin-bottom: .5em;
+ }
+
+div.center {margin-left: auto; margin-right: auto;
+ }
+
+</style>
+<meta charset="utf-8" />
+<meta id="DOI" content="10.1045/may2017-vanhyning" />
+<meta itemprop="datePublished" content="2017-05-15" />
+<meta id="description" content="D-Lib Magazine Article" />
+<meta id="keywords" content="Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS" />
+<link href="../../../style/style1.css" rel="stylesheet" type="text/css" />
+
+<title>Transforming Libraries and Archives through Crowdsourcing</title>
+</head>
+
+<body>
+<form action="/cgi-bin/search.cgi" method="get">
+
+<div style="height:2px;background:#2b538e"></div>
+<div style="height:4px;background:#4078b1"></div>
+
+<div style="height:30px;background:#4078b1">
+
+<span style="color: #ffffff; font-size: 12px; float: right; margin-right: 10px;">Search D-Lib:
+<input type="text" id="words" value="" size="25" />
+<input type="submit" id="search" value="Go!" />
+<input type="hidden" id="config" value="htdig" />
+<input type="hidden" id="restrict" value="" />
+<input type="hidden" id="exclude" value="" />
+</span>
+</div>
+
+<div style="height:1px;background:#e04c1e"></div>
+<div style="height:1px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:1px;background:#2b538e"></div>
+<div style="height:92px;background:#4078b1"><img width="450" height="90" alt="D-Lib-blocks5" src="../../../img2/D-Lib-blocks5.gif">
+</div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#4078b1"></div>
+<div style="height:1px;background:#abc0d6"></div>
+<div style="height:2px;background:#e04c1e"></div>
+<div style="height:24px;background:#eda443"><img src="../../../img2/magazine5.gif" alt="The Magazine of Digital Library Research" width="830" height="24" /></div>
+<div style="height:1px;background:#e04c1e"></div>
+<div style="height:28px;background:#2b538e">
+<div id="navtable">
+<table>
+<tr><td class="navtext"><img src="../../../img2/transparent.gif" alt="" width="20" height="20" /><a href="../../../dlib.html">HOME</a>&nbsp;|&nbsp;<a href="../../../about.html">ABOUT D-LIB</a>&nbsp;|&nbsp;<a href="../../../contents.html" class="navtext">CURRENT ISSUE</a>&nbsp;|&nbsp;<a href="../../../back.html">ARCHIVE</a>&nbsp;|&nbsp;<a href="../../../author-index.html">INDEXES</a>&nbsp;|&nbsp;<a href="http://www.dlib.org/groups.html">CALENDAR</a>&nbsp;|&nbsp;<a href="../../author-guidelines.html">AUTHOR GUIDELINES</a>&nbsp;|&nbsp;<a href="http://www.dlib.org/mailman/listinfo/dlib-subscribers">SUBSCRIBE</a>&nbsp;|&nbsp;<a href="../../letters.html">CONTACT D-LIB</a></td></tr></table></div></div>
+<div style="height:4px;background:#2b538e"></div>
+<div style="height:1px;background:#e04c1e"></div>
+
+<div style="padding-left: 2.5em; padding-top: 1em;">
+
+<h3 class="blue-space">D-Lib Magazine</h3>
+<p class="blue">May/June 2017<br />
+Volume 23, Number 5/6<br />
+<a href="../05contents.html">Table of Contents</a>
+</p>
+
+<div class="divider-full">&nbsp;</div>
+
+<h3 class="blue-space">Transforming Libraries and Archives through Crowdsourcing</h3>
+
+<p class="blue">Victoria Van Hyning, University of Oxford, Zooniverse<br />
+victoria [at] zooniverse.org<br /><br />
+
+Samantha Blickhan, The Adler Planetarium, Zooniverse<br />
+samantha [at] zooniverse.org<br /><br />
+
+Laura Trouille, The Adler Planetarium, Zooniverse<br />
+trouille [at] zooniverse.org<br /><br />
+
+Chris Lintott, University of Oxford, Zooniverse<br />
+chris [at] zooniverse.org</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p><a href="https://doi.org/10.1045/may2017-vanhyning" class="nolinka">https://doi.org/10.1045/may2017-vanhyning</a></p>
+
+<div class="divider-full">&nbsp;</div>
+ <!-- Abstract or TOC goes here -->
+
+<h3 class="blue">Abstract</h3>
+
+<p class="blue">This article will showcase the aims and research goals of the project entitled "Transforming Libraries and Archives through Crowdsourcing", recipient of a 2016 Institute for Museum and Library Services grant. This grant will be used to fund the creation of four bespoke text and audio transcription projects which will be hosted on the Zooniverse, the world-leading research crowdsourcing platform. These transcription projects, while supporting the research of four separate institutions, will also function as a means to expand and enhance the Zooniverse platform to better support galleries, libraries, archives and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing.</p>
+
+<p class="blue">Keywords: Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS</p>
+
+<!-- Article goes next -->
+
+<div class="divider-full">&nbsp;</div>
+<h3>1 Overview<span style="vertical-align: super;"><a href="#n6">1</a></span></h3>
+
+<p>As libraries, museums, and other cultural repositories digitize their collections and place them online, the challenges of transforming these materials into useful and searchable sources of information are becoming increasingly apparent. While OCR and handwriting recognition technology have opened up some print and manuscript corpora, and image and voice recognition software are improving daily, there are still many tasks that require human intervention. For these, volunteer crowdsourcing is a viable and vibrant solution.</p>
+
+<p>The <a href="https://www.zooniverse.org/">Zooniverse</a> is the world-leading research crowdsourcing platform, hosting over 50 active projects and over 100 projects total since its inception in 2007. The projects cover diverse subject areas from astronomy to zoology, engage over 1.5 million registered volunteers, and have produced data used in more than a hundred peer-reviewed articles.<span style="vertical-align: super;"><a href="#n1">2</a></span> The Zooniverse also hosts the <a href="https://www.zooniverse.org/lab">Project Builder</a>, a free platform through which anyone can build their own project. The Zooniverse grew from a single project developed at the University of Oxford in 2007, and is now developed and managed by a team based in Oxford and at the Adler Planetarium in Chicago and the University of Minnesota (see <a href="https://www.zooniverse.org/about/team">Zooniverse Team</a> for a more complete list).</p>
+
+<p>In late 2016, the Institute for Museum and Library Services awarded a National Leadership Grant titled "Transforming Libraries and Archives through Crowdsourcing (LG-71-16-0028-16)" to the Adler Planetarium and its collaborators to support the work of the Zooniverse. Through this grant-funded effort, the Zooniverse will further expand and enhance its platform to better support galleries, libraries, archives, and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing. </p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>1.1 What Can Crowdsourcing Offer GLAMs?</h4>
+
+<p>In 2010, author and professor Clay Shirky delivered a rousing <a href="https://www.ted.com/talks/clay_shirky_how_cognitive_surplus_will_change_the_world">TED</a> talk in which he used the phrase "cognitive surplus" to describe the one trillion hours of leisure time humans collectively accumulate each year (a great deal of which is spent watching television), which could be harnessed to advance human knowledge through civic engagement. He concluded that: "free cultures get what they celebrate. [...If we] celebrate and support and reward the people trying to use cognitive surplus to create civic value [...] we'll be able to change society".[<a href="#1">1</a>] One way that GLAMs can harness this cognitive surplus is through web-based crowdsourcing. What Shirky was describing was a type of "social machine", which Tim Berners-Lee defined as "new form[s] of social processes" emergent from the Web, and involving both human and machine components.[<a href="#2">2</a>] </p>
+
+<p>Academic crowdsourcing invites members of the public to work with specialists to conduct research: for example, to transcribe documents or add metadata to a collection of images, video or audio clips. This data is used in real science, social science, or humanities investigations and should, ideally, lead to publication. Crowdsourcing within GLAMs may not always be oriented around a specific research question or publication, but around making collections more accessible for future research and usability. GLAM crowdsourcing can be the seedbed of future scholarly research.</p>
+
+<p>GLAMs have been engaging volunteers with their collections for well over a century, usually by inviting select individuals into an institution and training them to do work that cannot be done by staff due to time or money constraints. On-site volunteers often build up valuable knowledge and skills and contribute a great deal to their chosen institutions, but training and supervising them also poses challenges. There is a limit to how many volunteers can be trained, supported on site, and indeed attracted and retained in the first place. Online volunteering, enabled by crowdsourcing platforms such as Zooniverse.org, offer an alternative or complementary form of engagement that has many benefits. Online projects can reach a wider range of individuals, including those who are less able-bodied or geographically remote from the institution in which they want to volunteer and/or unable to travel. Such projects require less training and time commitment from volunteers and typically attract a larger number of participants than on-site programs. They also enable GLAMs to open up rare collections to the public without concern for their material safety and security.<span style="vertical-align: super;"><a href="#n2">3</a></span></p>
+
+<p>While crowdsourcing projects have proliferated in the last decade, few offer easy to use, open source, and free platforms on which GLAM academics and amateur users can rely. The Zooniverse has the infrastructure, community, and technical expertise to intervene at this critical stage. </p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>1.2 How Does The Zooniverse Work?</h4>
+
+<p>All bespoke Zooniverse projects, including those built on the free Project Builder, have a few core components. Each image, audio or video file (data point) in each project is independently assessed by multiple individuals, whose responses are then aggregated using a variety of algorithms to determine what is in a given image. The amount of required responses for a task to be considered "complete" varies, depending on the project. With relatively quick tasks, such as animal identification in Snapshot Serengeti, upwards of 70 people will see each image. In tasks that require more time, such as transcription projects like <a href="https://www.shakespearesworld.org/#!/">Shakespeare's World</a> and <a href="https://anno.tate.org.uk/#!/">AnnoTate</a>, at least three people transcribe each line on each page. If enough people transcribe the same line and our algorithms deem the line to be completed to a good enough standard, these are greyed out, while outstanding lines are available to future site visitors. This approach was designed along the same principles that underpin all other Zooniverse projects, in which it is assumed that volunteers should work independently on tasks, in order that no one individual should have undue influence over others in the crowd. In the current IMLS project, however, we will test whether allowing volunteers to transcribe and work collaboratively ultimately creates better data and/or better user experiences. We will be able to compare datasets from AnnoTate and Shakespeare's World with text transcription datasets from the two new bespoke text transcription projects and, hopefully, with datasets generated at other institutions that have online crowdsourcing projects. Zooniverse is in a unique position in being able to gather these two very different kinds of data and compare them in order to determine the best outcomes. These findings will ultimately drive our design of free tools on the Project Builder.
+
+<p>In addition to participating in the classification task, users have the opportunity to communicate with other volunteers through an active, object-oriented discussion forum, called "Talk", associated with each project. Here volunteers can ask questions, interact with researchers and fellow volunteers, create their own "collections", and use hashtags to group together posts or images of interest. An example of the latter is <a href="https://talk.sciencegossip.org/#/search?tags%5Bfemale%5D=true">#female</a> from the <a href="https://www.sciencegossip.org/">Science Gossip</a> project, which indicates female authors, illustrators and printers contributing to the main scientific journals in the nineteenth century (visit the <a href="https://talk.sciencegossip.org/#/boards/BSC0000004/discussions/DSC00004s8">Science Gossip Talk</a> board to view the discussion around this tag). These interactions provide a rich set of experiences that allow users to personally experience the community in which they are participating, beyond simply providing classifications. Additionally, the collections allow volunteers to create their own research focal points within existing projects. During the process of transcribing, users can save images that contain content that is pertinent to their research interests by adding them to a public collection. They can then use the Talk forum to publicize their search, allowing other users to add images to that collection as well. In this way, the volunteer base can be mobilized to help other volunteers with minimal effort required.</p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>2 IMLS Funded Effort: Approach and Focus</h3>
+
+<p>Through the IMLS grant, the Zooniverse will engage in a research and development program to identify and implement crowdsourcing best practices in the arenas of text and audio transcription for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read. Though to date the majority of Zooniverse projects have been based in STEM fields rather than in the humanities, several text transcription projects have already been hosted on the site. For example, the first Zooniverse humanities project was <a href="https://www.ancientlives.org/">Ancient Lives</a>, which invited volunteers to transcribe ancient papyri one letter at a time using a clickable keyboard on their screen: volunteers did not have to be fluent in ancient Greek, they only needed to character match. Over 250,000 volunteers participated in the project, and made more than 1.5 million transcriptions between 2011 and 2014.[<a href="#6">3</a>] Furthermore, the computational pipeline used to convert individual identified letters into consensus-based transcriptions will benefit future classification projects attempting consensus letter or line sequence identifications.[<a href="#7">4</a>]</p>
+
+<p>By 2018 we will build four bespoke projects, two projects for text transcription and two projects for audio transcription, identified through open calls, in order to test, iterate, and research the efficacy of new and existing approaches (including within current Zooniverse and other projects) in these arenas. We will also develop the foundation for a GLAM-friendly data pipeline to export data from a Zooniverse project into GLAM collections. These functionalities are among those most frequently requested by GLAM institutions. We will work closely with four different GLAM institutions to build these bespoke crowdsourcing projects and functionalities. The text transcription open call closed in February 2017, with thirty-one submissions. The audio transcription open call will occur in fall 2017 (see <a href="http://zooniverse.org/get-involved/call-for-projects">Call for Projects</a>).</p>
+
+<p>From the lessons learned in building these bespoke projects, we will explore adding new tools and functionality to the Project Builder, which is freely available to any institution or user who wishes to lead a project. It is a flexible, powerful, and easy-to-use resource for building crowdsourcing projects, with a wide range of potential applications for GLAM collections, including text transcription. A basic text transcription tool is currently available, but will be refined through this grant effort. The Zooniverse has previously used this model of building bespoke projects in order to learn which tools are most useful, before implementing these tools in the Project Builder. We recognize that volunteers' time is precious, and are therefore unwilling to waste it with tools that are not proven to extract data in an efficient, high quality, and useful form. We will also draw on lessons learned from previous experiences supporting transcription projects through Zooniverse and other platforms. For example, <a href="https://www.operationwardiary.org/">Operation War Diary</a> which launched in 2014 to commemorate the outbreak of the First World War, is a partnership between the National Archives (UK), the Imperial War Museum, and the Zooniverse, which invites users to tag and transcribe dates, times, places, and names found in British WWI field diaries. Historian Richard Grayson has used the data to penetrate more deeply than ever before into records of soldiers' daily lives on the front.[<a href="#8">5</a>] All of the Operation War Diary metadata will eventually be integrated into the National Archive catalogues. The process of integrating new metadata into an existing catalogue can be complicated, raising an important question for any GLAM specialist seeking to harness crowdsourcing at their institution. For instance, it is essential to ensure, before starting a project, that the current content management system (CMS) supports the storage of additional metadata, such as large amounts of free-text. If not, it then becomes necessary to use an external resource to make available the results from the crowdsourcing project. Zooniverse can and will do more to facilitate GLAMs and research groups to use and store their data.</p>
+
+<p>Over the course of the IMLS project, we will also address the following research questions:</p>
+
+<p class="indentLeft">Q1: How can crowdsourcing be deployed in the arenas of text and audio transcription and metadata extraction for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read? What methods produce the best data and make for the best user experience?</p>
+
+<p class="indentLeft">Q2: Does the current Zooniverse methodology of multiple independent transcribers and aggregation render better results than allowing volunteers to see previous transcriptions by others or indeed collaborate to create a single transcription? How does each methodology impact the quality of data, as well as depth of analysis and participation?</p>
+
+<p class="indentLeft">Q3: How can we extend our crowdsourcing expertise to more GLAM professionals and learn from them, in turn, how to adjust the Zooniverse platform to best meet their research and curatorial needs?</p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.1 Addressing Q1 (Crowdsourcing for GLAM)</h4>
+
+<p>Only a platform like the Zooniverse can systematically address a question such as Q1: the community that has developed within the platform is made up of volunteers who move across projects, allowing us to trace the impact of differences between projects on the same volunteers. Zooniverse also has the infrastructure to implement A/B split experiments within a single project. This allows us to develop projects incorporating different practices which are specifically aimed at understanding different methodologies. Through the bespoke text and audio transcription projects, we will expand on the lessons learned through current Zooniverse text transcription projects, including Ancient Lives, AnnoTate, Old Weather, Measuring the ANZACs, Shakespeare's World, Science Gossip, Decoding the Civil War, Orchid Observers and Operation War Diary, as well as from external text transcription projects including <a href="http://blogs.ucl.ac.uk/transcribe-bentham/">Transcribe Bentham</a>, <a href="http://fromthepage.com/">FromthePage</a>, and <a href="http://scripto.org/">Scripto</a>. </p>
+
+<p>In the bespoke projects created through the IMLS grant, the features optimizing volunteer engagement and retention will include: </p>
+
+<ul>
+ <li><i>Volunteer choice:</i> volunteers choose which document to transcribe and can transcribe as little as a single line or as much as an entire document. We have found through AnnoTate and Shakespeare's World that allowing users to transcribe smaller fragments of text (without being required to complete an entire page) mitigates against forced or uncertain readings. We hypothesize and plan to fully test whether allowing microtasking helps to retain volunteers, giving them the chance to build up their skills and not make forced readings. </li>
+
+ <li><i>Keeping the task simple:</i> in Shakespeare's World and AnnoTate, volunteers drop points at the start and end of individual lines of text (not grammatical sentences) and transcribe the text contained between these two points. They do not use XML markup itself, which has proven to be a major repellent to participants in other text transcription crowdsourcing projects.<span style="vertical-align: super;"><a href="#n3">4</a></span> Instead, volunteers highlight words within the transcribed line and choose among different features (e.g., insertion, deletion, expansion, etc.). We propose to use these tagged words in each line to create simple TEI markup on the back-end, for output into commonly used CMSs such as Drupal and Omeka.</li>
+
+ <li><i>Narrowing the content focus to support sense-making:</i> In Shakespeare's World, the first release (or "chapter") consists of recipes and letters, with more genres to follow. This type of structured approach will be applied to the bespoke projects, as this supports creation of narratives within diverse collections, which in turn enables subject experts to more easily foster, and volunteers to contribute to, discussions in Talk.</li>
+</ul>
+
+<p>Features optimizing best practice in regard to data production and management will include:</p>
+
+<ul>
+ <li><i>Reliable, Scalable, Open Source Code Infrastructure:</i> The foundation for the Zooniverse platform that includes the Project Builder is an application written in Ruby on Rails which supports a powerful Application Programming Interface (API). The API serves subjects &#151; images, video or audio &#151; for classification by volunteers via a workflow defined by the project, and receives and records these classifications into a database. The frontend Javascript web software presents user interfaces to volunteers and supports the Project Builder. All Zooniverse code is open source and available through <a href="github.com/zooniverse">Github</a>.</li>
+
+ <li><i>Data Ingestion into Zooniverse:</i> In the current Project Builder, research teams can upload batches of 500 to 1000 subjects (images, videos, or audio clips) at a time by simply dragging and dropping the files. For larger collections and for bespoke projects, typically the research team provides a hard drive and the Zooniverse team uploads the subjects to the API. Through the projects proposed here, we will create a system to better support direct ingestion of large subject sets through a user-friendly web interface, adding functionality to the foundation we already have in place within the Project Builder.</li>
+
+ <li><i>Useful Output for Curation:</i> The Smithsonian Transcription Center is regularly cited as being successful in regard to their output being easily ingestible by CMSs.[<a href="#9">6</a>] Current Zooniverse transcription projects are not set up with this functionality. Currently, through our Project Builder for image annotation/marking projects, research teams can download the raw classification results (i.e. all classifications by all volunteers) as well as automatically-generated aggregated results that include confidence measures on consensus. Through this IMLS-funded effort, we will work with Meghan Ferriter of the Smithsonian Transcription Center, who is on our board of advisors, to design data outputs for full text transcription and full audio transcription that are suitable for ingestion into different GLAM CMSs. A key aspect of this effort is to continue exploring best practices and approaches for transcription aggregation and confidence metrics, building on our efforts with AnnoTate, Shakespeare's World, etc.</li>
+</ul>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.2 Addressing Research Q2 (Independent vs. Collaborative Transcription)</h4>
+
+<p>Through the two bespoke text transcription projects, we will investigate the impact on transcription quality and volunteer experience when volunteers transcribe in isolation versus with knowledge of how others have transcribed the same document. </p>
+
+<p>In terms of measuring impact on transcription quality, we will compare the rate of accuracy for individuals who transcribe in isolation on projects such as AnnoTate and Shakespeare's World versus individuals who see previous transcriptions. We will also compare the rate of accuracy in aggregated results for lines transcribed only by those working in isolation versus for lines in which all but the first transcriber sees previous transcriptions. In order to measure impact on volunteer experience, we will analyze the user behavior statistics we gather, e.g., number of transcriptions completed in a given session, length of session, number of sessions overall, sentiment analysis of discussion forum comments, etc.</p>
+
+<p>There are numerous open questions in this experiment: Does knowledge of other individuals' or collective transcriptions lead individuals down the wrong path? Is transcription more or less accurate if people work in isolation or with an awareness of other people's work? Does making transcriptions visible increase retention as a result of highlighting that an individual's effort is part of a broader community effort or have the opposite effect? What environment best promotes skills acquisition, i.e. improved paleography?</p>
+
+<div class="divider-dot">&nbsp;</div>
+<h4>2.3 Addressing Research Q3 (Feedback/Training)</h4>
+
+<p>We will provide numerous opportunities for input and feedback from and training for the GLAM community, specifically by working closely with our advisory board and four GLAM project partners throughout. In 2018 we will host feedback sessions at GLAM conferences and summer schools targeting GLAM institutions with collections for which text transcription, audio transcription, or image annotation/marking are of interest (we will include image annotation/marking because those tools are already included via the Project Builder). This will allow for input from a broader set of institutions on our decisions and approach for building new functionality into the Project Builder. In 2018&#151;2019 we will host training workshops for GLAM professionals in using the Project Builder to build their own crowdsourcing projects, incorporate the results into their databases and research, and sustain and nurture their online volunteer communities.</p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>3 Future Steps: Community Engagement, Output &amp; How to Get Involved</h3>
+
+<p>The IMLS-Funded Project "Transforming Libraries and Archives through Crowdsourcing" is still in its beginning stages. Currently, we are in the process of selecting the first two bespoke crowdsourcing text transcription projects to be built and incorporated into the Zooniverse platform. The detail of our research questions will evolve alongside these new transcription projects, and during the research and development process we will use conference presentations and feedback sessions to gather input which can then guide the overall project design. The open call for the two bespoke audio transcription projects will occur in the fall of 2017. At this point, the bespoke text transcriptions will be in beta review, allowing us to take advantage of lessons learned through that first round of new projects. We believe that this self-reflexive method will simultaneously benefit our ongoing project while offering new tools and ideas to the larger GLAM and academic community.</p>
+
+<p>We anticipate this proposed effort will produce two peer-reviewed publications. One article will focus on the methodology for creating, processing, and evaluating the data produced by the new projects. The second will focus on the results of our research exploring the impact of individual versus collaborative text transcription. We also note that all Zooniverse <a href="github.com/zooniverse">code</a> is freely available under a liberal open source license which serves as an additional or parallel form of publication.</p>
+
+<p>GLAM organizations keen to develop their own crowdsourcing projects should explore the available documentation on <a href="https://www.zooniverse.org/lab-how-to">how to build a project</a> and <a href="https://www.zooniverse.org/lab-best-practices/great-project">best practices for the design, launch and long term phases of a project</a>. While building a project is easy and requires relatively little technical support from Zooniverse or your institution, make sure you have the time to work with your resulting data, and time to support your online volunteer commmunity. Advertising the project's existence should be a long-term task, to avoid a plateau or potential drop-off of user participation. For example, Shakespeare's World received a bump in the number of daily classifications after an article was published in The New Yorker in January of 2017, over a year after the project's launch date.[<a href="#10">7</a>] However, it does not suffice to merely advertise the existence of a project; researchers need to engage with their users on a regular basis.<span style="vertical-align: super;"><a href="#n5">5</a></span> Zooniverse's Talk platform, social media such as blogging, Twitter, Instagram, and indeed in-person or on-site events all provide important channels for engaging current or potential volunteers with your collections. We believe that GLAM organizations, with their long history of volunteer engagement, have many of the skills to work effectively with online volunteers, and will benefit in new ways through cooperation with the crowd.</p>
+
+<p>In conclusion, while this project is specifically focused on text and audio transcription, it is our hope that the results, including the new Project Builder tools and GLAM data pipeline, will ultimately be used across a variety of disciplines and domains. We hope to facilitate future partnerships between GLAM institutions and volunteer communities around the world, thus extending the aims and outcomes of the National Digital Platform funded through this generous IMLS grant into an international digital platform that will benefit many individuals and institutions. </p>
+
+<div class="divider-full">&nbsp;</div>
+<h3>Notes</h3>
+
+<table style="width:90%">
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n6">1</a></td>
+<td style="padding-top: .5em;">Part of this article appeared previously as a blog post for CILIP, The Library and Information Association. Material is reproduced by express permission of CILIP.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n1">2</a></td>
+<td style="padding-top: .5em;">For a partial list of publications, please visit <a href="https://www.zooniverse.org/about/publications">https://www.zooniverse.org/about/publications</a>. </td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n2">3</a></td>
+<td style="padding-top: .5em;">Further discussion of the use of crowdsourcing in GLAM contexts can be found in Melissa Terras, "Crowdsourcing in the Digital Humanities", in <i>A New Companion to Digital Humanities</i>, eds. Susan Schreibman, Ray Siemens, and John Unsworth (John Wiley &amp; Sons, 2016), 420-438, particularly in the section entitled "The Growth of Crowdsourcing in Cultural and Heritage Applications" (pp. 423-28). See also <i>Crowdsourcing Our Cultural Heritage</i>, ed. Mia Ridge (Ashgate, 2014).</td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n3">4</a></td>
+<td style="padding-top: .5em;">Causer and Terras, "Many Hands Make Light Work", p. 81: "It would be fair to say that for volunteers, the XML mark-up complicates participation, and it has undoubtedly dissuaded many from participating more fully, or at all." For opinions from the volunteers about the process, the authors additionally refer the reader to Causer and Valerie Wallace, "<a href="http://www.digitalhumanities.org/dhq/vol/6/2/000125/000125.html">Building a Volunteer Community: Results and Findings from Transcribe Bentham</a>", <i>Digital Humanities Quarterly</i> 6.2 (2012).</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: super;"><a id="n5">5</a></td>
+<td style="padding-top: .5em;">Or, as Zephyr Frank, <i>et al</i>. put it: "Paid advertising can generate large numbers of clicks on a website. It cannot, however, produce good metadata or newly uploaded material that is relevant to the scholarly questions posed by academic researchers." "<a href="https://github.com/cestastanford/crowdsourcing/raw/master/files/Mellon%20White%20Paper.pdf">Crowdsourcing for Humanities Research</a>" (2016) Project White Paper. </td>
+</tr>
+</table>
+
+<div class="divider-white">&nbsp;</div>
+<div class="divider-full">&nbsp;</div>
+<h3>References</h3>
+
+<table style="width:90%">
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="1">[1]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Clay Shirky, "<a href="https://www.ted.com/talks/clay_shirky_how_cognitive_surplus_will_change_the_world">How Cognitive Surplus Will Change the World</a>", June 2010.</td>
+</tr>
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="2">[2]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Tim Berners-Lee with Mark Fischetti, <i>Weaving the Web: The Original Design and Ultimate Destiny of the World Wide Web by its Inventor</i> (San Francisco: Harper, 1999).</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="6">[3]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">"P.Oxy 5156, Plutarch Moralia 660C, 661B-C (Quaestiones Convivales IV PR., 1.2)", in <i>The Oxyrhynchus Papyri</i>, R.-L. Chang <i>et al</i>., eds, vol. 78 (London, Egypt Exploration Society, 2012), 97-98. </td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="7">[4]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Alex C. Williams <i>et al.</i>, "A Computational Pipeline for Crowdsourced Transcriptions of Ancient Greek Papyrus Fragments", in <i>IEEE International Conference on Big Data</i>, October 2014. <a href="https://doi.org/10.1109/BigData.2014.7004460">https://doi.org/10.1109/BigData.2014.7004460</a></td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="8">[5]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Richard Grayson, "A Life in the Trenches? The Use of Operation War Diary and Crowdsourcing Methods to Provide an Understanding of the British Army's Day-to-Day Life on the Western Front", <i>British Journal for Military History,</i> 2.2 (2016), 160-85.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="9">[6]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Katie Mika, "<a href="http://library.mcz.harvard.edu/blog/transcription-tools-survey-katie-mika-ndsr-resident">Transcription Tools: a survey by Katie Mika, NDSR Resident</a>", Harvard University, Ernst Mayr Library Blog.</td>
+</tr>
+
+<tr>
+<td style="padding-bottom: 12px; vertical-align: top;"><a id="10">[7]</a></td>
+<td style="padding-bottom: 12px; vertical-align: top;">Roberta Kwok, "<a href="http://www.newyorker.com/tech/elements/crowdsourcing-for-shakespeare">Crowdsourcing For Shakespeare</a>", <i>The New Yorker</i>, 16 Jan. 2017. </td>
+</tr>
+</table>
+
+<div class="divider-white">&nbsp;</div>
+<div class="divider-full">&nbsp;</div>
+<h3>About the Authors</h3>
+
+<p class="blue"><b>Victoria Van Hyning</b> is a Junior Research Fellow at Pembroke College, and a British Academy Postdoctoral Fellow. Her current project, 'Court to Convent: Early Modern English Catholic Women's Autobiography', will reveal how Catholic women articulated selfhood in the period when it was illegal to practice Catholicism, 1535 to 1829. She is also the Humanities PI of Zooniverse.org, the world leading academic crowdsourcing organization. Her projects include <a href="https://www.sciencegossip.org">Science Gossip</a>, <a href="http://www.shakespearesworld.org">Shakespeare's World</a> and <a href="https://anno.tate.org.uk">AnnoTate</a>.</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue"><b>Samantha Blickhan</b> is the IMLS Postdoctoral Fellow in the Department of Citizen Science at the Adler Planetarium, working on transcription projects for the Zooniverse. She received her Ph.D. in Musicology from Royal Holloway, University of London, with a thesis on the palaeography of British song notation in the 12th and 13th centuries. Her research interests include music and perception, and their relationships with writing systems, technology and pedagogy.</p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue"><b>Laura Trouille</b> is co-Investigator for Zooniverse and Director of Citizen Science at the Adler Planetarium where she leads the Zooniverse web development and Teen Programs teams. While earning her Ph.D. in astronomy in 2010 studying galaxy evolution, she also earned the Center for the Integration of Research, Teaching and Learning's Delta certificate for STEM education research. As a CIERA Postdoctoral Fellow at Northwestern University's CIERA Center for Astrophysics, she continued her research on active galaxies as well as co-led the Computational Thinking in STEM project, bringing computational thinking and modeling curricular materials to high school science and math teachers. </p>
+
+<div class="divider-dot">&nbsp;</div>
+
+<p class="blue">Chris Lintott is a professor of astrophysics at the University of Oxford, where he is also a research fellow at New College. He is the principle investigator for Galaxy Zoo and the Zooniverse, and his own research focuses on novel modes of crowdsourcing for anomaly detection.</p>
+
+<div class="divider-full">&nbsp;</div>
+
+ <!-- Standard Copyright line here -->
+
+<div class="center">
+<p class="footer">Copyright &reg; 2017 Victoria Van Hyning, Samantha Blickhan, Laura Trouille and Chris Lintott</p>
+</div>
+
+<div style="height:1px;background:#2b538e"></div>
+
+</div>
+</form>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/first_monday_ojs3_landingpage.html b/python/tests/files/first_monday_ojs3_landingpage.html
new file mode 100644
index 0000000..2633256
--- /dev/null
+++ b/python/tests/files/first_monday_ojs3_landingpage.html
@@ -0,0 +1,616 @@
+ <!DOCTYPE html>
+<html lang="en-US" xml:lang="en-US">
+<head>
+ <meta charset="utf-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>
+ Surveillance, stigma &amp; sociotechnical design for HIV
+ | First Monday
+ </title>
+
+
+<meta name="generator" content="Open Journal Systems 3.1.2.0">
+<link rel="icon" href="https://firstmonday.org/ojs/public/journals/3/favicon_en_US.gif">
+<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
+<meta name="DC.Coverage" xml:lang="en" content=""/>
+<meta name="DC.Creator.PersonalName" content="Calvin Liang"/>
+<meta name="DC.Creator.PersonalName" content="Jevan Alexander Hutson"/>
+<meta name="DC.Creator.PersonalName" content="Os Keyes"/>
+<meta name="DC.Date.created" scheme="ISO8601" content="2020-09-10"/>
+<meta name="DC.Date.dateSubmitted" scheme="ISO8601" content="2019-09-15"/>
+<meta name="DC.Date.issued" scheme="ISO8601" content="2020-10-01"/>
+<meta name="DC.Date.modified" scheme="ISO8601" content="2020-10-01"/>
+<meta name="DC.Description" xml:lang="en" content="Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."/>
+<meta name="DC.Format" scheme="IMT" content="text/html"/>
+<meta name="DC.Identifier" content="10274"/>
+<meta name="DC.Identifier.DOI" content="10.5210/fm.v25i10.10274"/>
+<meta name="DC.Identifier.URI" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274"/>
+<meta name="DC.Language" scheme="ISO639-1" content="en"/>
+<meta name="DC.Rights" content="Copyright (c) 2020 First Monday"/>
+<meta name="DC.Rights" content=""/>
+<meta name="DC.Source" content="First Monday"/>
+<meta name="DC.Source.ISSN" content="1396-0466"/>
+<meta name="DC.Source.URI" content="https://firstmonday.org/ojs/index.php/fm"/>
+<meta name="DC.Subject" xml:lang="en" content="HIV"/>
+<meta name="DC.Subject" xml:lang="en" content="online dating"/>
+<meta name="DC.Subject" xml:lang="en" content="design"/>
+<meta name="DC.Subject" xml:lang="en" content="policy"/>
+<meta name="DC.Subject" xml:lang="en" content="surveillance"/>
+<meta name="DC.Subject" xml:lang="en" content="intimacy"/>
+<meta name="DC.Subject" xml:lang="en" content="social computing"/>
+<meta name="DC.Subject" xml:lang="en" content="social justice"/>
+<meta name="DC.Title" content="Surveillance, stigma &amp; sociotechnical design for HIV"/>
+<meta name="DC.Type" content="Text.Serial.Journal"/>
+<meta name="DC.Type" xml:lang="en" content="Qualitative; Content analysis"/>
+<meta name="DC.Type.articleType" content="Articles"/>
+<meta name="gs_meta_revision" content="1.1"/>
+<meta name="citation_journal_title" content="First Monday"/>
+<meta name="citation_journal_abbrev" content="1"/>
+<meta name="citation_issn" content="1396-0466"/>
+<meta name="citation_author" content="Calvin Liang"/>
+<meta name="citation_author_institution" content="University of Washington, Department of Human Centered Design &amp; Engineering"/>
+<meta name="citation_author" content="Jevan Alexander Hutson"/>
+<meta name="citation_author_institution" content="University of Washington, School of Law"/>
+<meta name="citation_author" content="Os Keyes"/>
+<meta name="citation_author_institution" content="University of Washington, Department of Human Centered Design &amp; Engineering"/>
+<meta name="citation_title" content="Surveillance, stigma &amp; sociotechnical design for HIV"/>
+<meta name="citation_date" content="2020/09/10"/>
+<meta name="citation_doi" content="10.5210/fm.v25i10.10274"/>
+<meta name="citation_abstract_html_url" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274"/>
+<meta name="citation_language" content="en"/>
+<meta name="citation_keywords" xml:lang="en" content="HIV"/>
+<meta name="citation_keywords" xml:lang="en" content="online dating"/>
+<meta name="citation_keywords" xml:lang="en" content="design"/>
+<meta name="citation_keywords" xml:lang="en" content="policy"/>
+<meta name="citation_keywords" xml:lang="en" content="surveillance"/>
+<meta name="citation_keywords" xml:lang="en" content="intimacy"/>
+<meta name="citation_keywords" xml:lang="en" content="social computing"/>
+<meta name="citation_keywords" xml:lang="en" content="social justice"/>
+<meta name="citation_fulltext_html_url" content="https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"/>
+<link rel="alternate" type="application/atom+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/atom">
+<link rel="alternate" type="application/rdf+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss">
+<link rel="alternate" type="application/rss+xml" href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss2">
+ <link rel="stylesheet" href="https://firstmonday.org/ojs/index.php/fm/$$$call$$$/page/page/css?name=stylesheet" type="text/css" /><link rel="stylesheet" href="//fonts.googleapis.com/css?family=Noto+Sans:400,400italic,700,700italic" type="text/css" /><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.css" type="text/css" /><link rel="stylesheet" href="https://firstmonday.org/ojs/public/journals/3/styleSheet.css" type="text/css" />
+</head>
+<body class="pkp_page_article pkp_op_view has_site_logo" dir="ltr">
+
+ <div class="cmp_skip_to_content">
+ <a href="#pkp_content_main">Skip to main content</a>
+ <a href="#pkp_content_nav">Skip to main navigation menu</a>
+ <a href="#pkp_content_footer">Skip to site footer</a>
+ </div>
+ <div class="pkp_structure_page">
+
+ <header class="pkp_structure_head" id="headerNavigationContainer" role="banner">
+ <div class="pkp_head_wrapper">
+
+ <div class="pkp_site_name_wrapper">
+ <div class="pkp_site_name">
+ <a href=" https://firstmonday.org/ojs/index.php/fm/index
+ " class="is_img">
+ <img src="https://firstmonday.org/ojs/public/journals/3/pageHeaderLogoImage_en_US.gif" width="252" height="102" alt="Page Header Logo" />
+ </a>
+ </div>
+ </div>
+
+
+ <nav class="pkp_navigation_primary_row" aria-label="Site Navigation">
+ <div class="pkp_navigation_primary_wrapper">
+ <ul id="navigationPrimary" class="pkp_navigation_primary pkp_nav_list">
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about">
+ About
+ </a>
+ <ul>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about">
+ About the Journal
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/editorialTeam">
+ Editorial Team
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/privacy">
+ Privacy Statement
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/contact">
+ Contact
+ </a>
+ </li>
+ </ul>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/search/search">
+ Search
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/current">
+ Current
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/archive">
+ Archives
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/announcement">
+ Announcements
+ </a>
+ </li>
+ <li class="">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/submissions">
+ Submissions
+ </a>
+ </li>
+ </ul>
+
+
+
+ <form class="pkp_search" action="https://firstmonday.org/ojs/index.php/fm/search/search" method="post" role="search">
+ <input type="hidden" name="csrfToken" value="671acac3a608346eb0eb4de1f26c7563">
+ <input name="query" value="" type="text" aria-label="Search Query">
+ <button type="submit">
+ Search
+ </button>
+ <div class="search_controls" aria-hidden="true">
+ <a href="https://firstmonday.org/ojs/index.php/fm/search/search" class="headerSearchPrompt search_prompt" aria-hidden="true">
+ Search
+ </a>
+ <a href="#" class="search_cancel headerSearchCancel" aria-hidden="true"></a>
+ <span class="search_loading" aria-hidden="true"></span>
+ </div>
+</form>
+ </div>
+ </nav>
+
+ <nav class="pkp_navigation_user_wrapper" id="navigationUserWrapper" aria-label="User Navigation">
+ <ul id="navigationUser" class="pkp_navigation_user pkp_nav_list">
+ <li class="profile">
+ <a href="https://firstmonday.org/ojs/index.php/fm/user/register">
+ Register
+ </a>
+ </li>
+ <li class="profile">
+ <a href="https://firstmonday.org/ojs/index.php/fm/login">
+ Login
+ </a>
+ </li>
+ </ul>
+
+ </nav>
+ </div><!-- .pkp_head_wrapper -->
+ </header><!-- .pkp_structure_head -->
+
+ <div class="pkp_structure_content has_sidebar">
+ <div id="pkp_content_main" class="pkp_structure_main" role="main">
+
+<div class="page page_article">
+ <nav class="cmp_breadcrumbs" role="navigation" aria-label="You are here:">
+ <ol>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/index">
+ Home
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/archive">
+ Archives
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ Volume 25, Number 10 - 5 October 2020
+ </a>
+ <span class="separator">/</span>
+ </li>
+ <li class="current">
+ Articles
+ </li>
+ </ol>
+</nav>
+
+ <article class="obj_article_details">
+ <h1 class="page_title">
+ Surveillance, stigma &amp; sociotechnical design for HIV
+ </h1>
+
+
+ <div class="row">
+ <div class="main_entry">
+
+ <ul class="item authors">
+ <li>
+ <span class="name">
+ Calvin Liang
+ </span>
+ <span class="affiliation">
+ University of Washington, Department of Human Centered Design &amp; Engineering
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0002-3795-3441" target="_blank">
+ https://orcid.org/0000-0002-3795-3441
+ </a>
+ </span>
+ </li>
+ <li>
+ <span class="name">
+ Jevan Alexander Hutson
+ </span>
+ <span class="affiliation">
+ University of Washington, School of Law
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0003-3312-1733" target="_blank">
+ https://orcid.org/0000-0003-3312-1733
+ </a>
+ </span>
+ </li>
+ <li>
+ <span class="name">
+ Os Keyes
+ </span>
+ <span class="affiliation">
+ University of Washington, Department of Human Centered Design &amp; Engineering
+ </span>
+ <span class="orcid">
+
+ <a href="https://orcid.org/0000-0001-5196-609X" target="_blank">
+ https://orcid.org/0000-0001-5196-609X
+ </a>
+ </span>
+ </li>
+ </ul>
+
+ <div class="item doi">
+ <span class="label">
+ DOI:
+ </span>
+ <span class="value">
+ <a href="https://doi.org/10.5210/fm.v25i10.10274">
+ https://doi.org/10.5210/fm.v25i10.10274
+ </a>
+ </span>
+ </div>
+
+ <div class="item keywords">
+ <span class="label">
+ Keywords:
+ </span>
+ <span class="value">
+ HIV, online dating, design, policy, surveillance, intimacy, social computing, social justice </span>
+ </div>
+
+ <div class="item abstract">
+ <h3 class="label">Abstract</h3>
+ <p>Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.</p>
+ </div>
+
+
+
+ <div class="item author_bios">
+ <h3 class="label">
+ Author Biographies
+ </h3>
+ <div class="sub_item">
+ <div class="label">
+ Calvin Liang, <span class="affiliation">University of Washington, Department of Human Centered Design &amp; Engineering</span>
+ </div>
+ <div class="value">
+ <p>Calvin Liang is a PhD student in Human-Centered Design and Engineering at The University of Washington. Their research broadly focuses on technology’s role in and out of queerness, health, and queer health.</p>
+ </div>
+ </div>
+ <div class="sub_item">
+ <div class="label">
+ Jevan Alexander Hutson, <span class="affiliation">University of Washington, School of Law</span>
+ </div>
+ <div class="value">
+ Jevan Hutson is a third-year law student and Gregoire Fellow at the University of Washington School of Law. He holds an M.P.S. from the Department of Information Science at Cornell University, and a B.A. from the Department of Art History and Visual Studies at Cornell University. He has been published in venues including the Association for Computing Machinery’s conferences on Computer Human Interaction and Computer Supported Cooperative Work and Social Computing
+ </div>
+ </div>
+ <div class="sub_item">
+ <div class="label">
+ Os Keyes, <span class="affiliation">University of Washington, Department of Human Centered Design &amp; Engineering</span>
+ </div>
+ <div class="value">
+ Os Keyes is a PhD student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities.
+ </div>
+ </div>
+ </div>
+
+
+ </div><!-- .main_entry -->
+
+ <div class="entry_details">
+
+ <div class="item cover_image">
+ <div class="sub_item">
+ <a href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ <img src="https://firstmonday.org/ojs/public/journals/3/cover_issue_678_en_US.png" alt="“Frank Moore, Digital Divide, 2001 gouache, oil and mixed media on paper 14 3/4 x 24 1/4 inches (36,4 x 61,6 cm) sheet”">
+ </a>
+ </div>
+ </div>
+
+ <div class="item galleys">
+ <ul class="value galleys_links">
+ <li>
+
+
+
+
+<a class="obj_galley_link file" href="https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729">
+
+
+ HTML
+
+ </a>
+ </li>
+ </ul>
+ </div>
+
+ <div class="item published">
+ <div class="label">
+ Published
+ </div>
+ <div class="value">
+ 2020-09-10
+ </div>
+ </div>
+
+ <div class="item citation">
+ <div class="sub_item citation_display">
+ <div class="label">
+ How to Cite
+ </div>
+ <div class="value">
+ <div id="citationOutput" role="region" aria-live="polite">
+ <div class="csl-bib-body">
+ <div class="csl-entry">Liang, C., Hutson, J. A., &#38; Keyes, O. (2020). Surveillance, stigma &amp; sociotechnical design for HIV. <i>First Monday</i>, <i>25</i>(10). https://doi.org/10.5210/fm.v25i10.10274</div>
+</div>
+ </div>
+ <div class="citation_formats">
+ <button class="cmp_button citation_formats_button" aria-controls="cslCitationFormats" aria-expanded="false" data-csl-dropdown="true">
+ More Citation Formats
+ </button>
+ <div id="cslCitationFormats" class="citation_formats_list" aria-hidden="true">
+ <ul class="citation_formats_styles">
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acm-sig-proceedings?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acm-sig-proceedings?submissionId=10274&amp;return=json"
+ >
+ ACM
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acs-nano?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/acs-nano?submissionId=10274&amp;return=json"
+ >
+ ACS
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/apa?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/apa?submissionId=10274&amp;return=json"
+ >
+ APA
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/associacao-brasileira-de-normas-tecnicas?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/associacao-brasileira-de-normas-tecnicas?submissionId=10274&amp;return=json"
+ >
+ ABNT
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/chicago-author-date?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/chicago-author-date?submissionId=10274&amp;return=json"
+ >
+ Chicago
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/harvard-cite-them-right?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/harvard-cite-them-right?submissionId=10274&amp;return=json"
+ >
+ Harvard
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/ieee?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/ieee?submissionId=10274&amp;return=json"
+ >
+ IEEE
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/modern-language-association?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/modern-language-association?submissionId=10274&amp;return=json"
+ >
+ MLA
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/turabian-fullnote-bibliography?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/turabian-fullnote-bibliography?submissionId=10274&amp;return=json"
+ >
+ Turabian
+ </a>
+ </li>
+ <li>
+ <a
+ aria-controls="citationOutput"
+ href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/vancouver?submissionId=10274"
+ data-load-citation
+ data-json-href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/get/vancouver?submissionId=10274&amp;return=json"
+ >
+ Vancouver
+ </a>
+ </li>
+ </ul>
+ <div class="label">
+ Download Citation
+ </div>
+ <ul class="citation_formats_styles">
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/download/ris?submissionId=10274">
+ <span class="fa fa-download"></span>
+ Endnote/Zotero/Mendeley (RIS)
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/citationstylelanguage/download/bibtex?submissionId=10274">
+ <span class="fa fa-download"></span>
+ BibTeX
+ </a>
+ </li>
+ </ul>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="item issue">
+ <div class="sub_item">
+ <div class="label">
+ Issue
+ </div>
+ <div class="value">
+ <a class="title" href="https://firstmonday.org/ojs/index.php/fm/issue/view/678">
+ Volume 25, Number 10 - 5 October 2020
+ </a>
+ </div>
+ </div>
+
+ <div class="sub_item">
+ <div class="label">
+ Section
+ </div>
+ <div class="value">
+ Articles
+ </div>
+ </div>
+ </div>
+
+
+ <div class="item copyright">
+ <p>Authors retain copyright to their work published in <em>First Monday</em>. Please see the footer of each article for details.</p>
+ </div>
+
+
+
+ </div><!-- .entry_details -->
+ </div><!-- .row -->
+
+</article>
+
+
+
+</div><!-- .page -->
+
+ </div><!-- pkp_structure_main -->
+
+ <div class="pkp_structure_sidebar left" role="complementary" aria-label="Sidebar">
+ <div class="pkp_block block_developed_by">
+ <div class="content">
+ <a href="http://pkp.sfu.ca/ojs/">
+ Open Journal Systems
+ </a>
+ </div>
+</div>
+<div class="pkp_block block_web_feed">
+ <span class="title">Current Issue</span>
+ <div class="content">
+ <ul>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/atom">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/atom.svg" alt="Atom logo">
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss2">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/rss20_logo.svg" alt="RSS2 logo">
+ </a>
+ </li>
+ <li>
+ <a href="https://firstmonday.org/ojs/index.php/fm/gateway/plugin/WebFeedGatewayPlugin/rss">
+ <img src="https://firstmonday.org/ojs/lib/pkp/templates/images/rss10_logo.svg" alt="RSS1 logo">
+ </a>
+ </li>
+ </ul>
+ </div>
+</div>
+
+ </div><!-- pkp_sidebar.left -->
+ </div><!-- pkp_structure_content -->
+
+<div id="pkp_content_footer" class="pkp_structure_footer_wrapper" role="contentinfo">
+
+ <div class="pkp_structure_footer">
+
+ <div class="pkp_footer_content">
+ <p>A Great Cities Initiative of the University of Illinois at Chicago&nbsp;<a href="http://library.uic.edu/">University Library</a>.</p>
+<p>©&nbsp;<em>First Monday</em>, 1995-2020. ISSN&nbsp;1396-0466.</p>
+ </div>
+
+ <div class="pkp_brand_footer" role="complementary">
+ <a href="https://firstmonday.org/ojs/index.php/fm/about/aboutThisPublishingSystem">
+ <img alt="About this Publishing System" src="https://firstmonday.org/ojs/templates/images/ojs_brand.png">
+ </a>
+ </div>
+ </div>
+</div><!-- pkp_structure_footer_wrapper -->
+
+</div><!-- pkp_structure_page -->
+
+<script src="//ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js" type="text/javascript"></script><script src="//ajax.googleapis.com/ajax/libs/jqueryui/1.12.0/jquery-ui.min.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/lib/pkp/js/lib/jquery/plugins/jquery.tag-it.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/popper/popper.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/bootstrap/util.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/lib/bootstrap/dropdown.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/themes/default/js/main.js" type="text/javascript"></script><script src="https://firstmonday.org/ojs/plugins/generic/citationStyleLanguage/js/articleCitation.js" type="text/javascript"></script><script type="text/javascript">
+(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+
+ga('create', 'UA-41314203-1', 'auto');
+ga('send', 'pageview');
+</script>
+
+
+</body>
+</html>
diff --git a/python/tests/files/genders_g58_fairlie.html b/python/tests/files/genders_g58_fairlie.html
new file mode 100644
index 0000000..49cada8
--- /dev/null
+++ b/python/tests/files/genders_g58_fairlie.html
@@ -0,0 +1,146 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+<head>
+<title>Genders OnLine Journal - Genders OnLine Journal - Presenting innovative theories in art, literature, history, music, TV and film.</title>
+<meta name="description" content="Analysis of Hitchcocks Rope (1948) as a critique of heteromasculinity that thematizes queer anguish, orality, and womens relationship to the covert world of homosexual knowledge.">
+<meta name="keywords" content="homosexuality, homophobia, Cold War, the closet, heteromasculinity, queer anguish, anus, suspicion, orality, eating, cannibalism, Catholicism, knowledge, the cinematic cut, cinematic reality, women in Hitchcock, women and gay men, lack, hypocrisy, straight male interlocutor.">
+<style type="text/css">
+<!--
+
+td {
+ font-family: Arial, Helvetica, sans-serif;
+ font-size: 13px;
+}
+
+.Section1 {
+ page:Section1;
+}
+-->
+</style>
+</head>
+<body alink="#000088" background="../image/back.jpg" vlink="#00aa00">
+<p>
+<table width="600">
+ <tbody>
+ <tr>
+ <td valign="top" width="90"><p><img src="../image/indlgo.gif" alt="Genders OnLine Journal" align="bottom" border="0" height="530" width="97"> </p></td>
+ <td align="right" valign="top" width="530"><table width="530">
+ <tbody>
+ <tr>
+ <td valign="top"><p><b><font size="2">Issue 58</font></b>, Fall 2013</p>
+ <p><font size="5"><strong>Reading Maeshowe</strong></font> <br>
+ Recovering the Feminine in a Neolithic Tomb</p>
+<p>By <strong>CHARLOTTE FAIRLIE</strong></p>
+ <p>[1] Cuween, a small Neolithic cairn, perches on top of a hill on the Orkney Mainland. A flashlight waits in a bucket by the door, and visitors crawl on hands and knees, one by one, into the pitch-black interior. After savoring a degree of darkness rare in modern life, they direct beams of light up the tapering walls to marvel at the skill of the stonemasons. It is impossible to resist the impulse to clamber into the chambers and crouch where the bones once lay. Green and smooth, Maeshowe, another Orkney cairn, rises enigmatically from the field where it has stood since around 2700 BC. The designation of this monument and the surrounding Neolithic structures as a UNESCO World Heritage Site (WHS) in 1999 significantly increased tourism to the area (Card et al. 429), so while visitors may still enter Cuween unsupervised, access to the much larger Maeshowe now requires a timed ticket, bought in advance. Throughout the year, thousands of visitors, bending uncomfortably low, shuffle through the tunnel-like passage entry, making the physical journey from light to dark and a more psychological journey from present to past. Exploring any of the Neolithic sites in Orkney is to bridge time, to feel kinship with those who built them.</p>
+ <p>[2] Without doubt, a major reason Maeshowe attracts so many people is its symbiotic relationship with its environment. Most famously, at sundown during the December solstice, the winter sun lines up with the door of the tomb, shines down the passage, and focuses its rays on the stone wall within. Interest in this phenomenon, the moment when the light stabs the darkness, is so high that Historic Scotland provides web-cam coverage, but Maeshowe fascinates others besides tourists and solstice celebrants. Whether they are vacation visitors, archaeologists, anthropologists, or poets, explorers experience the sites differently, applying their own intellectual tools and imagining Neolithic lives from their respective points of view. Leslie Riddoch has written that these are &ldquo;Stone Age marvels which inspire and astonish,&rdquo; and Simon W. Hall expresses the experiences of many when he refers to &ldquo;the profound impact of entering a tomb&rdquo; (160). They imply that to enter a cairn is to become one with it, to undergo a transformation. Maeshowe, which can now be experienced only under the regimented conditions required by the Historic Scotland guides, clearly retains extraordinary power to inspire. Indeed, this ancient mound has attracted a great deal of literary attention from both noted and obscure writers. Considering these cumulative interpretations, rather than relying solely on the work of archaeologists, opens up a more comprehensive, textured, and, indeed, gendered understanding of ancient history and our commonality with Neolithic peoples.</p>
+ <p> [3] George Mackay Brown, Kathleen Jamie, Myra Schneider, and Dilys Rose are four of the more prominent authors for whom Maeshowe has proven inspirational. They have experienced the tomb through a doubly imaginative process: first by reading it as they would read a poem and then by expressing that interpretation in writing. While Brown was an Orcadian, living most of his life alongside the Neolithic sites, Jamie, Schneider, and Rose, all of whom have Scottish roots, experience Maeshowe as tourists, drawn across the Pentland Firth to enter the passage and travel into the darkness. Significantly, all three of these more contemporary writers are women. Hall, in his valuable survey, <u>The History of Orkney Literature</u>, contrasts the use of the prehistoric by female Scottish writers with that of their male counterparts, stating that it is less political, that women authors take &ldquo;the opportunity to reestablish the place&mdash;and, significantly, the inner lives of women in the prehistoric or early historical northern landscape&rdquo; (162-163). I would argue, however, that their work also engages the public world to a greater extent and is more ideological than this statement implies. Jamie&rsquo;s, Schneider&rsquo;s, and Rose&rsquo;s experiences in Maeshowe lead to readings of the monument that build on the archaeological interpretations, allowing us to consider the possibility of ancient gender power struggles and raising our awareness of the deep roots of masculine dominance.</p>
+ <p>[4] Archaeologist Colin Richards, who has written extensively about The Heart of Neolithic Orkney WHS, describes how visiting cairns must also have affected prehistoric visitors: &ldquo;the journey will be one of consequence.&rdquo; Moving from the light of day to the dark mysteries of a tomb&rsquo;s interior &ldquo;is a passage from the profane to the sacred.&rdquo; As such, &ldquo;it will involve transformation&rdquo; (&ldquo;Doorways&rdquo; 70-71). However, the nature of the transformation is mysterious. Referring to single-chambered structures divided into stalls, he continues, &ldquo;If the Orkney-Cromarty &lsquo;chambered&rsquo; tombs are principally conceived as a series of doorways, the question arises: where are they leading? To what goal?&rdquo; (71). In discussing the relationship between buildings and the people who used them thousands of years ago, Richards considers the figurative significance of doors. In doing so, he treats the tombs as if they were literary texts with debatable meaning, having previously pointed out that &ldquo;the architecture of a chambered tomb relied on analogy and metaphor for its understanding and interpretation&rdquo; (&ldquo;Doorways&rdquo; 67). Rather than merely being repositories for bones, the tombs, Richards asserts, were &ldquo;built to be experienced visually, physically and imaginatively,&rdquo; an experience which may well result in some kind of &ldquo;revelation&rdquo; (&ldquo;Doorways.&rdquo; 69, 70, 76). Since he argues that buildings carry metaphoric meaning, open to imaginative interpretation, it is entirely appropriate that, when explaining this, Richards also changes to the historical present tense. His grammatical shift emphasizes that like <u>Beowulf</u>, <u>Hamlet</u>, or <u>Moby Dick</u>, tombs such as Maeshowe transcend time and are open to new readings, whether by trained archaeologists, pilgrims, casual visitors, or writers.</p>
+ <p>[5] Robert Crawford draws more explicit parallels between Maeshowe itself and literature in his essay, &ldquo;Maes Howe Sappho.&rdquo; Noting the continuing appeal of the tomb, how today &ldquo;people still treasure&rdquo; the moment that the sun lines up with the passage, he compares the ancient monument to poetry:</p><blockquote>However different we and our family groups, our tribes, have become, we can and do still savor that sense of alignment and attunement and have our own ways of articulating some sort of consonance between ourselves, our intimate groupings, and the universe that surrounds us. Though such patternings may be deconstructed, they seem to emerge from a deep need that recurs across generations, like a persistent internal rhyme, and poetry, this most nuanced way of making with words, is a way in which that need for attunement is repeatedly articulated through language. If prehistoric sites often appear to relate people to the stars and planets, then poems continue that impulse. (61)
+ </blockquote>
+ <p>Ancient tombs, then, prompt us to ponder our place in the universe, our identity as humans, and in that also they resemble literature. According to Kenneth Brophy, Neolithic monuments &ldquo;were and are locations that embodied the biography of the builders, users, spectators, and excavators&rdquo; (10). It follows that if we think of Maeshowe as a text, Brophy&rsquo;s assertion that the monument absorbs the &ldquo;biography&rdquo; of all who have used it or visited it, positions it as an example of intertextuality. Maeshowe has many constantly changing stories to tell to its different readers, and readers will respond differently to its figurative meanings.</p>
+ <p>[6] In a 1977 column for <u>The Orcadian</u> newspaper, George Mackay Brown describes how witnessing the midwinter solstice at Maeshowe affects him: &ldquo;Winter after winter I never cease to wonder at the way primitive man arranged, in hewn stone, such powerful symbolism&rdquo; (&ldquo;Maeshowe at Midwinter&rdquo; 88). Like Richards, Brown is emphasizing the figurative qualities of the structure, which he has further explored in poetry. However, the first of his 1999 &ldquo;Two Maeshowe Poems&rdquo; (often printed as a stand-alone) opens not at the tomb, but with an image of the neighboring stone circle, Brodgar. Perhaps surprising to most readers, this would resonate with archaeologists since current scholarship emphasizes that the sites comprising The Heart of Neolithic Orkney are not self-contained but exist and function in relation to one another and to the surrounding landscape (See &ldquo;Heart of Neolithic Orkney WHS: Setting Project&rdquo; 5). As such, they should not be interpreted as discrete entities. It is fitting, then, that Brown&rsquo;s poem moves seamlessly through a series of images that integrate Brodgar&rsquo;s &ldquo;light and darkness&rdquo; with Maeshowe&rsquo;s &ldquo;flowers [and] stone&rdquo; (a reference to the runic graffiti carved by Vikings inside the tomb) and &ldquo;skulls&rdquo; (Lines 1, 9, 11). The first word of the poem, &ldquo;Circle,&rdquo; is semantically echoed in the initial word of each ensuing stanza, &ldquo;Ring,&rdquo; &ldquo;Wheel,&rdquo; and &ldquo;Round,&rdquo; subtly shifting from the geometrically circular Brodgar to the tumescent mound of Maeshowe and emphasizing the cycle of &ldquo;life and death&rdquo; (7). For this is a poem about regeneration, how &ldquo;Out of those skulls / Breaks the first green shoot, the full ear, then the bread&rdquo; (11-12). Throughout, juxtaposed images look for the positive to outweigh the negative: &ldquo;We move in shadows,&rdquo; but &ldquo;Brodgar has burned on the moor a dance of sun&rdquo;; &ldquo;Ring of quern and plough&rdquo; (a quern is a stone for grinding grain) are charged to &ldquo;contain / Our tumults of blood&rdquo;; &ldquo;The stars&rsquo; chaos is caught in a strict rein&rdquo;; the word &ldquo;stone&rdquo; is enveloped by &ldquo;flowers,&rdquo; and &ldquo;beauty and love&rdquo;; similarly, &ldquo;snow&rdquo; is flanked by &ldquo;sun&rdquo; and &ldquo;seed.&rdquo; So darkness becomes light, destructive violence is subservient to the raising and grinding of grain for bread, order makes sense of the universe, the beautiful and the warm temper the hard and the cold, and new life will follow death.</p>
+ <p>[7] Brown&rsquo;s interpretation of these monuments, his use of the architectural circularity and roundness of the Ring of Brodgar and Maeshowe as metaphors for the lifecycle and the possibility of renewal, is shared by archaeologists, who despite its being a burial site, have also associated Maeshowe and its rituals with the agricultural year. Neolithic people were not nomadic but had gradually become settled farmers, living by the routines and rhythms of the seasons, which, according to Richards, constituted &ldquo;an analogy with the human life cycle and past generations&rdquo; (&ldquo;Doorways&rdquo; 65). Time&rsquo;s passage was the organizational framework for survival as well as mortality, and the tombs, he writes, were &ldquo;a metaphorical extension of daily life&rdquo; (&ldquo;Doorways&rdquo; 76). Trevor Garnham, an architect, develops that idea further: &ldquo;Burying bones in the earth was perhaps to seek some metaphoric relationship with the planting of seeds. In its maturity and death, the seed containing the essence of its own renewal served as the inspiration for the hope of life&rsquo;s rebirth in some other form&rdquo; (87). In pairing skeletal remains with seeds as an expression of hope for the future, Garnham&rsquo;s analogy is comparable to the positive final image of Brown&rsquo;s poem, the &ldquo;skulls&rdquo; engendering the &ldquo;green shoots&rdquo; and the &ldquo;bread&rdquo; of life.</p>
+ <p>[8] Brown had written earlier of Maeshowe in his 1996 poem, &ldquo;Maeshowe: Midwinter,&rdquo; choosing then to focus on the solstice. However, the imagery here is not rooted in the agricultural cycle, the earthly world of querns, ploughs, and bread; instead, he connects the pre-Christian tomb to the Christian calendar. The opening phrase, &ldquo;Equinox to Hallowmass,&rdquo; immediately integrates the astronomical with the sacred, giving the season of &ldquo;darkness&rdquo; both physical and spiritual dimensions (1). The religious imagery continues in the second stanza as it evokes &ldquo;St Lucy,&rdquo; whose feast day falls on the shortest day of the year (6). She is portrayed as a weaver whose &ldquo;shuttle&rdquo; creates &ldquo;a dark web&rdquo; that &ldquo;fills the loom&rdquo; (7-9), placing at the centre of the poem a world in which light is completely absent: &ldquo;The blackness is solid as a / stone that locks a tomb. / No star shines there&rdquo; (10-12). To be in such a void, with no guiding star, would seem like a moment of psychological despair, yet just as the days begin to lengthen immediately after the solstice, the poem also brightens. The moment when the sun enters the passage is the &ldquo;true ceremony,&rdquo; suggesting that perhaps the pagan reverence for nature carries particular authenticity. Then &ldquo;the last fleeting solstice flame&rdquo; is &ldquo;caught up,&rdquo; leading to an optimistic note as the children&mdash;the future&mdash;sing with &ldquo;voices like leaves of light&rdquo; (19). Again, the poem ends with an image of rebirth, but its tone is less biological and more cosmological.</p>
+ <p>[9] While Brown&rsquo;s poems use these dual frames of reference in order to explore the themes of regeneration that Maeshowe expresses, the biological and cosmological are not at odds. Garnham defines the cosmos as &ldquo;an all-encompassing world of things and phenomena [. . . .] The essential character of this early form of cosmos bound every aspect of a people&rsquo;s life into reciprocal relationships with the forces that give shape to their world&rdquo; (9). The central argument of his book places Neolithic Orkney in this context. Similarly, reading Brown&rsquo;s two Maeshowe poems together reveals that the &ldquo;green shoot&rdquo; which produces the &ldquo;bread&rdquo; corresponds to the youthful &ldquo;voices like leaves of light.&rdquo; In fact, his insertion of &ldquo;leaves,&rdquo; with its agrarian connotations, into that final line establishes the connection, recognizes that the complex architectural system of domestic houses, burial chambers, and stone circles symbolizes the idea that the activities for which they were designed&mdash;working, eating, loving, sleeping, worshipping, dying, and the possibility of rebirth&mdash;are the web of human existence. The physical bread and the metaphysical song are one.</p>
+ <p>[10] In their respective responses to Maeshowe, Kathleen Jamie, Myra Schneider, and Dilys Rose also address the theme of the cycle of life and death. Jamie&rsquo;s essay, &ldquo;Darkness and Light,&rdquo; describes a quest: she seeks a good, positive darkness because, in the 21st century, it has become impossible &ldquo;to see the real dark for the metaphorical dark . . .the death-dark.&rdquo; Enjoyment of the &ldquo;natural, courteous dark,&rdquo; she has come to believe, has been squeezed out by the Christian belief in a metaphorical darkness that stands for the opposite of salvation (9-10). However, as she is planning this trip, a friend points out that &ldquo;Maes Howe is a metaphor,&rdquo; perhaps exposing a flaw in Jamie&rsquo;s thinking: possibly the natural and metaphorical darknesses are inseparable (10 emphasis added). Although her visit to Maeshowe takes place a couple of days before the solstice, the artificial lights of a surveyor&rsquo;s crew assault her eyes, so she rediscovers no &ldquo;courteous darkness&rdquo; and witnesses &ldquo;no resurrecting beam of sunlight&rdquo; (19). Nevertheless, through Maeshowe, she becomes reconciled to the conventional negative concept of darkness. In terms of &ldquo;wonder&rdquo; similar to Brown&rsquo;s in <u>The Orcadian</u>, she asks, &ldquo;Were they the first people . . . to articulate this metaphor of light and dark, of life and death?&rdquo; and reflects upon its significance:</p><blockquote>For five thousand years we have used darkness as the metaphor of our mortality. We were at the mercy of merciless death, which is darkness. When we died, they sent a beam of midwinter light in among our bones. What a tender, potent gesture. In the Christian era, we were laid in our graves to face the rising sun. We&rsquo;re still mortal, still don&rsquo;t want to die, don&rsquo;t want our loved ones to die. (19-20)
+ </blockquote>
+ <p>Her rejection of a metaphor that she has considered &ldquo;[worn] out&rdquo; and &ldquo;redundant&rdquo; (4, 9) turns out to have been less literary and more personally psychological, for Jamie&rsquo;s visit to the tomb leads to her acceptance of mortality. Whereas previously she has blamed Christianity, she now appreciates that the Christian concept of darkness is part of a continuum of dread traceable back to Neolithic times and forward to our own. The &ldquo;tender, potent gesture&rdquo; of the light penetrating the dark of the tomb, therefore, offers consolation, ameliorating our most profound fears (20).</p>
+ <p>[11] In her poem, &ldquo;Maeshowe,&rdquo; Myra Schneider also describes a guided tour of the cairn, during which the speaker uses the second person singular to address a hypothetical visitor, initially giving the sense that to enter the burial place feels like death as the &ldquo;chill seeps into your body&rdquo; (14). However, this ominous impression is immediately dismissed because &ldquo;a stillness that&rsquo;s other than death inhabits / this place where the undead gather to greet the dead&rdquo; (15-17). The journey through the passage will take &ldquo;you&rdquo; to a place that is not oblivion but, instead, is where the living may consort with their ancestors. Again, the boundary between life and death, which can seem so irrevocable, becomes less absolute and, therefore, less threatening. After the visit is over, its impact will remain, and the speaker imagines her visitor&rsquo;s memories:</p><blockquote>In midwinter you&rsquo;ll visualize the sun piercing the dark that swaddles seeds, see it falling on the aligned entrance, its white shine splitting to burnish the passage wall, flood the ground with gold. (22-26)
+ </blockquote>
+ <p>These images recall Garnham&rsquo;s theory: that the burial of bones is connected metaphorically to the planting of seeds. In the speaker&rsquo;s memory, the dark cradles seeds, the germ of life, rather than bones. Once sunlight enters the tomb, a radiant moment occurs in which the &ldquo;ground&rdquo; will turn &ldquo;gold,&rdquo; like a field of ripe grain. Schneider&rsquo;s poem, like Brown&rsquo;s, affirms the archaeological reading of Maeshowe as a place of renewal, but in this case that renewal goes beyond the promise of the agricultural cycle. An individual will be able to experience, perhaps during times of psychological or spiritual gloom, the moment of glory when the sun is &ldquo;piercing / the dark.&rdquo; There is a Romantic quality to these lines: Maeshowe will stay with Schneider&rsquo;s speaker as those daffodils stay with Wordsworth, &ldquo;to flash upon the inward eye / That is the bliss of solitude,&rdquo; to stimulate the imagination (24). Having herself benefited from the tomb&rsquo;s restorative qualities, the speaker is inspired to spread the word, to share her revelation with &ldquo;you,&rdquo; the reader.</p>
+ <p>[12] Besides the drama of the solstice, another inspirational feature of Maeshowe is the Viking runes carved on the interior walls. Referring to these inscriptions as &ldquo;The first island poems,&rdquo; Brown quotes them emphatically in the second of the paired poems: &ldquo;INGIBIORG IS THE LOVELIEST GIRL / HERMUND WITH A HARD AXE CARVED RUNES&rdquo; (&ldquo;Two&rdquo; 13, 18-19). Many have been struck by the simple humanity of these statements, as well as the paradox inherent in this lusty youthful scrawling being hidden in a tomb. Dilys Rose, in &ldquo;Maeshowe Nipple,&rdquo; for instance, lists the prosaic concerns of the Vikings, portraying them as &ldquo;intrepid&rdquo; but also homesick, missing &ldquo;sweethearts and family&rdquo; (4, 9). At the ends of their respective poems, both Brown and Rose emphasize that Maeshowe was merely a temporary shelter for the Vikings: the &ldquo;young seamen climbed out of Maeshowe, / Their nostrils wide to the salt wind&rdquo;; &ldquo;the dragon boats moved on&rdquo; (Brown &ldquo;Two&rdquo; 23-24; Rose 11). Crawling out of the subterranean tomb and heading for further maritime adventures, the men re-enter the world, extending the overall theme of regeneration. Brown, as we have seen, has already linked the tomb with the life-giving promise of &ldquo;the first green shoot, the full ear, then the bread&rdquo; in the first of these paired poems. Rose, in similar terms, also connects the Viking runes with the reassuring knowledge that there will be a crop next year: over the centuries, &ldquo;their tongue / took root and sprouted from invaded soil / green words for <u>Father</u>, <u>Daughter</u>, <u>Bread</u>&rdquo; (11-13). Here, in the final lines, the Viking vocabulary is fresh and verdant, a harbinger of new human life and the grain that nourishes it. Since runic characters are &ldquo;straight-branched&rdquo; (Rose 4), they resemble rows of rudimentary skeletal stick figures which have been buried in the tomb. The bony runes, therefore, have become metaphorical seeds, and Rose&rsquo;s speaker, like Garnham, sees hope in the bone/seed analogy.</p>
+ <p>[13] It is clear, to summarize briefly, that these four creative writers read Maeshowe much as archaeologists and historians of architecture have done, as an expression of hope for the future, particularly in relation to the coming of spring, but also at a more personal level. The texts suggest that to visit these tombs is, as Richards also emphasizes, transformative. Like their ancestors, contemporary visitors are changed, in some manner revitalized, especially if they witness the sun&rsquo;s midwinter alignment, which Brown describes as a &ldquo;pledge of renewal, a cry of resurrection&rdquo; (&ldquo;Maeshowe in Midwinter&rdquo; 88). However, in the work of Jamie, Schneider, and Rose, a further, more political restoration is at work, for all three use images equating Maeshowe with the female body.</p>
+ <p>[14] Kathleen Jamie states early in her essay, &ldquo;We are conceived and carried in the darkness,&rdquo; emphasizing the positive, life-giving qualities of the dark, and inviting the reader to see Maeshowe as a uterus (4). The womb/tomb imagery is developed further when she eroticizes the winter solstice as &ldquo;a complicit kiss,&rdquo; during which &ldquo;the beam of the setting sun shines along the passage, and onto the tomb&rsquo;s back wall&rdquo; (12). When she goes inside the tomb, she expects &ldquo;not utter darkness, but perhaps a wombish red&rdquo;; however, this is denied her because of the lights of the surveyors, one of whom is &ldquo;folded, foetus-like, into the little cell in the back wall&rdquo;: a foetus implanted in the very place where the sunbeam strikes (12,13). When Jamie leaves, she describes taking &ldquo;the smallest and most challenging of journeys, squeezing down a passageway and out into the world of sound and moving air&rdquo; (17). The tunnel that admits the beam has become a birth canal, so Jamie&rsquo;s transformation is not only her intellectual reassessment of the metaphorical value of darkness; she visualizes her own rebirth in more literal terms too, with Maeshowe cast as the mother.</p>
+ <p>[15] Myra Schneider&rsquo;s &ldquo;Maeshowe&rdquo; also hints that to visit the tomb is to return to the womb when the speaker remarks that although &ldquo;you&rdquo; are part of a tour group, you will realize that you are &ldquo;alone&rdquo; and have &ldquo;never travelled so far back / so far in&rdquo; (8-10). This analogy is made more explicit later in the poem when the sun enters the passage: &ldquo;In that deep chamber / you&rsquo;ll be bathed in red, not the red spilt in hatred&mdash;/the red that&rsquo;s birth, the heart looming with the blood&rdquo; (24-28). In the vision that the speaker evokes for the visitor&rsquo;s memory, therefore, the &ldquo;dark that swaddles seeds&rdquo; not only nurtures and protects the grain that will ripen into crops, but also the fertilized ovum (23). With no dazzling and intrusive surveyors&rsquo; lights, Schneider suggests that it is possible for us to experience the &ldquo;wombish red&rdquo; that was denied Jamie, blood that is the force of life rather than the mark of violence.</p>
+ <p>[16] Dilys Rose&rsquo;s poem, &ldquo;Maeshowe Nipple,&rdquo; on the other hand, in addressing the Viking use of the tomb, acknowledges that violence has taken place. The title, of course, immediately signals that Maeshowe is female, and the opening lines graphically describe the tomb&rsquo;s external anatomy: a &ldquo;breast,&rdquo; with an &ldquo;aureola / sandy-rimmed, the nipple leaking a pale trail / to hidden chambers&rdquo; (1-3). Within, Maeshowe&rsquo;s chambers have been &ldquo;invaded&rdquo; by men who &ldquo;inscribed their conquests&rdquo; and &ldquo;totted up the loot&rdquo; (12, 4, 6). Even though the poem has initially compared the cairn to a breast rather than a womb, this seems like a rape or an assault by men exercising their power and keeping track of their plunder. As human and homesick as the poem presents the young men, it does not forget that their presence in Maeshowe is as uninvited intruders who leave their runic seeds carved into the chamber walls.</p>
+ <p>[17] To make sense of this pattern of imagery, it is helpful to turn to an earlier female author, similarly inspired by her visit to a Neolithic site. Naomi Mitchison wrote <u>Early in Orcadia</u> after a friend took her to another of Orkney&rsquo;s chambered tombs, Isbister, which has no passage entry, because &ldquo;she knew it would waken something in me&rdquo; (8). Set in Neolithic times, the novel follows a family and its descendants as they settle on Orkney, establish homes and villages, and erect the monuments in which they practice their religious rituals. Mitchison depicts the cairns predating the stone circles (both Isbister and Maeshowe are, in fact, thought to have been built before Brodgar) and imaginatively describes the changing beliefs prompting these architectural developments. Tradition holds that pregnant women must visit the tomb in order that the ancestral spirit will be passed to their children (132). One woman, Ba, making this journey, reflects that a &ldquo;few moons&rdquo; have passed since she became pregnant and stopped menstruating. She also knows that a powerful goddess, &ldquo;the big bad Moon Woman had once had an honouring place,&rdquo; had watched over the dead (119). However, the Moon Woman has been supplanted by the sun. The burial place was &ldquo;pulled apart and scattered by the Sun Man and the bulls. After that came the beginning of their own honouring place where the bones lay and where you must go down on your knees before you could get in&rdquo; (119). The later passage cairn, then, is a creation of the masculine sun, the same sun that shines down the passageway at midwinter. Accompanied by bulls, also male, the Sun Man has ravaged the Moon Woman&rsquo;s tomb and designed a new one to suit his own needs. Even so, the burial place is still associated with female fertility. Nervously, Ba enters &ldquo;on her hands and knees . . . under and between great stones.&rdquo; Once inside, though, she thinks of the moments before she conceived her child: &ldquo;She was waiting, almost as she had waited in the soft sand behind that rock in the sun-warmed geo a few moons back&rdquo; (130). For Ba, the tomb is not frightening. She recalls not a violent rape, but a loving encounter, and the darkness feels as warm as the &ldquo;geo&rdquo; (an Orcadian word referring to a deep, narrow fissure in a cliff) where she met her lover. Following her memory of the moment of conception, she is &ldquo;push[ed] . . . back, back to the way out, back to the square of light, to the way out into the real world on hands and knees as one must&rdquo; (130). Like Jamie, Ba is compelled to crawl, to battle her way through the passage to be reborn.</p>
+ <p>[18] By the end of <u>Early in Orcadia</u>, the stone circle, with its emphasis on light rather than dark, is becoming the ultimate manifestation of the transfer of power from the Moon Woman to the Sun Man. Its significance is explained by the &ldquo;Great Man,&rdquo; who is &ldquo;painted with sun circles,&rdquo; to Moon Woman after he has summoned her to his presence: &ldquo;The great tall stones . . . were so raised to show the way of the sun, who is our master and our maker&rdquo; (169). Moon Woman, however, is aware of the injustice of this arrangement: &ldquo;They said that the moon was the servant of the sun, to do what he wanted, but that, Moon Woman knew, was not right. In her own mind she unsaid it&rdquo; (170). At first she is jealous and afraid, but the final vision of the novel is hers, and it is, to an extent, a reconciliation of powers:</p><blockquote>If I were to say a few small and easy words to the Great Man, if I were to move myself in a certain way, then we would be sun and moon. Then I would put my fingers onto the colour, onto that knife, onto his eyes, . . . eyes, onto that round, shining sun that hangs over his heart, fingering it so that my fingers would meet his, me going . . . onto all parts of him. He would be mine as the sun is the moon&rsquo;s. (176)
+ </blockquote>
+ <p>She is picturing an intertwining of sun and moon, of masculine and feminine&mdash;a consummation. The partnership is not one of complete equality, though, for she also envisions not that the sun will be the master and the moon the servant, but that he will be hers, that the moon will possess the sun, that her status will be restored.</p>
+ <p>[19] Mitchison&rsquo;s fictional representation of light/sun/man emerging as the object of worship and awe, assuming the rank previously held by dark/moon/woman, is an idea rooted across cultures: &ldquo;A fundamental polarity in many creation myths,&rdquo; according to Trevor Garnham, &ldquo;contrasts the dark, fecund, harbouring earth with the up-drawing sun.&rdquo; (145). He points out, for example, that &ldquo;by the time of the Celtic occupation of Britain, there were well-established beliefs and practices focused on the sun&rdquo; and that in Norse mythology, &ldquo;a male hierarchy supplanted older, matriarchal law&rdquo; (161, 109). Analyzing the archaeological sites within this paradigm, Garnham argues, supports the theory that religious practice fundamentally changed along with the architecture, that &ldquo;ritual activity associated with burial cairns became transferred to stone circles&rdquo; (152).</p>
+ <p>[20] Maeshowe, however, suggests a mid-point in this ritualistic shift because although, like earlier stalled cairns, it is dark and womb-like, its annual climactic moment is when the sun lights up the passage. Garnham sees the Neolithic architecture of Orkney as a progression. The first structures, the houses, were purely domestic; they had a &ldquo;nurturing role&rdquo; (66). The houses at the coastal village site, Scara Brae, therefore, &ldquo;seem to be fundamentally powerful symbols of protection and gathering, echoing that of the pot and the basket&rdquo; (70). Since the manufacture of both pots and baskets was the work of women, Garnham is reading the houses as essentially feminine. They were vessels, their stone walls embanked by earth. Both Garnham and Richards point out that the houses were models for the tombs: the passage graves are structurally similar to the houses at Scara Brae, and both were covered with turf (Garnham 48; Challands, Muir &amp; Richards 242, 245). Cairns of the Maeshow type, with passage entries, however, were the later forms. The earlier stalled structures, such as Midhowe, on the island of Rousay, did not feature the tunnel entrance.</p>
+ <p>[21] Archaeologists do not agree on the social significance of passage cairns and sun circles, the extent to which their development reveals a move to a more hierarchical society. Challands, Muir, and Richards state, &ldquo;In many ways, everything about the architecture of Maeshowe enforces a notion of separation, division, and restriction&rdquo; (247). Elsewhere, Richards and another co-writer are more guarded. They point out that the tomb resembles House 2 at the nearby Barnhouse settlement, a larger house than any at Scara Brae that was probably &ldquo;highly restricted on the basis of an individual&rsquo;s status, probably additionally defined in terms of age and gender.&rdquo; However, they also warn that there is insufficient archaeological evidence to &ldquo;leap to conclusions about a patriarchal group of &lsquo;elders&rsquo; who used knowledge as a commodity to maintain their power over women and younger men&rdquo; (Muir &amp; Richards 204). Although cautious, they do acknowledge that &ldquo;power and authority,&rdquo; probably based on &ldquo;cosmological beliefs,&rdquo; would have been necessary to build the monuments (199). Leaning not only on physical but also anthropological evidence, Garnham&rsquo;s view, on the other hand, is that the more formal structure <u>does</u> support the idea of hierarchy and that the estimated 100,000 man/hours that would have been necessary to build it point to a more complex social structure that had to extend beyond the local community (128). Furthermore, he writes, the layout of individual chambers &ldquo;can be read as a metaphor of primogeniture&rdquo; (74). Like Richards, Garnham interprets the passage as a symbol of privilege because it was hard to get inside. However, citing Eliade&rsquo;s <u>Patterns in Comparative Religion</u>, he also emphasizes that there is &ldquo;a close connection between solar theology and the elite&rdquo; (163). In this context it seems that &ldquo;allowing access to the sun . . . was more important that [sic] allowing access to members of the tribe&rdquo; (131-132).</p>
+ <p>[22] Maeshowe can be seen, then, as expressing a point of tension between earth and sun in which the dark tomb is literally infiltrated by solar rays on one day only. The subsequent building of the Circle of Brodgar elevates the stature of the sun. Fully above ground, the center of its astronomical and religious year occurs not in December, but in June, at the midsummer solstice. Garnham points out that while a smaller circle, the Stones of Stenness, is open to the sun at its &ldquo;point of maximum power,&rdquo; Maeshowe allows the sun inside only when it is &ldquo;at its lowest ebb.&rdquo; Except at midwinter, &ldquo;the tomb is dark, cold, and filled with white bones, echoing the whiteness of the moon&rdquo; (207). Although Stenness actually predates Maeshowe by perhaps 400 years, throwing off the neat chronology of <u>Early in Orcadia</u>, Garnham&rsquo;s interpretation of Maeshowe and the stone circles parallels Mitchison&rsquo;s literary response to the Isbister tomb: compared to earlier cairns, Maeshowe is a more patriarchal development, the passageway allowing the masculine sun to displace the feminine &ldquo;whiteness of the moon,&rdquo; and yet the bones, the metaphorical seeds, still lie dormant; the presence of Moon Woman endures.</p>
+ <p>[23] Although <u>Early in Orcadia</u> ends with Moon Woman&rsquo;s vision of a mingling of sun and moon, of masculine and feminine, there is a note of uncertainty as she asks herself, &ldquo;Should I, then?&rdquo; (176). She does not ask &ldquo;Can I?&rdquo; but &ldquo;Should I?&rdquo; Her question is not whether she is personally capable, but whether it would be wise to challenge the elite power structure in the name of justice. Readers are left without an answer, but since women are still fighting for equality in the institutions of politics and religion, it is reasonable to assume that if Moon Woman did attempt it, she met with a great deal of resistance. It is with this in mind, then, that we can return to the Maeshowe experiences of Jamie, Schneider and Rose. Their visits to the cairn suggest that to see it merely as a symbol of agricultural regeneration or even more broadly of hope, is incomplete. Something more needs to be resurrected, and their use of the female imagery effectively acknowledges and reclaims a feminine narrative for Maeshowe. In Rose&rsquo;s poem, 12th century Vikings may take up residence inside, but 900 years later, the reader is instructed to &ldquo;See,&rdquo; to bear witness to &ldquo;a green breast in a green field,&rdquo; the most nurturing part of a woman&rsquo;s body surrounded by the new growth of spring (1). When Schneider refers to the &ldquo;red that&rsquo;s birth&rdquo; rather than the &ldquo;red spilt in hatred,&rdquo; and describes how the sun will &ldquo;burnish the passage wall, / flood the ground with gold&rdquo; and, similarly, when Jamie refers to the &ldquo;complicit kiss,&rdquo; it is as if Moon Woman&rsquo;s consummation has finally taken place and justice restored.</p>
+ <p>[24] Richards asks where the doors of tombs lead, to what &ldquo;revelation.&rdquo; Indeed, the creative writing of Jamie, Schneider, and Rose transports readers through Maeshowe&rsquo;s entryway towards &ldquo;revelation.&rdquo; Their collective responses help us to recognize the humanity of Neolithic peoples, to appreciate how common experiences connect us to the past. They ask us to consider the roots of sexual discrimination, the possible marginalization of women 5000 years ago. More universally, they honor the memory of displaced matriarchal societies and, thus, prompt us to reflect on the status of women today. While, as Hall points out, male authors of the mid-twentieth-century Scottish Literary Renaissance had a nationalist political agenda, &ldquo;looking for Scotland in Scotland&rsquo;s prehistory&rdquo; (160), these female writers look to the past for a feminist renewal, both personal and political. As such, their interpretations complement and illuminate those of archaeologists. Naomi Mitchison, acknowledging that she may be &ldquo;treading on the toes of archaeologists,&rdquo; points out that their physical &ldquo;evidence may not always offer a clear interpretation, in fact it very seldom does&rdquo; (113). For despite their painstaking sifting (both literal and figurative) of physical evidence, archaeologists must, finally, apply their own imaginations.</p>
+ <p>[25] Archaeologists themselves recognize the uncertainty inherent in drawing conclusions about ancient societies from the surviving fragments of their lives. In reference to the recent discovery of a complex of temples at the Ness of Brodgar, Richards has said, &ldquo;This was a ceremonial centre, and a vast one at that. But the religious beliefs of its builders remain a mystery&quot; (qtd. in McKie). In fact, the excavation of this temple complex is prompting a reassessment of the entire Heart of Neolithic Orkney. Tom Muir, of the Orkney Museum, goes so far as to assert that &quot;the whole text book of British archaeology for this period will have to be torn up and rewritten from scratch thanks to this place&quot; (qtd. in McKie). Even as archaeologists, using sophisticated technology, scrape away the dust of time from this long-buried site, it remains true that &ldquo;Insights can only come from interpretation&rdquo; (Jones and Richards 195). It is in this interpretative arena that science must join forces with the arts and humanities in the search for knowledge, for a fuller understanding.</p>
+ <p>[26] George Mackay Brown has written, &ldquo;People in 2000 AD are essentially the same as the stone-breakers [. . .] of 3000 BC&rdquo; (&ldquo;Brodgar Poems&rdquo; lines 10-12). Knowing where we have come from, fleshing out our understanding of the prehistoric world and, therefore, ourselves, takes the skills and multiple perspectives not only of scientists, archaeologists, architects, and anthropologists, but also essayists, poets, and more. The interdisciplinary synergy involved in comparing archaeological, anthropological, and literary interpretations of Maeshowe sheds light on the shadows of the past, raises questions about the more elusive shadows of Neolithic women, and provides historical context for our understanding of gender relations across time. Like crawling through the passage into the dark and out to the light, the empirical and literary journeys into the mysteries of Maeshowe are indeed transformative, exhuming the bones of the past that we may better nurture the seeds of the future.</p>
+ <p>ACKNOWLEDGEMENTS. Thanks are due to Edward Gale Agran, Stephen Potthoff, and the anonymous reviewers for their time and valued advice. </p>
+ <p align="center">WORKS CITED</p>
+ <p>Bevan, Archie, and Brian Murray. Eds. <u>The Collected Poems of George Mackay Brown</u>. London: John Murray, 2005. Print.</p>
+ <p>Brown, George Mackay. &ldquo;Brodgar Poems (1992).&rdquo; In Bevan and Murray.308-312. Print.</p>
+ <p>---. &ldquo;Maeshowe: Midwinter.&rdquo;1996. In Bevan and Murray. 320. Print.</p>
+ <p>---. &ldquo;Maeshowe at Midwinter.&rdquo; 1977. <u>Under Binkie&rsquo;s Brae</u>. Edinburgh: Gordon Wright Publishing, 1979. 87-88. Print.</p>
+ <p>---. &ldquo;Two Maeshowe Poems.&rdquo; 1999. In Bevan and Murray. 420-421. Print.</p>
+ <p>Card, Nick, et al. &ldquo;Bringing a Landscape to Life? Researching and Managing &lsquo;The Heart of Neolithic Orkney&rsquo; World Heritage Site.&rdquo; <u>World Archaeology</u> 39.3 (2007): 417-435. EBSCO <u>Academic Search Complete</u>. Web. 29 Jun. 2011.</p>
+ <p>Challands, Adrian, Tom Muir, and Colin Richards. &ldquo;The Great Passage Grave of Maeshowe.&rdquo; <u>Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney</u>. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 229-248. Print.</p>
+ <p>Crawford, Robert. &ldquo;Maes Howe Sappho.&rdquo; <u>Yale Review</u>: 95.1 (2007): 60-65. OhioLINK Electronic Journal Center. Web. 29 Jun. 2011.</p>
+ <p>Garnham, Trevor. <u>Lines on the Landscape, Circles from the Sky: Monuments of Neolithic Orkney</u>. Stroud, Gloucestershire: Tempus, 2004. Print.</p>
+ <p>Hall, Simon W. <u>The History of Orkney Literature</u>. Edinburgh: John Donald/Birlinn Ltd., 2010. Print.</p>
+ <p>&ldquo;Heart of Neolithic Orkney WHS: Setting Project&rdquo; Historic Scotland. 2008. EBSCO <u>Academic Search Complete</u>. Web. 30 Jun. 2011.</p>
+ <p>Jamie, Kathleen. &ldquo;Darkness and Light.&rdquo; <u>Findings: Esssays on the Natural and Unnatural World</u>. Ed. Jamie. St. Paul, MN: Graywolf, 2005. 3-22. Print.</p>
+ <p>McKie, Robin. &ldquo;Neolithic Discovery: Why Orkney is the Centre of Ancient Britain.</p>
+ <p><u>The Guardian / The Observer</u>. 6 Oct. 2012. Web. 16 Mar. 2013.</p>
+ <p>Mitchison, Naomi. <u>Early in Orcadia</u>. Glasgow: Richard Drew, 1987. Print.</p>
+ <p>Jones, Si&acirc;n, and Colin Richards. &ldquo;The Villagers of Barnhouse.&rdquo; <u>Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney</u>. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 195-204. Print.</p>
+ <p>Richards, Colin. &ldquo;Doorways into Another World: The Orkney-Cromarty Chambered Tombs.&rdquo; <u>Vessels for Ancestors: Essays on the Neolithic of Britain and Ireland in Honour of Audrey Henshall</u>. Ed. Niall Sharples and Alison Sheridan. Edinburgh: Edinburgh UP, 1992. 62-76. Print.</p>
+ <p>Riddoch, Lesley. &ldquo;Stone Age Marvels Which Inspire and Astonish: Wonders of Scotland.&rdquo; <u>The Scotsman</u>. 13 Feb. 2006. Web. 30 Jun. 2011.</p>
+ <p>Rose, Dilys. &ldquo;Maes Howe Nipple.&rdquo; <u>Bodywork</u>. Edinburgh. Luath Press, 2007. Print.</p>
+ <p>Schneider, Myra. &ldquo;Maeshowe.&rdquo; <u>Circling the Core</u>. London: Enitharmon Press, 2008. 23-24. Print.</p>
+ <p>Wordsworth, William. &ldquo;I wandered lonely as a cloud.&rdquo; <u>The Norton Anthology of English Literature</u>. Eighth Ed. Ed. Stephen Greenblatt and M.H. Abrams. New York: Norton, 2006. 305-306. Print.</p>
+<p><strong>Contributor's Note</strong></p>
+ <p><strong>CHARLOTTE FAIRLIE</strong> teaches English at Wilmington College, in Wilmington, Ohio. Her published work focuses on Scottish literature and rural life in literature. She is currently co-editing an anthology of poetry relating to scythes and mowing.</p></td>
+ <td valign="top"><center>
+ <a href="../index.html"> <img src="../image/btncu.gif" alt="Current Issue" border="0" height="42" width="79"></a><br>
+ <a href="../download.html" tppabs="http://www.genders.org/download.html"> <img src="../image/btndo.gif" alt="Download" tppabs="http://www.genders.org/image/btndo.gif" align="bottom" border="0" height="42" width="115"></a><br>
+ <a href="../edit.html" tppabs="http://www.genders.org/edit.html"> <img src="../image/btned.gif" alt="Editorial Board" tppabs="http://www.genders.org/image/btned.gif" align="bottom" border="0" height="50" width="80"></a><br>
+ <a href="../guide.html" tppabs="http://www.genders.org/guide.html"> <img src="../image/btngu.gif" alt="Contributor Guidelines" tppabs="http://www.genders.org/image/btngu.gif" align="bottom" border="0" height="42" width="90"></a><br>
+ <a href="../recent.html"> <img src="../image/btnre.gif" alt="Recent Issues" tppabs="http://www.genders.org/image/btnre.gif" align="bottom" border="0" height="41" width="79"></a><br>
+ <a href="../link.html"> <img src="../image/btnli.gif" alt="Links &amp; Books" border="0" height="46" width="97"></a><br>
+ </center></td>
+ </tr>
+ </tbody>
+ </table>
+ <table width="500">
+ <tbody>
+ <tr>
+ <td><p><a href="../download.html">Copyright</a> 2010 Ann Kibbey.
+
+ All Rights Reserved Worldwide.<br>
+ </p>
+ <p> </p>
+ <center>
+ <a href="../download.html"><font size="1">Download</font></a><font size="1"> || <a href="../edit.html">Editorial Board</a> || <a href="../guide.html">Submission
+
+ Guidelines</a> || <a href="../index.html">Current Issue</a> || <a href="../recent.html">Recent Issues</a> || <a href="../link.html">Links
+
+ &amp; Books</a></font>
+ </center></td>
+ </tr>
+ </tbody>
+ </table>
+ <p></p>
+ <p align="right">
+
+ <table width="550">
+ <tbody>
+ <tr>
+ <td width="361"></td>
+ <td width="72"><p><img src="../image/algosmlr.gif" alt="Genders" align="bottom" border="0" height="72" width="72"> </p></td>
+ <td width="101"><b> <font size="1">Genders Journal</font></b> <font size="1"><br>
+ 226 UCB<br>
+ University of Colorado<br>
+ Boulder, CO 80309<br>
+ http://www.Genders.org</font></td>
+ </tr>
+ </tbody>
+ </table>
+ </p>
+ <p align="right"></p></td>
+ </tr>
+ </tbody>
+</table>
+</p>
+<p></p>
+</body>
+</html> \ No newline at end of file
diff --git a/python/tests/files/nature_article.html b/python/tests/files/nature_article.html
new file mode 100644
index 0000000..177da83
--- /dev/null
+++ b/python/tests/files/nature_article.html
@@ -0,0 +1,1379 @@
+
+
+
+
+
+
+
+
+<!DOCTYPE html>
+<html lang="en" class="grade-c">
+<head>
+ <meta charset="utf-8">
+<link rel="dns-prefetch" href="//ajax.googleapis.com"/>
+<link rel="dns-prefetch" href="//fonts.googleapis.com"/>
+<link rel="dns-prefetch" href="//fonts.gstatic.com"/>
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no">
+
+ <title>More than 100 scientific journals have disappeared from the Internet</title>
+ <meta name="description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta property="og:url" content="https://www.nature.com/articles/d41586-020-02610-z"/>
+ <meta property="og:type" content="article"/>
+ <meta property="og:title" content="More than 100 scientific journals have disappeared from the Internet"/>
+ <meta property="og:description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta property="og:image"
+ content="https://media.nature.com/lw1024/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18365322.jpg"/>
+ <meta name="twitter:card" content="summary_large_image"/>
+ <meta name="twitter:site" content="@nature"/>
+ <meta name="twitter:title" content="More than 100 scientific journals have disappeared from the Internet"/>
+ <meta name="twitter:description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+ <meta name="twitter:image"
+ content="https://media.nature.com/lw1024/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18365322.jpg"/>
+
+
+ <meta name="journal_id" content="41586"/>
+
+ <meta name="dc.title" content="More than 100 scientific journals have disappeared from the Internet"/>
+
+ <meta name="dc.source" content="Nature 2020"/>
+
+ <meta name="dc.format" content="text/html"/>
+
+ <meta name="dc.publisher" content="Nature Publishing Group"/>
+
+ <meta name="dc.date" content="2020-09-10"/>
+
+ <meta name="dc.type" content="News"/>
+
+ <meta name="dc.language" content="En"/>
+
+ <meta name="dc.copyright" content="2020 Nature"/>
+
+ <meta name="dc.rightsAgent" content="journalpermissions@springernature.com"/>
+
+ <meta name="dc.description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+
+ <meta name="prism.publicationName" content="Nature"/>
+
+ <meta name="prism.publicationDate" content="2020-09-10"/>
+
+ <meta name="prism.section" content="News"/>
+
+ <meta name="prism.startingPage" content=""/>
+
+ <meta name="prism.endingPage" content=""/>
+
+ <meta name="prism.copyright" content="2020 Nature"/>
+
+ <meta name="prism.rightsAgent" content="journalpermissions@springernature.com"/>
+
+ <meta name="prism.url" content="https://www.nature.com/articles/d41586-020-02610-z"/>
+
+ <meta name="prism.doi" content="doi:10.1038/d41586-020-02610-z"/>
+
+ <meta name="dc.identifier" content="doi:10.1038/d41586-020-02610-z"/>
+
+ <meta name="DOI" content="10.1038/d41586-020-02610-z"/>
+
+ <meta name="description" content="Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."/>
+
+ <meta name="dc.creator" content="Diana Kwon"/>
+
+ <meta name="dc.subject" content="Publishing"/>
+
+
+
+<script>(function(e){var t=e.documentElement,n=e.implementation;t.className='js';if(n&&n.hasFeature('http://www.w3.org/TR/SVG11/feature#Image','1.1')){t.className+=' svg'}})(document)</script>
+<link rel="stylesheet" href="/static/css/mosaic-grade-c.26f07b2f11.css">
+
+<link rel="stylesheet" class="js-ctm" href="/static/css/magazine-mosaic-150.7f46c29843.css" media="only screen, print and (-webkit-min-device-pixel-ratio:0) and (min-color-index:0), (-ms-high-contrast: none), only all and (min--moz-device-pixel-ratio:0) and (min-resolution: 3e1dpcm)">
+
+
+ <style>
+ .c-header--brand-border {
+ border-bottom: 5px solid #000;
+ }
+ </style>
+
+<link rel="apple-touch-icon" sizes="180x180" href=/static/images/favicons/nature/apple-touch-icon.f39cb19454.png>
+<link rel="icon" type="image/png" sizes="32x32" href=/static/images/favicons/nature/favicon-32x32.3fe59ece92.png>
+<link rel="icon" type="image/png" sizes="16x16" href=/static/images/favicons/nature/favicon-16x16.951651ab72.png>
+<link rel="manifest" href=/static/manifest.1a481c42b1.json>
+<link rel="mask-icon" href=/static/images/favicons/nature/safari-pinned-tab.69bff48fe6.svg color="#000000">
+<link rel="shortcut icon" href=/static/images/favicons/nature/favicon.62367f778b.ico>
+<meta name="msapplication-TileColor" content="#000000">
+<meta name="msapplication-config" content=/static/browserconfig.e35b3b052c.xml>
+<meta name="theme-color" content="#000000">
+<meta name="application-name" content="Nature">
+
+<link rel="search" href="http://www.nature.com/search">
+<link rel="search" href="http://www.nature.com/opensearch/opensearch.xml" type="application/opensearchdescription+xml" title="nature.com">
+<link rel="search" href="http://www.nature.com/opensearch/request" type="application/sru+xml" title="nature.com">
+
+ <meta name="WT.cg_s" content="News"/>
+ <meta name="WT.z_cg_type" content="News"/>
+ <meta name="WT.page_categorisation" content="Article page"/>
+ <meta name="WT.z_subject_term" content="Publishing"/>
+
+<meta name="WT.template" content="oscar"/>
+<meta name="WT.cg_n" content="Nature"/>
+<meta name="dc.rights" content="©2020 Macmillan Publishers Limited. All Rights Reserved."/>
+<meta name="WT.z_bandiera_abtest" content="a"/>
+
+ <script data-test="dataLayer">
+ dataLayer = [{"content":{"category":{"contentType":"news","legacy":{"webtrendsPrimaryArticleType":"news","webtrendsSubjectTerms":"publishing","webtrendsContentCategory":null,"webtrendsContentCollection":null,"webtrendsContentGroup":"Nature","webtrendsContentGroupType":null,"webtrendsContentSubGroup":"News"}},"article":{"doi":"10.1038/d41586-020-02610-z"},"attributes":{"cms":"core media","deliveryPlatform":"oscar","copyright":{"open":false,"legacy":{"webtrendsLicenceType":null}}},"contentInfo":{"authors":["Diana Kwon"],"publishedAt":1599696000,"publishedAtString":"2020-09-10","title":"More than 100 scientific journals have disappeared from the Internet","legacy":null,"publishedAtTime":null,"documentType":"aplusplus"},"journal":{"pcode":"nature","title":"nature","volume":null,"issue":null},"authorization":{"status":true},"features":[{"name":"furtherReadingSection","present":false}],"collection":null},"page":{"category":{"pageType":"article"},"attributes":{"template":"magazine mosaic","featureFlags":[{"name":"ab_test_news_feature","active":false}]},"search":null},"privacy":{},"version":"1.0.0","product":null,"session":null,"user":null,"backHalfContent":false}];
+</script>
+
+<script>
+ (function() {
+ function deleteCookie (name, domain) {
+ document.cookie = encodeURIComponent(name) +
+ '=' +
+ ';path=/' +
+ ';domain=' + domain +
+ ';expires=Thu, 01 Jan 1970 00:00:00 GMT';
+ }
+
+ var consentCookieParts = ('; ' + document.cookie).split('; OptanonConsent=');
+
+ if (consentCookieParts.length > 1) {
+ consentCookieParts.shift(); // remove redundant first part from the split array
+
+ // onetrust can set the same cookie multiple times with different domain specificities
+ for (let i=0; i<consentCookieParts.length; i++) {
+ var otCookieGroups = consentCookieParts[i].split('&groups=').pop().split('&').shift();
+
+ if (otCookieGroups.indexOf('C0001') === -1) {
+ deleteCookie('OptanonConsent', 'nature.com');
+ deleteCookie('OptanonAlertBoxClosed', 'nature.com');
+ }
+ }
+ }
+ })();
+</script>
+
+<script>
+ (function(w,d,t) {
+ function cc() {
+ var h = w.location.hostname;
+ if (h.indexOf('preview-www.nature.com') > -1) return;
+
+ var e = d.createElement(t),
+ s = d.getElementsByTagName(t)[0];
+
+ if (h.indexOf('nature.com') > -1) {
+ e.src = 'https://cdn.cookielaw.org/scripttemplates/otSDKStub.js';
+ e.setAttribute('data-domain-script', '83f2c78a-6cbc-4d1a-9088-3f8e8c4c7460');
+ } else {
+ e.src = '/static/js/cookie-consent-bundle.9d49adbc02.js';
+ e.setAttribute('data-consent', h);
+ }
+ s.parentNode.insertBefore(e, s);
+ }
+
+ !!w.google_tag_manager ? cc() : window.addEventListener('gtm_loaded', function() {cc()});
+ })(window,document,'script');
+</script>
+<script>
+ function OptanonWrapper() {
+ window.dataLayer.push({event:'OneTrustGroupsUpdated'});
+ document.activeElement.blur();
+ }
+</script>
+
+
+<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+ new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
+ j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
+ 'https://www.googletagmanager.com/gtm.js?id='+i+dl;
+
+
+ j.addEventListener('load', function() {
+ var _ge = new CustomEvent('gtm_loaded', { bubbles: true });
+ d.dispatchEvent(_ge);
+ });
+
+ f.parentNode.insertBefore(j,f);
+})(window,document,'script','dataLayer','GTM-NWDMT9Q');</script>
+
+
+
+</head>
+<body>
+
+
+
+<div role="banner" class="position-relative cleared z-index-50 background-white" data-test="top-containers">
+
+
+ <a class="c-skip-link u-hide-print" href="#content">Skip to main content</a>
+
+
+
+
+
+
+
+ <aside class="c-ad c-ad--728x90">
+ <div class="c-ad__inner" data-container-type="banner-advert">
+ <p class="c-ad__label">Advertisement</p>
+
+
+
+ <div id="article-doubleclickad-container">
+ <div id="div-gpt-ad-top-1"
+ class="div-gpt-ad advert leaderboard js-ad text-center hide-print grade-c-hide"
+ data-ad-type="top"
+ data-gpt-unitpath="/285/nature.com/article"
+ data-gpt-sizes="728x90"
+ data-gpt-targeting="type=article;pos=top;artid=d41586-020-02610-z;doi=10.1038/d41586-020-02610-z;subjmeta=479,648,706;kwrd=Publishing">
+ <noscript>
+ <a href="//pubads.g.doubleclick.net/gampad/jump?iu=/285/nature.com/article&amp;sz=728x90&amp;c=766965215&amp;t=pos%3Dtop%26type%3Darticle%26artid%3Dd41586-020-02610-z%26doi%3D10.1038/d41586-020-02610-z%26subjmeta%3D479,648,706%26kwrd%3DPublishing">
+ <img data-test="gpt-advert-fallback-img"
+ src="//pubads.g.doubleclick.net/gampad/ad?iu=/285/nature.com/article&amp;sz=728x90&amp;c=766965215&amp;t=pos%3Dtop%26type%3Darticle%26artid%3Dd41586-020-02610-z%26doi%3D10.1038/d41586-020-02610-z%26subjmeta%3D479,648,706%26kwrd%3DPublishing"
+ alt="Advertisement"
+ width="728"
+ height="90"></a>
+ </noscript>
+ </div>
+</div>
+
+
+
+
+ </div>
+ </aside>
+
+
+
+
+
+ <div class="c-grade-c-banner u-hide">
+ <div class="c-grade-c-banner__container">
+
+ <p>Thank you for visiting nature.com. You are using a browser version with limited support for CSS. To obtain
+ the best experience, we recommend you use a more up to date browser (or turn off compatibility mode in
+ Internet Explorer). In the meantime, to ensure continued support, we are displaying the site without styles
+ and JavaScript.</p>
+
+ </div>
+ </div>
+
+
+
+
+ <header class="c-header c-header--brand-border" id="header" data-header>
+ <div class="c-header__row-border">
+ <div class="c-header__container">
+ <div class="c-header__layout">
+ <a href="/nature"
+ data-track="click" data-track-action="home" data-track-category="nature-150-split-header" data-track-label="image">
+ <picture class="c-header__logo">
+ <source srcset="//media.springernature.com/full/nature-cms/uploads/product/nature/header-86f1267ea01eccd46b530284be10585e.svg" media="(min-width: 769px)">
+ <img src="//media.springernature.com/full/nature-cms/uploads/product/nature/header-86f1267ea01eccd46b530284be10585e.svg" alt="Nature">
+ </picture>
+ </a>
+ <div class="c-header__layout">
+
+ <div class="c-header__site-navigation c-header__site-navigation--show-at-md"
+ data-test="siteindex-link">
+ <a class="c-header__link" href="https://www.nature.com/siteindex"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open nature research index" data-track-label="link">
+ <span>View all Nature Research journals</span>
+ </a>
+ </div>
+
+ <div class="c-header__site-navigation c-header__site-navigation--border">
+ <a class="c-header__link"
+ href="#search-menu"
+ data-header-expander
+ data-test="search-link" data-track="click" data-track-category="nature-150-split-header" data-track-action="open search tray" data-track-label="button">
+ <span>Search</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M16.48 15.455c.283.282.29.749.007 1.032a.738.738 0 01-1.032-.007l-3.045-3.044a7 7 0 111.026-1.026zM8 14A6 6 0 108 2a6 6 0 000 12z"/></svg>
+ </a>
+ <a href="/nams/svc/myaccount"
+ id="my-account"
+ class="c-header__link placeholder"
+ data-test="login-link" data-track="click" data-track-action="my account" data-track-category="nature-150-split-header" data-track-label="link">
+ <span>My Account</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M10.238 16.905a7.96 7.96 0 003.53-1.48c-.874-2.514-2.065-3.936-3.768-4.319V9.83a3.001 3.001 0 10-2 0v1.277c-1.703.383-2.894 1.805-3.767 4.319A7.96 7.96 0 009 17c.419 0 .832-.032 1.238-.095zm4.342-2.172a8 8 0 10-11.16 0c.757-2.017 1.84-3.608 3.49-4.322a4 4 0 114.182 0c1.649.714 2.731 2.305 3.488 4.322zM9 18A9 9 0 119 0a9 9 0 010 18z" fill="#333" fill-rule="evenodd"/></svg>
+</a>
+<a href="https://idp.nature.com/authorize/natureuser?client_id&#x3D;grover&amp;redirect_uri&#x3D;https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z"
+ id="login-button"
+ style="display: none;"
+ class="c-header__link placeholder"
+ data-test="login-link" data-track="click" data-track-action="login" data-track-category="nature-150-split-header" data-track-label="link">
+ <span>Login</span><svg role="img" aria-hidden="true" focusable="false" height="22" width="22" viewBox="0 0 18 18" xmlns="http://www.w3.org/2000/svg"><path d="M10.238 16.905a7.96 7.96 0 003.53-1.48c-.874-2.514-2.065-3.936-3.768-4.319V9.83a3.001 3.001 0 10-2 0v1.277c-1.703.383-2.894 1.805-3.767 4.319A7.96 7.96 0 009 17c.419 0 .832-.032 1.238-.095zm4.342-2.172a8 8 0 10-11.16 0c.757-2.017 1.84-3.608 3.49-4.322a4 4 0 114.182 0c1.649.714 2.731 2.305 3.488 4.322zM9 18A9 9 0 119 0a9 9 0 010 18z" fill="#333" fill-rule="evenodd"/></svg>
+</a>
+
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+
+ <div class="c-header__container" data-test="c-header__container">
+ <ul class="c-header__menu">
+
+ <li class="c-header__item" data-test="explore-content-button">
+ <a href="#explore"
+ class="c-header__link c-header__link--dropdown"
+ data-header-expander
+ data-test="menu-button"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open explore expander" data-track-label="button">
+ <span>Explore <span class="c-header__show-text">our content</span></span><svg role="img" aria-hidden="true" focusable="false" height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m5.58578644 3-3.29289322-3.29289322c-.39052429-.39052429-.39052429-1.02368927 0-1.41421356s1.02368927-.39052429 1.41421356 0l4 4c.39052429.39052429.39052429 1.02368927 0 1.41421356l-4 4c-.39052429.39052429-1.02368927.39052429-1.41421356 0s-.39052429-1.02368927 0-1.41421356z" transform="matrix(0 1 -1 0 11 3)"/></svg>
+ </a>
+ </li>
+
+ <li class="c-header__item">
+ <a href="#journal-info"
+ class="c-header__link c-header__link--dropdown"
+ data-header-expander
+ data-test="menu-button"
+ data-track="click" data-track-category="nature-150-split-header" data-track-action="open journal information expander" data-track-label="button">
+ <span>Journal info<span class="c-header__show-text">rmation</span></span><svg role="img" aria-hidden="true" focusable="false" height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m5.58578644 3-3.29289322-3.29289322c-.39052429-.39052429-.39052429-1.02368927 0-1.41421356s1.02368927-.39052429 1.41421356 0l4 4c.39052429.39052429.39052429 1.02368927 0 1.41421356l-4 4c-.39052429.39052429-1.02368927.39052429-1.41421356 0s-.39052429-1.02368927 0-1.41421356z" transform="matrix(0 1 -1 0 11 3)"/></svg>
+ </a>
+ </li>
+
+ <li class="c-header__item c-header__item--pipe">
+ <a class="c-header__link"
+ href="https://www.nature.com/nature/subscribe"
+ data-track="click"
+ data-track-action="subscribe"
+ data-track-category="nature-150-split-header"
+ data-track-label="link">
+ <span>Subscribe</span>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+
+ </header>
+
+
+
+
+ <div class="u-mb-16">
+ <div class="u-container">
+ <ol class="c-breadcrumbs">
+ <li class="c-breadcrumbs__item" id="breadcrumb0"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb1"><a class="c-breadcrumbs__link"
+ href="/"
+ itemprop="url"
+ data-track="click" data-track-action="breadcrumb" data-track-category="header" data-track-label="link:nature"><span itemprop="title">nature</span></a><svg class="c-icon c-breadcrumbs__chevron" aria-hidden="true" focusable="false" height="10" viewBox="0 0 10 10" width="10" xmlns="http://www.w3.org/2000/svg"><path d="m5.96738168 4.70639573 2.39518594-2.41447274c.37913917-.38219212.98637524-.38972225 1.35419292-.01894278.37750606.38054586.37784436.99719163-.00013556 1.37821513l-4.03074001 4.06319683c-.37758093.38062133-.98937525.38100976-1.367372-.00003075l-4.03091981-4.06337806c-.37759778-.38063832-.38381821-.99150444-.01600053-1.3622839.37750607-.38054587.98772445-.38240057 1.37006824.00302197l2.39538588 2.4146743.96295325.98624457z" fill="#666" fill-rule="evenodd" transform="matrix(0 -1 1 0 0 10)"/></svg></li><li class="c-breadcrumbs__item" id="breadcrumb1"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb2"><a class="c-breadcrumbs__link"
+ href="/nature/articles?type&#x3D;news"
+ itemprop="url"
+ data-track="click" data-track-action="breadcrumb" data-track-category="header" data-track-label="link:news"><span itemprop="title">news</span></a><svg class="c-icon c-breadcrumbs__chevron" aria-hidden="true" focusable="false" height="10" viewBox="0 0 10 10" width="10" xmlns="http://www.w3.org/2000/svg"><path d="m5.96738168 4.70639573 2.39518594-2.41447274c.37913917-.38219212.98637524-.38972225 1.35419292-.01894278.37750606.38054586.37784436.99719163-.00013556 1.37821513l-4.03074001 4.06319683c-.37758093.38062133-.98937525.38100976-1.367372-.00003075l-4.03091981-4.06337806c-.37759778-.38063832-.38381821-.99150444-.01600053-1.3622839.37750607-.38054587.98772445-.38240057 1.37006824.00302197l2.39538588 2.4146743.96295325.98624457z" fill="#666" fill-rule="evenodd" transform="matrix(0 -1 1 0 0 10)"/></svg></li><li class="c-breadcrumbs__item" id="breadcrumb2"
+ itemscope="itemscope" itemtype="http://data-vocabulary.org/Breadcrumb" itemref="breadcrumb3"><span itemprop="title">article</span></li>
+ </ol>
+ </div>
+ </div>
+
+
+
+
+
+
+</div>
+
+
+ <div id="content" class="article-page position-relative z-index-1">
+ <section class="container highlight-container article-page--news container-with-gap">
+ <article class="article-item article-item--open" itemscope="" itemtype="http://schema.org/NewsArticle"
+ data-track-component="news">
+ <div class="container cleared container-type-article" data-container-type="article" itemprop="articleBody">
+ <div class="content position-relative cleared clear mq1200-padded" data-component="article-container"
+ role="main">
+ <header class="article-item__header clear cleared pull--both">
+ <div class="article__type">NEWS
+ <div class="ml10 article__date">
+ <time itemprop="datePublished">10 September 2020</time>
+ </div>
+ </div>
+
+ <div class="clear cleared"></div>
+ <h1 class="article-item__title serif" itemprop="headline">More than 100 scientific journals have disappeared from the Internet</h1>
+
+ <div class="article-item__teaser-text serif">
+ Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk.
+ </div>
+ </header>
+
+ <div class="clear cleared"></div>
+
+ <div class="bordered-container clear cleared pull--both">
+ <div id="author-affiliations" class="tab-group text14" role="tablist" data-test="author-affiliations" data-tab-group>
+ <div class="cleared">
+
+ <div id="author-affiliation-news-0" class="tab-box js-box-wrapper">
+ <h3 id="author-affiliation-news-0-head" data-track="click" data-track-label="view author info" class="sans-serif strong tab tab-skin ma0" role="tab"
+ aria-controls="author-affiliation-news-0-content" data-tooltip="Show author information">
+ Diana Kwon
+ </h3>
+ <div id="author-affiliation-news-0-content" class="tab-content pin-right grid grid-12 last"
+ role="tabpanel">
+ <div class="pa10" aria-labelledby="author-affiliation-news-0-head">
+ <div class="clear cleared">
+
+
+ <div class="align-left">
+ <h4 class="sans-serif">Search for this author in:</h4>
+ <ul class="ma0 clean-list">
+ <li class="strong"><a href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd&#x3D;search&amp;term&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Pub Med" >Pub Med</a></li>
+
+ <li class="strong"><a href="https://www.nature.com/search?order&#x3D;date_desc&amp;q&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Nature.com" >Nature.com</a></li>
+
+ <li class="strong"><a href="https://scholar.google.co.uk/scholar?as_q&#x3D;&amp;btnG&#x3D;Search+Scholar&amp;as_sauthors&#x3D;%22Diana%2BKwon%22" data-track="click" data-track-label="Google Scholar" >Google Scholar</a></li>
+ </ul>
+ </div>
+
+
+
+ </div>
+ </div>
+ </div>
+ </div>
+
+ </div>
+</div>
+
+ </div>
+
+ <div class="clear cleared pull--both">
+ <ul class="social clean-list inline-list hide-print">
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="twitter" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="https://twitter.com/intent/tweet?text=More+than+100+scientific+journals+have+disappeared+from+the+Internet&url=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share on Twitter</title>
+ <desc>Share on Twitter</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M20.8125,11.4875 C21.42,11.10375 21.8875,10.49625 22.105,9.7725 C21.5375,10.1275 20.90875,10.385 20.23875,10.5225 C19.70625,9.9225 18.9425,9.545 18.0975,9.545 C16.475,9.545 15.16,10.9325 15.16,12.6425 C15.16,12.885 15.185,13.1225 15.235,13.3475 C12.7975,13.2175 10.63125,11.985 9.1825,10.11 C8.93,10.56875 8.785,11.10125 8.785,11.66875 C8.785,12.74375 9.30375,13.69125 10.09125,14.2475 C9.61125,14.23125 9.1575,14.09 8.76125,13.86 L8.76125,13.8975 C8.76125,15.3975 9.77375,16.65125 11.11875,16.935 C10.87125,17.0075 10.6125,17.04375 10.34375,17.04375 C10.15625,17.04375 9.96875,17.025 9.79125,16.98875 C10.16625,18.22125 11.24875,19.11875 12.535,19.1425 C11.52875,19.97375 10.2625,20.4675 8.885,20.4675 C8.6475,20.4675 8.415,20.455 8.185,20.42625 C9.485,21.30375 11.02875,21.81625 12.6875,21.81625 C18.09,21.81625 21.04375,17.095 21.04375,13.00125 L21.03625,12.60125 C21.61125,12.16375 22.11125,11.6175 22.50125,10.99625 C21.97375,11.2425 21.4075,11.40875 20.81375,11.48375 L20.8125,11.4875 Z"
+ fill-rule="nonzero"></path>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="facebook" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share on Facebook</title>
+ <desc>Share on Facebook</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M15.89625,22.8625 L12.57125,22.8625 L12.57125,15.02125 L10.90875,15.02125 L10.90875,12.31875 L12.57125,12.31875 L12.57125,10.69625 C12.57125,8.4925 13.50875,7.18 16.175,7.18 L18.39375,7.18 L18.39375,9.8825 L17.00625,9.8825 C15.96875,9.8825 15.9,10.26 15.9,10.965 L15.895,12.3175 L18.4075,12.3175 L18.115,15.02 L15.89625,15.02 L15.89625,22.8625 Z"
+ fill-rule="nonzero"></path>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+ <li class="mr10">
+ <a class="icon--inline inline-block" data-track="click" data-track-action="email" data-track-category="social" data-track-label="10.1038/d41586-020-02610-z" href="mailto:?subject=More than 100 scientific journals have disappeared from the Internet&body=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-020-02610-z">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg role="img" focusable="false" viewBox="0 0 30 30" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Share via E-Mail</title>
+ <desc>Share via E-Mail</desc>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <g>
+ <g>
+ <polygon points="0 0 30 0 30 30 0 30"></polygon>
+ <path d="M15,15.3269887 L10.6248577,11.9177869 C10.4236021,11.7609644 10.1299323,11.7927468 9.96892789,11.988775 C9.80792343,12.1848031 9.84055341,12.4708451 10.041809,12.6276676 L14.7012493,16.2584003 C14.8680779,16.3940555 15.1152493,16.4013884 15.2915244,16.2640313 C15.2939898,16.2622325 15.2963784,16.2603294 15.2987507,16.2584003 L19.958191,12.6276676 C20.1594466,12.4708451 20.1920766,12.1848031 20.0310721,11.988775 C19.8700677,11.7927468 19.5763979,11.7609644 19.3751423,11.9177869 L15,15.3269887 Z M9,10 L21,10 C21.5522847,10 22,10.4477153 22,11 L22,19 C22,19.5522847 21.5522847,20 21,20 L9,20 C8.44771525,20 8,19.5522847 8,19 L8,11 C8,10.4477153 8.44771525,10 9,10 Z"></path>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </a>
+ </li>
+</ul>
+
+ </div>
+
+
+
+
+ <div class="align-left">
+
+ <div class="article__body serif cleared">
+ <p>Scholarly journals are supposed to provide a lasting record of science. But over the past two decades, 176 open-access journals — and many of the papers published in them — have disappeared from the Internet, according to an analysis published on 27 August<sup><a href="#ref-CR1" data-track="click" data-action="anchor-link" data-track-label="go to reference" data-track-category="references">1</a></sup>.</p><p>“There shouldn’t really be any decay or loss in scientific publications, particularly those that have been open on the web,” says Mikael Laakso, an information scientist at the Hanken School of Economics in Helsinki, and a co-author of the study, which was posted on the arXiv preprint server. He and his colleagues identified 176 titles whose online presence vanished between 2000 and 2019.</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/news/investigating-journals-the-dark-side-of-publishing-1.12666" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"><h1 class="recommended__title serif">Investigating journals: The dark side of publishing</h1></a>
+ </aside></p><p>More than half of these journals were in the social sciences and humanities, although life sciences, health sciences, physical sciences and mathematics were also represented. Eighty-eight of the journals were affiliated with a scholarly society or a research institution. The analysis also identified 900 journals that are still online but seem to have stopped publishing papers, so might be vulnerable to vanishing in the near future.</p><p>The study lays out a "compelling case" for the vulnerability of online journals, says Elizabeth Lightfoot, a librarian at Florida International University in Miami.</p><h2>Vanishing journals</h2><p>Journals can disappear from the Internet for a number of reasons, says Laakso. The publisher might stop paying to keep its publication’s webpage afloat, for example, or journals might be hosted on an online platform that belongs to an academic institution and is left behind when the site or server is updated.</p><p>Journals are supposed to be preserved in digital archives when this happens. Services such as the LOCKSS (Lots of Copies Keep Stuff Safe) Program, which was launched by Stanford Libraries in 1999, aim to ensure that publications remain available even when the publisher is no longer around. LOCKSS works by making multiple copies of content that is stored on the servers of participating libraries, who pay an annual fee to have their collections preserved. Similar initiatives, including CLOCKSS, Portico and the Public Knowledge Project’s Preservation Network (PKP PN), have emerged over the past two decades. These vary in cost and coverage: Some work with libraries, others with publishers — services such as PKP PN are free for journals that sign up. Tens of thousands of titles are currently curated in such preservation schemes. But, Laakso says, there are dozens of journals that fall through the cracks.</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/articles/d41586-018-06178-7" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"><h1 class="recommended__title serif">Radical open-access plan could spell end to journal subscriptions</h1></a>
+ </aside></p><p>Pinning down whether a journal is truly unavailable online is a challenge, because there is no single database that tracks the activity of open-access journals, says Lisa Matthias, one of the authors of the study and a PhD student at the Free University of Berlin. Databases such as the Directory of Open Access Journals (DOAJ) don’t keep track of journals that no longer publish — and journals that cease publishing or stop maintaining their presence on the web usually do so silently.</p><p>To find out how many journals had vanished, the team manually collected historical data from several lists of titles, including the DOAJ, Ulrichsweb and Scopus. Then they checked to see if any of the titles they identified were listed on the Keepers Registry, which keeps track of journals that are enrolled into digital preservation schemes. Finally, they went to the Internet Archive’s Wayback Machine to access snapshots of now-offline journals’ websites to see when they had last published, and when the content was last available on the Internet. Journals were considered “vanished” if less than 50% of their content was still freely available online (the researchers acknowledge that some journals could exist in print form or behind a paywall).</p><p>The majority of the 176 vanished journals had disappeared within 5 years of becoming inactive — the point at which they stopped publishing papers. Around one-third of them disappeared within one year of the last publication. The researchers used this ‘life cycle’ to estimate that another 900 inactive open-access journalscould be at risk of vanishing.</p><h2>Preserving the literature</h2><p>Subscription journals were not included in the study, Laakso says, because paywalls mean that they would have had to have used a different method to collect the data. He adds that because of this and other limitations, the study probably underestimates the number of journals that have disappeared. “It’s really hard to pin down when something doesn't absolutely exist, but we tried our best,” Laakso says. “We hope that there will be more refined and automatic ways to detect these in the future.”</p><p>
+ <aside class="recommended pull pull--left sans-serif" data-label="Related">
+ <a href="https://www.nature.com/articles/d41586-019-02038-0" data-track="click" data-track-label="recommended article"><img class="recommended__image" alt="" src="//media.nature.com/w400/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16870448.jpg"><h1 class="recommended__title serif">India culls hundreds more ‘dubious’ journals from government approved list</h1></a>
+ </aside></p><p>Thib Guicherd-Callin, the acting manager of the LOCKSS Program, says it’s not surprising that there are journals that aren't captured by existing preservation services. Although many groups have used the open-source LOCKSS software, efforts to launch digital preservation initiatives are still “woefully underfunded”, he adds. “The desire to preserve these at-risk works is there,” he adds, but few institutions are investing the resources necessary to identify these publications and make sure they’re included in a digital preservation scheme.</p><p>Matthias says that the responsibility for ensuring inactive journals don’t disappear should be shared between publishers, authors, librarians and preservation services. Lightfoot agrees that a coordinated and collaborative effort is necessary. However, she adds, “the twin challenges of what that effort might look like and who would fund it make the pathway forward murky at best”.</p>
+ </div>
+
+ <div class="emphasis">doi: <a href="https://doi.org/10.1038/d41586-020-02610-z">https://doi.org/10.1038/d41586-020-02610-z</a></div>
+ <div class="anchor-link mt40" data-toggle="anchor-links"></div>
+ <div id="references" class="references" data-toggle="anchor-links-section" data-label="References" data-concertina="true">
+ <section aria-labelledby="Bib1"><div class="serif article-section js-article-section cleared clear" id="Bib1-section"><h2 class="js-section-title section-title strong position-relative tighten-line-height background-gray-light pt20 pb6 pl0 pr20 standard-space-below small-space-above mq640-pt10 mq640-pb10 mq640-pl20 mq640-mt0 mq640-ml-20 mq640-mr-20 extend-left" id="Bib1">References</h2><div class="pl20 mq875-pl0 js-collapsible-section" id="Bib1-content"><div data-container-section="references"><ol class="clean-list ma0 standard-space-below indented-list" data-test="references-list"><li class="small-space-below border-gray-medium border-bottom-1 position-relative js-ref-item" itemprop="citation" itemscope="itemscope" itemtype="http://schema.org/Article" data-test="citation"><span class="indented-counter serif h2 tighten-line-height text-right position-absolute grade-c-hide">1.</span><p class="tiny-space-below" id="ref-CR1">Laakso, M., Matthias, L. &amp; Jahn, N. Preprint at <a href="https://arxiv.org/abs/2008.11933">https://arxiv.org/abs/2008.11933</a> (2020).</p><ul class="js-ref-links clean-list cleared strong sans-serif text13 hide-print small-space-below"><li class="pin-right"><ul class="clean-list ma0"></ul></li></ul></li></ol><p class="hide-print text-right"><a href="/articles/d41586-020-02610-z-references.ris" class="text14 sans-serif strong" data-track="click" data-track-action="download citation references" data-track-label="link">Download references</a></p></div></div></div></section>
+ </div>
+
+
+
+
+
+
+ <div class="nature-briefing nature-briefing-box mt0 cleared hide-print" data-component-id="nature-briefing-box" data-track="in-view" data-track-action="in-view" data-track-category="nature briefing" data-track-label="inPage box visible">
+ <div class="nature-briefing-box__header pa20">
+ <h1 class="h2 strong pb10 extra-tight-line-height">Nature Briefing</h1>
+ <p class="nature-briefing-box__standfirst mb0 sans-serif tighten-line-height">An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.</p>
+ </div>
+ <form action="/briefing/signup/formfeedback" method="post" class="nature-briefing-box__form pa20" data-location="box" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-box-signup-form-inPage-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBox-inPage">
+ <input id="briefing-box-signup-form-inPage-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBox">
+ <label class="nature-briefing-box__input-label block strong" for="box-inPage-EmailAddressInput">Email address</label>
+ <input class="nature-briefing-box__input-input block border-all-1 equalize-line-height pa10 mb10 box-sizing grid-12" type="email" id="box-inPage-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-box-email-input">
+
+ <div class="mb20 position-relative" role="group">
+ <input class="nature-briefing-box__checkbox-checkbox" id="gdpr-briefing-box-inPage-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-box-gdpr-checkbox" required>
+ <label class="nature-briefing-box__checkbox-label tighten-line-height" for="gdpr-briefing-box-inPage-checkbox">Yes! Sign me up to receive the daily <em>Nature Briefing</em> email. I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+
+ <button type="submit" class="nature-briefing-box__submit-button c-btn--squared" data-test-element="briefing-box-signup-button">Sign up</button>
+
+ </form>
+ </div>
+
+
+
+
+ </div>
+
+ <aside class="article__aside align-right">
+ <div class="related-content shrink--aside hide-print">
+
+ <h3 class="aside__title sans-serif">Related Articles</h3>
+ <ul class="ma0 clean-list">
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-06178-7" data-track="click"
+ data-track-label="related article (rank:0)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"
+ alt="Radical open-access plan could spell end to journal subscriptions">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16099234.jpg"
+ alt="Radical open-access plan could spell end to journal subscriptions">
+ </noscript>
+
+ Radical open-access plan could spell end to journal subscriptions
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/news/investigating-journals-the-dark-side-of-publishing-1.12666" data-track="click"
+ data-track-label="related article (rank:1)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"
+ alt="Investigating journals: The dark side of publishing">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_15541288.jpg"
+ alt="Investigating journals: The dark side of publishing">
+ </noscript>
+
+ Investigating journals: The dark side of publishing
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-020-01066-5" data-track="click"
+ data-track-label="related article (rank:2)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18030798.jpg"
+ alt="Nature to join open-access Plan S, publisher says">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_18030798.jpg"
+ alt="Nature to join open-access Plan S, publisher says">
+ </noscript>
+
+ Nature to join open-access Plan S, publisher says
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-07557-w" data-track="click"
+ data-track-label="related article (rank:3)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16355294.jpg"
+ alt="Funders flesh out details of Europe’s bold open-access plan">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_16355294.jpg"
+ alt="Funders flesh out details of Europe’s bold open-access plan">
+ </noscript>
+
+ Funders flesh out details of Europe’s bold open-access plan
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/articles/d41586-018-07245-9" data-track="click"
+ data-track-label="related article (rank:4)">
+
+ <img class="figure__image" data-src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_17334214.jpg"
+ alt="AI peer reviewers unleashed to ease publishing grind">
+ <noscript>
+ <img class="figure__image figure--no-js"
+ src="//media.nature.com/lw100/magazine-assets/d41586-020-02610-z/d41586-020-02610-z_17334214.jpg"
+ alt="AI peer reviewers unleashed to ease publishing grind">
+ </noscript>
+
+ AI peer reviewers unleashed to ease publishing grind
+ </a>
+ </h3>
+ </li>
+
+ <li class="article-item article-item--rc cleared">
+ <h3 class="article-item__title serif">
+ <a href="https://www.nature.com/news/open-access-the-true-cost-of-science-publishing-1.12676" data-track="click"
+ data-track-label="related article (rank:5)">
+
+ The true cost of science publishing
+ </a>
+ </h3>
+ </li>
+
+ </ul>
+ </div>
+
+ <div class="article__subjects bordered-container shrink--aside hide-print">
+ <h3 class="aside__title sans-serif">Subjects</h3>
+ <ul class="ma0 subject-list cleared clean-list inline-list">
+
+ <li class="subject"><a href="/subjects/publishing" data-track="click"
+ data-track-label="subject (rank:0)">Publishing</a>
+ </li>
+
+ </ul>
+ </div>
+
+
+
+<div id="div-gpt-ad-right-2"
+ class="div-gpt-ad medium-rectangle advert js-ad text-center hide-print grade-c-hide"
+ data-gpt-unitpath="/285/nature.com/article"
+ data-gpt-sizes="300x250"
+ data-gpt-targeting="pos=right;artid=/articles/d41586-020-02610-z;path=/articles/d41586-020-02610-z"
+ data-ad-type="right"
+ >
+ <noscript>
+ <a href="//pubads.g.doubleclick.net/gampad/jump?iu=/285/nature.com/article&amp;sz=300x250&amp;c=1791348774&amp;t=pos%3Dright%26artid%3D/articles/d41586-020-02610-z">
+ <img data-test="gpt-advert-fallback-img"
+ src="//pubads.g.doubleclick.net/gampad/ad?iu=/285/nature.com/article&amp;sz=300x250&amp;c=1791348774&amp;t=pos%3Dright%26artid%3D/articles/d41586-020-02610-z"
+ alt="Advertisement"
+ width="300"
+ height="250"/>
+ </a>
+ </noscript>
+</div>
+
+
+ <div class="nature-briefing--sidebar bordered-container shrink--aside hide-print">
+
+
+ <div class="nature-briefing nature-briefing-box mt0 cleared hide-print" data-component-id="nature-briefing-box" data-track="in-view" data-track-action="in-view" data-track-category="nature briefing" data-track-label="sidebar box visible">
+ <div class="nature-briefing-box__header pa20">
+ <h1 class="h2 strong pb10 extra-tight-line-height">Sign up to Nature Briefing</h1>
+ <p class="nature-briefing-box__standfirst mb0 sans-serif tighten-line-height">An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.</p>
+ </div>
+ <form action="/briefing/signup/formfeedback" method="post" class="nature-briefing-box__form pa20" data-location="box" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-box-signup-form-sidebar-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBox-sidebar">
+ <input id="briefing-box-signup-form-sidebar-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBox">
+ <label class="nature-briefing-box__input-label block strong" for="box-sidebar-EmailAddressInput">Email address</label>
+ <input class="nature-briefing-box__input-input block border-all-1 equalize-line-height pa10 mb10 box-sizing grid-12" type="email" id="box-sidebar-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-box-email-input">
+
+ <div class="mb20 position-relative" role="group">
+ <input class="nature-briefing-box__checkbox-checkbox" id="gdpr-briefing-box-sidebar-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-box-gdpr-checkbox" required>
+ <label class="nature-briefing-box__checkbox-label tighten-line-height" for="gdpr-briefing-box-sidebar-checkbox">Yes! Sign me up to receive the daily <em>Nature Briefing</em> email. I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+
+ <button type="submit" class="nature-briefing-box__submit-button c-btn--squared" data-test-element="briefing-box-signup-button">Sign up</button>
+
+ </form>
+ </div>
+
+
+</div>
+
+ </aside>
+ </div>
+ </div>
+ <div data-microformat-only="" itemscope="" itemprop="publisher" itemtype="https://schema.org/Organization">
+ <meta content="Macmillan Publishers Limited, part of Springer Nature" itemprop="name"/>
+ </div>
+ <div data-microformat-only="" itemscope="" itemprop="author" itemtype="https://schema.org/Organization">
+ <meta content="Nature Editorial" itemprop="name"/>
+ </div>
+ <img src="/platform/track/article/d41586-020-02610-z" width="1" height="1" alt="" class="visually-hidden"/>
+</article>
+
+
+
+
+
+
+
+<div class="c-site-messages message hide u-hide-print c-site-messages--nature-briefing c-site-messages--nature-briefing-email-variant c-site-messages--nature-briefing-redesign-2020 sans-serif"
+data-component-id="nature-briefing-banner"
+data-component-expirydays="30"
+data-component-trigger-scroll-percentage="15"
+data-track="in-view"
+data-track-action="in-view"
+data-track-category="nature briefing"
+data-track-label="redesign banner visible">
+
+
+ <div class="c-site-messages__banner-large">
+
+
+<div class="c-site-messages__close-container ">
+ <button class="c-site-messages__close"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner dismiss">
+ <span class="">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg width="25px" height="25px" focusable="false" aria-hidden="true" viewBox="0 0 25 25" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Close banner</title>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <rect opacity="0" x="0" y="0" width="25" height="25"></rect>
+ <path d="M6.29679575,16.2772478 C5.90020818,16.6738354 5.90240728,17.3100587 6.29617427,17.7038257 C6.69268654,18.100338 7.32864195,18.0973145 7.72275218,17.7032043 L12,13.4259564 L16.2772478,17.7032043 C16.6738354,18.0997918 17.3100587,18.0975927 17.7038257,17.7038257 C18.100338,17.3073135 18.0973145,16.671358 17.7032043,16.2772478 L13.4259564,12 L17.7032043,7.72275218 C18.0997918,7.32616461 18.0975927,6.68994127 17.7038257,6.29617427 C17.3073135,5.89966201 16.671358,5.90268552 16.2772478,6.29679575 L12,10.5740436 L7.72275218,6.29679575 C7.32616461,5.90020818 6.68994127,5.90240728 6.29617427,6.29617427 C5.89966201,6.69268654 5.90268552,7.32864195 6.29679575,7.72275218 L10.5740436,12 L6.29679575,16.2772478 Z" fill="#ffffff"></path>
+ </g>
+ </svg>
+ </span>
+ <span class="visually-hidden">Close</span>
+ </button>
+</div>
+
+
+ <div class="c-site-messages__form-container">
+
+
+
+ <div class="grid grid-12 last">
+ <div class="grid grid-4">
+ <img alt="Nature Briefing" src="/static/images/logos/nature-briefing-logo-n150-white.d81c9da3ec.svg" width="250" height="40">
+ <p class="c-site-messages--nature-briefing__strapline extra-tight-line-height">Sign up for the <em>Nature Briefing</em> newsletter — what matters in science, free to your inbox daily.</p>
+ </div>
+ <div class="grid grid-8 last">
+ <form action="/briefing/signup/formfeedback" method="post" data-location="banner" data-track="submit" data-track-action="transmit-form">
+ <input id="briefing-banner-signup-form-input-track-originReferralPoint" type="hidden" name="track_originReferralPoint" value="DirectEmailBannerRedesign2020">
+ <input id="briefing-banner-signup-form-input-track-formType" type="hidden" name="track_formType" value="DirectEmailBanner">
+ <label class="nature-briefing-banner__email-label" for="banner-EmailAddressInput">Email address</label>
+
+ <div class="nature-briefing-banner__email-wrapper">
+ <input class="nature-briefing-banner__email-input box-sizing text14" type="email" id="banner-EmailAddressInput" name="email" value="" placeholder="e.g. jo.smith@university.ac.uk" required="true" aria-required="true" data-test-element="briefing-emailbanner-email-input">
+ <button type="submit" class="nature-briefing-banner__submit-button box-sizing text14" data-test-element="briefing-emailbanner-signup-button">Sign up</button>
+ </div>
+
+ <div class="nature-briefing-banner__checkbox-wrapper grid grid-12 last">
+ <input class="nature-briefing-banner__checkbox-checkbox" id="gdpr-briefing-banner-checkbox" type="checkbox" name="gdpr" value="1" data-test-element="briefing-emailbanner-gdpr-checkbox" required>
+ <label class="nature-briefing-banner__checkbox-label box-sizing text13 sans-serif block tighten-line-height" for="gdpr-briefing-banner-checkbox">I agree my information will be processed in accordance with the <em>Nature</em> and Springer Nature Limited <a href="https://www.nature.com/info/privacy">Privacy Policy</a>.</label>
+ </div>
+ </form>
+ </div>
+ </div>
+
+
+ </div>
+
+ </div>
+
+
+ <div class="c-site-messages__banner-small">
+
+
+<div class="c-site-messages__close-container ">
+ <button class="c-site-messages__close"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner dismiss">
+ <span class="">
+ <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+ <svg width="25px" height="25px" focusable="false" aria-hidden="true" viewBox="0 0 25 25" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+ <title>Close banner</title>
+ <defs></defs>
+ <g stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+ <rect opacity="0" x="0" y="0" width="25" height="25"></rect>
+ <path d="M6.29679575,16.2772478 C5.90020818,16.6738354 5.90240728,17.3100587 6.29617427,17.7038257 C6.69268654,18.100338 7.32864195,18.0973145 7.72275218,17.7032043 L12,13.4259564 L16.2772478,17.7032043 C16.6738354,18.0997918 17.3100587,18.0975927 17.7038257,17.7038257 C18.100338,17.3073135 18.0973145,16.671358 17.7032043,16.2772478 L13.4259564,12 L17.7032043,7.72275218 C18.0997918,7.32616461 18.0975927,6.68994127 17.7038257,6.29617427 C17.3073135,5.89966201 16.671358,5.90268552 16.2772478,6.29679575 L12,10.5740436 L7.72275218,6.29679575 C7.32616461,5.90020818 6.68994127,5.90240728 6.29617427,6.29617427 C5.89966201,6.69268654 5.90268552,7.32864195 6.29679575,7.72275218 L10.5740436,12 L6.29679575,16.2772478 Z" fill="#ffffff"></path>
+ </g>
+ </svg>
+ </span>
+ <span class="visually-hidden">Close</span>
+ </button>
+</div>
+
+
+ <div class="c-site-messages__content text14">
+ <span class="c-site-messages--nature-briefing__strapline strong serif">Get the most important science stories of the day, free in your inbox.</span>
+ <a class="nature-briefing__link text14 sans-serif"
+ data-track="click"
+ data-track-category="nature briefing"
+ data-track-label="redesign banner CTA to site"
+ data-test-element="briefing-banner-link"
+ target="_blank"
+ rel="noreferrer noopener"
+ href="/briefing/signup/?origin=Nature&amp;originReferralPoint=EmailBanner">Sign up for Nature Briefing
+ </a>
+ </div>
+
+ </div>
+
+</div>
+
+ </section>
+</div>
+ <script>
+ window.onload = function () {
+ Array.prototype.slice.call(document.querySelectorAll(".magazine-infographic > iframe"))
+ .forEach(function (element) {
+ function listener(event) {
+ if (event.data.height) {
+ if (element.id === event.data.requestData.id) {
+ element.setAttribute("height", event.data.height)
+ }
+ }
+ }
+
+ window.addEventListener("message", listener);
+ element.contentWindow.postMessage({name: "getHeight", id: element.id}, "*");
+ });
+ }
+ </script>
+ <script>
+ var linkEl = document.querySelector('.js-ctm');
+ if (linkEl && window.matchMedia && window.matchMedia(linkEl.media).matches) {
+ var fragment = document.createDocumentFragment();
+ var polyfillScript = document.createElement('script');
+ var header150Script = null;
+ var appScript = document.createElement('script');
+ var sharedEs6Script = document.createElement('script');
+
+ polyfillScript.src = 'https://cdn.polyfill.io/v2/polyfill.min.js?features=default,IntersectionObserver,Array.prototype.includes,Promise';
+ polyfillScript.async = false;
+ fragment.appendChild(polyfillScript);
+
+ appScript.src = '/static/js/magazine/magazine-mosaic.71d8740808.js';
+ appScript.async = false;
+ fragment.appendChild(appScript);
+
+ sharedEs6Script.src = '/static/js/shared-es6-bundle.c83ed51f05.js';
+ sharedEs6Script.async = false;
+ fragment.appendChild(sharedEs6Script);
+
+ header150Script = document.createElement('script');
+ header150Script.src = '/static/js/header-150-bundle.aaea96385f.js';
+ header150Script.async = false;
+ fragment.appendChild(header150Script);
+
+ document.body.appendChild(fragment);
+ }
+ </script>
+ <script>
+ var idp = {
+ hasNatureUserProof: function (hasProof) {
+ if (!hasProof) {
+ document.getElementById("my-account").setAttribute("style", "display: none;");
+ document.getElementById("login-button").setAttribute("style", "");
+ }
+ }
+ }
+ </script>
+ <script src="https://verify.nature.com/verify/nature.min.js"></script>
+ <noscript>
+ <img src="https://verify.nature.com/verify/nature.png" alt="" width="0" height="0"/>
+ </noscript>
+
+
+
+ <nav class="u-hide-print c-header-expander" aria-labelledby="Explore-our-content" data-test="Explore-our-content" id="explore" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <div class="c-header-expander__keyline">
+ <h2 id="Explore-our-content" class="c-header-expander__heading u-js-hide">Explore our content</h2>
+ <ul class="c-header-expander__list">
+
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/research"
+ data-track="click"
+ data-track-action="research"
+ data-track-label="link">
+ Research
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/news"
+ data-track="click"
+ data-track-action="news"
+ data-track-label="link">
+ News
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/opinion"
+ data-track="click"
+ data-track-action="opinion"
+ data-track-label="link">
+ Opinion
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/research-analysis"
+ data-track="click"
+ data-track-action="research analysis"
+ data-track-label="link">
+ Research Analysis
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/careers"
+ data-track="click"
+ data-track-action="careers"
+ data-track-label="link">
+ Careers
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/books-culture"
+ data-track="click"
+ data-track-action="books and culture"
+ data-track-label="link">
+ Books and Culture
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/podcast"
+ data-track="click"
+ data-track-action="podcasts"
+ data-track-label="link">
+ Podcasts
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/videoarchive"
+ data-track="click"
+ data-track-action="videos"
+ data-track-label="link">
+ Videos
+ </a>
+ </li>
+
+
+
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/current-issue"
+ data-track="click"
+ data-track-action="current issue"
+ data-track-label="link">
+ Current Issue
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/browse-issues"
+ data-track="click"
+ data-track-action="browse issues"
+ data-track-label="link">
+ Browse Issues
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/articles"
+ data-track="click"
+ data-track-action="browse articles"
+ data-track-label="link">
+ Browse Articles
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/collections"
+ data-track="click"
+ data-track-action="browse collections"
+ data-track-label="link">
+ Browse Collections
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/browse-subjects"
+ data-track="click"
+ data-track-action="browse subjects"
+ data-track-label="link">
+ Browse Subjects
+ </a>
+ </li>
+
+
+
+ <li class="c-header-expander__item c-header-expander__item--keyline">
+ <a class="c-header-expander__link"
+ href="https://www.nature.com/my-account/alerts/subscribe-journal?list-id&#x3D;1"
+ data-track="click"
+ data-track-action="Sign up for alerts"
+ data-track-label="link">Sign up for alerts<svg role="img" aria-hidden="true" focusable="false" height="18" viewBox="0 0 18 18" width="18" xmlns="http://www.w3.org/2000/svg"><path d="m4 10h2.5c.27614237 0 .5.2238576.5.5s-.22385763.5-.5.5h-3.08578644l-1.12132034 1.1213203c-.18753638.1875364-.29289322.4418903-.29289322.7071068v.1715729h14v-.1715729c0-.2652165-.1053568-.5195704-.2928932-.7071068l-1.7071068-1.7071067v-3.4142136c0-2.76142375-2.2385763-5-5-5-2.76142375 0-5 2.23857625-5 5zm3 4c0 1.1045695.8954305 2 2 2s2-.8954305 2-2zm-5 0c-.55228475 0-1-.4477153-1-1v-.1715729c0-.530433.21071368-1.0391408.58578644-1.4142135l1.41421356-1.4142136v-3c0-3.3137085 2.6862915-6 6-6s6 2.6862915 6 6v3l1.4142136 1.4142136c.3750727.3750727.5857864.8837805.5857864 1.4142135v.1715729c0 .5522847-.4477153 1-1 1h-4c0 1.6568542-1.3431458 3-3 3-1.65685425 0-3-1.3431458-3-3z" fill="#fff"/></svg>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+ </div>
+ </nav>
+
+
+
+ <nav class="u-hide-print c-header-expander" aria-labelledby="Journal-information" id="journal-info" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <div class="c-header-expander__keyline">
+ <h2 id="Journal-information" class="c-header-expander__heading u-js-hide">Journal information</h2>
+ <ul class="c-header-expander__list">
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/about"
+ data-track="click"
+ data-track-action="about the journal"
+ data-track-label="link">
+ About the Journal
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/for-authors"
+ data-track="click"
+ data-track-action="for authors"
+ data-track-label="link">
+ For Authors
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/for-referees"
+ data-track="click"
+ data-track-action="for referees"
+ data-track-label="link">
+ For Referees
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/awards"
+ data-track="click"
+ data-track-action="awards"
+ data-track-label="link">
+ Awards
+ </a>
+ </li>
+
+ <li class="c-header-expander__item">
+ <a class="c-header-expander__link"
+ href="/nature/subscribe"
+ data-track="click"
+ data-track-action="subscribe"
+ data-track-label="link">
+ Subscribe
+ </a>
+ </li>
+
+
+ <li class="c-header-expander__item c-header-expander__item--keyline">
+ <a class="c-header-expander__link"
+ href="http://mts-nature.nature.com/"
+ data-track="click"
+ data-track-action="Submit manuscript"
+ data-track-label="link">Submit manuscript<svg role="img" aria-hidden="true" focusable="false" height="18" viewBox="0 0 18 18" width="18" xmlns="http://www.w3.org/2000/svg"><path d="m15 0c1.1045695 0 2 .8954305 2 2v5.5c0 .27614237-.2238576.5-.5.5s-.5-.22385763-.5-.5v-5.5c0-.51283584-.3860402-.93550716-.8833789-.99327227l-.1166211-.00672773h-9v3c0 1.1045695-.8954305 2-2 2h-3v10c0 .5128358.38604019.9355072.88337887.9932723l.11662113.0067277h7.5c.27614237 0 .5.2238576.5.5s-.22385763.5-.5.5h-7.5c-1.1045695 0-2-.8954305-2-2v-10.17157288c0-.53043297.21071368-1.0391408.58578644-1.41421356l3.82842712-3.82842712c.37507276-.37507276.88378059-.58578644 1.41421356-.58578644zm-.5442863 8.18867991 3.3545404 3.35454039c.2508994.2508994.2538696.6596433.0035959.909917-.2429543.2429542-.6561449.2462671-.9065387-.0089489l-2.2609825-2.3045251.0010427 7.2231989c0 .3569916-.2898381.6371378-.6473715.6371378-.3470771 0-.6473715-.2852563-.6473715-.6371378l-.0010428-7.2231995-2.2611222 2.3046654c-.2531661.2580415-.6562868.2592444-.9065605.0089707-.24295423-.2429542-.24865597-.6576651.0036132-.9099343l3.3546673-3.35466731c.2509089-.25090888.6612706-.25227691.9135302-.00001728zm-.9557137-3.18867991c.2761424 0 .5.22385763.5.5s-.2238576.5-.5.5h-6c-.27614237 0-.5-.22385763-.5-.5s.22385763-.5.5-.5zm-8.5-3.587-3.587 3.587h2.587c.55228475 0 1-.44771525 1-1zm8.5 1.587c.2761424 0 .5.22385763.5.5s-.2238576.5-.5.5h-6c-.27614237 0-.5-.22385763-.5-.5s.22385763-.5.5-.5z" fill="#fff"/></svg>
+ </a>
+ </li>
+
+ </ul>
+ </div>
+ </div>
+ </nav>
+
+
+
+
+
+ <div id="search-menu" class="c-header-expander c-header-expander--tray u-hide-print" data-track-component="nature-150-split-header">
+ <div class="c-header-expander__container">
+ <h2 class="u-visually-hidden">Search</h2>
+ <div data-test="inline-search">
+ <div class="c-header-expander__keyline u-mb-16">
+ <form action="/search"
+ method="get"
+ role="search"
+ class="c-header-expander__form"
+ autocomplete="off"
+ data-dynamic-track-label
+ data-track="submit" data-track-action="search" data-track-label="form">
+ <label class="c-header-expander__heading" for="keywords">Article Search</label>
+ <div class="c-form-field u-display-flex">
+ <input type="text"
+ class="c-form-field__input u-flex-shrink"
+ id="keywords"
+ name="q"
+ value=""
+ placeholder="Search by keywords or author"
+ data-test="search-keywords">
+ <button type="submit" class="c-button c-button--contrast u-flex-static u-ml-8" data-test="search-submit">Search</button>
+ </div>
+ <p class="u-ma-0">
+ <a href="/search/advanced"
+ data-track="click" data-track-action="advanced search" data-track-label="link">
+ Advanced search
+ </a>
+ </p>
+ </form>
+ </div>
+ <div class="c-header-expander__keyline">
+ <h3 class="c-header-expander__heading">Quick links</h3>
+ <ul class="u-list-reset">
+ <li class="u-display-inline-block u-mr-24"><a href="/subjects" data-track="click" data-track-action="explore articles by subject" data-track-label="link">Explore articles by subject</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/naturecareers" data-track="click" data-track-action="find a job" data-track-label="link">Find a job</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/authors/index.html" data-track="click" data-track-action="guide to authors" data-track-label="link">Guide to authors</a></li>
+ <li class="u-display-inline-block u-mr-24"><a href="/authors/editorial_policies/" data-track="click" data-track-action="editorial policies" data-track-label="link">Editorial policies</a></li>
+ </ul>
+ </div>
+ </div>
+ </div>
+ </div>
+
+
+
+
+<footer role="contentinfo" class="composite-layer">
+ <div class="u-mt-16 u-mb-16">
+ <div class="u-container">
+ <div class="u-display-flex u-flex-wrap u-justify-content-space-between">
+ <p class="c-meta u-ma-0 u-mr-24">
+
+</p>
+
+ <p class="c-meta u-ma-0">
+ <span aria-level="2" class="c-meta__item" itemprop="name">
+ Nature
+ </span>
+ <span class="c-meta__item">
+ <abbr title="International Standard Serial Number">ISSN</abbr> <span itemprop="issn">1476-4687</span> (online)
+ </span>
+ </p>
+ </div>
+ </div>
+</div>
+
+
+ <div itemscope itemtype="http://schema.org/Periodical">
+ <meta itemprop="publisher" content="Springer Nature">
+ <div class="c-footer">
+ <div class="u-container">
+ <div class="u-hide-print" data-track-component="footer">
+ <h2 aria-level="2" class="u-visually-hidden">nature.com sitemap</h2>
+ <div class="c-footer__header">
+ <div class="c-footer__logo">
+ <img alt="Nature Research" src="/static/images/logos/nature research-white-150.f4acf77e0c.svg" loading="lazy" width="200" height="26">
+ </div>
+ <ul class="c-menu c-menu--inherit u-mr-32">
+ <li class="c-menu__item"><a class="c-menu__link" href="https://www.nature.com/npg_/company_info/index.html" data-track="click" data-track-action="about us" data-track-label="link">About us</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://www.nature.com/npg_/press_room/press_releases.html" data-track="click" data-track-action="press releases" data-track-label="link">Press releases</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://press.nature.com/" data-track="click" data-track-action="press office" data-track-label="link">Press office</a></li>
+ <li class="c-menu__item"><a class="c-menu__link" href="https://support.nature.com/support/home" data-track="click" data-track-action="contact us" data-track-label="link">Contact us</a></li>
+ </ul>
+ <ul class="c-menu c-menu--inherit">
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://www.facebook.com/nature/" aria-label="Nature on Facebook" data-track="click" data-track-action="facebook" data-track-label="link">
+ <svg class="u-icon u-mt-2 u-mb-2" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 20 20"><path d="M2.5 20C1.1 20 0 18.9 0 17.5v-15C0 1.1 1.1 0 2.5 0h15C18.9 0 20 1.1 20 2.5v15c0 1.4-1.1 2.5-2.5 2.5h-3.7v-7.7h2.6l.4-3h-3v-2c0-.9.2-1.5 1.5-1.5h1.6V3.1c-.3 0-1.2-.1-2.3-.1-2.3 0-3.9 1.4-3.9 4v2.2H8.1v3h2.6V20H2.5z"/></svg>
+ </a>
+ </li>
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://twitter.com/nresearchnews?lang=en" aria-label="Nature on Twitter" data-track="click" data-track-action="twitter" data-track-label="link">
+ <svg class="u-icon" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20"><path d="M17.6 4.1c.8-.5 1.5-1.4 1.8-2.4-.8.5-1.7.9-2.6 1-.7-.8-1.8-1.4-3-1.4-2.3 0-4.1 1.9-4.1 4.3 0 .3 0 .7.1 1-3.4 0-6.4-1.8-8.4-4.4C1 2.9.8 3.6.8 4.4c0 1.5.7 2.8 1.8 3.6C2 8 1.4 7.8.8 7.5v.1c0 2.1 1.4 3.8 3.3 4.2-.3.1-.7.2-1.1.2-.3 0-.5 0-.8-.1.5 1.7 2 3 3.8 3-1.3 1.1-3.1 1.8-5 1.8-.3 0-.7 0-1-.1 1.8 1.2 4 1.9 6.3 1.9C13.8 18.6 18 12 18 6.3v-.6c.8-.6 1.5-1.4 2-2.2-.7.3-1.5.5-2.4.6z"/></svg>
+ </a>
+ </li>
+ <li class="c-menu__item">
+ <a class="c-menu__link" href="https://www.youtube.com/channel/UCvCLdSgYdSTpWcOgEJgi-ng" aria-label="Nature on YouTube" data-track="click" data-track-action="youtube" data-track-label="link">
+ <svg class="u-icon" role="img" aria-hidden="true" focusable="false" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20"><path d="M7.9 12.6V6.9l5.4 2.8c0 .1-5.4 2.9-5.4 2.9zM19.8 6s-.2-1.4-.8-2c-.8-.8-1.6-.8-2-.9-2.8-.2-7-.2-7-.2s-4.2 0-7 .2c-.4 0-1.2 0-2 .9-.6.6-.8 2-.8 2S0 7.6 0 9.2v1.5c0 1.7.2 3.3.2 3.3s.2 1.4.8 2c.8.8 1.8.8 2.2.9 1.6.1 6.8.2 6.8.2s4.2 0 7-.2c.4 0 1.2-.1 2-.9.6-.6.8-2 .8-2s.2-1.6.2-3.3V9.2c0-1.6-.2-3.2-.2-3.2z"/></svg>
+ </a>
+ </li>
+ </ul>
+ </div>
+
+ <div class="c-footer__grid">
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Discover content</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/siteindex" data-track="click" data-track-action="journals a-z" data-track-label="link">Journals A-Z</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/subjects/" data-track="click" data-track-action="article by subject" data-track-label="link">Articles by subject</a></li>
+ <li class="c-footer__item"><a href="https://nano.nature.com/" data-track="click" data-track-action="nano" data-track-label="link">Nano</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/protocolexchange/" data-track="click" data-track-action="protocol exchange" data-track-label="link">Protocol Exchange</a></li>
+ <li class="c-footer__item"><a href="https://www.natureindex.com/" data-track="click" data-track-action="nature index" data-track-label="link">Nature Index</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Publish with us</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/author_resources/index.html" data-track="click" data-track-action="guide to authors" data-track-label="link">Guide to Authors</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/peer_review/" data-track="click" data-track-action="guide to referees" data-track-label="link">Guide to Referees</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/authors/editorial_policies/" data-track="click" data-track-action="editorial policies" data-track-label="link">Editorial policies</a></li>
+ <li class="c-footer__item"><a href="http://www.nature.com/openresearch/publishing-with-npg/" data-track="click" data-track-action="open access" data-track-label="link">Open access</a></li>
+ <li ><a href="https://www.nature.com/reprints/" data-track="click" data-track-action="reprints and permissions" data-track-label="link">Reprints &amp; permissions</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Researcher services</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/authors/research-data" data-track="click" data-track-action="data research service" data-track-label="link">Research data</a></li>
+ <li class="c-footer__item"><a href="https://authorservices.springernature.com/go/nr" data-track="click" data-track-action="language editing" data-track-label="link">Language editing</a></li>
+ <li class="c-footer__item"><a href="https://authorservices.springernature.com/scientific-editing/" data-track="click" data-track-action="scientific editing" data-track-label="link">Scientific editing</a></li>
+ <li class="c-footer__item"><a href="https://masterclasses.nature.com/" data-track="click" data-track-action="nature masterclasses" data-track-label="link">Nature Masterclasses</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/researcher-training/" data-track="click" data-track-action="nature research academies" data-track-label="link">Nature Research Academies</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Libraries &amp; institutions</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/librarians/tools-services" data-track="click" data-track-action="librarian service and tools" data-track-label="link">Librarian service &amp; tools</a></li>
+ <li class="c-footer__item"><a href="https://www.springernature.com/gp/librarians/manage-your-account/librarianportal" data-track="click" data-track-action="librarian portal" data-track-label="link">Librarian portal</a></li>
+ <li class="c-footer__item"><a href="http://www.nature.com/openresearch/about-open-access/information-for-institutions/" data-track="click" data-track-action="open research" data-track-label="link">Open research</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Advertising &amp; partnerships</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/digital-advertising/" data-track="click" data-track-action="advertising" data-track-label="link">Advertising</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/" data-track="click" data-track-action="partnerships and services" data-track-label="link">Partnerships &amp; Services</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/media-kits/" data-track="click" data-track-action="media kits" data-track-label="link">Media kits</a></li>
+ <li class="c-footer__item"><a href="https://partnerships.nature.com/product/branded-content-native-advertising/" data-track-action="branded content" data-track-label="link">Branded content</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Career development</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/naturecareers" data-track="click" data-track-action="nature careers" data-track-label="link">Nature Careers</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/natureconferences/" data-track="click" data-track-action="nature conferences" data-track-label="link">Nature<span class="visually-hidden"> </span> Conferences</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/natureevents/" data-track="click" data-track-action="nature events" data-track-label="link">Nature<span class="visually-hidden"> </span> events</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Regional websites</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="http://www.naturechina.com" data-track="click" data-track-action="nature china" data-track-label="link">Nature China</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/nindia" data-track="click" data-track-action="nature india" data-track-label="link">Nature India</a></li>
+ <li class="c-footer__item"><a href="https://www.natureasia.com/ja-jp/" data-track="click" data-track-action="nature japan" data-track-label="link">Nature Japan</a></li>
+ <li class="c-footer__item"><a href="https://www.natureasia.com/ko-kr/" data-track="click" data-track-action="nature korea" data-track-label="link">Nature Korea</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/nmiddleeast/" data-track="click" data-track-action="nature middle east" data-track-label="link">Nature Middle East</a></li>
+ </ul>
+ </div>
+
+ <div class="c-footer__group">
+ <h3 class="c-footer__heading">Legal &amp; Privacy</h3>
+ <ul class="c-footer__list">
+ <li class="c-footer__item"><a href="https://www.nature.com/info/privacy.html" data-track="click" data-track-action="privacy policy" data-track-label="link">Privacy Policy</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/cookies.html" data-track="click" data-track-action="use of cookies" data-track-label="link">Use of cookies</a></li>
+ <li class="c-footer__item"><a class="optanon-toggle-display" href="javascript:;" data-track="click" data-track-action="manage cookies" data-track-label="link">Manage cookies/Do not sell my data</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/legal_notice.html" data-track="click" data-track-action="legal notice" data-track-label="link">Legal notice</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/accessibility_statement.html" data-track="click" data-track-action="accessibility statement" data-track-label="link">Accessibility statement</a></li>
+ <li class="c-footer__item"><a href="https://www.nature.com/info/tandc.html" data-track="click" data-track-action="terms and conditions" data-track-label="link">Terms &amp; Conditions</a></li>
+ <li class="c-footer__item"><a href="https://www.springernature.com/ccpa" data-track="click" data-track-action="california privacy statement" data-track-label="link">California Privacy Statement</a></li>
+ </ul>
+ </div>
+ </div>
+</div>
+
+
+ </div>
+ </div>
+ </div>
+
+ <div class="c-corporate-footer">
+ <div class="u-container">
+ <img src="/static/images/logos/sn-logo-white.ea63208b81.svg" alt="Springer Nature" loading="lazy" width="140" height="14"/>
+ <p class="c-corporate-footer__legal" data-test="copyright">&copy; 2020 Springer Nature Limited</p>
+ </div>
+</div>
+
+
+ <svg class="u-hide hide">
+ <symbol id="global-icon-chevron-right" viewBox="0 0 16 16">
+ <path d="M7.782 7L5.3 4.518c-.393-.392-.4-1.022-.02-1.403a1.001 1.001 0 011.417 0l4.176 4.177a1.001 1.001 0 010 1.416l-4.176 4.177a.991.991 0 01-1.4.016 1 1 0 01.003-1.42L7.782 9l1.013-.998z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-download" viewBox="0 0 16 16">
+ <path d="M2 14c0-.556.449-1 1.002-1h9.996a.999.999 0 110 2H3.002A1.006 1.006 0 012 14zM9 2v6.8l2.482-2.482c.392-.392 1.022-.4 1.403-.02a1.001 1.001 0 010 1.417l-4.177 4.177a1.001 1.001 0 01-1.416 0L3.115 7.715a.991.991 0 01-.016-1.4 1 1 0 011.42.003L7 8.8V2c0-.55.444-.996 1-.996.552 0 1 .445 1 .996z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-email" viewBox="0 0 18 18">
+ <path d="M1.995 2h14.01A2 2 0 0118 4.006v9.988A2 2 0 0116.005 16H1.995A2 2 0 010 13.994V4.006A2 2 0 011.995 2zM1 13.994A1 1 0 001.995 15h14.01A1 1 0 0017 13.994V4.006A1 1 0 0016.005 3H1.995A1 1 0 001 4.006zM9 11L2 7V5.557l7 4 7-4V7z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-institution" viewBox="0 0 18 18">
+ <path d="M14 8a1 1 0 011 1v6h1.5a.5.5 0 01.5.5v.5h.5a.5.5 0 01.5.5V18H0v-1.5a.5.5 0 01.5-.5H1v-.5a.5.5 0 01.5-.5H3V9a1 1 0 112 0v6h8V9a1 1 0 011-1zM6 8l2 1v4l-2 1zm6 0v6l-2-1V9zM9.573.401l7.036 4.925A.92.92 0 0116.081 7H1.92a.92.92 0 01-.528-1.674L8.427.401a1 1 0 011.146 0zM9 2.441L5.345 5h7.31z" fill-rule="evenodd"/>
+ </symbol>
+ <symbol id="global-icon-search" viewBox="0 0 22 22">
+ <path fill-rule="evenodd" d="M21.697 20.261a1.028 1.028 0 01.01 1.448 1.034 1.034 0 01-1.448-.01l-4.267-4.267A9.812 9.811 0 010 9.812a9.812 9.811 0 1117.43 6.182zM9.812 18.222A8.41 8.41 0 109.81 1.403a8.41 8.41 0 000 16.82z"/>
+ </symbol>
+ <symbol id="global-icon-info" viewBox="0 0 18 18">
+ <path d="m9 0c4.9705627 0 9 4.02943725 9 9 0 4.9705627-4.0294373 9-9 9-4.97056275 0-9-4.0294373-9-9 0-4.97056275 4.02943725-9 9-9zm0 7h-1.5l-.11662113.00672773c-.49733868.05776511-.88337887.48043643-.88337887.99327227 0 .47338693.32893365.86994729.77070917.97358929l.1126697.01968298.11662113.00672773h.5v3h-.5l-.11662113.0067277c-.42082504.0488782-.76196299.3590206-.85696816.7639815l-.01968298.1126697-.00672773.1166211.00672773.1166211c.04887817.4208251.35902055.761963.76398144.8569682l.1126697.019683.11662113.0067277h3l.1166211-.0067277c.4973387-.0577651.8833789-.4804365.8833789-.9932723 0-.4733869-.3289337-.8699473-.7707092-.9735893l-.1126697-.019683-.1166211-.0067277h-.5v-4l-.00672773-.11662113c-.04887817-.42082504-.35902055-.76196299-.76398144-.85696816l-.1126697-.01968298zm0-3.25c-.69035594 0-1.25.55964406-1.25 1.25s.55964406 1.25 1.25 1.25 1.25-.55964406 1.25-1.25-.55964406-1.25-1.25-1.25z" fill-rule="evenodd"/>
+ </symbol>
+ </svg>
+
+</footer>
+
+
+</body>
+</html>
+
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
new file mode 100644
index 0000000..4154aa5
--- /dev/null
+++ b/python/tests/test_html_metadata.py
@@ -0,0 +1,137 @@
+
+import datetime
+
+from sandcrawler.html_metadata import *
+
+
+def test_html_metadata_plos() -> None:
+
+ with open('tests/files/plos_one_article.html', 'r') as f:
+ plos_html = f.read()
+
+ meta = html_extract_biblio(HTMLParser(plos_html))
+ assert meta is not None
+ assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ assert meta.doi == "10.1371/journal.pone.0213978"
+ assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert meta.contrib_names == [
+ "Yang Li",
+ "Tuanjie Wang",
+ "Lin Wang",
+ "Mingjun Sun",
+ "Zhizhong Cui",
+ "Shuang Chang",
+ "Yongping Wu",
+ "Xiaodong Zhang",
+ "Xiaohui Yu",
+ "Tao Sun",
+ "Peng Zhao",
+ ]
+ assert meta.container_name == "PLOS ONE"
+ assert meta.container_abbrev == "PLOS ONE"
+ # "Apr 22, 2019"
+ assert meta.release_date == datetime.date(year=2019, month=4, day=22)
+ assert meta.first_page == "e0213978"
+ assert meta.issue == "4"
+ assert meta.volume == "14"
+ assert meta.container_issn == "1932-6203"
+ assert meta.publisher == "Public Library of Science"
+ assert "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
+ assert meta.release_type == "article-journal"
+
+
+def test_html_metadata_elife() -> None:
+
+ with open('tests/files/elife_article.html', 'r') as f:
+ elife_html = f.read()
+
+ meta = html_extract_biblio(HTMLParser(elife_html))
+ assert meta is not None
+ assert meta.title == "Parallel visual circuitry in a basal chordate"
+ assert meta.doi == "10.7554/eLife.44753"
+ assert meta.contrib_names == [
+ "Matthew J Kourakis",
+ "Cezar Borba",
+ "Angela Zhang",
+ "Erin Newman-Smith",
+ "Priscilla Salas",
+ "B Manjunath",
+ "William C Smith",
+ ]
+ assert meta.container_name == "eLife"
+ # 2019-04-18
+ assert meta.release_date == datetime.date(year=2019, month=4, day=18)
+ assert meta.publisher == "eLife Sciences Publications Limited"
+
+
+def test_html_metadata_nature() -> None:
+
+ with open('tests/files/nature_article.html', 'r') as f:
+ nature_html = f.read()
+
+ meta = html_extract_biblio(HTMLParser(nature_html))
+ assert meta is not None
+ assert meta.title == "More than 100 scientific journals have disappeared from the Internet"
+ assert meta.doi == "10.1038/d41586-020-02610-z"
+ assert meta.contrib_names == [
+ "Diana Kwon",
+ ]
+ assert meta.container_name == "Nature"
+ # "2020-09-10"
+ assert meta.release_date == datetime.date(year=2020, month=9, day=10)
+ assert meta.publisher == "Nature Publishing Group"
+ # note: some error in dublin code in nature HTML resulting in duplication
+ assert meta.abstract == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+
+
+def test_html_metadata_ojs3() -> None:
+
+ with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ ojs3_html = f.read()
+
+ meta = html_extract_biblio(HTMLParser(ojs3_html))
+ assert meta is not None
+ assert meta.title == "Surveillance, stigma & sociotechnical design for HIV"
+ assert meta.doi == "10.5210/fm.v25i10.10274"
+ assert meta.contrib_names == [
+ "Calvin Liang",
+ "Jevan Alexander Hutson",
+ "Os Keyes",
+ ]
+ assert meta.container_name == "First Monday"
+ assert meta.container_abbrev == "1" # NOTE: bad source metadata
+ assert meta.container_issn == "1396-0466"
+ # "2020/09/10"
+ assert meta.release_date == datetime.date(year=2020, month=9, day=10)
+ assert meta.lang == "en"
+ assert meta.abstract == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
+ assert meta.html_fulltext_url == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ assert meta.release_type == "article-journal"
+
+
+def test_html_metadata_dlib() -> None:
+
+ with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ dlib_html = f.read()
+
+ meta = html_extract_biblio(HTMLParser(dlib_html))
+ assert meta is not None
+ assert meta.doi == "10.1045/may2017-vanhyning"
+ # "2017-05-15"
+ assert meta.release_date == datetime.date(year=2017, month=5, day=15)
+
+def test_html_metadata_dc_case() -> None:
+ """
+ This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive.
+ """
+
+ snippet = """
+ <html>
+ <head>
+ <meta name="DC.Citation.Issue" content="123"/>
+ </head>
+ <body>Hi.</body>
+ </html>"""
+
+ meta = html_extract_biblio(HTMLParser(snippet))
+ assert meta.issue == "123"