From ae851f3f205b741dbc826c3197cdd3cc9bde8802 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Oct 2020 15:27:23 -0700 Subject: start HTML metadata extraction code --- python/tests/files/dlib_05vanhyning.html | 350 +++++ .../tests/files/first_monday_ojs3_landingpage.html | 616 +++++++++ python/tests/files/genders_g58_fairlie.html | 146 +++ python/tests/files/nature_article.html | 1379 ++++++++++++++++++++ python/tests/test_html_metadata.py | 137 ++ 5 files changed, 2628 insertions(+) create mode 100644 python/tests/files/dlib_05vanhyning.html create mode 100644 python/tests/files/first_monday_ojs3_landingpage.html create mode 100644 python/tests/files/genders_g58_fairlie.html create mode 100644 python/tests/files/nature_article.html create mode 100644 python/tests/test_html_metadata.py (limited to 'python/tests') diff --git a/python/tests/files/dlib_05vanhyning.html b/python/tests/files/dlib_05vanhyning.html new file mode 100644 index 0000000..dbe3ef7 --- /dev/null +++ b/python/tests/files/dlib_05vanhyning.html @@ -0,0 +1,350 @@ + + + + + + + + + + + + +Transforming Libraries and Archives through Crowdsourcing + + + +
+ +
+
+ +
+ +Search D-Lib: + + + + + + +
+ +
+
+
+
+
+
+
D-Lib-blocks5 +
+
+
+
+
+
The Magazine of Digital Library Research
+
+
+
+
+
+ +
+ +

D-Lib Magazine

+

May/June 2017
+Volume 23, Number 5/6
+Table of Contents +

+ +
 
+ +

Transforming Libraries and Archives through Crowdsourcing

+ +

Victoria Van Hyning, University of Oxford, Zooniverse
+victoria [at] zooniverse.org

+ +Samantha Blickhan, The Adler Planetarium, Zooniverse
+samantha [at] zooniverse.org

+ +Laura Trouille, The Adler Planetarium, Zooniverse
+trouille [at] zooniverse.org

+ +Chris Lintott, University of Oxford, Zooniverse
+chris [at] zooniverse.org

+ +
 
+ +

https://doi.org/10.1045/may2017-vanhyning

+ +
 
+ + +

Abstract

+ +

This article will showcase the aims and research goals of the project entitled "Transforming Libraries and Archives through Crowdsourcing", recipient of a 2016 Institute for Museum and Library Services grant. This grant will be used to fund the creation of four bespoke text and audio transcription projects which will be hosted on the Zooniverse, the world-leading research crowdsourcing platform. These transcription projects, while supporting the research of four separate institutions, will also function as a means to expand and enhance the Zooniverse platform to better support galleries, libraries, archives and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing.

+ +

Keywords: Crowdsourcing, Citizen Humanities, GLAM, Transcription, IMLS

+ + + +
 
+

1 Overview1

+ +

As libraries, museums, and other cultural repositories digitize their collections and place them online, the challenges of transforming these materials into useful and searchable sources of information are becoming increasingly apparent. While OCR and handwriting recognition technology have opened up some print and manuscript corpora, and image and voice recognition software are improving daily, there are still many tasks that require human intervention. For these, volunteer crowdsourcing is a viable and vibrant solution.

+ +

The Zooniverse is the world-leading research crowdsourcing platform, hosting over 50 active projects and over 100 projects total since its inception in 2007. The projects cover diverse subject areas from astronomy to zoology, engage over 1.5 million registered volunteers, and have produced data used in more than a hundred peer-reviewed articles.2 The Zooniverse also hosts the Project Builder, a free platform through which anyone can build their own project. The Zooniverse grew from a single project developed at the University of Oxford in 2007, and is now developed and managed by a team based in Oxford and at the Adler Planetarium in Chicago and the University of Minnesota (see Zooniverse Team for a more complete list).

+ +

In late 2016, the Institute for Museum and Library Services awarded a National Leadership Grant titled "Transforming Libraries and Archives through Crowdsourcing (LG-71-16-0028-16)" to the Adler Planetarium and its collaborators to support the work of the Zooniverse. Through this grant-funded effort, the Zooniverse will further expand and enhance its platform to better support galleries, libraries, archives, and museums (GLAM institutions) in unlocking their data and engaging the public through crowdsourcing.

+ +
 
+

1.1 What Can Crowdsourcing Offer GLAMs?

+ +

In 2010, author and professor Clay Shirky delivered a rousing TED talk in which he used the phrase "cognitive surplus" to describe the one trillion hours of leisure time humans collectively accumulate each year (a great deal of which is spent watching television), which could be harnessed to advance human knowledge through civic engagement. He concluded that: "free cultures get what they celebrate. [...If we] celebrate and support and reward the people trying to use cognitive surplus to create civic value [...] we'll be able to change society".[1] One way that GLAMs can harness this cognitive surplus is through web-based crowdsourcing. What Shirky was describing was a type of "social machine", which Tim Berners-Lee defined as "new form[s] of social processes" emergent from the Web, and involving both human and machine components.[2]

+ +

Academic crowdsourcing invites members of the public to work with specialists to conduct research: for example, to transcribe documents or add metadata to a collection of images, video or audio clips. This data is used in real science, social science, or humanities investigations and should, ideally, lead to publication. Crowdsourcing within GLAMs may not always be oriented around a specific research question or publication, but around making collections more accessible for future research and usability. GLAM crowdsourcing can be the seedbed of future scholarly research.

+ +

GLAMs have been engaging volunteers with their collections for well over a century, usually by inviting select individuals into an institution and training them to do work that cannot be done by staff due to time or money constraints. On-site volunteers often build up valuable knowledge and skills and contribute a great deal to their chosen institutions, but training and supervising them also poses challenges. There is a limit to how many volunteers can be trained, supported on site, and indeed attracted and retained in the first place. Online volunteering, enabled by crowdsourcing platforms such as Zooniverse.org, offer an alternative or complementary form of engagement that has many benefits. Online projects can reach a wider range of individuals, including those who are less able-bodied or geographically remote from the institution in which they want to volunteer and/or unable to travel. Such projects require less training and time commitment from volunteers and typically attract a larger number of participants than on-site programs. They also enable GLAMs to open up rare collections to the public without concern for their material safety and security.3

+ +

While crowdsourcing projects have proliferated in the last decade, few offer easy to use, open source, and free platforms on which GLAM academics and amateur users can rely. The Zooniverse has the infrastructure, community, and technical expertise to intervene at this critical stage.

+ +
 
+

1.2 How Does The Zooniverse Work?

+ +

All bespoke Zooniverse projects, including those built on the free Project Builder, have a few core components. Each image, audio or video file (data point) in each project is independently assessed by multiple individuals, whose responses are then aggregated using a variety of algorithms to determine what is in a given image. The amount of required responses for a task to be considered "complete" varies, depending on the project. With relatively quick tasks, such as animal identification in Snapshot Serengeti, upwards of 70 people will see each image. In tasks that require more time, such as transcription projects like Shakespeare's World and AnnoTate, at least three people transcribe each line on each page. If enough people transcribe the same line and our algorithms deem the line to be completed to a good enough standard, these are greyed out, while outstanding lines are available to future site visitors. This approach was designed along the same principles that underpin all other Zooniverse projects, in which it is assumed that volunteers should work independently on tasks, in order that no one individual should have undue influence over others in the crowd. In the current IMLS project, however, we will test whether allowing volunteers to transcribe and work collaboratively ultimately creates better data and/or better user experiences. We will be able to compare datasets from AnnoTate and Shakespeare's World with text transcription datasets from the two new bespoke text transcription projects and, hopefully, with datasets generated at other institutions that have online crowdsourcing projects. Zooniverse is in a unique position in being able to gather these two very different kinds of data and compare them in order to determine the best outcomes. These findings will ultimately drive our design of free tools on the Project Builder. + +

In addition to participating in the classification task, users have the opportunity to communicate with other volunteers through an active, object-oriented discussion forum, called "Talk", associated with each project. Here volunteers can ask questions, interact with researchers and fellow volunteers, create their own "collections", and use hashtags to group together posts or images of interest. An example of the latter is #female from the Science Gossip project, which indicates female authors, illustrators and printers contributing to the main scientific journals in the nineteenth century (visit the Science Gossip Talk board to view the discussion around this tag). These interactions provide a rich set of experiences that allow users to personally experience the community in which they are participating, beyond simply providing classifications. Additionally, the collections allow volunteers to create their own research focal points within existing projects. During the process of transcribing, users can save images that contain content that is pertinent to their research interests by adding them to a public collection. They can then use the Talk forum to publicize their search, allowing other users to add images to that collection as well. In this way, the volunteer base can be mobilized to help other volunteers with minimal effort required.

+ +
 
+

2 IMLS Funded Effort: Approach and Focus

+ +

Through the IMLS grant, the Zooniverse will engage in a research and development program to identify and implement crowdsourcing best practices in the arenas of text and audio transcription for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read. Though to date the majority of Zooniverse projects have been based in STEM fields rather than in the humanities, several text transcription projects have already been hosted on the site. For example, the first Zooniverse humanities project was Ancient Lives, which invited volunteers to transcribe ancient papyri one letter at a time using a clickable keyboard on their screen: volunteers did not have to be fluent in ancient Greek, they only needed to character match. Over 250,000 volunteers participated in the project, and made more than 1.5 million transcriptions between 2011 and 2014.[3] Furthermore, the computational pipeline used to convert individual identified letters into consensus-based transcriptions will benefit future classification projects attempting consensus letter or line sequence identifications.[4]

+ +

By 2018 we will build four bespoke projects, two projects for text transcription and two projects for audio transcription, identified through open calls, in order to test, iterate, and research the efficacy of new and existing approaches (including within current Zooniverse and other projects) in these arenas. We will also develop the foundation for a GLAM-friendly data pipeline to export data from a Zooniverse project into GLAM collections. These functionalities are among those most frequently requested by GLAM institutions. We will work closely with four different GLAM institutions to build these bespoke crowdsourcing projects and functionalities. The text transcription open call closed in February 2017, with thirty-one submissions. The audio transcription open call will occur in fall 2017 (see Call for Projects).

+ +

From the lessons learned in building these bespoke projects, we will explore adding new tools and functionality to the Project Builder, which is freely available to any institution or user who wishes to lead a project. It is a flexible, powerful, and easy-to-use resource for building crowdsourcing projects, with a wide range of potential applications for GLAM collections, including text transcription. A basic text transcription tool is currently available, but will be refined through this grant effort. The Zooniverse has previously used this model of building bespoke projects in order to learn which tools are most useful, before implementing these tools in the Project Builder. We recognize that volunteers' time is precious, and are therefore unwilling to waste it with tools that are not proven to extract data in an efficient, high quality, and useful form. We will also draw on lessons learned from previous experiences supporting transcription projects through Zooniverse and other platforms. For example, Operation War Diary which launched in 2014 to commemorate the outbreak of the First World War, is a partnership between the National Archives (UK), the Imperial War Museum, and the Zooniverse, which invites users to tag and transcribe dates, times, places, and names found in British WWI field diaries. Historian Richard Grayson has used the data to penetrate more deeply than ever before into records of soldiers' daily lives on the front.[5] All of the Operation War Diary metadata will eventually be integrated into the National Archive catalogues. The process of integrating new metadata into an existing catalogue can be complicated, raising an important question for any GLAM specialist seeking to harness crowdsourcing at their institution. For instance, it is essential to ensure, before starting a project, that the current content management system (CMS) supports the storage of additional metadata, such as large amounts of free-text. If not, it then becomes necessary to use an external resource to make available the results from the crowdsourcing project. Zooniverse can and will do more to facilitate GLAMs and research groups to use and store their data.

+ +

Over the course of the IMLS project, we will also address the following research questions:

+ +

Q1: How can crowdsourcing be deployed in the arenas of text and audio transcription and metadata extraction for the purposes of unlocking big data currently trapped in GLAM sources that cannot be machine read? What methods produce the best data and make for the best user experience?

+ +

Q2: Does the current Zooniverse methodology of multiple independent transcribers and aggregation render better results than allowing volunteers to see previous transcriptions by others or indeed collaborate to create a single transcription? How does each methodology impact the quality of data, as well as depth of analysis and participation?

+ +

Q3: How can we extend our crowdsourcing expertise to more GLAM professionals and learn from them, in turn, how to adjust the Zooniverse platform to best meet their research and curatorial needs?

+ +
 
+

2.1 Addressing Q1 (Crowdsourcing for GLAM)

+ +

Only a platform like the Zooniverse can systematically address a question such as Q1: the community that has developed within the platform is made up of volunteers who move across projects, allowing us to trace the impact of differences between projects on the same volunteers. Zooniverse also has the infrastructure to implement A/B split experiments within a single project. This allows us to develop projects incorporating different practices which are specifically aimed at understanding different methodologies. Through the bespoke text and audio transcription projects, we will expand on the lessons learned through current Zooniverse text transcription projects, including Ancient Lives, AnnoTate, Old Weather, Measuring the ANZACs, Shakespeare's World, Science Gossip, Decoding the Civil War, Orchid Observers and Operation War Diary, as well as from external text transcription projects including Transcribe Bentham, FromthePage, and Scripto.

+ +

In the bespoke projects created through the IMLS grant, the features optimizing volunteer engagement and retention will include:

+ +
    +
  • Volunteer choice: volunteers choose which document to transcribe and can transcribe as little as a single line or as much as an entire document. We have found through AnnoTate and Shakespeare's World that allowing users to transcribe smaller fragments of text (without being required to complete an entire page) mitigates against forced or uncertain readings. We hypothesize and plan to fully test whether allowing microtasking helps to retain volunteers, giving them the chance to build up their skills and not make forced readings.
  • + +
  • Keeping the task simple: in Shakespeare's World and AnnoTate, volunteers drop points at the start and end of individual lines of text (not grammatical sentences) and transcribe the text contained between these two points. They do not use XML markup itself, which has proven to be a major repellent to participants in other text transcription crowdsourcing projects.4 Instead, volunteers highlight words within the transcribed line and choose among different features (e.g., insertion, deletion, expansion, etc.). We propose to use these tagged words in each line to create simple TEI markup on the back-end, for output into commonly used CMSs such as Drupal and Omeka.
  • + +
  • Narrowing the content focus to support sense-making: In Shakespeare's World, the first release (or "chapter") consists of recipes and letters, with more genres to follow. This type of structured approach will be applied to the bespoke projects, as this supports creation of narratives within diverse collections, which in turn enables subject experts to more easily foster, and volunteers to contribute to, discussions in Talk.
  • +
+ +

Features optimizing best practice in regard to data production and management will include:

+ +
    +
  • Reliable, Scalable, Open Source Code Infrastructure: The foundation for the Zooniverse platform that includes the Project Builder is an application written in Ruby on Rails which supports a powerful Application Programming Interface (API). The API serves subjects — images, video or audio — for classification by volunteers via a workflow defined by the project, and receives and records these classifications into a database. The frontend Javascript web software presents user interfaces to volunteers and supports the Project Builder. All Zooniverse code is open source and available through Github.
  • + +
  • Data Ingestion into Zooniverse: In the current Project Builder, research teams can upload batches of 500 to 1000 subjects (images, videos, or audio clips) at a time by simply dragging and dropping the files. For larger collections and for bespoke projects, typically the research team provides a hard drive and the Zooniverse team uploads the subjects to the API. Through the projects proposed here, we will create a system to better support direct ingestion of large subject sets through a user-friendly web interface, adding functionality to the foundation we already have in place within the Project Builder.
  • + +
  • Useful Output for Curation: The Smithsonian Transcription Center is regularly cited as being successful in regard to their output being easily ingestible by CMSs.[6] Current Zooniverse transcription projects are not set up with this functionality. Currently, through our Project Builder for image annotation/marking projects, research teams can download the raw classification results (i.e. all classifications by all volunteers) as well as automatically-generated aggregated results that include confidence measures on consensus. Through this IMLS-funded effort, we will work with Meghan Ferriter of the Smithsonian Transcription Center, who is on our board of advisors, to design data outputs for full text transcription and full audio transcription that are suitable for ingestion into different GLAM CMSs. A key aspect of this effort is to continue exploring best practices and approaches for transcription aggregation and confidence metrics, building on our efforts with AnnoTate, Shakespeare's World, etc.
  • +
+ +
 
+

2.2 Addressing Research Q2 (Independent vs. Collaborative Transcription)

+ +

Through the two bespoke text transcription projects, we will investigate the impact on transcription quality and volunteer experience when volunteers transcribe in isolation versus with knowledge of how others have transcribed the same document.

+ +

In terms of measuring impact on transcription quality, we will compare the rate of accuracy for individuals who transcribe in isolation on projects such as AnnoTate and Shakespeare's World versus individuals who see previous transcriptions. We will also compare the rate of accuracy in aggregated results for lines transcribed only by those working in isolation versus for lines in which all but the first transcriber sees previous transcriptions. In order to measure impact on volunteer experience, we will analyze the user behavior statistics we gather, e.g., number of transcriptions completed in a given session, length of session, number of sessions overall, sentiment analysis of discussion forum comments, etc.

+ +

There are numerous open questions in this experiment: Does knowledge of other individuals' or collective transcriptions lead individuals down the wrong path? Is transcription more or less accurate if people work in isolation or with an awareness of other people's work? Does making transcriptions visible increase retention as a result of highlighting that an individual's effort is part of a broader community effort or have the opposite effect? What environment best promotes skills acquisition, i.e. improved paleography?

+ +
 
+

2.3 Addressing Research Q3 (Feedback/Training)

+ +

We will provide numerous opportunities for input and feedback from and training for the GLAM community, specifically by working closely with our advisory board and four GLAM project partners throughout. In 2018 we will host feedback sessions at GLAM conferences and summer schools targeting GLAM institutions with collections for which text transcription, audio transcription, or image annotation/marking are of interest (we will include image annotation/marking because those tools are already included via the Project Builder). This will allow for input from a broader set of institutions on our decisions and approach for building new functionality into the Project Builder. In 2018—2019 we will host training workshops for GLAM professionals in using the Project Builder to build their own crowdsourcing projects, incorporate the results into their databases and research, and sustain and nurture their online volunteer communities.

+ +
 
+

3 Future Steps: Community Engagement, Output & How to Get Involved

+ +

The IMLS-Funded Project "Transforming Libraries and Archives through Crowdsourcing" is still in its beginning stages. Currently, we are in the process of selecting the first two bespoke crowdsourcing text transcription projects to be built and incorporated into the Zooniverse platform. The detail of our research questions will evolve alongside these new transcription projects, and during the research and development process we will use conference presentations and feedback sessions to gather input which can then guide the overall project design. The open call for the two bespoke audio transcription projects will occur in the fall of 2017. At this point, the bespoke text transcriptions will be in beta review, allowing us to take advantage of lessons learned through that first round of new projects. We believe that this self-reflexive method will simultaneously benefit our ongoing project while offering new tools and ideas to the larger GLAM and academic community.

+ +

We anticipate this proposed effort will produce two peer-reviewed publications. One article will focus on the methodology for creating, processing, and evaluating the data produced by the new projects. The second will focus on the results of our research exploring the impact of individual versus collaborative text transcription. We also note that all Zooniverse code is freely available under a liberal open source license which serves as an additional or parallel form of publication.

+ +

GLAM organizations keen to develop their own crowdsourcing projects should explore the available documentation on how to build a project and best practices for the design, launch and long term phases of a project. While building a project is easy and requires relatively little technical support from Zooniverse or your institution, make sure you have the time to work with your resulting data, and time to support your online volunteer commmunity. Advertising the project's existence should be a long-term task, to avoid a plateau or potential drop-off of user participation. For example, Shakespeare's World received a bump in the number of daily classifications after an article was published in The New Yorker in January of 2017, over a year after the project's launch date.[7] However, it does not suffice to merely advertise the existence of a project; researchers need to engage with their users on a regular basis.5 Zooniverse's Talk platform, social media such as blogging, Twitter, Instagram, and indeed in-person or on-site events all provide important channels for engaging current or potential volunteers with your collections. We believe that GLAM organizations, with their long history of volunteer engagement, have many of the skills to work effectively with online volunteers, and will benefit in new ways through cooperation with the crowd.

+ +

In conclusion, while this project is specifically focused on text and audio transcription, it is our hope that the results, including the new Project Builder tools and GLAM data pipeline, will ultimately be used across a variety of disciplines and domains. We hope to facilitate future partnerships between GLAM institutions and volunteer communities around the world, thus extending the aims and outcomes of the National Digital Platform funded through this generous IMLS grant into an international digital platform that will benefit many individuals and institutions.

+ +
 
+

Notes

+ + + + + + + + + + + + + + + + + + + + + + + + + +
1Part of this article appeared previously as a blog post for CILIP, The Library and Information Association. Material is reproduced by express permission of CILIP.
2For a partial list of publications, please visit https://www.zooniverse.org/about/publications.
3Further discussion of the use of crowdsourcing in GLAM contexts can be found in Melissa Terras, "Crowdsourcing in the Digital Humanities", in A New Companion to Digital Humanities, eds. Susan Schreibman, Ray Siemens, and John Unsworth (John Wiley & Sons, 2016), 420-438, particularly in the section entitled "The Growth of Crowdsourcing in Cultural and Heritage Applications" (pp. 423-28). See also Crowdsourcing Our Cultural Heritage, ed. Mia Ridge (Ashgate, 2014).
4Causer and Terras, "Many Hands Make Light Work", p. 81: "It would be fair to say that for volunteers, the XML mark-up complicates participation, and it has undoubtedly dissuaded many from participating more fully, or at all." For opinions from the volunteers about the process, the authors additionally refer the reader to Causer and Valerie Wallace, "Building a Volunteer Community: Results and Findings from Transcribe Bentham", Digital Humanities Quarterly 6.2 (2012).
5Or, as Zephyr Frank, et al. put it: "Paid advertising can generate large numbers of clicks on a website. It cannot, however, produce good metadata or newly uploaded material that is relevant to the scholarly questions posed by academic researchers." "Crowdsourcing for Humanities Research" (2016) Project White Paper.
+ +
 
+
 
+

References

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
[1]Clay Shirky, "How Cognitive Surplus Will Change the World", June 2010.
[2]Tim Berners-Lee with Mark Fischetti, Weaving the Web: The Original Design and Ultimate Destiny of the World Wide Web by its Inventor (San Francisco: Harper, 1999).
[3]"P.Oxy 5156, Plutarch Moralia 660C, 661B-C (Quaestiones Convivales IV PR., 1.2)", in The Oxyrhynchus Papyri, R.-L. Chang et al., eds, vol. 78 (London, Egypt Exploration Society, 2012), 97-98.
[4]Alex C. Williams et al., "A Computational Pipeline for Crowdsourced Transcriptions of Ancient Greek Papyrus Fragments", in IEEE International Conference on Big Data, October 2014. https://doi.org/10.1109/BigData.2014.7004460
[5]Richard Grayson, "A Life in the Trenches? The Use of Operation War Diary and Crowdsourcing Methods to Provide an Understanding of the British Army's Day-to-Day Life on the Western Front", British Journal for Military History, 2.2 (2016), 160-85.
[6]Katie Mika, "Transcription Tools: a survey by Katie Mika, NDSR Resident", Harvard University, Ernst Mayr Library Blog.
[7]Roberta Kwok, "Crowdsourcing For Shakespeare", The New Yorker, 16 Jan. 2017.
+ +
 
+
 
+

About the Authors

+ +

Victoria Van Hyning is a Junior Research Fellow at Pembroke College, and a British Academy Postdoctoral Fellow. Her current project, 'Court to Convent: Early Modern English Catholic Women's Autobiography', will reveal how Catholic women articulated selfhood in the period when it was illegal to practice Catholicism, 1535 to 1829. She is also the Humanities PI of Zooniverse.org, the world leading academic crowdsourcing organization. Her projects include Science Gossip, Shakespeare's World and AnnoTate.

+ +
 
+ +

Samantha Blickhan is the IMLS Postdoctoral Fellow in the Department of Citizen Science at the Adler Planetarium, working on transcription projects for the Zooniverse. She received her Ph.D. in Musicology from Royal Holloway, University of London, with a thesis on the palaeography of British song notation in the 12th and 13th centuries. Her research interests include music and perception, and their relationships with writing systems, technology and pedagogy.

+ +
 
+ +

Laura Trouille is co-Investigator for Zooniverse and Director of Citizen Science at the Adler Planetarium where she leads the Zooniverse web development and Teen Programs teams. While earning her Ph.D. in astronomy in 2010 studying galaxy evolution, she also earned the Center for the Integration of Research, Teaching and Learning's Delta certificate for STEM education research. As a CIERA Postdoctoral Fellow at Northwestern University's CIERA Center for Astrophysics, she continued her research on active galaxies as well as co-led the Computational Thinking in STEM project, bringing computational thinking and modeling curricular materials to high school science and math teachers.

+ +
 
+ +

Chris Lintott is a professor of astrophysics at the University of Oxford, where he is also a research fellow at New College. He is the principle investigator for Galaxy Zoo and the Zooniverse, and his own research focuses on novel modes of crowdsourcing for anomaly detection.

+ +
 
+ + + +
+ +
+ +
+ +
+
+ + \ No newline at end of file diff --git a/python/tests/files/first_monday_ojs3_landingpage.html b/python/tests/files/first_monday_ojs3_landingpage.html new file mode 100644 index 0000000..2633256 --- /dev/null +++ b/python/tests/files/first_monday_ojs3_landingpage.html @@ -0,0 +1,616 @@ + + + + + + + Surveillance, stigma & sociotechnical design for HIV + | First Monday + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skip to main content + Skip to main navigation menu + Skip to site footer +
+
+ + + +
+
+ +
+ + +
+

+ Surveillance, stigma & sociotechnical design for HIV +

+ + +
+
+ + + + + +
+ + Keywords: + + + HIV, online dating, design, policy, surveillance, intimacy, social computing, social justice +
+ +
+

Abstract

+

Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.

+
+ + + +
+

+ Author Biographies +

+
+
+ Calvin Liang, University of Washington, Department of Human Centered Design & Engineering +
+
+

Calvin Liang is a PhD student in Human-Centered Design and Engineering at The University of Washington. Their research broadly focuses on technology’s role in and out of queerness, health, and queer health.

+
+
+
+
+ Jevan Alexander Hutson, University of Washington, School of Law +
+
+ Jevan Hutson is a third-year law student and Gregoire Fellow at the University of Washington School of Law. He holds an M.P.S. from the Department of Information Science at Cornell University, and a B.A. from the Department of Art History and Visual Studies at Cornell University. He has been published in venues including the Association for Computing Machinery’s conferences on Computer Human Interaction and Computer Supported Cooperative Work and Social Computing +
+
+
+
+ Os Keyes, University of Washington, Department of Human Centered Design & Engineering +
+
+ Os Keyes is a PhD student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities. +
+
+
+ + +
+ +
+ +
+
+ + “Frank Moore, Digital Divide, 2001 gouache, oil and mixed media on paper 14 3/4 x 24 1/4 inches (36,4 x 61,6 cm) sheet” + +
+
+ +
+ +
+ +
+
+ Published +
+
+ 2020-09-10 +
+
+ +
+
+
+ How to Cite +
+
+
+
+
Liang, C., Hutson, J. A., & Keyes, O. (2020). Surveillance, stigma & sociotechnical design for HIV. First Monday, 25(10). https://doi.org/10.5210/fm.v25i10.10274
+
+
+
+ + +
+
+
+
+ +
+ + +
+
+ Section +
+
+ Articles +
+
+
+ + + + + + +
+
+ +
+ + + +
+ +
+ + +
+ + + +
+ + + + + + diff --git a/python/tests/files/genders_g58_fairlie.html b/python/tests/files/genders_g58_fairlie.html new file mode 100644 index 0000000..49cada8 --- /dev/null +++ b/python/tests/files/genders_g58_fairlie.html @@ -0,0 +1,146 @@ + + + +Genders OnLine Journal - Genders OnLine Journal - Presenting innovative theories in art, literature, history, music, TV and film. + + + + + +

+ + + + + + + +

Genders OnLine Journal

+ + + + + + +

Issue 58, Fall 2013

+

Reading Maeshowe
+ Recovering the Feminine in a Neolithic Tomb

+

By CHARLOTTE FAIRLIE

+

[1] Cuween, a small Neolithic cairn, perches on top of a hill on the Orkney Mainland. A flashlight waits in a bucket by the door, and visitors crawl on hands and knees, one by one, into the pitch-black interior. After savoring a degree of darkness rare in modern life, they direct beams of light up the tapering walls to marvel at the skill of the stonemasons. It is impossible to resist the impulse to clamber into the chambers and crouch where the bones once lay. Green and smooth, Maeshowe, another Orkney cairn, rises enigmatically from the field where it has stood since around 2700 BC. The designation of this monument and the surrounding Neolithic structures as a UNESCO World Heritage Site (WHS) in 1999 significantly increased tourism to the area (Card et al. 429), so while visitors may still enter Cuween unsupervised, access to the much larger Maeshowe now requires a timed ticket, bought in advance. Throughout the year, thousands of visitors, bending uncomfortably low, shuffle through the tunnel-like passage entry, making the physical journey from light to dark and a more psychological journey from present to past. Exploring any of the Neolithic sites in Orkney is to bridge time, to feel kinship with those who built them.

+

[2] Without doubt, a major reason Maeshowe attracts so many people is its symbiotic relationship with its environment. Most famously, at sundown during the December solstice, the winter sun lines up with the door of the tomb, shines down the passage, and focuses its rays on the stone wall within. Interest in this phenomenon, the moment when the light stabs the darkness, is so high that Historic Scotland provides web-cam coverage, but Maeshowe fascinates others besides tourists and solstice celebrants. Whether they are vacation visitors, archaeologists, anthropologists, or poets, explorers experience the sites differently, applying their own intellectual tools and imagining Neolithic lives from their respective points of view. Leslie Riddoch has written that these are “Stone Age marvels which inspire and astonish,” and Simon W. Hall expresses the experiences of many when he refers to “the profound impact of entering a tomb” (160). They imply that to enter a cairn is to become one with it, to undergo a transformation. Maeshowe, which can now be experienced only under the regimented conditions required by the Historic Scotland guides, clearly retains extraordinary power to inspire. Indeed, this ancient mound has attracted a great deal of literary attention from both noted and obscure writers. Considering these cumulative interpretations, rather than relying solely on the work of archaeologists, opens up a more comprehensive, textured, and, indeed, gendered understanding of ancient history and our commonality with Neolithic peoples.

+

[3] George Mackay Brown, Kathleen Jamie, Myra Schneider, and Dilys Rose are four of the more prominent authors for whom Maeshowe has proven inspirational. They have experienced the tomb through a doubly imaginative process: first by reading it as they would read a poem and then by expressing that interpretation in writing. While Brown was an Orcadian, living most of his life alongside the Neolithic sites, Jamie, Schneider, and Rose, all of whom have Scottish roots, experience Maeshowe as tourists, drawn across the Pentland Firth to enter the passage and travel into the darkness. Significantly, all three of these more contemporary writers are women. Hall, in his valuable survey, The History of Orkney Literature, contrasts the use of the prehistoric by female Scottish writers with that of their male counterparts, stating that it is less political, that women authors take “the opportunity to reestablish the place—and, significantly, the inner lives of women in the prehistoric or early historical northern landscape” (162-163). I would argue, however, that their work also engages the public world to a greater extent and is more ideological than this statement implies. Jamie’s, Schneider’s, and Rose’s experiences in Maeshowe lead to readings of the monument that build on the archaeological interpretations, allowing us to consider the possibility of ancient gender power struggles and raising our awareness of the deep roots of masculine dominance.

+

[4] Archaeologist Colin Richards, who has written extensively about The Heart of Neolithic Orkney WHS, describes how visiting cairns must also have affected prehistoric visitors: “the journey will be one of consequence.” Moving from the light of day to the dark mysteries of a tomb’s interior “is a passage from the profane to the sacred.” As such, “it will involve transformation” (“Doorways” 70-71). However, the nature of the transformation is mysterious. Referring to single-chambered structures divided into stalls, he continues, “If the Orkney-Cromarty ‘chambered’ tombs are principally conceived as a series of doorways, the question arises: where are they leading? To what goal?” (71). In discussing the relationship between buildings and the people who used them thousands of years ago, Richards considers the figurative significance of doors. In doing so, he treats the tombs as if they were literary texts with debatable meaning, having previously pointed out that “the architecture of a chambered tomb relied on analogy and metaphor for its understanding and interpretation” (“Doorways” 67). Rather than merely being repositories for bones, the tombs, Richards asserts, were “built to be experienced visually, physically and imaginatively,” an experience which may well result in some kind of “revelation” (“Doorways.” 69, 70, 76). Since he argues that buildings carry metaphoric meaning, open to imaginative interpretation, it is entirely appropriate that, when explaining this, Richards also changes to the historical present tense. His grammatical shift emphasizes that like Beowulf, Hamlet, or Moby Dick, tombs such as Maeshowe transcend time and are open to new readings, whether by trained archaeologists, pilgrims, casual visitors, or writers.

+

[5] Robert Crawford draws more explicit parallels between Maeshowe itself and literature in his essay, “Maes Howe Sappho.” Noting the continuing appeal of the tomb, how today “people still treasure” the moment that the sun lines up with the passage, he compares the ancient monument to poetry:

However different we and our family groups, our tribes, have become, we can and do still savor that sense of alignment and attunement and have our own ways of articulating some sort of consonance between ourselves, our intimate groupings, and the universe that surrounds us. Though such patternings may be deconstructed, they seem to emerge from a deep need that recurs across generations, like a persistent internal rhyme, and poetry, this most nuanced way of making with words, is a way in which that need for attunement is repeatedly articulated through language. If prehistoric sites often appear to relate people to the stars and planets, then poems continue that impulse. (61) +
+

Ancient tombs, then, prompt us to ponder our place in the universe, our identity as humans, and in that also they resemble literature. According to Kenneth Brophy, Neolithic monuments “were and are locations that embodied the biography of the builders, users, spectators, and excavators” (10). It follows that if we think of Maeshowe as a text, Brophy’s assertion that the monument absorbs the “biography” of all who have used it or visited it, positions it as an example of intertextuality. Maeshowe has many constantly changing stories to tell to its different readers, and readers will respond differently to its figurative meanings.

+

[6] In a 1977 column for The Orcadian newspaper, George Mackay Brown describes how witnessing the midwinter solstice at Maeshowe affects him: “Winter after winter I never cease to wonder at the way primitive man arranged, in hewn stone, such powerful symbolism” (“Maeshowe at Midwinter” 88). Like Richards, Brown is emphasizing the figurative qualities of the structure, which he has further explored in poetry. However, the first of his 1999 “Two Maeshowe Poems” (often printed as a stand-alone) opens not at the tomb, but with an image of the neighboring stone circle, Brodgar. Perhaps surprising to most readers, this would resonate with archaeologists since current scholarship emphasizes that the sites comprising The Heart of Neolithic Orkney are not self-contained but exist and function in relation to one another and to the surrounding landscape (See “Heart of Neolithic Orkney WHS: Setting Project” 5). As such, they should not be interpreted as discrete entities. It is fitting, then, that Brown’s poem moves seamlessly through a series of images that integrate Brodgar’s “light and darkness” with Maeshowe’s “flowers [and] stone” (a reference to the runic graffiti carved by Vikings inside the tomb) and “skulls” (Lines 1, 9, 11). The first word of the poem, “Circle,” is semantically echoed in the initial word of each ensuing stanza, “Ring,” “Wheel,” and “Round,” subtly shifting from the geometrically circular Brodgar to the tumescent mound of Maeshowe and emphasizing the cycle of “life and death” (7). For this is a poem about regeneration, how “Out of those skulls / Breaks the first green shoot, the full ear, then the bread” (11-12). Throughout, juxtaposed images look for the positive to outweigh the negative: “We move in shadows,” but “Brodgar has burned on the moor a dance of sun”; “Ring of quern and plough” (a quern is a stone for grinding grain) are charged to “contain / Our tumults of blood”; “The stars’ chaos is caught in a strict rein”; the word “stone” is enveloped by “flowers,” and “beauty and love”; similarly, “snow” is flanked by “sun” and “seed.” So darkness becomes light, destructive violence is subservient to the raising and grinding of grain for bread, order makes sense of the universe, the beautiful and the warm temper the hard and the cold, and new life will follow death.

+

[7] Brown’s interpretation of these monuments, his use of the architectural circularity and roundness of the Ring of Brodgar and Maeshowe as metaphors for the lifecycle and the possibility of renewal, is shared by archaeologists, who despite its being a burial site, have also associated Maeshowe and its rituals with the agricultural year. Neolithic people were not nomadic but had gradually become settled farmers, living by the routines and rhythms of the seasons, which, according to Richards, constituted “an analogy with the human life cycle and past generations” (“Doorways” 65). Time’s passage was the organizational framework for survival as well as mortality, and the tombs, he writes, were “a metaphorical extension of daily life” (“Doorways” 76). Trevor Garnham, an architect, develops that idea further: “Burying bones in the earth was perhaps to seek some metaphoric relationship with the planting of seeds. In its maturity and death, the seed containing the essence of its own renewal served as the inspiration for the hope of life’s rebirth in some other form” (87). In pairing skeletal remains with seeds as an expression of hope for the future, Garnham’s analogy is comparable to the positive final image of Brown’s poem, the “skulls” engendering the “green shoots” and the “bread” of life.

+

[8] Brown had written earlier of Maeshowe in his 1996 poem, “Maeshowe: Midwinter,” choosing then to focus on the solstice. However, the imagery here is not rooted in the agricultural cycle, the earthly world of querns, ploughs, and bread; instead, he connects the pre-Christian tomb to the Christian calendar. The opening phrase, “Equinox to Hallowmass,” immediately integrates the astronomical with the sacred, giving the season of “darkness” both physical and spiritual dimensions (1). The religious imagery continues in the second stanza as it evokes “St Lucy,” whose feast day falls on the shortest day of the year (6). She is portrayed as a weaver whose “shuttle” creates “a dark web” that “fills the loom” (7-9), placing at the centre of the poem a world in which light is completely absent: “The blackness is solid as a / stone that locks a tomb. / No star shines there” (10-12). To be in such a void, with no guiding star, would seem like a moment of psychological despair, yet just as the days begin to lengthen immediately after the solstice, the poem also brightens. The moment when the sun enters the passage is the “true ceremony,” suggesting that perhaps the pagan reverence for nature carries particular authenticity. Then “the last fleeting solstice flame” is “caught up,” leading to an optimistic note as the children—the future—sing with “voices like leaves of light” (19). Again, the poem ends with an image of rebirth, but its tone is less biological and more cosmological.

+

[9] While Brown’s poems use these dual frames of reference in order to explore the themes of regeneration that Maeshowe expresses, the biological and cosmological are not at odds. Garnham defines the cosmos as “an all-encompassing world of things and phenomena [. . . .] The essential character of this early form of cosmos bound every aspect of a people’s life into reciprocal relationships with the forces that give shape to their world” (9). The central argument of his book places Neolithic Orkney in this context. Similarly, reading Brown’s two Maeshowe poems together reveals that the “green shoot” which produces the “bread” corresponds to the youthful “voices like leaves of light.” In fact, his insertion of “leaves,” with its agrarian connotations, into that final line establishes the connection, recognizes that the complex architectural system of domestic houses, burial chambers, and stone circles symbolizes the idea that the activities for which they were designed—working, eating, loving, sleeping, worshipping, dying, and the possibility of rebirth—are the web of human existence. The physical bread and the metaphysical song are one.

+

[10] In their respective responses to Maeshowe, Kathleen Jamie, Myra Schneider, and Dilys Rose also address the theme of the cycle of life and death. Jamie’s essay, “Darkness and Light,” describes a quest: she seeks a good, positive darkness because, in the 21st century, it has become impossible “to see the real dark for the metaphorical dark . . .the death-dark.” Enjoyment of the “natural, courteous dark,” she has come to believe, has been squeezed out by the Christian belief in a metaphorical darkness that stands for the opposite of salvation (9-10). However, as she is planning this trip, a friend points out that “Maes Howe is a metaphor,” perhaps exposing a flaw in Jamie’s thinking: possibly the natural and metaphorical darknesses are inseparable (10 emphasis added). Although her visit to Maeshowe takes place a couple of days before the solstice, the artificial lights of a surveyor’s crew assault her eyes, so she rediscovers no “courteous darkness” and witnesses “no resurrecting beam of sunlight” (19). Nevertheless, through Maeshowe, she becomes reconciled to the conventional negative concept of darkness. In terms of “wonder” similar to Brown’s in The Orcadian, she asks, “Were they the first people . . . to articulate this metaphor of light and dark, of life and death?” and reflects upon its significance:

For five thousand years we have used darkness as the metaphor of our mortality. We were at the mercy of merciless death, which is darkness. When we died, they sent a beam of midwinter light in among our bones. What a tender, potent gesture. In the Christian era, we were laid in our graves to face the rising sun. We’re still mortal, still don’t want to die, don’t want our loved ones to die. (19-20) +
+

Her rejection of a metaphor that she has considered “[worn] out” and “redundant” (4, 9) turns out to have been less literary and more personally psychological, for Jamie’s visit to the tomb leads to her acceptance of mortality. Whereas previously she has blamed Christianity, she now appreciates that the Christian concept of darkness is part of a continuum of dread traceable back to Neolithic times and forward to our own. The “tender, potent gesture” of the light penetrating the dark of the tomb, therefore, offers consolation, ameliorating our most profound fears (20).

+

[11] In her poem, “Maeshowe,” Myra Schneider also describes a guided tour of the cairn, during which the speaker uses the second person singular to address a hypothetical visitor, initially giving the sense that to enter the burial place feels like death as the “chill seeps into your body” (14). However, this ominous impression is immediately dismissed because “a stillness that’s other than death inhabits / this place where the undead gather to greet the dead” (15-17). The journey through the passage will take “you” to a place that is not oblivion but, instead, is where the living may consort with their ancestors. Again, the boundary between life and death, which can seem so irrevocable, becomes less absolute and, therefore, less threatening. After the visit is over, its impact will remain, and the speaker imagines her visitor’s memories:

In midwinter you’ll visualize the sun piercing the dark that swaddles seeds, see it falling on the aligned entrance, its white shine splitting to burnish the passage wall, flood the ground with gold. (22-26) +
+

These images recall Garnham’s theory: that the burial of bones is connected metaphorically to the planting of seeds. In the speaker’s memory, the dark cradles seeds, the germ of life, rather than bones. Once sunlight enters the tomb, a radiant moment occurs in which the “ground” will turn “gold,” like a field of ripe grain. Schneider’s poem, like Brown’s, affirms the archaeological reading of Maeshowe as a place of renewal, but in this case that renewal goes beyond the promise of the agricultural cycle. An individual will be able to experience, perhaps during times of psychological or spiritual gloom, the moment of glory when the sun is “piercing / the dark.” There is a Romantic quality to these lines: Maeshowe will stay with Schneider’s speaker as those daffodils stay with Wordsworth, “to flash upon the inward eye / That is the bliss of solitude,” to stimulate the imagination (24). Having herself benefited from the tomb’s restorative qualities, the speaker is inspired to spread the word, to share her revelation with “you,” the reader.

+

[12] Besides the drama of the solstice, another inspirational feature of Maeshowe is the Viking runes carved on the interior walls. Referring to these inscriptions as “The first island poems,” Brown quotes them emphatically in the second of the paired poems: “INGIBIORG IS THE LOVELIEST GIRL / HERMUND WITH A HARD AXE CARVED RUNES” (“Two” 13, 18-19). Many have been struck by the simple humanity of these statements, as well as the paradox inherent in this lusty youthful scrawling being hidden in a tomb. Dilys Rose, in “Maeshowe Nipple,” for instance, lists the prosaic concerns of the Vikings, portraying them as “intrepid” but also homesick, missing “sweethearts and family” (4, 9). At the ends of their respective poems, both Brown and Rose emphasize that Maeshowe was merely a temporary shelter for the Vikings: the “young seamen climbed out of Maeshowe, / Their nostrils wide to the salt wind”; “the dragon boats moved on” (Brown “Two” 23-24; Rose 11). Crawling out of the subterranean tomb and heading for further maritime adventures, the men re-enter the world, extending the overall theme of regeneration. Brown, as we have seen, has already linked the tomb with the life-giving promise of “the first green shoot, the full ear, then the bread” in the first of these paired poems. Rose, in similar terms, also connects the Viking runes with the reassuring knowledge that there will be a crop next year: over the centuries, “their tongue / took root and sprouted from invaded soil / green words for Father, Daughter, Bread” (11-13). Here, in the final lines, the Viking vocabulary is fresh and verdant, a harbinger of new human life and the grain that nourishes it. Since runic characters are “straight-branched” (Rose 4), they resemble rows of rudimentary skeletal stick figures which have been buried in the tomb. The bony runes, therefore, have become metaphorical seeds, and Rose’s speaker, like Garnham, sees hope in the bone/seed analogy.

+

[13] It is clear, to summarize briefly, that these four creative writers read Maeshowe much as archaeologists and historians of architecture have done, as an expression of hope for the future, particularly in relation to the coming of spring, but also at a more personal level. The texts suggest that to visit these tombs is, as Richards also emphasizes, transformative. Like their ancestors, contemporary visitors are changed, in some manner revitalized, especially if they witness the sun’s midwinter alignment, which Brown describes as a “pledge of renewal, a cry of resurrection” (“Maeshowe in Midwinter” 88). However, in the work of Jamie, Schneider, and Rose, a further, more political restoration is at work, for all three use images equating Maeshowe with the female body.

+

[14] Kathleen Jamie states early in her essay, “We are conceived and carried in the darkness,” emphasizing the positive, life-giving qualities of the dark, and inviting the reader to see Maeshowe as a uterus (4). The womb/tomb imagery is developed further when she eroticizes the winter solstice as “a complicit kiss,” during which “the beam of the setting sun shines along the passage, and onto the tomb’s back wall” (12). When she goes inside the tomb, she expects “not utter darkness, but perhaps a wombish red”; however, this is denied her because of the lights of the surveyors, one of whom is “folded, foetus-like, into the little cell in the back wall”: a foetus implanted in the very place where the sunbeam strikes (12,13). When Jamie leaves, she describes taking “the smallest and most challenging of journeys, squeezing down a passageway and out into the world of sound and moving air” (17). The tunnel that admits the beam has become a birth canal, so Jamie’s transformation is not only her intellectual reassessment of the metaphorical value of darkness; she visualizes her own rebirth in more literal terms too, with Maeshowe cast as the mother.

+

[15] Myra Schneider’s “Maeshowe” also hints that to visit the tomb is to return to the womb when the speaker remarks that although “you” are part of a tour group, you will realize that you are “alone” and have “never travelled so far back / so far in” (8-10). This analogy is made more explicit later in the poem when the sun enters the passage: “In that deep chamber / you’ll be bathed in red, not the red spilt in hatred—/the red that’s birth, the heart looming with the blood” (24-28). In the vision that the speaker evokes for the visitor’s memory, therefore, the “dark that swaddles seeds” not only nurtures and protects the grain that will ripen into crops, but also the fertilized ovum (23). With no dazzling and intrusive surveyors’ lights, Schneider suggests that it is possible for us to experience the “wombish red” that was denied Jamie, blood that is the force of life rather than the mark of violence.

+

[16] Dilys Rose’s poem, “Maeshowe Nipple,” on the other hand, in addressing the Viking use of the tomb, acknowledges that violence has taken place. The title, of course, immediately signals that Maeshowe is female, and the opening lines graphically describe the tomb’s external anatomy: a “breast,” with an “aureola / sandy-rimmed, the nipple leaking a pale trail / to hidden chambers” (1-3). Within, Maeshowe’s chambers have been “invaded” by men who “inscribed their conquests” and “totted up the loot” (12, 4, 6). Even though the poem has initially compared the cairn to a breast rather than a womb, this seems like a rape or an assault by men exercising their power and keeping track of their plunder. As human and homesick as the poem presents the young men, it does not forget that their presence in Maeshowe is as uninvited intruders who leave their runic seeds carved into the chamber walls.

+

[17] To make sense of this pattern of imagery, it is helpful to turn to an earlier female author, similarly inspired by her visit to a Neolithic site. Naomi Mitchison wrote Early in Orcadia after a friend took her to another of Orkney’s chambered tombs, Isbister, which has no passage entry, because “she knew it would waken something in me” (8). Set in Neolithic times, the novel follows a family and its descendants as they settle on Orkney, establish homes and villages, and erect the monuments in which they practice their religious rituals. Mitchison depicts the cairns predating the stone circles (both Isbister and Maeshowe are, in fact, thought to have been built before Brodgar) and imaginatively describes the changing beliefs prompting these architectural developments. Tradition holds that pregnant women must visit the tomb in order that the ancestral spirit will be passed to their children (132). One woman, Ba, making this journey, reflects that a “few moons” have passed since she became pregnant and stopped menstruating. She also knows that a powerful goddess, “the big bad Moon Woman had once had an honouring place,” had watched over the dead (119). However, the Moon Woman has been supplanted by the sun. The burial place was “pulled apart and scattered by the Sun Man and the bulls. After that came the beginning of their own honouring place where the bones lay and where you must go down on your knees before you could get in” (119). The later passage cairn, then, is a creation of the masculine sun, the same sun that shines down the passageway at midwinter. Accompanied by bulls, also male, the Sun Man has ravaged the Moon Woman’s tomb and designed a new one to suit his own needs. Even so, the burial place is still associated with female fertility. Nervously, Ba enters “on her hands and knees . . . under and between great stones.” Once inside, though, she thinks of the moments before she conceived her child: “She was waiting, almost as she had waited in the soft sand behind that rock in the sun-warmed geo a few moons back” (130). For Ba, the tomb is not frightening. She recalls not a violent rape, but a loving encounter, and the darkness feels as warm as the “geo” (an Orcadian word referring to a deep, narrow fissure in a cliff) where she met her lover. Following her memory of the moment of conception, she is “push[ed] . . . back, back to the way out, back to the square of light, to the way out into the real world on hands and knees as one must” (130). Like Jamie, Ba is compelled to crawl, to battle her way through the passage to be reborn.

+

[18] By the end of Early in Orcadia, the stone circle, with its emphasis on light rather than dark, is becoming the ultimate manifestation of the transfer of power from the Moon Woman to the Sun Man. Its significance is explained by the “Great Man,” who is “painted with sun circles,” to Moon Woman after he has summoned her to his presence: “The great tall stones . . . were so raised to show the way of the sun, who is our master and our maker” (169). Moon Woman, however, is aware of the injustice of this arrangement: “They said that the moon was the servant of the sun, to do what he wanted, but that, Moon Woman knew, was not right. In her own mind she unsaid it” (170). At first she is jealous and afraid, but the final vision of the novel is hers, and it is, to an extent, a reconciliation of powers:

If I were to say a few small and easy words to the Great Man, if I were to move myself in a certain way, then we would be sun and moon. Then I would put my fingers onto the colour, onto that knife, onto his eyes, . . . eyes, onto that round, shining sun that hangs over his heart, fingering it so that my fingers would meet his, me going . . . onto all parts of him. He would be mine as the sun is the moon’s. (176) +
+

She is picturing an intertwining of sun and moon, of masculine and feminine—a consummation. The partnership is not one of complete equality, though, for she also envisions not that the sun will be the master and the moon the servant, but that he will be hers, that the moon will possess the sun, that her status will be restored.

+

[19] Mitchison’s fictional representation of light/sun/man emerging as the object of worship and awe, assuming the rank previously held by dark/moon/woman, is an idea rooted across cultures: “A fundamental polarity in many creation myths,” according to Trevor Garnham, “contrasts the dark, fecund, harbouring earth with the up-drawing sun.” (145). He points out, for example, that “by the time of the Celtic occupation of Britain, there were well-established beliefs and practices focused on the sun” and that in Norse mythology, “a male hierarchy supplanted older, matriarchal law” (161, 109). Analyzing the archaeological sites within this paradigm, Garnham argues, supports the theory that religious practice fundamentally changed along with the architecture, that “ritual activity associated with burial cairns became transferred to stone circles” (152).

+

[20] Maeshowe, however, suggests a mid-point in this ritualistic shift because although, like earlier stalled cairns, it is dark and womb-like, its annual climactic moment is when the sun lights up the passage. Garnham sees the Neolithic architecture of Orkney as a progression. The first structures, the houses, were purely domestic; they had a “nurturing role” (66). The houses at the coastal village site, Scara Brae, therefore, “seem to be fundamentally powerful symbols of protection and gathering, echoing that of the pot and the basket” (70). Since the manufacture of both pots and baskets was the work of women, Garnham is reading the houses as essentially feminine. They were vessels, their stone walls embanked by earth. Both Garnham and Richards point out that the houses were models for the tombs: the passage graves are structurally similar to the houses at Scara Brae, and both were covered with turf (Garnham 48; Challands, Muir & Richards 242, 245). Cairns of the Maeshow type, with passage entries, however, were the later forms. The earlier stalled structures, such as Midhowe, on the island of Rousay, did not feature the tunnel entrance.

+

[21] Archaeologists do not agree on the social significance of passage cairns and sun circles, the extent to which their development reveals a move to a more hierarchical society. Challands, Muir, and Richards state, “In many ways, everything about the architecture of Maeshowe enforces a notion of separation, division, and restriction” (247). Elsewhere, Richards and another co-writer are more guarded. They point out that the tomb resembles House 2 at the nearby Barnhouse settlement, a larger house than any at Scara Brae that was probably “highly restricted on the basis of an individual’s status, probably additionally defined in terms of age and gender.” However, they also warn that there is insufficient archaeological evidence to “leap to conclusions about a patriarchal group of ‘elders’ who used knowledge as a commodity to maintain their power over women and younger men” (Muir & Richards 204). Although cautious, they do acknowledge that “power and authority,” probably based on “cosmological beliefs,” would have been necessary to build the monuments (199). Leaning not only on physical but also anthropological evidence, Garnham’s view, on the other hand, is that the more formal structure does support the idea of hierarchy and that the estimated 100,000 man/hours that would have been necessary to build it point to a more complex social structure that had to extend beyond the local community (128). Furthermore, he writes, the layout of individual chambers “can be read as a metaphor of primogeniture” (74). Like Richards, Garnham interprets the passage as a symbol of privilege because it was hard to get inside. However, citing Eliade’s Patterns in Comparative Religion, he also emphasizes that there is “a close connection between solar theology and the elite” (163). In this context it seems that “allowing access to the sun . . . was more important that [sic] allowing access to members of the tribe” (131-132).

+

[22] Maeshowe can be seen, then, as expressing a point of tension between earth and sun in which the dark tomb is literally infiltrated by solar rays on one day only. The subsequent building of the Circle of Brodgar elevates the stature of the sun. Fully above ground, the center of its astronomical and religious year occurs not in December, but in June, at the midsummer solstice. Garnham points out that while a smaller circle, the Stones of Stenness, is open to the sun at its “point of maximum power,” Maeshowe allows the sun inside only when it is “at its lowest ebb.” Except at midwinter, “the tomb is dark, cold, and filled with white bones, echoing the whiteness of the moon” (207). Although Stenness actually predates Maeshowe by perhaps 400 years, throwing off the neat chronology of Early in Orcadia, Garnham’s interpretation of Maeshowe and the stone circles parallels Mitchison’s literary response to the Isbister tomb: compared to earlier cairns, Maeshowe is a more patriarchal development, the passageway allowing the masculine sun to displace the feminine “whiteness of the moon,” and yet the bones, the metaphorical seeds, still lie dormant; the presence of Moon Woman endures.

+

[23] Although Early in Orcadia ends with Moon Woman’s vision of a mingling of sun and moon, of masculine and feminine, there is a note of uncertainty as she asks herself, “Should I, then?” (176). She does not ask “Can I?” but “Should I?” Her question is not whether she is personally capable, but whether it would be wise to challenge the elite power structure in the name of justice. Readers are left without an answer, but since women are still fighting for equality in the institutions of politics and religion, it is reasonable to assume that if Moon Woman did attempt it, she met with a great deal of resistance. It is with this in mind, then, that we can return to the Maeshowe experiences of Jamie, Schneider and Rose. Their visits to the cairn suggest that to see it merely as a symbol of agricultural regeneration or even more broadly of hope, is incomplete. Something more needs to be resurrected, and their use of the female imagery effectively acknowledges and reclaims a feminine narrative for Maeshowe. In Rose’s poem, 12th century Vikings may take up residence inside, but 900 years later, the reader is instructed to “See,” to bear witness to “a green breast in a green field,” the most nurturing part of a woman’s body surrounded by the new growth of spring (1). When Schneider refers to the “red that’s birth” rather than the “red spilt in hatred,” and describes how the sun will “burnish the passage wall, / flood the ground with gold” and, similarly, when Jamie refers to the “complicit kiss,” it is as if Moon Woman’s consummation has finally taken place and justice restored.

+

[24] Richards asks where the doors of tombs lead, to what “revelation.” Indeed, the creative writing of Jamie, Schneider, and Rose transports readers through Maeshowe’s entryway towards “revelation.” Their collective responses help us to recognize the humanity of Neolithic peoples, to appreciate how common experiences connect us to the past. They ask us to consider the roots of sexual discrimination, the possible marginalization of women 5000 years ago. More universally, they honor the memory of displaced matriarchal societies and, thus, prompt us to reflect on the status of women today. While, as Hall points out, male authors of the mid-twentieth-century Scottish Literary Renaissance had a nationalist political agenda, “looking for Scotland in Scotland’s prehistory” (160), these female writers look to the past for a feminist renewal, both personal and political. As such, their interpretations complement and illuminate those of archaeologists. Naomi Mitchison, acknowledging that she may be “treading on the toes of archaeologists,” points out that their physical “evidence may not always offer a clear interpretation, in fact it very seldom does” (113). For despite their painstaking sifting (both literal and figurative) of physical evidence, archaeologists must, finally, apply their own imaginations.

+

[25] Archaeologists themselves recognize the uncertainty inherent in drawing conclusions about ancient societies from the surviving fragments of their lives. In reference to the recent discovery of a complex of temples at the Ness of Brodgar, Richards has said, “This was a ceremonial centre, and a vast one at that. But the religious beliefs of its builders remain a mystery" (qtd. in McKie). In fact, the excavation of this temple complex is prompting a reassessment of the entire Heart of Neolithic Orkney. Tom Muir, of the Orkney Museum, goes so far as to assert that "the whole text book of British archaeology for this period will have to be torn up and rewritten from scratch thanks to this place" (qtd. in McKie). Even as archaeologists, using sophisticated technology, scrape away the dust of time from this long-buried site, it remains true that “Insights can only come from interpretation” (Jones and Richards 195). It is in this interpretative arena that science must join forces with the arts and humanities in the search for knowledge, for a fuller understanding.

+

[26] George Mackay Brown has written, “People in 2000 AD are essentially the same as the stone-breakers [. . .] of 3000 BC” (“Brodgar Poems” lines 10-12). Knowing where we have come from, fleshing out our understanding of the prehistoric world and, therefore, ourselves, takes the skills and multiple perspectives not only of scientists, archaeologists, architects, and anthropologists, but also essayists, poets, and more. The interdisciplinary synergy involved in comparing archaeological, anthropological, and literary interpretations of Maeshowe sheds light on the shadows of the past, raises questions about the more elusive shadows of Neolithic women, and provides historical context for our understanding of gender relations across time. Like crawling through the passage into the dark and out to the light, the empirical and literary journeys into the mysteries of Maeshowe are indeed transformative, exhuming the bones of the past that we may better nurture the seeds of the future.

+

ACKNOWLEDGEMENTS. Thanks are due to Edward Gale Agran, Stephen Potthoff, and the anonymous reviewers for their time and valued advice.

+

WORKS CITED

+

Bevan, Archie, and Brian Murray. Eds. The Collected Poems of George Mackay Brown. London: John Murray, 2005. Print.

+

Brown, George Mackay. “Brodgar Poems (1992).” In Bevan and Murray.308-312. Print.

+

---. “Maeshowe: Midwinter.”1996. In Bevan and Murray. 320. Print.

+

---. “Maeshowe at Midwinter.” 1977. Under Binkie’s Brae. Edinburgh: Gordon Wright Publishing, 1979. 87-88. Print.

+

---. “Two Maeshowe Poems.” 1999. In Bevan and Murray. 420-421. Print.

+

Card, Nick, et al. “Bringing a Landscape to Life? Researching and Managing ‘The Heart of Neolithic Orkney’ World Heritage Site.” World Archaeology 39.3 (2007): 417-435. EBSCO Academic Search Complete. Web. 29 Jun. 2011.

+

Challands, Adrian, Tom Muir, and Colin Richards. “The Great Passage Grave of Maeshowe.” Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 229-248. Print.

+

Crawford, Robert. “Maes Howe Sappho.” Yale Review: 95.1 (2007): 60-65. OhioLINK Electronic Journal Center. Web. 29 Jun. 2011.

+

Garnham, Trevor. Lines on the Landscape, Circles from the Sky: Monuments of Neolithic Orkney. Stroud, Gloucestershire: Tempus, 2004. Print.

+

Hall, Simon W. The History of Orkney Literature. Edinburgh: John Donald/Birlinn Ltd., 2010. Print.

+

“Heart of Neolithic Orkney WHS: Setting Project” Historic Scotland. 2008. EBSCO Academic Search Complete. Web. 30 Jun. 2011.

+

Jamie, Kathleen. “Darkness and Light.” Findings: Esssays on the Natural and Unnatural World. Ed. Jamie. St. Paul, MN: Graywolf, 2005. 3-22. Print.

+

McKie, Robin. “Neolithic Discovery: Why Orkney is the Centre of Ancient Britain.

+

The Guardian / The Observer. 6 Oct. 2012. Web. 16 Mar. 2013.

+

Mitchison, Naomi. Early in Orcadia. Glasgow: Richard Drew, 1987. Print.

+

Jones, Siân, and Colin Richards. “The Villagers of Barnhouse.” Dwelling Among the Monuments: The Neolithic Village of Barnhouse, Maeshowe Passage Grave and Surrounding Monuments at Stenness, Orkney. Ed. Colin Richards. Cambridge: McDonald Inst. For Archaeological Research, 2005. 195-204. Print.

+

Richards, Colin. “Doorways into Another World: The Orkney-Cromarty Chambered Tombs.” Vessels for Ancestors: Essays on the Neolithic of Britain and Ireland in Honour of Audrey Henshall. Ed. Niall Sharples and Alison Sheridan. Edinburgh: Edinburgh UP, 1992. 62-76. Print.

+

Riddoch, Lesley. “Stone Age Marvels Which Inspire and Astonish: Wonders of Scotland.” The Scotsman. 13 Feb. 2006. Web. 30 Jun. 2011.

+

Rose, Dilys. “Maes Howe Nipple.” Bodywork. Edinburgh. Luath Press, 2007. Print.

+

Schneider, Myra. “Maeshowe.” Circling the Core. London: Enitharmon Press, 2008. 23-24. Print.

+

Wordsworth, William. “I wandered lonely as a cloud.” The Norton Anthology of English Literature. Eighth Ed. Ed. Stephen Greenblatt and M.H. Abrams. New York: Norton, 2006. 305-306. Print.

+

Contributor's Note

+

CHARLOTTE FAIRLIE teaches English at Wilmington College, in Wilmington, Ohio. Her published work focuses on Scottish literature and rural life in literature. She is currently co-editing an anthology of poetry relating to scythes and mowing.

+ Current Issue
+ Download
+ Editorial Board
+ Contributor Guidelines
+ Recent Issues
+ Links & Books
+
+ + + + + + +

Copyright 2010 Ann Kibbey. + + All Rights Reserved Worldwide.
+

+

+
+ Download || Editorial Board || Submission + + Guidelines || Current Issue || Recent Issues || Links + + & Books +
+

+

+ + + + + + + + + +

Genders

Genders Journal
+ 226 UCB
+ University of Colorado
+ Boulder, CO 80309
+ http://www.Genders.org
+

+

+

+

+ + \ No newline at end of file diff --git a/python/tests/files/nature_article.html b/python/tests/files/nature_article.html new file mode 100644 index 0000000..177da83 --- /dev/null +++ b/python/tests/files/nature_article.html @@ -0,0 +1,1379 @@ + + + + + + + + + + + + + + + + + + + More than 100 scientific journals have disappeared from the Internet + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
NEWS + +
+ +
+

More than 100 scientific journals have disappeared from the Internet

+ +
+ Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. +
+
+ +
+ +
+
+
+ +
+ +
+
+
+ + +
+

Search for this author in:

+ +
+ + + +
+
+
+
+ +
+
+ +
+ + + + + + +
+ +
+

Scholarly journals are supposed to provide a lasting record of science. But over the past two decades, 176 open-access journals — and many of the papers published in them — have disappeared from the Internet, according to an analysis published on 27 August1.

“There shouldn’t really be any decay or loss in scientific publications, particularly those that have been open on the web,” says Mikael Laakso, an information scientist at the Hanken School of Economics in Helsinki, and a co-author of the study, which was posted on the arXiv preprint server. He and his colleagues identified 176 titles whose online presence vanished between 2000 and 2019.

+

More than half of these journals were in the social sciences and humanities, although life sciences, health sciences, physical sciences and mathematics were also represented. Eighty-eight of the journals were affiliated with a scholarly society or a research institution. The analysis also identified 900 journals that are still online but seem to have stopped publishing papers, so might be vulnerable to vanishing in the near future.

The study lays out a "compelling case" for the vulnerability of online journals, says Elizabeth Lightfoot, a librarian at Florida International University in Miami.

Vanishing journals

Journals can disappear from the Internet for a number of reasons, says Laakso. The publisher might stop paying to keep its publication’s webpage afloat, for example, or journals might be hosted on an online platform that belongs to an academic institution and is left behind when the site or server is updated.

Journals are supposed to be preserved in digital archives when this happens. Services such as the LOCKSS (Lots of Copies Keep Stuff Safe) Program, which was launched by Stanford Libraries in 1999, aim to ensure that publications remain available even when the publisher is no longer around. LOCKSS works by making multiple copies of content that is stored on the servers of participating libraries, who pay an annual fee to have their collections preserved. Similar initiatives, including CLOCKSS, Portico and the Public Knowledge Project’s Preservation Network (PKP PN), have emerged over the past two decades. These vary in cost and coverage: Some work with libraries, others with publishers — services such as PKP PN are free for journals that sign up. Tens of thousands of titles are currently curated in such preservation schemes. But, Laakso says, there are dozens of journals that fall through the cracks.

+

Pinning down whether a journal is truly unavailable online is a challenge, because there is no single database that tracks the activity of open-access journals, says Lisa Matthias, one of the authors of the study and a PhD student at the Free University of Berlin. Databases such as the Directory of Open Access Journals (DOAJ) don’t keep track of journals that no longer publish — and journals that cease publishing or stop maintaining their presence on the web usually do so silently.

To find out how many journals had vanished, the team manually collected historical data from several lists of titles, including the DOAJ, Ulrichsweb and Scopus. Then they checked to see if any of the titles they identified were listed on the Keepers Registry, which keeps track of journals that are enrolled into digital preservation schemes. Finally, they went to the Internet Archive’s Wayback Machine to access snapshots of now-offline journals’ websites to see when they had last published, and when the content was last available on the Internet. Journals were considered “vanished” if less than 50% of their content was still freely available online (the researchers acknowledge that some journals could exist in print form or behind a paywall).

The majority of the 176 vanished journals had disappeared within 5 years of becoming inactive — the point at which they stopped publishing papers. Around one-third of them disappeared within one year of the last publication. The researchers used this ‘life cycle’ to estimate that another 900 inactive open-access journalscould be at risk of vanishing.

Preserving the literature

Subscription journals were not included in the study, Laakso says, because paywalls mean that they would have had to have used a different method to collect the data. He adds that because of this and other limitations, the study probably underestimates the number of journals that have disappeared. “It’s really hard to pin down when something doesn't absolutely exist, but we tried our best,” Laakso says. “We hope that there will be more refined and automatic ways to detect these in the future.”

+

Thib Guicherd-Callin, the acting manager of the LOCKSS Program, says it’s not surprising that there are journals that aren't captured by existing preservation services. Although many groups have used the open-source LOCKSS software, efforts to launch digital preservation initiatives are still “woefully underfunded”, he adds. “The desire to preserve these at-risk works is there,” he adds, but few institutions are investing the resources necessary to identify these publications and make sure they’re included in a digital preservation scheme.

Matthias says that the responsibility for ensuring inactive journals don’t disappear should be shared between publishers, authors, librarians and preservation services. Lightfoot agrees that a coordinated and collaborative effort is necessary. However, she adds, “the twin challenges of what that effort might look like and who would fund it make the pathway forward murky at best”.

+
+ + + +
+

References

  1. 1.

    Laakso, M., Matthias, L. & Jahn, N. Preprint at https://arxiv.org/abs/2008.11933 (2020).

Download references

+
+ + + + + + +
+
+

Nature Briefing

+

An essential round-up of science news, opinion and analysis, delivered to your inbox every weekday.

+
+
+ + + + + +
+ + +
+ + + +
+
+ + + + +
+ + +
+
+
+ +
+ + +
+ + + + + + + +
+ + +
+ + +
+ +
+ + +
+ + + +
+
+ Nature Briefing +

Sign up for the Nature Briefing newsletter — what matters in science, free to your inbox daily.

+
+
+
+ + + + + + +
+ + +
+
+
+
+ + +
+ +
+ + +
+ + +
+ +
+ + +
+ Get the most important science stories of the day, free in your inbox. + Sign up for Nature Briefing + +
+ +
+ +
+ +
+
+ + + + + + + + + + + + + + + + + + +
+
+

Search

+
+
+ +
+ +
+
+
+ + + + + + + + + + diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py new file mode 100644 index 0000000..4154aa5 --- /dev/null +++ b/python/tests/test_html_metadata.py @@ -0,0 +1,137 @@ + +import datetime + +from sandcrawler.html_metadata import * + + +def test_html_metadata_plos() -> None: + + with open('tests/files/plos_one_article.html', 'r') as f: + plos_html = f.read() + + meta = html_extract_biblio(HTMLParser(plos_html)) + assert meta is not None + assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody" + assert meta.doi == "10.1371/journal.pone.0213978" + assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" + assert meta.contrib_names == [ + "Yang Li", + "Tuanjie Wang", + "Lin Wang", + "Mingjun Sun", + "Zhizhong Cui", + "Shuang Chang", + "Yongping Wu", + "Xiaodong Zhang", + "Xiaohui Yu", + "Tao Sun", + "Peng Zhao", + ] + assert meta.container_name == "PLOS ONE" + assert meta.container_abbrev == "PLOS ONE" + # "Apr 22, 2019" + assert meta.release_date == datetime.date(year=2019, month=4, day=22) + assert meta.first_page == "e0213978" + assert meta.issue == "4" + assert meta.volume == "14" + assert meta.container_issn == "1932-6203" + assert meta.publisher == "Public Library of Science" + assert "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references + assert meta.release_type == "article-journal" + + +def test_html_metadata_elife() -> None: + + with open('tests/files/elife_article.html', 'r') as f: + elife_html = f.read() + + meta = html_extract_biblio(HTMLParser(elife_html)) + assert meta is not None + assert meta.title == "Parallel visual circuitry in a basal chordate" + assert meta.doi == "10.7554/eLife.44753" + assert meta.contrib_names == [ + "Matthew J Kourakis", + "Cezar Borba", + "Angela Zhang", + "Erin Newman-Smith", + "Priscilla Salas", + "B Manjunath", + "William C Smith", + ] + assert meta.container_name == "eLife" + # 2019-04-18 + assert meta.release_date == datetime.date(year=2019, month=4, day=18) + assert meta.publisher == "eLife Sciences Publications Limited" + + +def test_html_metadata_nature() -> None: + + with open('tests/files/nature_article.html', 'r') as f: + nature_html = f.read() + + meta = html_extract_biblio(HTMLParser(nature_html)) + assert meta is not None + assert meta.title == "More than 100 scientific journals have disappeared from the Internet" + assert meta.doi == "10.1038/d41586-020-02610-z" + assert meta.contrib_names == [ + "Diana Kwon", + ] + assert meta.container_name == "Nature" + # "2020-09-10" + assert meta.release_date == datetime.date(year=2020, month=9, day=10) + assert meta.publisher == "Nature Publishing Group" + # note: some error in dublin code in nature HTML resulting in duplication + assert meta.abstract == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk." + + +def test_html_metadata_ojs3() -> None: + + with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f: + ojs3_html = f.read() + + meta = html_extract_biblio(HTMLParser(ojs3_html)) + assert meta is not None + assert meta.title == "Surveillance, stigma & sociotechnical design for HIV" + assert meta.doi == "10.5210/fm.v25i10.10274" + assert meta.contrib_names == [ + "Calvin Liang", + "Jevan Alexander Hutson", + "Os Keyes", + ] + assert meta.container_name == "First Monday" + assert meta.container_abbrev == "1" # NOTE: bad source metadata + assert meta.container_issn == "1396-0466" + # "2020/09/10" + assert meta.release_date == datetime.date(year=2020, month=9, day=10) + assert meta.lang == "en" + assert meta.abstract == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate." + assert meta.html_fulltext_url == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729" + assert meta.release_type == "article-journal" + + +def test_html_metadata_dlib() -> None: + + with open('tests/files/dlib_05vanhyning.html', 'r') as f: + dlib_html = f.read() + + meta = html_extract_biblio(HTMLParser(dlib_html)) + assert meta is not None + assert meta.doi == "10.1045/may2017-vanhyning" + # "2017-05-15" + assert meta.release_date == datetime.date(year=2017, month=5, day=15) + +def test_html_metadata_dc_case() -> None: + """ + This tests that CSS selector attribute lookups are not case-sensitive. + """ + + snippet = """ + + + + + Hi. + """ + + meta = html_extract_biblio(HTMLParser(snippet)) + assert meta.issue == "123" -- cgit v1.2.3 From c4cf72914560f92e914a5dbf7360637f6c24f323 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Oct 2020 15:52:54 -0700 Subject: HTML metadata: fix type warnings --- python/sandcrawler/html_metadata.py | 4 +++- python/tests/test_html_metadata.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'python/tests') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 71715c2..a9536a6 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -219,7 +219,9 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]: raw_date = meta.pop('raw_date', None) if raw_date: - meta['release_date'] = dateparser.parse(raw_date).date() + parsed = dateparser.parse(raw_date) + if parsed: + meta['release_date'] = parsed.date() raw_release_type = meta.pop('raw_release_type', None) if raw_release_type: diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py index 4154aa5..4d670e5 100644 --- a/python/tests/test_html_metadata.py +++ b/python/tests/test_html_metadata.py @@ -36,7 +36,7 @@ def test_html_metadata_plos() -> None: assert meta.volume == "14" assert meta.container_issn == "1932-6203" assert meta.publisher == "Public Library of Science" - assert "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references + assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references assert meta.release_type == "article-journal" @@ -134,4 +134,5 @@ def test_html_metadata_dc_case() -> None: """ meta = html_extract_biblio(HTMLParser(snippet)) + assert meta is not None assert meta.issue == "123" -- cgit v1.2.3 From 3d56509ef83226a808ebb078f5cac9815afb5d9d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 29 Oct 2020 14:31:21 -0700 Subject: html: more metadata tests --- python/tests/files/peerj_oa_article.html | 2365 ++++++++++++++++++++++++++++++ python/tests/test_html_metadata.py | 88 ++ 2 files changed, 2453 insertions(+) create mode 100644 python/tests/files/peerj_oa_article.html (limited to 'python/tests') diff --git a/python/tests/files/peerj_oa_article.html b/python/tests/files/peerj_oa_article.html new file mode 100644 index 0000000..f2cf365 --- /dev/null +++ b/python/tests/files/peerj_oa_article.html @@ -0,0 +1,2365 @@ + + + + + + + The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles [PeerJ] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+ + 203 + Citations + + + +   + Views + + + +   + Downloads + +
+
+
+
+
+
+ +
+ + + + + + +
+ + + + +
+ + + +
+ + +
+ +
+
+
+

The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles

+
+
+
View article
+
+
+
+
+
+
+ + 11 days ago +
+
RT @AMAldanaS: También revisamos el tema de la publicación en abierto: tipos y ventajas. Discutimos este artículo de Piwowar y colaboradore…
+ +
+
+
+
+
+ +
@ces43 May I recommend Piwowar and Priem et al's article for that topic? https://t.co/Fnm0vtYtKS
+ +
+
+
+
+
+
+ + 40 days ago +
+
También revisamos el tema de la publicación en abierto: tipos y ventajas. Discutimos este artículo de Piwowar y colaboradores de 2018 en donde se evidencia la ventaja de publicar en green open access: . https://t.co/1HAmYlfoBP
+ +
+
+
+
+
+
+ + 62 days ago +
+
RT @InandVertebrate: How many articles are published in Open Access every year? +https://t.co/xkUMWA5jbJ +#openaccess #openscience #scicomm
+ +
+
+
+
+
+
+ + 62 days ago +
+
RT @InandVertebrate: How many articles are published in Open Access every year? +https://t.co/xkUMWA5jbJ +#openaccess #openscience #scicomm
+ +
+
+
+
+
+
+ + 62 days ago +
+
RT @InandVertebrate: How many articles are published in Open Access every year? +https://t.co/xkUMWA5jbJ +#openaccess #openscience #scicomm
+ +
+
+
+
+
+ +
How many articles are published in Open Access every year? +https://t.co/xkUMWA5jbJ +#openaccess #openscience #scicomm
+ +
+
+
+
+
+
+ + 90 days ago +
+
RT @InandVertebrate: How many articles are published in Open Access every year? +https://t.co/xkUMWzNIkb +#openaccess #openscience #scicomm
+ +
+
+
+
+
+
+ + 90 days ago +
+
RT @InandVertebrate: How many articles are published in Open Access every year? +https://t.co/xkUMWzNIkb +#openaccess #openscience #scicomm
+ +
+
+
+
+
+
+ + 90 days ago +
+
RT @InandVertebrate: How many articles are published in Open Access every year? +https://t.co/xkUMWzNIkb +#openaccess #openscience #scicomm
+ +
+
+
+
+
+ +
How many articles are published in Open Access every year? +https://t.co/xkUMWzNIkb +#openaccess #openscience #scicomm
+ +
+
+
+
+
+
+ + 102 days ago +
+
@Mietmensch @unpaywall Gotcha. It's tough to generalize the answer to that, as it depends a lot on the specific journal and field. We dove into the details more in this paper, though: https://t.co/HRus7k3P0B
+ +
+
+
+
+
+
+ + 103 days ago +
+
@dwhly @unpaywall @hpiwowar historical stats are in here: https://t.co/HRus7k3P0B + +prediction for future is here: https://t.co/ex0vvThc9G
+ +
+
+
+
+
+
+ + 104 days ago +
+
RT @jasonpriem: @egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for valu…
+ +
+
+
+
+
+
+ + 104 days ago +
+
RT @OxonAndrew: A look ‘under the hood’ of open access publishing: + +“The state of OA: a large-scale analysis of the prevalence and impact o…
+ +
+
+
+
+
+
+ + 104 days ago +
+
RT @egonwillighagen: the vast majority of research cannot be accessed if you do not have a big pile of money #openaccess https://t.co/RZ7UJ…
+ +
+
+
+
+
+
+ + 104 days ago +
+
A look ‘under the hood’ of open access publishing: + +“The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles” ⁦@thePeerJ⁩ https://t.co/yCu96hCzMK
+ +
+
+
+
+
+ +
the vast majority of research cannot be accessed if you do not have a big pile of money #openaccess https://t.co/RZ7UJV72Uf https://t.co/DE9MPIKTdZ
+ +
+
+
+
+
+ +
RT @jasonpriem: @egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for valu…
+ +
+
+
+
+
+
+ + 105 days ago +
+
@egonwillighagen @unpaywall yes, we do have this for all years. see https://t.co/HRus7k3P0B and the data behind it for values.
+ +
+
+
+
+
+ +
RT @InandVertebrate: The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018 +https://t.co/xkUMWA…
+ +
+
+
+
+
+
+ + 115 days ago +
+
RT @InandVertebrate: The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018 +https://t.co/xkUMWA…
+ +
+
+
+
+
+
+ + 115 days ago +
+
The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles, 2018 +https://t.co/xkUMWA5jbJ +#openaccess #openscience #scicomm
+ +
+
+
+
+
+
+ + 121 days ago +
+
@lisalibrarian @ashleydfarley @andy_nobes Usual def of "bronze" in @our_research is free to read, but does not have CC license. +https://t.co/T34fQja0nN
+ +
+
+
+
+
+
+ + 146 days ago +
+
RT @InandVertebrate: How many articles are published in Open Access every year? +https://t.co/xkUMWzNIkb +#openaccess #openscience #scicomm
+ +
+
+ +
+
+
+
+ + PeerJ +
+ + +
+ +
+ Note that a Preprint of this article also exists, first published August 2, 2017. +
+
+ + +
+
+

Introduction

+

The movement to provide open access (OA) to all research literature is now over fifteen years old. In the last few years, several developments suggest that after years of work, a sea change is imminent in OA. First, funding institutions are increasingly mandating OA publishing for grantees. In addition to the US National Institutes of Health, which mandated OA in 2008 (https://publicaccess.nih.gov/index.htm), the Bill and Melinda Gates Foundation (http://www.gatesfoundation.org/How-We-Work/General-Information/Open-Access-Policy), the European Commission (http://ec.europa.eu/research/participants/data/ref/h2020/grants_manual/hi/oa_pilot/h2020-hi-oa-pilot-guide_en.pdf), the US National Science Foundation (https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf), and the Wellcome Trust (https://wellcome.ac.uk/press-release/wellcome-trust-strengthens-its-open-access-policy), among others, have made OA diffusion mandatory for grantees. Second, several tools have sprung up to build value atop the growing OA corpus. These include discovery platforms like ScienceOpen and 1Science, and browser-based extensions like the Open Access Button, Canary Haz, and Unpaywall. Third, Sci-Hub (a website offering pirate access to full text articles) has built an enormous user base, provoking newly intense conversation around the ethics and efficiency of paywall publishing (Bohannon, 2016; Greshake, 2017). Academic social networks like ResearchGate and Academia.edu now offer authors an increasingly popular but controversial solution to author self-archiving (Björk, 2016a; Björk, 2016b). Finally, the increasing growth in the cost of toll-access subscriptions, particularly via so-called “Big Deals” from publishers, has begun to force libraries and other institutions to initiate large-scale subscription cancellations; recent examples include Caltech, the University of Maryland, University of Konstanz, Université de Montréal, and the national system of Peru (Université de Montréal, 2017; Schiermeier & Mega, 2017; Anderson, 2017a; Université Konstanz, 2014). As the toll-access status quo becomes increasingly unaffordable, institutions are looking to OA as part of their “Plan B” to maintain access to essential literature (Antelman, 2017).

+

Open access is thus provoking a new surge of investment, controversy, and relevance across a wide group of stakeholders. We may be approaching a moment of great importance in the development of OA, and indeed of the scholarly communication system. However, despite the recent flurry of development and conversation around OA, there is a need for large-scale, high-quality data on the growth and composition of the OA literature itself. In particular, there is a need for a data-driven “state of OA” overview that is (a) large-scale, (b) up-to-date, and (c) reproducible. This paper attempts to provide such an overview, using a new open web service called oaDOI that finds links to legally-available OA scholarly articles.1 Building on data provided by the oaDOI service, we answer the following questions:

+
    +
  1. +

    What percentage of the scholarly literature is OA, and how does this percentage vary according to publisher, discipline, and publication year?

    +
  2. +
  3. +

    Are OA papers more highly-cited than their toll-access counterparts?

    +
  4. +
+

The next section provides a brief review of the background literature for this paper, followed by a description of the datasets and methods used, as well as details on the definition and accuracy of the oaDOI categorization. Results are then presented, in turn, for each research question, and are followed by a general discussion and conclusions.

+
+
+

Literature Review

+

Fifteen years of OA research have produced a significant body of literature, a complete review of which falls outside the scope of this paper (for recent, in-depth reviews, see Tennant et al. (2016) and McKiernan et al. (2016). Here we instead briefly review three major topics from the OA literature: defining OA and its subtypes, assessing the prevalence of OA, and examining the relative citation impact of OA.

+

Despite the large literature on OA, the term itself remains “somewhat fluid” (Antelman, 2004), making an authoritative definition challenging. The most influential definition of OA comes from the 2002 Budapest Open Access Initiative (BOAI), and defines OA as making content both free to read and free to reuse, requiring the opportunity of OA users to “crawl (articles) for indexing, pass them as data to software, or use them for any other lawful purpose.” In practice, the BOAI definition is roughly equivalent to the popular “CC-BY” Creative Commons license (Creative Commons, 2018). However, a number of other sources prefer a less strict definition, requiring only that OA “makes the research literature free to read online” (Willinsky, 2003), or that it is “digital, online, [and] free of charge.” (Matsubayashi et al., 2009). Others have suggested it is more valuable to think of OA as a spectrum (Chen & Olijhoek, 2016).

+

Researchers have identified a number of subtypes of OA; some of these have near-universal support, while others remain quite controversial. We will not attempt a comprehensive list of these, but instead note several that have particular relevance for the current study.

+
    +
  • +

    Libre OA (Suber, 2008): extends user’s rights to read and also to reuse literature for purposes like automated crawling, archiving, or other purposes. The Libre OA definition is quite similar to the BOAI definition of OA.

    +
  • +
  • +

    Gratis OA (Suber, 2008): in contrast to Libre, Gratis extends only rights to read articles.

    +
  • +
  • +

    Gold OA: articles are published in an “OA journal,” a journal in which all articles are open directly on the journal website. In practice, OA journals are most often defined by their inclusion in the Directory of Open Access Journals (DOAJ) (Archambault et al., 2014; Gargouri et al., 2012).

    +
  • +
  • +

    Green OA: Green articles are published in a toll-access journal, but self-archived in an OA archive. These “OA archives” are either disciplinary repositories like ArXiv, or “institutional repositories (IRs) operated by universities, and the archived articles may be either the published versions, or electronic preprints (Harnad et al., 2008). Most Green OA articles do not meet the BOAI definition of OA since they do not extend reuse rights (making them Gratis OA).

    +
  • +
  • +

    Hybrid OA: articles are published in a subscription journal but are immediately free to read under an open license, in exchange for an an article processing charge (APC) paid by authors (Walker & Soichi, 1998; Laakso & Björk, 2013).

    +
  • +
  • +

    Delayed OA: articles are published in a subscription journal, but are made free to read after an embargo period (Willinsky, 2009; Laakso & Björk, 2013).

    +
  • +
  • +

    Academic Social Networks (ASN): Articles are shared by authors using commercial online social networks like ResearchGate and Academia.edu. While some include these in definitions of OA (Archambault et al., 2013; Björk, 2016b), others argue that content shared on ASNs is not OA at all. Unlike Green OA repositories, ASNs do not check for copyright compliance, and therefore as much as half their content is illegally posted and hosted (Jamali, 2017). This raises concerns over the persistence of content, since, as was the case in October 2017, publishers can and do issue large-scale takedown notices to ASN ordering the removal of infringing content (Chawla, 2017). Others have raised questions about the sustainability and ethics of ASN services themselves (Fortney & Gonder, 2015). Due to these concerns, and inconsistent support from the literature, we exclude ASN-hosted content from our definition of OA.2

    +
  • +
  • +

    “Black OA”: Articles shared on illegal pirate sites, primarily Sci-Hub and LibGen. Although (Björk, 2017) labels these articles as a subtype of OA, the literature has nearly no support for including Sci-Hub articles in definitions of OA. Given this, we exclude Sci-Hub and LibGen content from our definition of OA.

    +
  • +
+

Based on the consensus (and in some cases, lack of consensus) around these definitions and subtypes, we will use the following definition of OA in the remainder of this paper: OA articles are free to read online, either on the publisher website or in an OA repository.

+
+

Prevalence of OA

+

Many studies have estimated what proportion of the literature is available OA, including Björk et al. (2010), Laakso et al. (2011), Laakso & Björk (2012), Gargouri et al. (2012), Archambault et al. (2013), Archambault et al. (2014) and Chen (2013). We are not aware of any studies since 2014. The most recent two analyses estimate that more than 50% of papers are now freely available online, when one includes both OA and ASNs. Archambault et al. (2014), the most comprehensive study to date, estimates that of papers published between 2011 and 2013, 12% of articles could be retrieved from the journal website, 6% from repositories, and 31% by other mechanisms (including ASNs). Archambault et al. (2014) also found that the availability of papers published between 1996 and 2011 increased by 4% between April 2013 and April 2014, noting that “backfilling” is a significant contributor to green OA. Their discipline-level analysis confirmed the findings of other studies, that the proportion of OA is relatively high in biomedical research and math, while notably low in engineering, chemistry, and the humanities.

+

This Archambault et al. (2014) study is of particular interest because it used automated web scraping to find and identify OA content; most earlier efforts have relied on laborious manual checking of the DOAJ, publisher webpages, Google, and/or Google Scholar (though see Hajjem, Harnad & Gingras (2006) for a notable early exception). By using automated methods, Archambault et al. were able to sample hundreds of thousands of articles, greatly improving statistical power and supporting more nuanced inferences. Moreover, by creating a system that indexes OA content, they address a major concern in the world of OA research; as Laakso et al. (2011) observes: “A major challenge for research...has been the lack of comprehensive indexing for both OA journals and their articles.” The automated system of Archambault et al. (2014) is very accurate—it only misclassifies a paper as OA 1% of the time, and finds about 75% of all OA papers that exist online, as per Archambault et al. (2016). However, the algorithm is not able to distinguish Gold from Hybrid OA. More problematically for researchers, the database used in the study is not open online for use in follow-up research. Instead, the data has since been used to build the commercial subscription-access database 1science (http://www.1science.com/oanumbr.html).

+
+
+

The open access citation advantage

+

Several dozen studies have compared the citation counts of OA articles and toll-access articles. Most of these have reported higher citation counts for OA, suggesting a so-called “open access citation advantage” (OACA); several annotated bibliographies have been created to track this literature (SPARC Europe, 2015; Wagner, 2010; Tennant, 2017). The OACA is not universally supported. Many studies supporting the OACA have been criticised on methodological grounds (Davis & Walters, 2011), and an investigation using the randomized-control trial method failed to find evidence of an OACA (Davis, 2011). However, recent investigations using robust methods have continued to observe an OACA. For instance, McCabe & Snyder (2014) used a complex statistical model to remove confounding effects of author selection (authors may selectively publish their higher-impact work as OA), reporting a small but meaningful 8% OACA. Archambault et al. (2014) describe a 40% OACA in a massive sample of over one million articles using field-normalized citation rates. Ottaviani (2016) used a natural experiment as articles (not selected by authors) emerged from embargoes to become OA, and reports a 19% OACA excluding the author self-selection bias for older articles outside their prime citation years.

+
+
+
+

Methods

+
+

OA determination

+
+

Classifications

+

We classify publications into two categories, OA and Closed. As described above, we define OA as free to read online, either on the publisher website or in an OA repository; all articles not meeting this definition were defined as Closed. We further divide the OA literature into one of four exclusive subcategories, resulting in a five-category classification system for articles:

+
    +
  • +

    Gold: Published in an open-access journal that is indexed by the DOAJ.

    +
  • +
  • +

    Green: Toll-access on the publisher page, but there is a free copy in an OA repository.

    +
  • +
  • +

    Hybrid: Free under an open license in a toll-access journal.

    +
  • +
  • +

    Bronze: Free to read on the publisher page, but without an clearly identifiable license.

    +
  • +
  • +

    Closed: All other articles, including those shared only on an ASN or in Sci-Hub.

    +
  • +
+

These categories are largely consistent with their use throughout the OA literature, although a few clarifications are useful. First, we (like many other OA studies) do not include ASN-hosted content as OA. Second, categories are exclusive, and publisher-hosted content takes precedence over self-archived content. This means that if an article is posted in both a Gold journal and an OA repository, we would classify it as Gold, not Green. Put another way, publisher-hosted content can “shadow” archived articles that would otherwise be Green. This definition of Green (“available in a repository but not available from the publisher”) is often used in the OA literature (including by Steven Harnad, the coiner of the Green and Gold terms Harnad et al., 2008), but this usage is not unanimous. Some studies allow a given article to be both Gold and Green; compared to these, our classification system does undercount Green. Hybrid articles share properties with Gold articles (both are free to read and are licensed for re-use), but differ in the venue of publication (i.e., Hybrid articles are published in journals not considered open access by the DOAJ) and in that Hybrid articles are not necessarily immediately available (i.e., they may only be freely available after an embargo). We also add a novel subcategory, Bronze. Bronze shares attributes of Gold and Hybrid; like both, Bronze OA articles are publisher-hosted. Unlike Gold OA, Bronze articles are not published in journals considered open access in the DOAJ. Unlike Hybrid, Bronze articles carry no license information. Although this lack of identifiable license may not be intentional, without an identifiable license, the articles are free to read but do not allow extended reuse rights beyond reading. It is also not clear if Bronze articles are temporarily or permanently available to read for free.

+

Finally, we should add that, although our categories of choice reflect the OA literature, they do not necessarily reflect the more complex reality of scholarly publishing today. Organizations like SciELO and Redalyc in Latin America have been acting simultaneously as publishers and repositories and many of the articles found on their site do not fall neatly into the above categories (Packer, 2010).

+
+
+

The oaDOI system

+

We assigned the categories above by calling the oaDOI service with a DOI for each item. The oaDOI returns a link to a legally-available OA version of the article, when one is available (https://oadoi.org/). It contains records for all 88 million Crossref DOIs.3 The oaDOI service crawls, aggregates, normalizes, and verifies data from many sources including PMC (https://www.ncbi.nlm.nih.gov/pmc/), BASE (https://www.base-search.net/about/en/), DOAJ (https://doaj.org/), and thousands of institutional repositories and publishers. The oaDOI system offers a fast, free API with no rate-limits, allowing it to support a variety of other services and tools. At the time of writing, oaDOI processes approximately 500,000 requests daily–roughly twice the daily uses of Sci-Hub4 (Bohannon, 2016; Himmelstein et al., 2017). The majority of this volume comes from around 700 academic libraries, who use oaDOI to help readers find articles where the library has no subscription access, addressing the discoverability problem (Chen, 2013). The oaDOI service also powers the Unpaywall browser extension, which helps readers to find legal OA copies of paywalled articles as they browse; Unpaywall currently has over 80,000 active users. The oaDOI codebase is open source, and the service is free and open via an open API.

+
+
+

Accuracy of oaDOI

+

To assess the accuracy of our automated OA determination, a random subsample of 500 articles were chosen from our main “Crossref-DOI” sample, described below. We manually searched the internet for each article in our subsample to determine if the paper was freely available on the publisher’s website, or on another website, such as an institutional repository, an academic social networking site, or on a personal webpage. DOIs were resolved by appending the DOI to “https://doi.org/”. If the full text was available through that link, articles were marked as being freely available from the publisher’s site. If articles required a subscription, the title of the article was entered into Google Scholar (GS) and into Google to find alternative versions (i.e., preprints or archived copies). If the fulltext was found on any publisher page or OA repository, these were marked as being freely available from an archive. If the only available open copy was hosted on an academic social network (like Academia.edu or ResearchGate), this was noted but for the sake of the study these were not counted as any category of OA, and were instead added to the “Closed” category;

+

The performance of oaDOI is summarized below, compared to these manual accuracy checks. The complete dataset behind this summary is available in supplementary information. Using this data we calculated the recall and precision of the system. “Recall” asks the question, “when an article is open, how often does oaDOI correctly identify it as open?” The recall of the service is 77.0%, meaning that 77% of the truly open articles are correctly identified as open by oaDOI. “Precision” asks the question, “When oaDOI says an article is open, how often is it correct?” The precision of the system is 96.6%, meaning that 96.6% of the time that oaDOI reports an article is open, it really is open.

+

These results can be roughly compared to the recall of 86.4% and precision of 99.1% reported by Archambault et al. (2014) for their automated system. Their accuracy estimate was also calculated based on a sample of 500 data points, giving each estimate a margin of error of ±4.5 percentage points. The Archambault study used a narrower date window for their sample (starting in 1996, versus our Crossref-DOI sample which was not time restricted), resulting in a more homogeneous task, which may partially explain their somewhat better performance.

+

The oaDOI service is optimized for high precision, rather than high recall. The very high precision of oaDOI means that any estimates derived from the database can be considered a conservative estimate of the actual percentage of open access in the literature. That is, we can safely assume that when oaDOI reports a certain percentage of open access, the real percentage is at least that high—and almost certainly higher given that recall was less than perfect. Put another way, oaDOI delivers very few false positives (where it mistakenly calls an article open), but a relatively high number of false negatives (where it mistakenly calls an article closed) (Table 1). Future improvements to the system are planned that will improve recall while keeping precision high.

+
+Table 1: +
Accuracy of the prototype version of the oaDOI service used in this study.
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
oaDOI reports OpenoaDOI reports ClosedManual count Total (ground truth)
Open14443187
Closed5308313
Total149351500
+
DOI: 10.7717/peerj.4375/table-1 +
+
+
+
+
+

Study samples

+

Three samples of DOI-assigned scholarly resources are summarized in Table 2 and described further below.

+
+

Crossref sample

+

The first sample, “Crossref-DOIs,” is a random sample of 100,000 journal articles with Crossref DOIs, across all publication years. There are approximately 88 million Crossref DOIs in total as of May 2017. In order to exclude books, datasets, and other non-article content, we sampled only items whose “type” was listed as “journal-article” in the Crossref API metadata; there are 66 million of these. To verify the accuracy of Crossref metadata, we manually checked 150 items assigned to type “journal-article,” and determined that 93% were indeed journal articles; the remaining 7% were mostly journal front-matter such as tables of content or instructions to authors.

+
+Table 2: +
Summary of samples used in this study.
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Sample nameSample sizePopulation sampledPurposePopulation size
Crossref-DOIs100,000All journal articles with Crossref DOIs, all years.Estimate percentage of the literature that is OA.66,560,153
WoS-DOIs100,000All citable WoS articles with DOIs, 2009–2015.Estimate citation impact of recent OA papers, and also OA prevalence by discipline.8,083,613
Unpaywall-DOIs100,000All articles accessed by Unpaywall users over a 1-week period in 2017.Estimate percentage of OA experienced by users of the Unpaywall extension.213,323
+
DOI: 10.7717/peerj.4375/table-2 +
+
+

The purpose of this sample is to roughly proxy the scholarly literature as a whole. As such, it has strengths and weaknesses. One weakness is that although Crossref includes information on citation counts and discipline categorization, we found these to be quite incomplete, and therefore not useful for the present study. Another is that researchers in the scientometrics and OA fields have largely relied on other indexes, particularly Scopus and Web of Science (WoS), to represent the literature as a whole; this makes our results more difficult to compare to previous work. Finally, DOIs are known to be less frequently assigned by publishers in certain disciplines (like humanities; Gorraiz et al., 2016), in certain geographic regions (particularly the developing world), and among older articles (Boudry & Chartron, 2017); consequently, these segments will be underrepresented in our sample. This said, Scopus and WoS are also known to underrepresent important segments of the literature (Mongeon & Paul-Hus, 2016), and so this failing is not limited to Crossref. Moreover, the Crossref sample has important advantages of its own over other indexes. While no sample of the scholarly literature will be complete in every regard, the Crossref index is more expansive than other sources: in July 2017 there were 67 million journal articles indexed in Crossref compared to 30 million in Scopus (https://www.elsevier.com/solutions/scopus/content). Also, Crossref has the advantage of being entirely free and open to use, while Scopus and WoS are subscription-access databases; this allows the study data to also be free and open, promoting replication and reuse of our results in further research. However, we did turn to the subscription-access WoS in order to answer questions about the discipline and citation counts of OA articles, since Crossref data is lacking in these areas.

+
+
+

WoS sample

+

The second sample, “WoS-DOIs”, is a random sample of 100,000 journal articles with DOIs that are indexed by Web of Science. The sample was drawn from a local version of the WoS database at the Observatoire des sciences et des technologies (OST) at the Université du Québec à Montréal. Only articles that WoS defines as “citable items” are included in the sample; this excludes non-peer reviewed content such as editorial material and news items. This sample is restricted to articles published between 2009 and 2015, due to DOI availability constraints. The sample of 100,000 articles is randomly drawn from a population of 8 million articles and reviews with a DOI in WoS published between 2009 and 2015 as of May 2017.

+

Because the WoS sample is restricted to certain publication years, due to availability of DOIs in the WoS database, this sample is unsuitable for estimating the proportion of the total literature that is OA. However, it is more useful than the Crossref sample in some ways: the WoS sample included accurate discipline information for each article (described below), and also citation counts. Therefore we use the WoS sample to assess OA prevalence by discipline and also the citation impact of recent OA papers. We do not encourage comparisons between the OA percentages in the WoS sample and the Crossref sample, because of large differences in the sampling frames.

+

Documents in the WoS-DOIs sample were classified using the National Science Foundation (NSF) journal classification system. This system assigns every journal exactly one “discipline” (a high-level categorization) and exactly one “specialty” (a finer-grained categorization). Because this is a journal-level classification, all articles from a given journal are assigned the same discipline and specialty as the journal. A downside of this approach is that the system classifies multidisciplinary journals (e.g., Nature, PNAS, PLOS ONE) as “biomedical research”, despite their publishing many articles from other fields.5 In these cases, we used a ground-up, article-by-article classification approach. Each article published in a list of multidisciplinary journals was assigned to the NSF specialty which appeared most frequently in its own reference list. In other words, papers published in multidisciplinary journals were classified at the article level (instead of at the journal level) to the subject area which they cite most frequently.6

+

We assess the relative impact of open and closed articles, using citations as an indicator of their scholarly impact. There are several properties of articles, however, that can confound this kind of comparison. Chief among these are the article’s discipline (some fields are much more cited than others) and its age (older articles have had more time to gather citations). In order to address this, we computed a normalized expected number of citations for each article, based on its age and its NSF specialty, by comparing it to the average citations for similar articles.7

+

Using this approach, each article receives an average relative citation (ARC). An ARC of 1.0 indicates that a document was cited according to expectations based on documents published in the same year and NSF specialty, while an ARC above or below 1.0 indicates that the citation impact was above or below world average, respectively. Using these field-normalized citation rates, citation impact can be compared across scientific disciplines as well as across years. We can also compute mean ARCs for groups of articles, like “all open articles” or “all closed articles”, allowing us to compare normalized impact between these two groups. Analyzing results on the level of NSF disciplines, data is not shown for the Humanities (n = 1,091) and Arts (n = 164), because they are underrepresented both in the Web of Science and in terms of DOI coverage.

+
+
+

Unpaywall sample

+

The third sample, “Unpaywall-DOIs”, is a random sample of 100,000 articles accessed by users of the free, open-source Unpaywall browser extension, gathered over a one-week time window. We collected IP addresses and DOI requests made to the oaDOI service through the Unpaywall browser extension during the week of June 5–June 11, 2017. In that time period there were 374,703 total accesses, 213,323 unique DOIs, and 42,894 unique IP addresses gathered in total, from which 100,000 unique DOIs were randomly sampled.

+

This sample was used to assess the prevalence of OA experienced by users of the Unpaywall extension (since Unpaywall uses oaDOI data to find OA). It is a convenience sample of what articles people are interested in reading, and thereby lets us roughly estimate the percent of this literature that is OA. The sample has serious limitations, however: we don’t know the demographics of Unpaywall users, and we are aware of a bias towards users from the US (as determined by the IP addresses). As such, we cannot accurately generalize the results by education level, discipline, or purpose in reading the scholarly literature.

+
+
+
+
+

Results

+
+

RQ1. What percent of the literature is open access?

+
+

How much of the literature is OA?

+

We found 27.9% (95% CI [27.6–28.2]) of all DOI-assigned journal articles are OA, using the Crossref-DOI sample. Based on this, we estimate there are 18.6 million OA articles with Crossref DOIs (95% CI [18.4–18.8]). This is the total population of OA articles that can be identified and accessed by oaDOI. Given our finding (described in Methods above) that the oaDOI service finds 77% of OA compared to manual searches, we can further estimate that an additional 3.5 million articles are OA but not detectable by this version of oaDOI.

+

People reading the literature using the Unpaywall browser extension encounter a significantly higher proportion of OA: we found that 47.0% (95% CI [46.7–47.3]) of the Unpaywall-accessed sample is open access. The main reason for this is article age: since this sample is based on the behavior of actual readers, it is disproportionately comprised of recent articles. In fact, half the accessed articles were published in the last 2 years. Recent articles are much more likely to be OA than their older counterparts (see Results ‘How does Open Access vary by year of publication?’ below).

+
+
+

What types of Open Access are most common?

+

The proportion of OA by subtype is relatively similar across the samples, as shown in Fig. 1 and Table 3. Green OA represents a relatively small percentage of OA articles in all three samples. This is partly because self-archived articles are only counted as Green where there is no publisher-hosted option available; that is, Green OA is sometimes “shadowed” by Gold, Bronze, or Hybrid articles. Bronze is the most common OA subtype in all the samples, which is particularly interesting given that few studies have highlighted its role. We manually inspected a small sample of Bronze articles in order to understand this subcategory more; we found that while many Bronze articles were Delayed OA from toll-access publishers, nearly half were hosted on journals that published 100% of content as free-to-read but were not listed on the DOAJ and did not formally license content (using CC-BY or any other license). Such journals might be better described as “Dark Gold” or “Hidden Gold” than Bronze. A more complete examination of Bronze falls outside the scope of this study, and therefore further investigation will be undertaken in future work.

+
Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample.
+
+
+Figure 1: Percent of articles by OA status, Crossref-DOIs sample vs Unpaywall-DOIs sample.
+
+
+Table 3: +
Percent of the literature that is OA, by type, in three samples of 100,000 journal articles, with 95% confidence intervals.
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Access typeCrossref-DOI All journal articles with Crossref DOIs, all years. (“Articles with DOIs” in Fig. 1)WoS-DOIs All citable WoS articles with DOIs, 2009–2015Unpaywall-DOIs All articles accessed by Unpaywall users over a 1-week period in 2017
Estimate95% CIEstimate95% CIEstimate95% CI
OA (all types)27.9%27.6–28.236.1%36.0–36.247.0%46.7–47.3
Bronze OA16.2%16.0–16.512.9%12.6–13.215.3%15.0–15.6
Hybrid OA3.6%3.3–3.94.3%4.0–4.68.3%8.0–8.6
Gold OA3.2%2.9–3.57.4%7.1–7.714.3%14.0–14.6
Green OA4.8%4.5–5.111.5%11.2–11.89.1%8.8–9.4
Closed72.0%71.8–72.463.9%63.8–64.053.0%52.7–53.3
+
DOI: 10.7717/peerj.4375/table-3 +
+
+
+
+

How does Open Access vary by year of publication?

+

Figure 2 presents the number (Fig. 2A) and proportion (Fig. 2B) of papers by access category and publication date. Articles published in the last 20 years are increasingly OA, and this trend shows no sign of slowing. More recent articles are more likely to be OA, with the most recent year examined also containing the most OA: 44.7% of 2015 articles are OA (95% CI [43.3–46.2%]), including 17.6% Bronze (95% CI [16.2–19.1]), 9.4% Hybrid (95% CI [8.0–10.9]), 11.3% Gold (95% CI [9.9–12.8]), and 6.3% Green (95% CI [4.9–7.8]). Well over one million OA papers were published in 2015. This growth trend has largely been driven by dramatic growth in Gold and Hybrid OA since the year 2000. However, more than 20% of papers published before the digital age are also freely available. The majority of these older OA papers are Bronze, and based on their age they are probably more precisely Delayed OA, although additional investigation will be required to confirm this. Bronze OA remains remarkably constant as a proportion of the literature for all publication years examined.

+
Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs.
+
+
+Figure 2: Number of articles (A) and proportion of articles (B) with OA copies, estimated based on a random sample of 100,000 articles with Crossref DOIs.
+
+

The number and proportion of Green papers must be interpreted with particular caution, due to several factors. First, unlike publisher-hosted OA (Gold, Bronze, and Hybrid), the date when the Green article became open is generally different from the date the article was first published. Authors often self-archive articles years after (or before, in the case of preprints) their original publication, leading to so-called “backfilling” of Green stocks (Archambault et al., 2014). Consequently, the graph cannot show the growth of Green OA over time; this would require longitudinal analysis over several years, and so is outside the scope of this analysis. Instead it shows the number and proportion of Green OA by publication year of the article. Second, many articles cannot be legally self-archived until a certain number of months after publication; this embargoing likely influences the apparent plateau in Green shown in Fig. 2. Finally, as noted earlier, many self-archived articles would otherwise be Green except for being “shadowed” by a Gold, Bronze, or Hybrid of the same article elsewhere. For more detail on the growth of shadowed Green OA, see Figs. SA2 and SA3.

+
+
+

How does Open Access vary by publisher?

+

We analyzed a subset of the Crossref-DOIs sample by publisher (as listed on the Crossref metadata record) to understand how the extent and types of OA are common across publishers for recent publications (between 2009 and 2015). As we can see in Fig. 3A, the largest publishers by volume publish the most OA articles by volume, led by Elsevier. As a proportion of all articles published (Fig. 3B), however, PLOS and Hindawi distinguish themselves as being the only publishers in the top 20 with 100% OA. More than half of the papers published by Oxford University Press, Nature Publishing Group, IOP Publishing, and the American Physical Society (APS) are freely available online. In the case of APS this is largely driven by content available through repositories such as arXiv (for more details on repositories, see Fig. SA1).

+
Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015.
+
+
+Figure 3: Number (A) and proportion (B) of articles with OA copies, by publisher, for the 20 most prolific publishers. Based on sample of 27,894 Crossref DOI-assigned articles published between 2009–2015.
+
+
+
+

How does Open Access vary across disciplines?

+

We used the WoS-DOIs sample to examine OA prevalence differences by discipline, because of the easy availability of discipline metadata in the WoS index. Figure 4 displays our results. More than half of the publications are freely available in biomedical research and mathematics, while in chemistry and engineering & technology less than 20% of the papers are freely available. Figure 4 also highlights the popularity of Green OA in disciplines like physics and mathematics, where more than one fifth of papers are available only through online repositories (mainly arXiv). Hybrid articles are particularly prevalent in mathematics (9.4%), biomedical research (8.1%) and clinical medicine (6.3%), while authors in biomedical research (15.3%), health (11.7%), mathematics (11.2%) and clinical medicine (10.3%) often publish in Gold journals.

+
Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities).
+
+
+Figure 4: Percentage of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015 per NSF discipline (excluding Arts and Humanities).
+
+

Large variations can also be observed on the more detailed level of NSF specialties (Fig. SA5). At more than 80% of OA articles, astronomy & astrophysics (87%), fertility (86%), tropical medicine (84%), and embryology (83%) were the specialties where access to literature was the most open. At the other end of the spectrum are pharmacy (7%), inorganic & nuclear chemistry (7%), and chemical engineering (9%), where publications were hidden behind a paywall for more than 90% of papers. More detail on these and other NSF specialties can be seen in Fig. SA1.

+
+
+
+

RQ2. What is the scholarly impact of open access?

+

Comparing the average relative citation impact of different access categories, the OACA is corroborated: Papers hidden behind a paywall were cited 10% below world average (ARC = 0.90), while those that are freely available obtain, on average, 18% more citations than what is expected (ARC = 1.18). However, citation impact differs between the different manners in which papers are made available for free: those that are only available as Green OA (ARC = 1.33) and Hybrid OA papers (ARC = 1.31) are cited the most with an impact of more than 30% above expectations, those available as Bronze are cited 22% above world average, while papers published as Gold OA obtain an ARC of 0.83. This constitutes an average relative citation impact of 17% below world average and 9% below that of articles hidden behind a paywall. Figure 5 below describes these findings.

+
Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015.
+
+

+Figure 5: Average relative citations of different access types of a random sample of WoS articles and reviews with a DOI published between 2009 and 2015.

+
+

These trends vary over time, however, as shown in Fig. 6. While the ARC of closed access papers remains below world average throughout the period studied, it increased from .86 in 2009 to .93 over in 2014 and 2015. Meanwhile, when looking across all open types, the mean citation rate is consistently above the world average, fluctuating between 1.15 and 1.22. This fluctuation is guided by differences between the access types, with the impact of Hybrid OA papers increasing over the time period. While Green OA papers’ mean citation rate remain relatively stable, the highest impact, for 2015, is obtained by Bronze and Hybrid. The only form of open for which mean impact has decreased steadily over time is Gold. The results for more recent years are only based on a short citation window, however, and results might change over the next years as citations accumulate.

+
Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication.
+
+

+Figure 6: Percentage and impact of different access types of a random sample of WoS articles and reviews with a DOI, by year of publication.

+
+
+
+
+

Discussion and Conclusion

+

Access to scholarly literature is at the heart of current debates in the research community. Research funders are increasingly mandating OA dissemination to their grantees while, at the same time, the growth in toll-access subscriptions costs have prompted more and more university libraries to cancel subscriptions. In this context, several tools have been developed to provide access–both legally and illegally–to scholarly literature. Using data from one of these tools (oaDOI), this paper addresses two broad research questions: what percent of the literature is OA and how does it vary by type of OA, and what is the mean scholarly impact of papers diffused through this form. Three large samples were used, to assess different aspects of OA patterns: (1) 100,000 articles that have a Crossref DOIs, which allows us to assess the relative proportion of OA across all existing literature; (2) 100,000 WoS-indexed journals articles that have a DOI, which allows us to assess the scholarly impact of OA and non OA papers; (3) 100,000 articles accessed by users through the Unpaywall browser extension, which lets us assess the proportion of OA papers found by users of this free tool.

+

We found that 28% of all journal articles are freely available online (Crossref-DOI sample). Encouragingly for proponents of OA, this proportion has been growing steadily over the last 20 years, driven particularly by growth in Gold and Hybrid. Articles from 2015, the most recent year examined, had the highest proportion OA (45%), as well as the largest absolute number of OA articles published in a single year. This disproportionate level of OA in recent years, combined with readers’ preference for more recent articles, leads to a felicitous situation for readers: the proportion of OA they experience as they browse and search is better than the overall percentage of OA across the literature as a whole. Users of the Unpaywall browser extension, which gives individual readers access to the oaDOI service, encounter OA articles nearly half (47%) of the time. The effect almost certainly extends beyond Unpaywall users; one may assume readers in general also favor newer articles, and therefore benefit from the growth of Gold, Bronze, and Hybrid OA among recent papers, even without using Unpaywall. More studies of readership data from other sources would be useful to quantify this further.

+

Interestingly, we found that the majority of OA articles are Bronze–hosted on publisher websites, either without a license at all or without an open license. This is surprisingly high given that Bronze is relatively little-discussed in the OA literature, and suggests that this OA category deserves further attention from the OA community. In particular, Bronze OA may be significant in a policy context, since, unlike other publisher-hosted OA, Bronze articles do not extend any reuse rights beyond reading, making them Gratis OA. Much more research is needed into the characteristics of Bronze OA. How many Bronze articles are licensed openly, but do not make their license available? Is Bronze disproportionately non-peer-reviewed content? How much of Bronze OA is also Delayed OA? How much Bronze is Promotional, and how transient is the free-to-read status of this content? How many Bronze articles are published in “hidden gold” journals that are not listed in the DOAJ? Why are these journals not defining an explicit license for their content, and are there effective ways to encourage this? These and other questions are outside the scope of this study but may provide fruitful insights for future OA research and policy.

+

Only about 7% of the literature overall (and 17% of the OA literature) is Green. This is may at first seem disappointing, given years of advocacy focused on Green OA as well as ongoing growth in the number of Green OA mandates (Björk et al., 2014). However, the full context of Green OA provides reasons for optimism. First, many papers are archived in repositories but are not counted as Green in this analysis because they are also available on the publisher site as Hybrid, Gold, or Bronze versions. These “shadowed Green” copies provide a useful safety net that preserves access in cases where publishers rescind it (as could potentially happen with Delayed OA and other Bronze articles). Further research is needed to determine the prevalence of shadowed Green OA in various disciplines. Second, the phenomenon of “backfilling” (authors self-archiving content published across all years, not just the current one) means that although the percentage graph of Green OA does not show the same year-over-year slope as Gold or Hybrid, the line itself may be rising across all years as authors gradually self-archive papers from years or even decades ago. This assumption is supported by results reported by Archambault et al. (2016). Finally, the relatively low proportion of green OA encouragingly leaves room for continued growth. While most journals published by major publishers (Elsevier, Wiley, Springer, etc.) allow for self-archiving, research shows that only a small proportion of papers from these publishers actually are self-archived in OA repositories; for example, Smith et al. (in press) report using a sample of Global Health Research papers that only 39% of them made use of available self-archiving rights.

+

Our results confirm the Open Access Citation Advantage found by other studies: open articles receive 18% more citations than otherwise expected. While at least some of this boost is likely due to the fact that more access allows more people to read and hence cite articles they otherwise would not, causation is difficult to establish and there are many possible confounders. Most discussed is the so-called “selection bias postulate”, (Craig et al., 2007) which suggests that authors choose only their most impactful work to make OA. The current study does not examine the cause or directionality of correlation, but does find that it exists in a very large sample that is relatively representative of the literature as a whole. Funder requirements may also play a role in the observed citation advantage: high-profile funders are more likely to have an OA publishing requirement; at the same time, well funded studies are independently more likely to receive more citations than poorly funded studies (Berg, 2010). Interestingly, Gold articles are actually cited less, likely due to an increase in the number of newer and smaller OA journals. Some of these journals are from regions of the world not historically indexed by WoS, are published in languages other than English, or might be considered to be less prestigious because they have not had time to become established or accumulate citations (Archambault et al., 2013). On the flip side, the citation disadvantage of Gold OA is likely also affected by the continued growth of so-called ‘mega journals’ such as PLOS ONE ( PLOS, 2018). Whatever the reason, the lower impact of Gold means the overall citation advantage is strongly driven by Green, Hybrid, and Bronze content. In sum, while several factors can affect the observed differences in citation rates, and causation remains difficult to establish, the fact remains that scholars are much more likely to read and cite papers to which they have access than those that they cannot obtain. Hopefully the existence of a free, open index of OA content will help support further research into the OACA question.

+

The relatively high percentage of OA found in this study, particularly among readers of the free Unpaywall extension, has important potential implications for academic libraries. Increasingly, these libraries are under pressure to meet growing prices of “Big Deal” subscription packages, and the once-unthinkable outcome of canceling these Big Deals is becoming an increasingly realistic option. In this environment, knowing that around half of the literature of interest is available without any subscription may tip the scales toward cancellation for some institutions–particularly given that this percentage seems to be growing steadily. Indeed, the Université de Montréal’s cancellation of their Taylor & Francis subscription package (Université de Montréal, 2017) is particularly interesting, given that their cancellation announcement directly pointed faculty to Unpaywall and other tools to help them access OA content. This may seem a radical suggestion, but cancellation of subscription journals has long been part of the universal OA roadmap (Anderson, 2017b). Even when the percentage of OA is not enough to support outright cancellation, it may be enough to negotiate better subscription rates by supporting calculation of “OA-adjusted Cost Per Access” (Antelman, 2017). However, much more study is needed to see how OA availability varies across journals and Big Deal packages, along with praxis-oriented work building OA analysis tools that help librarians make cancellation choices.

+

This study has several important limitations. Our dataset only includes journal articles with DOIs, which means that disciplines and geographical areas which rely more heavily on conference papers or articles without DOIs are underrepresented. Our Crossref sample includes about 7% journal “front matter” that the journal has assigned a DOI and Crossref labelled “journal article” but is actually a page describing the journal Editorial Board or similar. Our Bronze OA category includes articles published in OA journals which aren’t indexed in DOAJ; future work must identify these OA journals and classify such articles as Gold. As discussed in our definition of OA, when finding open copies we ignored free-to-read articles from academic social networks like ResearchGate and Academia.edu. The oaDOI system has some coverage of articles published on personal web pages, but this is quite limited compared to web-scale indexes like Google. The oaDOI system includes thousands of institutional and subject repositories, but there are some repositories that it misses. Our accuracy checks suggest that oaDOI, and therefore this study, are probably overlooking around 23% of OA otherwise discoverable using web searches, meaning that estimates in reported in this paper undercount OA by approximately 30%. Finally, our approach did not detect when articles were deposited into repositories. Because repositories are often backfilled with content that has been published many years ago, this study does not measure any increase/decrease in prevalence of Green OA over time, but only the proportion of Green OA by article publication date at the moment of data collection.

+

In addition to the empirical results obtained, this paper clearly shows the potential of the oaDOI service for future research. The freely available oaDOI service provides scholars with the basis for assessing and monitoring the development of access to scholarly literature on a large scale, as well as the factors that affect it. For instance, our results show that the percentage of the literature available as OA is growing, and that articles diffused through this form are generally more cited than closed access articles. Several factors are likely to contribute to these trends; however, those remain poorly understood. Combined with other datasets–such as the WoS, Scopus, or Crossref–oaDOI allows one to assess at a large-scale the effects of various mandates on deposit rates, or to track the development of documents’ accessibility to determine, for example, when authors self-archive, or the sustainability of the promotional OA category. Aggregated at the level of journals and publishing platforms, these data can also provide librarians with indicators to help inform subscription cancellations and mitigate their effects. The application of the oaDOI algorithm on a large scale also allows for more complete analysis of the OA citation advantage across fields and time. As in Gargouri et al. (2010), confounding factors could be mitigated by using article-level metadata to identify article pairs published in the same journal issue, on the same topic or published by the same authors at the same time. We hope that other scholars will dig deeper in those data to better understand OA dissemination and the factors that drive it. This is of utmost importance for the future of scholarly communication.

+
+
+

Supplemental Information

+
+

Additional results

+ +
DOI: 10.7717/peerj.4375/supp-1 +
+ +
+
+
+
+
In the interest of full disclosure, it should be noted that two of the authors of the paper are the co-founders of Impactstory, the non-profit organization that developed oaDOI.
+
Repositories that were included are those covered by the Bielefeld Academic Search Engine (BASE) in May 2017. A full listing of repositories can be found on their website at: https://www.base-search.net/about/en/about_sources_date.php?menu=2&submenu=1 +
+
DOIs are short, unique identifiers for scholarly papers. Crossref is a nonprofit that helps a the DOI system, and is by far the largest supplier of academic DOIs in academia.
+
Based on a Sci-Hub dataset released in 2016 (the most recent data available).
+
These journals were identified by selecting journals with over a one thousand articles per year from those classified in the general “biomedical research” category. The full list of journals meeting these criteria were: PLOS ONE, Nature, Science, Scientific Reports, PNAS, Nature Communication, PeerJ, and Science Advances.
+
Ties between frequently cited specialties were resolved randomly; that is, if a paper cites exactly the same amount of papers from two NSF specialties, it was assigned to one of the two at random
+
Citations were normalized using the population of WoS articles and reviews with a DOI.
+
+
+ + + + + + + +
+
+
+
+
 
+
+ +
+ Questions +
+ + Ask a question + +
+
+
+
+
+ + +
+ + +
+ + + +
+ + + + +
+ + + + + + +
+ +
+ + + + + + + + + + + + + + + + +
+ + + + + +
+ + + + + + +
+ + +
+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+ +
+ + + + + diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py index 4d670e5..597520c 100644 --- a/python/tests/test_html_metadata.py +++ b/python/tests/test_html_metadata.py @@ -1,5 +1,6 @@ import datetime +import pytest from sandcrawler.html_metadata import * @@ -64,6 +65,31 @@ def test_html_metadata_elife() -> None: assert meta.publisher == "eLife Sciences Publications Limited" +def test_html_metadata_peerj() -> None: + + with open('tests/files/peerj_oa_article.html', 'r') as f: + peerj_html = f.read() + + meta = html_extract_biblio(HTMLParser(peerj_html)) + assert meta is not None + assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles" + assert meta.doi == "10.7717/peerj.4375" + assert meta.contrib_names == [ + "Heather Piwowar", + "Jason Priem", + "Vincent Larivière", + "Juan Pablo Alperin", + "Lisa Matthias", + "Bree Norlander", + "Ashley Farley", + "Jevin West", + "Stefanie Haustein", + ] + assert meta.container_name == "PeerJ" + # "2018-02-13" + assert meta.release_date == datetime.date(year=2018, month=2, day=13) + + def test_html_metadata_nature() -> None: with open('tests/files/nature_article.html', 'r') as f: @@ -136,3 +162,65 @@ def test_html_metadata_dc_case() -> None: meta = html_extract_biblio(HTMLParser(snippet)) assert meta is not None assert meta.issue == "123" + +@pytest.fixture +def adblock() -> Any: + return load_adblock_rules() + +def test_html_resources(adblock) -> None: + + with open('tests/files/dlib_05vanhyning.html', 'r') as f: + dlib_html = f.read() + + resources = html_extract_resources( + "http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html", + HTMLParser(dlib_html), + adblock, + ) + + assert dict(url="http://www.dlib.org/style/style1.css", type="stylesheet") in resources + + # check that adblock working + for r in resources: + assert '/ga.js' not in r['url'] + + with open('tests/files/plos_one_article.html', 'r') as f: + plos_html = f.read() + + resources = html_extract_resources( + "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978", + HTMLParser(plos_html), + adblock, + ) + + # check that custom adblock working + for r in resources: + assert 'crossmark-cdn.crossref.org' not in r['url'] + + with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f: + monday_html = f.read() + + resources = html_extract_resources( + "https://firstmonday.org/blah/", + HTMLParser(monday_html), + adblock, + ) + + with open('tests/files/elife_article.html', 'r') as f: + elife_html = f.read() + + resources = html_extract_resources( + "https://elife.org/blah/", + HTMLParser(elife_html), + adblock, + ) + + with open('tests/files/nature_article.html', 'r') as f: + nature_html = f.read() + + resources = html_extract_resources( + "https://nature.com/blah/", + HTMLParser(nature_html), + adblock, + ) + -- cgit v1.2.3 From ae37c8e32f1289816b69cd5a502a6bc5fd862414 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 29 Oct 2020 15:21:57 -0700 Subject: improve test running and config --- python/.pylintrc | 3 +-- python/Makefile | 5 ++++- python/tests/test_pdfextract.py | 2 ++ 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'python/tests') diff --git a/python/.pylintrc b/python/.pylintrc index 91fea7c..387bca1 100644 --- a/python/.pylintrc +++ b/python/.pylintrc @@ -11,5 +11,4 @@ include-ids=yes notes=FIXME,XXX,DELETEME [TYPECHECK] -ignored-modules=responses -extension-pkg-whitelist=selectolax +extension-pkg-whitelist=selectolax,pydantic,responses diff --git a/python/Makefile b/python/Makefile index f783d0e..0a97437 100644 --- a/python/Makefile +++ b/python/Makefile @@ -17,6 +17,7 @@ lint: ## Run lints (eg, flake8, mypy) #pipenv run flake8 . --exit-zero pipenv run flake8 . --select=E9,F63,F7,F82 --exit-zero pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports + pipenv run pylint --rcfile=.pylintrc -E --jobs=4 sandcrawler tests *.py #pipenv run pytype sandcrawler/ .PHONY: fmt @@ -24,7 +25,9 @@ fmt: ## Run code formating on all source code pipenv run black *.py sandcrawler/ tests/ .PHONY: test -test: lint ## Run all tests and lints +test: ## Run all tests and lints + pipenv run flake8 . --select=E9,F63,F7,F82 --exit-zero + pipenv run mypy *.py sandcrawler/ tests/ --ignore-missing-imports pipenv run pytest .PHONY: coverage diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py index ed93341..64e3137 100644 --- a/python/tests/test_pdfextract.py +++ b/python/tests/test_pdfextract.py @@ -2,6 +2,7 @@ import pytest import struct import responses +import poppler from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient from sandcrawler.pdfextract import process_pdf @@ -20,6 +21,7 @@ def test_process_fake_pdf(): resp = process_pdf(pdf_bytes) assert resp.status == 'not-pdf' +@pytest.mark.skipif(poppler.__version__ == '0.2.1', reason="unsupported version of poppler") def test_process_dummy_pdf(): with open('tests/files/dummy.pdf', 'rb') as f: pdf_bytes = f.read() -- cgit v1.2.3 From a8387ac21bf6f9693cef24f9ef39482b9337f3af Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 30 Oct 2020 15:18:32 -0700 Subject: tests: fix conditional on poppler version check --- python/tests/test_pdfextract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python/tests') diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py index 64e3137..255e3fb 100644 --- a/python/tests/test_pdfextract.py +++ b/python/tests/test_pdfextract.py @@ -21,7 +21,7 @@ def test_process_fake_pdf(): resp = process_pdf(pdf_bytes) assert resp.status == 'not-pdf' -@pytest.mark.skipif(poppler.__version__ == '0.2.1', reason="unsupported version of poppler") +@pytest.mark.skipif(poppler.version_string() == '0.71.0', reason="unsupported version of poppler") def test_process_dummy_pdf(): with open('tests/files/dummy.pdf', 'rb') as f: pdf_bytes = f.read() -- cgit v1.2.3 From e61d6e8cc3b6824816a83dff56ffbdbbb6329e57 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 30 Oct 2020 17:20:22 -0700 Subject: html: work around firstmonday DOCTYPE issue --- python/sandcrawler/html_ingest.py | 3 + python/tests/files/first_monday_ojs3_fulltext.html | 441 +++++++++++++++++++++ python/tests/test_html_ingest.py | 14 + 3 files changed, 458 insertions(+) create mode 100644 python/tests/files/first_monday_ojs3_fulltext.html create mode 100644 python/tests/test_html_ingest.py (limited to 'python/tests') diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index e86fa2b..acd336e 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -25,6 +25,9 @@ def html_extract_fulltext_teixml(doc: bytes) -> dict: ) if tei_xml: return dict(status="success", tei_xml=tei_xml) + elif doc.startswith(b''): + # hack for firstmonday.org + return html_extract_fulltext_teixml(doc[106:]) else: return dict(status="empty-xml") diff --git a/python/tests/files/first_monday_ojs3_fulltext.html b/python/tests/files/first_monday_ojs3_fulltext.html new file mode 100644 index 0000000..2248aed --- /dev/null +++ b/python/tests/files/first_monday_ojs3_fulltext.html @@ -0,0 +1,441 @@ + + + + + +Surveillance, stigma and sociotechnical design for HIV + + +
First Monday
+
+
+ +
Surveillance, stigma and sociotechnical design for HIV by Calvin Liang, Jevan Alexander Hutson, and Os Keyes
+ +


+ +

Abstract
Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate.

+ +

Contents

+

Introduction
+Methods
+Findings
+Discussion
+Conclusion

+ +

 


 

+

Introduction

+ +
“AIDS is essentially a crisis of governance, of what governments do and do not do, to and for their people — we have the drugs to treat HIV infection, we have the tools to confront the risks that drive HIV transmission and prevent infection itself — what we don’t have is national political will necessary to scale-up our response. We have demanded too little from our leaders, excused far too much.”
— Gregg Gonsalves, speech at the 2006 Toronto AIDS Conference.
+ +
“Design is inherently about change — not just in the creation of new material artifacts, but in the ways that new technological objects afford new practices, social habits, and ways of living and interacting.”
— Dombrowski, et al. (2016). “Social justice-oriented interaction design: Outlining key design strategies and commitments.”
+ +

Living and loving with HIV is a complicated task. HIV status and the stigma attached to it exists within a complex interplay of social norms and medicolegal infrastructures. The medicolegal history of HIV begins the moment that HIV and AIDS emerged, constituting a mix of medically justified legal norms and legally enforced medical requirements. The criminal justice and public health systems of modern states demarcated people living with HIV as a uniquely dangerous population, “one that needed to be sought out, tracked down, tested, reported, listed, tagged, monitored, regulated, and, increasingly, criminalized” [1].

+ +

The immediate policy response in the United States imposed significant criminal and civil liability upon people living with HIV (Hoppe, 2018; Harsono, et al., 2017; Sykes, et al., 2016; Thrasher, 2015; Galletly, et al., 2014; Lehman, et al., 2014; Gagnon, 2012; Pollard, 2006; Gostin, et al., 1999). Between 1986–2019, HIV-specific criminal laws and sentence enhancements applicable to people living with HIV have been enacted in 34 states and two U.S. territories (Center for HIV Law & Policy, 2019; Lehman, et al., 2014). Since 1986, these laws have criminalized nondisclosure of HIV and engagement in “risky” behaviors such as sexual activity, exposure to bodily fluids, needle sharing, sex work, blood/organ/semen donation, and, in a variety of instances, behaviors posing little, if any, risk of HIV transmission (Center for Disease Control and Prevention, 2019a; Center for HIV Law & Policy, 2019).

+ +

Despite claiming medical legitimacy for this punitive approach, researchers have long understood that the criminalization of HIV transmission was instead fueled by the associations between HIV and the gay community and communities of color (Hoppe, 2018; Gallo, 2006; Johnson, 1992; Banks, 1989) at a time when consensual sex between same-sex partners was a criminal offense in twenty-two states and over 61 percent of American evangelicals and 50 percent of non-evangelicals agreed with the statement “I sometimes think AIDS is a punishment for the decline in moral standards” (Gallup and Castelli, 1987).

+ +

A significant body of empirical social science work documents the harmful effects HIV laws have had on the lives of people living with HIV (BarréSinoussi, et al., 2018; Harsono, et al., 2017; Sweeney, et al., 2017; Adam, et al., 2014). HIV criminalization both reinforces and magnifies HIV-related stigma and discrimination, reduces the willingness of persons at risk for HIV to get tested or seek care, and imperils demographic health collection of information (Harsono, et al., 2017; Burris and Cameron, 2008; Galletly and Pinkerton, 2006; Elliot, 2002). A survey of over 2,000 people living with HIV in the U.S. revealed that at least 25 percent of respondents knew one or more individuals who were afraid to get tested for fear of facing criminalization (Sero Project, 2012). HIV criminalization also ignores the reality that successful antiretroviral therapy can render the level of the virus to undetectable, which, according to the National Institute of Health, means that HIV is then untransmittable (Eisinger, et al., 2019).

+ +

While HIV transmission was criminalized, other tools of control — in the form of surveillance — arose and were enforced. Early policy responses to HIV centered on overt surveillance and ostracism of those infected and perceived to be at risk (Fortin, 1995). This surveillance generally consists of disease reporting, sexual contact tracing, and data collection of people who have been diagnosed with HIV (Fan, 2012; 2011; Ward and Bell, 2014; Ward, 2005). The Center for Disease Control, for example, collects HIV data based on confidential name-based reporting laws implemented in all 50 states as of April 2008 (Center for Disease Control and Prevention, 2019b).

+ +

HIV surveillance (and sexually transmitted infection surveillance more broadly) centralizes information and power in the state (Fairchild, et al., 2007; Fan, 2012); because HIV intervention and surveillance is generally concentrated in lower income communities and health settings (McCree and Hogben, 2010), the most socially and economically marginalized communities bear the heaviest burden of HIV surveillance and its downstream consequences (Miller, et al., 2004; Banks, 1989; Brandt, 1987). There is a long-racialized history of HIV, one that, in combination with the background racism of the United States, has led to the systemic undertreatment and under-consideration of communities of color (Ford, et al., 2007; Anonymous, 2000; Johnson, 1992).

+ +

This infrastructure of surveillance in turn reinforces the stigma of HIV, which has dramatic consequences for the likelihood of unwanted disclosure, access to care, psychiatric well-being, housing and employment discrimination, and, consequently, quality (or probability) of life (Lazarus, et al., 2016; Mahajan, et al., 2008). Coupled with the overarching stigma of HIV and its criminalization in various contexts, HIV surveillance offers a tool through which the state can identify citizens to be punished.

+ +

In the era of “big data” and ubiquitous surveillance capitalism (Zuboff, 2019) — the private monetization of information about reality — HIV surveillance is not just in the hands of the state, but also in the hands of private organizations and individuals. In the context of widespread state surveillance and control and ongoing stigmatization of HIV, this opens yet more possibilities for harm through enabling the selling and redistribution of HIV status information, without the user’s meaningful consent, to parties who may themselves engage in discrimination or direct violence.

+ +

Many online platforms — including, as we trace out below, dating platforms — constitute not just spaces for the purposes outlined in their marketing materials but also tools for the police in tracing HIV status and criminalized behavior. In recent years, police have used technology to conduct Internet-based investigations for a similar purpose (POZ, 2015). Police now go undercover on Web sites and dating apps by creating fake identities online (Semitsu, 2011), and local law enforcement agencies and federal agencies increasingly employ these tactics in online investigations (Lichtblau and Arkin, 2014).

+ +

Legal and public health scholars and advocates continue to call for a paradigm shift in managing HIV that leaves behind historical responses like surveillance, ostracism, and incarceration and accounts for the rise of the Internet and mobile technology and their impact on sexual attitudes and behaviors (Lehman, et al., 2014; McCallum, 2014; Fan, 2011; Fenton, 2010). Since the criminalization of HIV, intimate platforms have become vital structures through which millions of people access the opportunity to engage in reciprocal romantic and sexual relationships (Hutson, et al., 2018; Taylor, et al., 2017; Rosenfeld and Thomas, 2012). By designing infrastructures for intimate affiliation, intimate platforms wield unmatched structural power to shape who meets whom and how within dating and sexual platforms (Hutson, et al., 2018; Levy and Barocas, 2018; Emens, 2008; Robinson, 2007). These platforms frame the circumstances within which users understand each other as prospective romantic or sexual partners and shape social norms, sexual scripts, and relative advantages among users (Hardy and Lindtner, 2017; Kannabiran, et al., 2012).

+ +

The design of intimate platforms provides opportunities to explore new ways of managing HIV that reduce the concentration of power and information in the state (Fan, 2012). Through the role that platform design plays in shaping cultural norms, which has been identified as a more effective way of achieving HIV transmission prevention than flexing the punitive and surveillant arms of the state (Sunstein, 1996), intimate platform design provides opportunities to explore new ways of managing HIV (Fan, 2012). Indeed, a meta-analysis of HIV prevention efforts found that strategies that intervene in social meaning by shaping social norms, cultural practices, and individual attitudes were more effective in empowering behavioral change than appeals to fear (Albarracin, et al., 2015).

+ +

However, designing intimate platforms to account for HIV also presents serious challenges for social computing researchers and human-computer interaction (HCI) designers. As Handel and Shklovski pointed out: “The minutiae of design decisions around profile options deserves particular attention because even the smallest changes can result in substantial differences for user interactions” (Handel and Shklovski, 2012). In addition to concerns around how to best design for HIV, platforms, Grindr in particular, have already come under fire for sharing user HIV information with third parties (Singer, 2018). Moreover, designing intimate platforms to unburden the risks of extant criminal and civil sexual regulations runs the serious risk of re-entrenching the status quo and its incumbent inequalities and power relations (Bardzell, 2010). While designing for HIV presents opportunities to redress stigma and harm, researchers in HCI must understand that “[i]t is not enough to have good intentions ... [we] must ground [our] efforts in clear political commitments and rigorous evaluations of the likely consequences” (Green, 2018).

+ +

From this comes the recognition that social computing designers and researchers seeking to design for disclosure cannot afford to ignore the ways that the lived experiences of people living with HIV are shaped by structural forces and, particularly, the reality of HIV criminalization and the State’s role in conducting STD surveillance. Platforms, after all, do not exist in a separate sphere from material reality: a redesign that eases HIV disclosure from user-to-user might also involve the storing of disclosure data by the platform — data that can then be accessed, requisitioned, and co-opted by arms of the state. In line with Jackson, et al.’s call for the social computing community to address the structural and lived consequences of law and policy that “establish the very terrain on which design and practice can be conceived, articulated, and imagined — and upon which battles of accountability are inevitably waged” [2], we wish to undertake a critical investigation of HIV disclosure in dating and hookup platforms. This involves not just investigating the implications of disclosure in a person-to-person sense, but also how platform design is shaped by legal and administrative regulation and how the risks of disclosure might open users up to systems of surveillance, stigma, and criminalization. We do so by using a range of platforms in an effort to gain a wide view, and to practice prefigurative politics — minimizing our assumptions about the “type” of people at risk of HIV infection and/or surveillance.

+ +

To do this, we analyze platform’s consequences for HIV through the lens of user-to-user interactions, exploring the ways that design renders users visible and vulnerable to wider carceral and surveillance infrastructures, and the way that design shapes (and is shaped) by HIV’s legal status. We ground our discussion in a content analysis of 50 popular, mobile dating and hookup platforms, coding for design and policy choices related to HIV disclosure, prevention, destigmatization, surveillance, privacy, and criminalization. Through this, we reveal that many platforms fail to account for HIV, and of those that do, many neglect to attend to the downstream consequences of HIV disclosure and the data produced by it, while exacerbating the social, racial, and class stereotypes associated with the condition.

+ +

As scholars and designers consider how platform design might aid HIV prevention and destigmatization (Hutson, et al., 2018; Albury, et al., 2017; Wohlfeiler, et al., 2013; Rosser, et al., 2011), we aim to grapple with the structural and ethical implications of designing for HIV, particularly how intimate platform design might aid and abet the decriminalization and surveillance of HIV (Sykes, et al., 2016; Kazatchkine, et al., 2015; Perone, 2013; Gagnon, 2012; Jürgens, et al., 2009). Drawing on principles from social justice-oriented design to investigate controversies and design possibilities in intimate platforms, we attempt to articulate an approach to intimate platform design that not only works to reduce the stigma of user disclosure, but also works to contest historic and present power imbalances and injustices between users, platforms, and the state.

+ +

 

+

++++++++++

+

Methods

+ +

Using a directed content analysis (Hsieh and Shannon, 2005), we reviewed 50 existing mobile dating and hookup platforms. Content analyses have proven effective in understanding platform design and governance and the ways design practices mediate user-to-user bias and discrimination (Levy and Barocas, 2018; Hutson, et al., 2018). We set out to capture a landscape of popular platforms and selected the first 50 dating and hook up platforms in the top 200 grossing social networking applications in the United States on the iOS App Store in March of 2019. Figure 1 lists the platforms selected in alphabetical order.

+ +

 

+ + + + + + +
50 dating and hookup platforms surveyed
 
Figure 1: The 50 dating and hookup platforms surveyed.
 
+

 

+ +

Utilizing the walkthrough method (Light, et al., 2018), we explored each platform’s HIV-related user experience. We examined design features on each of these platforms, systematically documenting design choices, policies, and informational interventions that mediate HIV. Building upon previous work around intimate platforms and HIV, we coded each of the 50 intimate platforms based on the following dimensions:

+ +

Prevention

+
  • Whether the app allows same-sex connections
  • +
  • Whether a user can disclose HIV/sexually transmitted infection (STI) status (Warner, et al., 2018)
  • +
  • If they can disclose, what are the options? (Warner, et al., 2018)
  • +
  • Whether a user can search for or filter out users with HIV/STIs? (Hutson, et al., 2018)
  • +
  • Whether the platforms provide informational interventions with respect to HIV/STI prevention (Wang, et al., 2019)
+

Stigma reduction

+
  • Whether a user can identify as having HIV/STI (e.g., “Poz”, etc.)
  • +
  • Whether a user can indicate interest in or acceptance of people living with HIV/STIs (e.g. outward presentation, separate from filtering, not simply via profile text) (Hutson, et al., 2018)
+

Policies

+
  • Whether the platform engages HIV/STIs in their policies (terms of service, privacy, and community policies, etc.) (Jackson, et al., 2014)
+ +

For ethical reasons, we did not interact with other users, only observed features, and deleted our accounts once data were collected when possible (not all platforms allowed for account deletion). The design and policy choices described and discussed below are not intended as an endorsement of any particular design intervention for managing HIV. Rather, we aim to capture the various ways intimate platforms currently manage and mediate HIV among users and how those choices map onto extant legal and surveillant infrastructures. Additionally, we highlight two limitations in how we chose which platforms to analyze. First, it is possible for a hook-up platform to not have an accompanying mobile app, meaning our selection of platforms from the iOS app store will have invariably missed Web site-based platforms. Second, we may have overlooked platforms that are more niche or community-specific, yet not as popular in the broader platform marketplace (i.e., not within the top grossing platforms).

+ +

 

+

++++++++++

+

Findings

+ +

 

+ + + + + + +
A visualization of our content analysis
 
Figure 2: A visualization of our content analysis.
 
+

 

+ +

Design features

+ +

Out of the 50 intimate platforms we examined, 13 were meant specifically for queer communities (11 specifically targeted at gay and bisexual men and two at lesbian and bisexual women). None of the platforms we reviewed were distinctly designed for trans people. The remaining 34 platforms were for general audiences, catering to heterosexual and homosexual connections, and three platforms were exclusively for heterosexual connections (eHarmony, Uniform Dating, and Waplog). Only queer-specific platforms (six) had explicit HIV disclosure options and allowed for filtering or searching based on HIV status. Figure 3 shows the disclosure options for each platform. Growlr, Taimi, and Scruff allowed users to indicate that they were accepting of people living with HIV. Grindr, Hornet, Mr. X, Xtremboy, and Scruff, five platforms all of which are queer-specific, provide informational interventions with respect to HIV/STI prevention (See Figure 4 for examples). Eight dating apps mentioned HIV in their policies (five queer-specific, three general). Four dating apps allowed users to identify with an HIV/STI-relevant identity category, often labeled “poz”. Please see Figure 2 for a visualization of our content analysis.

+ +

 

+ + + + + + +
Disclosure options
 
Figure 3: Disclosure options.
 
+

 

+ +

 

+ + + + + + +
Examples of HIV/STI prevention features on Grindr (left, middle) and Hornet (right)
 
Figure 4: Examples of HIV/STI prevention features on Grindr (left, middle) and Hornet (right).
 
+

 

+ +

Policies

+ +

None of the 50 intimate platforms we reviewed explicitly mention HIV in their terms of service. Four platforms expressly discuss HIV in their privacy policies (Grindr, Hornet, Scruff, and Mr. X), and four platforms mention HIV in platform safety policies (Planet Romeo, Tinder, BlackPeopleMeet, and Our Time). No platform engaged any of the legal implications of HIV. No platform engaged the public health surveillance of HIV.

+ +

Of the four platforms that expressly engage HIV in their privacy policies (Grindr, Hornet, Mr. X, Scruff), only two (Grindr & Hornet) explicitly prohibit sharing HIV information with third parties. By disclosing one’s HIV status on Mr. X and Scruff, users consent to the platform’s processing of that information. Grindr warns that HIV status disclosure on a user profile is effectively public information, however the platform does not share HIV status information with third party tracking, analytics, and advertising companies or service providers. Of all the platforms reviewed, Grindr’s privacy policy is the only one that devotes an entire section to HIV status, which is not particularly surprising given Grindr’s involvement in multiple controversies around sharing HIV information with third parties (Fitzsimons, 2019; Singer, 2018):

+ +
“HIV Status. At the recommendation of HIV prevention experts and the community of Grindr users, we give you the option of publishing your health characteristics, such as your HIV status, in your Grindr community profile. Remember that if you choose to include information in your profile, that information will become public to other users of the Grindr App. As a result, you should carefully consider whether you want to disclose your HIV status. We do not share HIV status with any third-party service advertisers or third-party service providers other than companies that host data on our behalf (e.g., Amazon Cloud). In addition, we do not use HIV status for advertising purposes and do not share this information with advertisers.”
+ +

According to Hornet’s privacy policies, they “[do] not share any HIV status information with third parties unless required to do so by law”. Of the 50 platforms reviewed, Hornet was the only one to enable users to opt into receiving “in-app reminders to undergo HIV tests and receive information on the location of nearby testing centers.” On Hornet, a user’s HIV status “is only searchable by users who have defined themselves as HIV positive.” Scruff’s privacy policy highlights that “there is no requirement to” provide them with “health details and whether part of the POZ (HIV positive) community (for example, in creating or updating your profile),” and that by doing so, users “are explicitly consenting to [Scruff’s] processing of [their] information.” Mr. X’s privacy policy notes that HIV status information “may be considered ‘special’ or ‘sensitive’ in certain jurisdictions,” and that by providing this information, users “consent to [Mr. X’s] processing that information”.

+ +

 

+

++++++++++

+

Discussion

+ +

Prevention

+ +

Platforms can act as an interventional tool to improve access to and perceptions of care for people living with HIV. Examples of HIV/STI prevention include a “Last Tested Date” section on a user’s profile and reminders to get tested for HIV/STIs. Some current platforms engage with HIV more critically by acknowledging that HIV is an issue its users should be aware through specific features. Hornet, for instance, provides its users with HIV-relevant educational material and resources for getting tested. Hornet also limits searching based on HIV status to people who themselves have chosen the HIV positive option, thereby limiting the possibility of HIV status-based discrimination. Hornet and Grindr can also provide reminders for users to get tested. Scruff allows users to choose from sex safety practices that include using condoms, using pre-exposure prophylaxis (PrEP), and/or treatment as prevention (Warner, et al., 2019).

+ +

Due in large part to the history of HIV’s recognition as a medical condition, HIV has been generally classified as a “gay man’s problem” in North America — frequently (albeit almost as frequently unmarked) a white, cisgender gay man’s problem. This classification and framing acted to both separate normative society from the stigma associated with the condition and provide an avenue for activism by associating it with the most “acceptable” queer bodies: masculine, middle class, cisgender and white (Epstein, 1996).

+ +

HIV has disproportionately impacted gay communities specifically, but transmission does not fit a neat pattern of being binarized tidily along sexuality. It is disproportionately prevalent in communities of color, appears in heterosexual relationships and lives, and risk of transmission follows societal vulnerability and marginalization — transgender women, particularly transgender women of color, are particularly overrepresented in diagnosis rates (Clark, et al., 2017). While the partial normalization of HIV — holding it outside the direct concerns of white, cisgender, heterosexual people, but embodying it in people who look “just like them” — may have aided in assembling efforts to address the condition, the assumptions that it has created in who is at risk and who “counts” have been tremendous. One only has to look at the ethnographic work of Vivianne Namaste, who highlights how Montreal’s history of HIV, its recognition, and efforts at its prevention simultaneously elided the incidence rate amongst the Haitian community (which at one point had 65 percent of reported AIDS cases) and lacked any advice or conception of susceptibility for women, particularly heterosexual or bisexual women (Namaste, 2015).

+ +

Our platform analysis demonstrates that these same assumptions about vulnerability and risk are present in the design of intimate platforms. Generic platforms (i.e., those that cater to non-queer or broader, more heteronormative audiences) entirely do not consider, engage, or design for HIV while the platforms for queer — and more specifically gay men — do. Even within the group of 13 queer-specific applications, neither of the two queer women-specific apps allowed for HIV disclosure, even though 23 percent of people with HIV in the U.S. are women (Center for Disease Control and Prevention, 2019c). Most, if not all, platforms dedicated to general audiences do nothing when it comes to HIV prevention, contributing to the knowledge gap for general audiences on sexual health, HIV-specific, and more. Because general audiences can go through online dating experiences without encountering HIV materials, platform designers allow these users to falsely believe that their sexual lives are excluded from important matters of sexual health.

+ +

Our intent is not to suggest that HIV should be narrated as a problem for everyone; to ignore sexuality in the impact and risk of HIV transmission is an ahistorical mistake. But treating it solely as a “gay man’s problem” simultaneously elides differences in vulnerability and risk within gay communities and perpetuates the silence around transmission for other populations, particularly trans women of color and/or heterosexual people. In other words, it is not that HIV is not frequently a risk for gay communities, but that drawing a line between sexuality and risk perpetuates the more nuanced disparities in risk and the discourse that HIV transmission is not something anyone else has to think about.

+ +

Platforms can and have implemented prevention efforts through Last Tested Date and Testing Reminders features. Doing so more ubiquitously, rather than solely on gay male-specific platforms, may be helpful in normalizing prevention efforts like getting tested regularly and knowing one’s status. Through opportunities like this, platform designers have the opportunity to promote HIV/STI prevention and care — an opportunity that is valuable precisely for its ability to normalize prevention efforts. This is not to say that such features are not without risks, particularly with regards to state surveillance, intervention and structural forces, which is our next topic of concern and discussion.

+ +

Stigma & disclosure

+ +

Designing for HIV is not as simple as including disclosure fields and status-based filtering or not. Allowing disclosure and filtering can protect people living with HIV from negative and sometimes harmful interactions, help filter out people who might discriminate against them, fight HIV stigma, and promote much-needed awareness. However, disclosure and filtering can also lead to discriminatory practices (Hutson, et al., 2018), have potential for privacy unraveling (Warner, et al., 2018), and contribute to surveillance (Fan, 2012, 2011).

+ +

De-stigmatizing HIV offers designers an opportunity to engage in the structural dimensions of how HIV operates in social life and can possibly allow us to better tap into social norms around the condition that ultimately improve other outcomes. For instance, humanizing people living with HIV could lead to more people getting tested, being open about their status, and being communicative with their sexual partners. Platforms have the power to shift social norms and destigmatize HIV at scale due to their pervasiveness throughout modern connections, but designers need to contest the ethical implications of de-stigmatizing HIV on these platforms, especially through current features such as HIV-status-based filtering and disclosure options.

+ +

Filtering and searching tools based on HIV status can be instrumental for people living with HIV to find others who are either seropositive or otherwise accepting of seropositive people. Additionally, filtering out those who might discriminate against them for their HIV status anyways allows people living with HIV to avoid awkward or even violent interactions with users who harbor problematic beliefs about people living with HIV. Conversely, HIV status-based filtering and searching tools have representational and allocational harms. First, it represents that there are particularly psycho-social characteristics incumbent with HIV status. These stereotypes play out in a variety of different ways such as the framing that people living with HIV engage in “risky” sexual behavior. Second, HIV status-based filtering can be used to structurally exclude HIV positive users from the opportunity to engage in intimate affiliation (Hutson, et al., 2018). Platforms can and do provide users the ability to screen out other users who identify as “Poz” or disclose their HIV status. Not only do these design features facilitate exclusion, they may disincentivize HIV related disclosures to the extent that such disclosures can be weaponized by other users to exclude them as potential intimate affiliates.

+ +

Disclosure fields as a way to de-stigmatize HIV are similarly complicated in that they can inhibit and benefit people living with HIV. For one, encouraging users to disclose, regardless of their status, can create a healthier culture and discussion around HIV, possibly making talking about one’s status an acceptable and common practice of intimate engagement. On the other hand, disclosure can be used for a variety of problematic ends that harm seropositive users. Other users may discriminate against users who have disclosed their HIV status, choosing to ignore or disengage with them entirely. Disclosure may have unintended consequences and lead to more personal and violent outcomes. Due to laws in particular jurisdictions, failure to disclose one’s status to a partner can lead to prosecution and potentially incarceration. People living with HIV might also face physical and emotional threats for disclosing their status either publicly or privately.

+ +

Due to these complexities, designers of dating platforms must face the question of how can we de-stigmatize HIV without creating additional obstacles for people living with HIV? Platforms need to critically unpack the possible consequences of well-intentioned design choices, including HIV status-based filtering and HIV status disclosure fields. Of the platforms we reviewed, Scruff is the only one to provide for HIV disclosure without using an express “HIV status” field, allowing instead two disclosure options, Poz and Treatment as Prevention. “Poz” constitutes an association and identification with a community (e.g., “I am a bear, daddy, poz”), while “Treatment as Prevention,” signals antiretroviral therapy (i.e., use of HIV medicines to treat HIV infection) and constitutes a link to sex safety practices.

+ +

Surveillance & criminalization

+ +

At the same time, given the questions of structural power and surveillance built into these platforms, we are leery of treating disclosure option design as the site of de-stigmatization and justice. Questions of privacy and stigma go wider than micro-interactions and touch on how HIV is seen and responded to societally and administratively. The dominant responses to HIV/AIDS “center on adjusting the traditional levers of criminal and tort law, and of public health law, with its surveillance and disciplinary regimes that concentrate information and decision-making in the state” [3]. Indeed, HIV continues to function as a “vector for the exercise of state power and the invention of novel logics and techniques of government,” whereby “[i]nfection with HIV virtually guarantees that a citizen will need to interact, either beneficently or coercively, with one or more state bureaucracies” [4].

+ +

The broader ecosystem of intimate platforms that we observed provided virtually no HIV-specific privacy information or protections for users living with HIV. Overall, both the platforms that account for HIV in their privacy policies and the platforms that enable disclosure but do not account for HIV in their privacy policies continue to place the risks and burden of surveillance, privacy, and disclosure on users with HIV. Grindr’s “HIV Status” policy puts it clearly: “Remember that if you choose to include information in your profile, that information will become public to other users of the Grindr App.” By surfacing this as a risk we do not mean to suggest that users lack agency — merely that the agency to choose between a range of options can be influenced by how those options are bounded and made available in addition to the affordances and norms that platform design provides. That a user makes information public does not mean that “consumable by all” is the framework of disclosure that they have in mind (Wittkower, 2016).

+ +

While some intimate platforms are working towards promoting HIV disclosure, prevention, and de-stigmatization, they are also failing to grapple with privacy implications of HIV and their responsibility in ensuring it. People living with HIV are already vulnerable and bear the weight of HIV disclosure’s downstream consequences. By continuing to offload the burdens and risk on those with HIV, platforms are likely contributing to issues of nondisclosure as well as HIV testing. Research shows that privacy fears can result in the non-disclosure of HIV status information within close personal relationships (Derlega, et al., 2004; Zea, et al., 2003; Derlega, et al., 2002).

+ +

In this context, proposals to design for HIV disclosure that do not consider the wider structural implications of surveillance are concerning. The focus of most research into HIV and online dating in HCI on micro-interactions and enabling trust and certainty between users elides the implications that providing this data to a platform outside user control has and the way that this data can be used to control. This is not an abstract risk; just this year, Grindr (the platform under study) has been the subject of scrutiny by the U.S. government over its Chinese ownership, due to fears that the Chinese government might access and copy Grindr’s data around HIV disclosure for the purpose of domestic policing and control (Fitzsimons, 2019). If we are designing to enable HIV disclosure, are we working to improve stigma associated with disclosure — or are we enabling new forms of control and surveillance?

+ +

In the United States today, intimate platforms operate within 29 states that have HIV criminal laws, which include laws that target sex/nondisclosure of HIV-positive status, sex work, exposure to bodily fluids, needle-sharing, sex work, and blood/organ/semen donation, nine states that have sentencing enhancements applicable to people living with HIV who commit an underlying assault crime, and 24 states that have prosecuted people living with HIV under non-HIV-specific general criminal laws (Center for HIV Law & Policy, 2019). Here, the design of intimate platforms cannot be removed from the reality of laws that criminalize HIV, particularly HIV non-disclosure.

+ +

People living with HIV in the U.S. with HIV-specific criminal laws must disclose their HIV status to sexual partners. Generally, “disclosure and consent” is an affirmative defense [5], whereby a person can avoid criminal and civil liability if they disclose their serostatus [6] and their sexual partner voluntarily consents to sexual activity with knowledge of that serostatus [7]. Many of the laws that criminalize HIV non-disclosure do not provide guidance as to what methods of disclosure and consent are enough to avoid prosecution and conviction (McCallum, 2014). No court or legislature has affirmatively stated whether verbal disclosure and consent are necessary under criminal HIV transmission statutes. Furthermore, non-verbal communication online create uncertainty as to whether there is sufficient disclosure and consent to remove criminal liability for HIV-positive individuals. Both disclosure and consent can be ambiguous or misunderstood, a problem that is complicated by the design and widespread use of mobile dating and hookup platforms.

+ +

It remains unclear what constitutes appropriate disclosure and informed consent in the context of intimate platforms, such as HIV disclosure fields on user profiles or other communication in a profile’s free form text sections (e.g., “+” “Poz”, “undetectable”). Although some intimate platforms afford HIV-positive users the ability to disclose their serostatus in new ways, no court or legislature in the U.S. has answered whether disclosing HIV status on an intimate platform is enough to achieve informed consent and avoid criminal and civil liability. Yet many people living with HIV also use records of conversations on intimate platforms as a means of protection. For example, people disclose their status and use that record as a way to protect themselves from future allegations of non-disclosure. This ambiguity and incumbent legal risk places significant responsibility and pressure on HIV users. Research shows that fears around rejection, self-blame, criminalization, and privacy can result in the non-disclosure of HIV status information within close personal relationships (Derlega, et al., 2004; Zea, et al., 2003; Derlega, et al., 2002). Privacy concerns around HIV disclosure are often associated with the need to protect one’s self from HIV related stigma (Adam, et al., 2011; Serovich and Mosack, 2006; Greene, et al., 2003). As more and more people use platforms to meet intimate partners, the historical failure of HIV criminalization law to understand how disclosure and consent are negotiated in practice becomes all the more apparent.

+ +

It might seem from this that designers and developers are trapped in an impossible situation — disclosure to protect users simultaneously produces the possibility of structural harms for those disclosing. While we urge designers to take both needs seriously, we do not consider it impossible; in fact, there is a range of work within queer theory and technology that not only articulates this tension of privacy, disclosure and the reuse of data but suggests queer forms of resistance to it. Writing more broadly, Brian Schram highlights the way that the increasing possibilities of “big data” and its attendant surveillance structures “constitute an undoing of Queerness as a radical political injection” [8], advocating a politics of melancholia that features a haunting of archives: an insertion of the dead weight of our collective memory as Queer persons into the growing catalog of our digital information. In other words, Schram suggests the deliberate incorporation of masses of false data, profiles, and traces into data stores in order to render ambiguous the truth of any presence and provide cover for those queer persons existing within the platform(s) data highlights. What would this look like in the case of dating platforms? What are the possibilities raised by incorporating a deluge of false accounts, doppelgängers, and doubles, not as a deception of the platform or its users, but against state forces examining the database?

+ +

More broadly, we might see possibilities for the future through practices in the past. In how queer communities responded to HIV disclosure and protection protocols during the 1980s and 1990s, David Halperin has articulated the way that gay communities worked to articulate norms that balanced risks, trust, and vulnerability in the absence of structural norms, that “it is gay men themselves who have continued to define, and to redefine, the limits of safety through an ongoing history of sexual experimentation and mutual consultation, and who have thereby produced, over time, workable compromises and pragmatic solutions that balance safety and risk” [9]. Rather than taking universalized, top-down approaches to platform design for all, we might instead seek to work up and to create a diverse range of spaces that challenge the ease of surveillance built into large-scale platforms and afford individual users more agency in establishing those compromises and solutions and engaging in that consultation.

+ +

 

+

++++++++++

+

Conclusion

+ +

As HCI researchers and designers, we continue to push the boundaries of what is technologically possible but doing so requires us to first ask whether platform design is even an appropriate intervention in a given situation (Keyes, et al., 2019; Baumer and Silberman, 2011; Suchman, 2011). The current model of platform design for HIV cannot continue, as it is too closely tied to the collection and commodification of highly sensitive personal data. However, reimagining intimate platform design provides the social computing community an opportunity to intervene in the social norms around HIV and HIV disclosure in a manner that could unburden the weight of criminalization without centralizing the surveillant arms of the state.

+ +

We envision a future of dating platforms that does not force people living with HIV to sacrifice surveillance for intimate experiences. Because of their entanglements with sex and romance, intimate platforms need to take on more responsibility in the sexual health and data privacy of their users. Drawing from our analysis and our own lived experiences, we recommend platform-level changes, changes in platform, and mechanisms to prevent platforms from knowing their users’ statuses. First, platforms should make explicit to their users the consequences of storing sensitive, personal information like HIV status and their documentation processes. Next, they should also implement policies that manage how data are stored when users delete their accounts and protect these data from third-party consumers. Finally, ownership of user’s data should belong to the users themselves, rather than the platforms. Users should be able to pass along their information to other users without the platforms tracking it.

+ +

HIV is a medical condition, but its eradication requires not just technical, or even sociotechnical, but sociopolitical solutions. Indeed, the ways in which designers and policy-makers frame HIV is an inherently political decision, one that will impose the contours and boundaries of our response. The social computing community cannot do nothing, but it also must resist the desire to do everything. Designing user interfaces and platform policies to account for HIV will require a rigorous analysis of possible outcomes and consequences as well as a bedrock commitment to centering the voices and experiences of those impacted by HIV and the state’s responses to it. Our commitments must account for the ways pathology and power intertwine to subjugate and otherize impacted communities at home and abroad.

+ +

Designing intimate platforms to unburden the risks of extant criminal and civil sexual regulations runs the risk of re-entrenching the status quo and its incumbent inequalities and power relations (Dombrowski, et al., 2016; Light, 2011; Irani, et al., 2010; Bardzell, 2010). The social computing community must ground its efforts to design for HIV in clear political commitments to decriminalizing HIV and decentralizing power and information from the state. We must strive to unburden the weight of surveillance and incarceration on vulnerable and marginalized communities and work towards offloading the significant social and legal risks and pressures for people living with HIV. Moreover, our commitment to designing for HIV must not exclude nor obfuscate our capacity for direct action within and outside of the realms of design and research. This means fighting for the rights, dignity, and safety of people living with HIV in the streets and in the halls of local, national, and international political, legislative, and executive bodies.

+ +

Our instinctual response to the failed and violent efforts of HIV criminalization and surveillance should not be “there’s an app for that,” but rather “there’s a zap for that!”. That is, the practice of designing for people with HIV should be a “critical technical practice” (Agre, 1997), undertaken with a mindset that sits uneasily between and is cognizant of both individual and structural power and consequence. Pioneered by the American gay liberation movement, a zap or “zap action” is a political action of direct and persistent public confrontation. Whether shouting down public figures or smashing pies into the faces of evangelicals, zaps aim to disrupt and disturb persons and institutions of authority to effect change (Cohen, 2018). In the words of AIDS Coalition to Unleash Power’s (ACT UP) “New Member Packet”:

+ +
“Zaps are a method for ACT UP members to register their disapproval of and anger toward the zap target. Zaps usually have more specific targets than actions. Because of this focus, numerous zapping techniques have been developed. ACT UP zaps individuals or organizations by: sending postcards or letters; invading offices and distributing fact sheets; sending (lots and lots of) faxes; picketing; outraged (and sometimes outrageous) phone calls. The more zappers who zap the zappee the better the zap.”
+ +

A critical approach to designing for HIV requires the contesting of histories of incarceration, stigmatization, and surveillance and the ways in which the state exerts power and domination through its medicolegal levers of criminal law and public health surveillance. Intimate platform design should not only work to reduce the prevalence and stigma of HIV, but also to contest historic and present power imbalances and injustices between users, platforms, and the state. End of article

+ +

 

+ +

About the authors

+ +

Calvin Liang is a Ph.D. student in Human-Centered Design and Engineering Department at the University of Washington. His research broadly focuses on technology’s role in and out of queerness, health, and queer health.
E-mail: cliang02 [at] uw [dot] edu

+ +

Jevan Alexander Hutson, living with HIV for four years, is a technology policy advocate, human-computer interaction researcher, and J.D. candidate at the University of Washington School of Law. His research interests center on issues of technology, law, and social life, with a particular focus on intimate/sexual computing.
E-mail: jevanh [at] uw [dot] edu

+ +

Os Keyes is a Ph.D. student in Human-Centered Design and Engineering at the University of Washington, and an inaugural Ada Lovelace Fellow. Their research examines gender, technology and (counter)power, with a particular focus on the ways technologies of measurement shape and define queer communities.
E-mail: okeyes [at] uw [dot] edu

+ +

 

+

Acknowledgements

+ +

We dedicate this paper to the radical history of the AIDS Coalition to Unleash Power (ACT UP) and to all of the souls we’ve lost and continue to lose to HIV/AIDS. We would like to thank Mary Fan, Sean Munson, and Julie Kientz for valuable conversations and feedback, and Margret Wander and Margaret Hopkins for their ongoing care and support. This research was partially funded by a Microsoft Ada Lovelace Fellowship.

+ +

 

+

Notes

+ +

1. Halperin and Hoppe, 2017, p. 349.

+ +

2. Jackson, et al., 2014, p. 596.

+ +

3. Fan, 2011, p. 36.

+ +

4. Halperin and Hoppe, 2017, p. 255.

+ +

5. See FLA. STAT. ANN. § 775.0877 (2017) (“[I]t is an affirmative defense to a charge of violating this section that the person exposed knew that the offender was infected with HIV, knew that the action being taken could result in transmission of the HIV infection, and consented to the action voluntarily with that knowledge.”). See also http://www.hivlawandpolicy.org/states/florida.

+ +

6. Serostatus is defined as: “The state of either having or not having detectable antibodies against a specific antigen, as measured by a blood test (serologic test). For example, HIV seropositive means that a person has detectable antibodies to HIV; seronegative means that a person does not have detectable HIV antibodies.” U.S. Department of Health & Human Services, Education Materials, AIDSINFO, at https://aidsinfo.nih.gov/education-materials/glossary/1632/serostatus, accessed 30 August 2019.

+ +

7. Lehman, et al., 2014, p. 1,101.

+ +

8. Schram, 2019, p. 611.

+ +

9. Halperin, 2015, p. 207.

+ +

 

+

References

+ +

Barry D. Adam, Richard Elliott, Patrice Corriveau, and Ken English, 2014. “Impacts of criminalization on the everyday lives of people living with HIV in Canada,” Sexuality Research and Social Policy, volume 11, number 1, pp. 39–49.
doi: https://doi.org/10.1007/s13178-013-0131-8, accessed 5 September 2020.

+ +

Barry D. Adam, James Murray, Suzanne Ross, Jason Oliver, Stephen G. Lincoln, and Vicki Rynard, 2011. “Hivstigma.com, an innovative Web-supported stigma reduction intervention for gay and bisexual men,” Health Education Research, volume 26, number 5. pp. 795–807.
doi: https://doi.org/10.1093/her/cyq078, accessed 5 September 2020.

+ +

Philip E. Agre, 1997. “Toward a critical technical practice: Lessons learned in trying to reform AI,” In: Geof Bowker, Les Gasser, Leigh Star, and Bill Turner (editors). Bridging the great divide: Social science, technical systems, and cooperative work. Mahwah, N.J.: Erlbaum.

+ +

Anonymous, 2000. “Name brands: The effects of intrusive HIV legislation on high-risk demographic groups,” Harvard Law Review, volume 113, number 8, pp. 2,098–2,113.
doi: https://doi.org/10.2307/1342321, accessed 5 September 2020.

+ +

Taunya Lovell Banks, 1989. “Women and AIDS — Racism, sexism, and classism,” New York University Review of Law & Social Change, volume 17, pp. 351–385, and at ttps://digitalcommons.law.umaryland.edu/fac_pubs/328, accessed 5 September 2020.

+ +

Shaowen Bardzell, 2010. “Feminist HCI: Taking stock and outlining an agenda for design,” CHI ’10: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems, pp. 1,301–1,310.
doi: https://doi.org/10.1145/1753326.1753521, accessed 5 September 2020.

+ +

Françoise BarréSinoussi, Salim S. Abdool Karim, Jan Albert, LindaGail Bekker, Chris Beyrer, Pedro Cahn, Alexandra Calmy, Beatriz Grinsztejn, Andrew Grulich, Adeeba Kamarulzaman, Nagalingeswaran Kumarasamy, Mona R. Loutfy, Kamal M. El Filali, Souleymane Mboup, Julio S.G. Montaner, Paula Munderi, Vadim Pokrovsky, AnneMieke Vandamme, Benjamin Young, and Peter GodfreyFaussett, 2018. “Expert consensus statement on the science of HIV in the context of criminal law,” Journal of the International AIDS Society, volume 21, number 7.
doi: https://doi.org/10.1002/jia2.25161, accessed 5 September 2020.

+ +

Eric P.S. Baumer and M. Six Silberman, 2011. “When the implication is not to design (technology),” CHI ’11: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems, pp. 2,271–2,274.
doi: https://doi.org/10.1145/1978942.1979275, accessed 5 September 2020.

+ +

Allan M Brandt, 1987. No magic bullet: A social history of venereal disease in the United States since 1880. Expanded edition. Oxford: Oxford University Press.

+ +

Scott Burris and Edwin Cameron, 2008. “The case against criminalization of HIV transmission,” Journal of the American Medical Association, volume 300, number 5, pp. 578–581.
doi: https://doi.org/10.1001/jama.300.5.578, accessed 5 September 2020.

+ +

Center for Disease Control and Prevention, 2019a. “HIV and STD criminal laws,” at https://www.cdc.gov/hiv/policies/law/states/exposure.html, accessed 30 August 2019.

+ +

Center for Disease Control and Prevention, 2019b. “HIV surveillance reports,” at https://www.cdc.gov/hiv/library/reports/hiv-surveillance.html, accessed 30 August 2019.

+ +

Center for Disease Control and Prevention, 2019c. “HIV and women,” at https://www.cdc.gov/hiv/group/gender/women/, accessed 5 September 2020.

+ +

Center for HIV Law & Policy, 2019. “HIV criminalization in The United States,” at http://www.hivlawandpolicy.org/sourcebook, accessed 2 February 2020.

+ +

Hollie Clark, Aruna Surendera Babu, Ellen Weiss Wiewel, Jenevieve Opoku, and Nicole Crepaz, 2017. “Diagnosed HIV infection in transgender adults and adolescents: Results from the National HIV Surveillance System, 2009–2014,” AIDS and Behavior, volume 21 number 9, pp. 2,774–2,783.
doi: https://doi.org/10.1007/s10461-016-1656-7, accessed 5 September 2020.

+ +

Sascha Cohen, 2018. “How gay activists challenged the politics of civility,” Smithsonian Magazine (10 July), at https://www.smithsonianmag.com/history/how-gay-activists-challenged-politics-civility-180969579/, accessed 5 September 2020.

+ +

Valerian J. Derlega, Barbara A. Winstead, Kathryn Greene, Julianne Serovich, and William N. Elwood, 2004. “Reasons for HIV disclosure/nondisclosure in close relationships: Testing a model of HIVdisclosure decision making,” Journal of Social and Clinical Psychology, volume 23, number 6, pp. 747–767.
doi: https://doi.org/10.1521/jscp.23.6.747.54804, accessed 5 September 2020.

+ +

Valerian J. Derlega, Barbara A. Winstead, Kathryn Greene, Julianne Serovich, and William N. Elwood, 2002. “Perceived HIV-related stigma and HIV disclosure to relationship partners after finding out about the seropositive diagnosis,” Journal of Health Psychology, volume 7, number 4, pp. 415–432.
doi: https://doi.org/10.1177/1359105302007004330, accessed 5 September 2020.

+ +

Lynn Dombrowski, Ellie Harmon, and Sarah Fox, 2016. “Social justice-oriented interaction design: Outlining key design strategies and commitments,” DIS ’16: Proceedings of the 2016 ACM Conference on Designing Interactive Systems, pp. 656–671.
doi: https://doi.org/10.1145/2901790.2901861, accessed 5 September 2020.

+ +

Robert W. Eisinger, Carl W. Dieffenbach, and Anthony S. Fauci, 2019. “HIV viral load and transmissibility of HIV infection: Undetectable equals untransmittable,” Journal of the American Medical Association, volume 321, number 5, pp. 451–452.
doi: https://doi.org/10.1001/jama.2018.21167, accessed 5 September 2020.

+ +

Richard Elliot, 2002. “Criminal law, public health and HIV transmission: A policy options paper,” UNAIDS (Joint United Nations Programme on HIV/AIDS), at https://data.unaids.org/publications/irc-pub02/jc733-criminallaw_en.pdf, accessed 5 September 2020.

+ +

Elizabeth F. Emens, 2008. “Intimate discrimination: The state’s role in the accidents of sex and love,” Harvard Law Review, volume 122, number 5, pp. 1,307–1,402.
doi: https://doi.org/10.2307/40379752, accessed 5 September 2020.

+ +

Steven Epstein, 1996. Impure science: AIDS, activism, and the politics of knowledge. Berkeley: University of California Press.

+ +

Amy L. Fairchild, Ronald Bayer, and James Colgrove, with Daniel Wolfe, 2007. Searching eyes: Privacy, the state, and disease surveillance in America. Berkeley: University of California Press.

+ +

Mary D. Fan, 2012. “Decentralizing STD surveillance: Toward better informed sexual consent,” Yale Journal of Health Policy, Law, and Ethics, volume 12, number 1, pp. 1–38.

+ +

Mary D. Fan, 2011. “Sex, privacy, and public health in a casual encounters culture,” University of California Davis Law Review, volume 25, pp. 531–596.

+ +

Tim Fitzsimons, 2019. “Inside Grindr, fears that China wanted to access user data via HIV research,” NBC News (2 April), at https://www.nbcnews.com/feature/nbc-out/inside-grindr-fears-china-wanted-access-user-data-hiv-research-n989996, accessed 5 September 2020.

+ +

Chandra L. Ford, Kathryn D. Whetten, Susan A. Hall, Jay S. Kaufman, and Angela D. Thrasher, 2007. “Black sexuality, social construction, and research targeting ‘The Down Low’ (‘The DL’),” Annals of Epidemiology, volume 17, number 3, pp. 209–216.
doi: https://doi.org/10.1016/j.annepidem.2006.09.006, accessed 5 September 2020.

+ +

A.J. Fortin, 1995. “AIDS, surveillance, and public policy,” Research in Law and Policy Studies, volume 4, pp. 173–197.

+ +

Marilou Gagnon, 2012. “Toward a critical response to HIV criminalization: Remarks on advocacy and social justice,” Journal of the Association of Nurses in AIDS Care, volume 23, number 1, pp. 11–15.
doi: https://doi.org/10.1016/j.jana.2011.08.012, accessed 5 September 2020.

+ +

Carol L. Galletly and Steven D. Pinkerton, 2006. “Conflicting messages: How criminal HIV disclosure laws undermine public health efforts to control the spread of HIV,” AIDS and Behavior, volume 10, number 5, pp. 451–461.
doi: https://doi.org/10.1007/s10461-006-9117-3, accessed 5 September 2020.

+ +

C. Galletly, Z. Lazzarini, C. Sanders, and S.D. Pinkerton, 2014. “Criminal HIV exposure laws: Moving forward,” AIDS and Behavior, volume 18, number 6, pp. 1,011–1,013.
doi: https://doi.org/10.1007/s10461-014-0731-1, accessed 5 September 2020.

+ +

Robert C. Gallo, 2006. “A reflection on HIV/AIDS research after 25 years,” Retrovirology, volume 3, article number 72.
doi: https://doi.org/10.1186/1742-4690-3-72, accessed 5 September 2020.

+ +

George Gallup, Jr. and Jim Castelli, 1987. “Poll catalogs views on AIDS by religion,” Dallas Morning News (27 September), p. 45A.

+ +

Lawrence O. Gostin, Scott Burris, and Zita Lazzarini, 1999. “The law and the public’s health: A study of infectious disease law in the United States,” Columbia Law Review, volume 99, number 1, pp. 59–128.

+ +

Ben Green, 2018. “Data science as political action: Grounding data science in a politics of justice,” arXiv:1811.03435 (6 November), at https://arxiv.org/abs/1811.03435, accessed 5 September 2020.

+ +

Kathryn Greene, Valerian J. Derlega, Gust A. Yep, and Sandra Petronio, 2003. Privacy and disclosure of HIV in interpersonal relationships: A sourcebook for researchers and practitioners. Mahwah, N.J.: Lawrence Erlbaum Associates.

+ +

David M. Halperin, 2015. “The biopolitics of HIV prevention discourse,” In: Vernon W. Cisney and Nicolae Morar (editors). Biopower: Foucault and beyond. Chicago: University of Chicago Press, pp. 199–227.

+ +

David M. Halperin and Trevor Hoppe (editors), 2017. The war on sex. Durham, N.C.: Duke University Press.

+ +

Mark J. Handel and Irina Shklovski, 2012. “Disclosure, ambiguity and risk reduction in real-time dating sites,” GROUP ’12: Proceedings of the 17th ACM International Conference on Supporting Group Work, pp. 175–178.
doi: https://doi.org/10.1145/2389176.2389203, accessed 5 September 2020.

+ +

Jean Hardy and Silvia Lindtner, 2017. “Constructing a desiring user: Discourse, rurality, and design in location-based social networks,” CSCW ’17: Proceedings of the 2017 ACM Conference on Computer Supported Cooperative Work and Social Computing, pp. 13–25.
doi: https://doi.org/10.1145/2998181.2998347, accessed 5 September 2020.

+ +

Dini Harsono, Carol L. Galletly, Elaine O’Keefe, and Zita Lazzarini, 2017. “Criminalization of HIV exposure: A review of empirical studies in the United States,” AIDS and Behavior, volume 21, no. 1, pp. 27–50.
doi: https://doi.org/10.1007/s10461-016-1540-5, accessed 5 September 2020.

+ +

Trevor Hoppe, 2018. Punishing disease: HIV and the criminalization of sickness. Berkeley: University of California Press.

+ +

Hsiu-Fang Hsieh and Sarah E. Shannon, 2005. “Three approaches to qualitative content analysis,” Qualitative Health Research, volume 15, number 9, pp. 1,277–1,288.
doi: https://doi.org/10.1177/1049732305276687, accessed 5 September 2020.

+ +

Jevan A. Hutson, Jessie G. Taft, Solon Barocas, and Karen Levy, 2018. “Debiasing desire: Addressing bias & discrimination on intimate platforms,” Proceedings of the ACM on Human-Computer Interaction, article number 73.
doi: https://doi.org/10.1145/3274342, accessed 5 September 2020.

+ +

Lilly Irani, Janet Vertesi, Paul Dourish, Kavita Philip, and Rebecca E. Grinter, 2010. “Postcolonial computing: A lens on design and development,” CHI ’10: Proceedings of the SIGCHI Conference on Human Factors in Computing Systems, pp. 1,311–1,320.
doi: https://doi.org/10.1145/1753326.1753522, accessed 5 September 2020.

+ +

Steven J. Jackson, Tarleton Gillespie, and Sandy Payette, 2014. “The policy knot: Re-integrating policy, practice and design in cscw studies of social computing,” CSCW ’14: Proceedings of the 17th ACM Conference on Computer Supported Cooperative Work & Social Computing, pp. 588–602.
doi: https://doi.org/10.1145/2531602.2531674, accessed 5 September 2020.

+ +

Paula C. Johnson, 1992. “Silence equals death: The response to AIDS within communities of color,” University of Illinois Law Review, volume 1992, pp. 1,075–1,083.

+ +

Ralf Jürgens, Jonathan Cohen, Edwin Cameron, Scott Burris, Michaela Clayton, Richard Elliott, Richard Pearshouse, Anne Gathumbi, and Delme Cupido, 2009. “Ten reasons to oppose the criminalization of HIV exposure or transmission,” Reproductive Health Matters, volume 17, number 34, pp. 163–172.
doi: https://doi.org/10.1016/S0968-8080(09)34462-6, accessed 5 September 2020.

+ +

Gopinaath Kannabiran, Shaowen Bardzell, and Jeffrey Bardzell, 2012. “Designing (for) desire: a critical study of technosexuality in HCI,” NordiCHI ’12: Proceedings of the Seventh Nordic Conference on Human-Computer Interaction: Making Sense Through Design, pp. 655–664.
doi: https://doi.org/10.1145/2399016.2399116, accessed 5 September 2020.

+ +

Cécile Kazatchkine, Edwin Bernard, and Patrick Eba, 2015. “Ending overly broad HIV criminalization: Canadian scientists and clinicians stand for justice,” Journal of the International AIDS Society, volume 18, number 1, pp. 201–226.
doi: https://doi.org/10.7448/IAS.18.1.20126, accessed 5 September 2020.

+ +

Os Keyes, Jevan Hutson, and Meredith Durbin, 2019. “A mulching proposal: Analysing and improving an algorithmic system for turning the elderly into high-nutrient slurry,” CHI EA ’19: Extended Abstracts of the 2019 CHI Conference on Human Factors in Computing Systems, paper number alt06.
doi: https://doi.org/10.1145/3290607.3310433, accessed 5 September 2020.

+ +

Jeffrey V. Lazarus, Kelly Safreed-Harmon, Simon E. Barton, Dominique Costagliola, Nikos Dedes, Julia del Amo Valero, Jose M. Gatell, Ricardo Baptista-Leite, Lus Mendão, Kholoud Porter, Stefano Vella, and Jürgen Kurt Rockstroh, 2016. “Beyond viral suppression of HIV — The new quality of life frontier,” BMC Medicine, volume 14, number 1, article number 94.
doi: https://doi.org/10.1186/s12916-016-0640-4, accessed 5 September 2020.

+ +

J. Stan Lehman, Meredith H. Carr, Allison J. Nichol, Alberto Ruisanchez, David W. Knight, Anne E. Langford, Simone C. Gray, and Jonathan H. Mermin, 2014. “Prevalence and public health implications of state laws that criminalize potential HIV exposure in the United States,” AIDS and Behavior, volume 18, number 6, pp.997–1,006.
doi: https://doi.org/10.1007/s10461-014-0724-0, accessed 5 September 2020.

+ +

Karen Levy and Solon Barocas, 2018. “Designing against discrimination in online markets,” Berkeley Technology Law Journal, volume 32, number 3, pp. 1,183–1,237.
doi: https://doi.org/10.15779/Z38BV79V7K, accessed 5 September 2020.

+ +

Eric Lichtblau and William M. Arkin, 2014. “More federal agencies are using undercover operations,” New York Times (15 November), at https://www.nytimes.com/2014/11/16/us/more-federal-agencies-are-using-undercover-operations.html, accessed 5 September 2020.

+ +

Ann Light, 2011. “HCI as heterodoxy: Technologies of identity and the queering of interaction with computers,” Interacting with Computers, volume 23, number 5, pp. 430–438.
doi: https://doi.org/10.1016/j.intcom.2011.02.002, accessed 5 September 2020.

+ +

Ben Light, Jean Burgess, and Stefanie Duguay, 2018. “The walkthrough method: An approach to the study of apps,” New Media & Society, volume 20, number 3, pp. 881–900.
doi: https://doi.org/10.1177/1461444816675438, accessed 5 September 2020.

+ +

Anish P. Mahajan, Jennifer N. Sayles, Vishal A. Patel, Robert H. Remien, Daniel Ortiz, Greg Szekeres, and Thomas J. Coates, 2008. “Stigma in the HIV/AIDS epidemic: A review of the literature and recommendations for the way forward,” AIDS, volume 22, supplement 2, pp. S67–S79.
doi: https://doi.org/10.1097/01.aids.0000327438.13291.62, accessed 5 September 2020.

+ +

Alexandra McCallum, 2014. “Criminalizing the transmission of HIV: Consent, disclosure, and online dating,” Utah Law Review, volume 2014, number 3, article 5, at https://dc.law.utah.edu/ulr/vol2014/iss3/5, accessed 5 September 2020.

+ +

Donna Hubbard McCree and Matthew Hogben, 2010. “The contribution to and context of other sexually transmitted diseases and tuberculosis in the HIV/AIDS epidemic among African Americans,” In: Donna Hubbard McCree, Kenneth Jones, and Ann O’Leary (editors). African Americans and HIV/AIDS: Understanding and addressing the epidemic, New York: Springer, pp. 3–12.
doi: https://doi.org/10.1007/978-0-387-78321-5_1, accessed 5 September 2020.

+ +

William C. Miller, Carol A. Ford, Martina Morris, Mark S. Handcock, John L. Schmitz, Marcia M. Hobbs, Myron S. Cohen, Kathleen Mullan Harris, and J. Richard Udry, 2004. “Prevalence of chlamydial and gonococcal infections among young adults in the United States,” Journal of the American Medical Association, volume 291, number 18, pp. 2,229–2,236.
doi: https://doi.org/10.1007/978-0-387-78321-5_1, accessed 5 September 2020.

+ +

Viviane Namaste, 2015. Oversight: Critical reflections on feminist research and politics. Toronto: Women’s Press.

+ +

Angela Perone, 2013. “From punitive to proactive: An alternative approach for responding to HIV criminalization that departs from penalizing marginalized communities,” Hastings Women’s Law Journal, volume 24, pp. 363–406, and at https://repository.uchastings.edu/hwlj/vol24/iss2/5, accessed 5 September 2020.

+ +

Deana A. Pollard, 2006. “Sex torts,” Minnesota Law Review, volume 91, pp. 769–824, and at https://www.minnesotalawreview.org/wp-content/uploads/2012/01/Pollard_Final.pdf, accessed 5 September 2020.

+ +

POZ, 2015. “Man with HIV arrested for seeking sex on social media”(22 July 22), at https://www.poz.com/article/stlouis-hiv-arrest-27534-4846, accessed 5 September 2020.

+ +

Russell K. Robinson, 2007. “Structural dimensions of romantic preferences,” Fordham Law Review, volume 76, pp. 2,787–2,820, and at http://fordhamlawreview.org/issues/structural-dimensions-of-romantic-preferences/, accessed 5 September 2020.

+ +

Michael J. Rosenfeld and Reuben J. Thomas, 2012. “Searching for a mate: The rise of the Internet as a social intermediary,” American Sociological Review, volume 77, number 4, pp. 523–547.
doi: https://doi.org/10.1177/0003122412448050, accessed 5 September 2020.

+ +

B.R. Simon Rosser, J. Michael Wilkerson, Derek J. Smolenski, J. Michael Oakes, Joseph Konstan, Keith J. Horvath, Gunna R. Kilian, David S. Novak, Gene P. Danilenko, and Richard Morgan, 2011. “The future of Internet-based HIV prevention: A report on key findings from the Men’s INTernet (MINTS-I, II) Sex Studies,” AIDS and Behavior, volume 15, supplement 1, pp. S91–S100.
doi: https://doi.org/10.1007/s10461-011-9910-5, accessed 5 September 2020.

+ +

Brian Schram, 2019. “Accidental orientations: Rethinking queerness in archival times,” Surveillance & Society, volume 17, number 5, pp. 602–617.
doi: https://doi.org/10.24908/ss.v17i5.8688, accessed 5 September 2020.

+ +

Junichi P. Semitsu, 2011. “From Facebook to mug shot: How the dearth of social networking privacy rights revolutionized online government surveillance,” Pace Law Review, volume 31, number 1, pp. 291–381, and at https://digitalcommons.pace.edu/plr/vol31/iss1/7, accessed 5 September 2020.

+ +

Sero Project, 2012, “National criminalization survey preliminary results,” (25 July), at https://toolkit.hivjusticeworldwide.org/resource/the-sero-project-national-criminalization-survey-preliminary-results-2/, accessed 30 August 2019.

+ +

Julianne M. Serovich and Katie E. Mosack, 2003. “Reasons for HIV disclosure or nondisclosure to casual sexual partners,” AIDS Education and Prevention, volume 15, number 1, pp. 70–80.

+ +

Natasha Singer, 2018. “Grindr sets off privacy firestorm after sharing users’ H.I.V.-status data,” New York Times (3 April), at https://www.nytimes.com/2018/04/03/technology/grindr-sets-off-privacy-firestorm-after-sharing-users-hiv-status-data.html, accessed 5 September 2020.

+ +

Lucy Suchman, 2011. “Anthropological relocations and the limits of design,” Annual Review of Anthropology, volume 40, pp. 1–18.
doi: https://doi.org/10.1146/annurev.anthro.041608.105640, accessed 5 September 2020.

+ +

Cass R. Sunstein, 1996. “Social norms and social roles,” Columbia Law Review, volume 96, number 4, pp. 903–968.

+ +

Patricia Sweeney, Simone C. Gray, David W. Purcell, Jenny Sewell, Aruna Surendera Babu, Brett A. Tarver, Joseph Prejean, and Jonathan Mermin, 2017. “Association of HIV diagnosis rates and laws criminalizing HIV exposure in the United States,” AIDS, volume 31, number 10, pp. 1,483–1,488.
doi: https://doi.org/10.1097/QAD.0000000000001501, accessed 5 September 2020.

+ +

Bryan L. Sykes, Trevor A. Hoppe, and Kristen D. Maziarka, 2016. “Cruel intentions? HIV prevalence and criminalization during an age of mass incarceration, U.S. 1999 to 2012,” Medicine (Baltimore), volume 95, number 16, e3352.
doi: https://doi.org/10.1097/MD.0000000000003352, accessed 5 September 2020.

+ +

Samuel Hardman Taylor, Jevan Alexander Hutson, and Tyler Richard Alicea, 2017. “Social consequences of Grindr use: Extending the Internet-enhanced self-disclosure hypothesis,” CHI ’17: Proceedings of the 2017 CHI Conference on Human Factors in Computing Systems, pp. 6,645–6,657.
doi: https://doi.org/10.1145/3025453.3025775, accessed 5 September 2020.

+ +

Steven Thrasher, 2015. “A Black body on trial: The conviction of HIV-positive ‘Tiger Mandingo’,” BuzzFeed News (30 November), at https://www.buzzfeednews.com/article/steventhrasher/a-black-body-on-trial-the-conviction-of-hiv-positive-tiger-m, accessed 5 September 2020.

+ +

Liming Wang, Dylan Podson, Zihuang Chen, Hongyan Lu, Vania Wang, Colin Shepard, John K. Williams, and Guodong Mi, 2019. “Using social media to increase HIV testing among men who have sex with men — Beijing, China, 2013–2017,” Morbidity and Mortality Weekly Report, volume 68, number 21, pp. 478–482.
doi: http://dx.doi.org/10.15585/mmwr.mm6821a3, accessed 5 September 2020.

+ +

Helen Ward. 2005. “Partner notification and contact-tracing,” Medicine, volume 33, number 9, pp. 28–30.
doi: https://doi.org/10.1383/medc.2005.33.9.28, accessed 5 September 2020.

+ +

Helen Ward and Gill Bell, 2014. “Partner notification,” Medicine (Abingdon), volume 42, number 6, pp. 314–317.
doi: https://doi.org/10.1016/j.mpmed.2014.03.013, accessed 5 September 2020.

+ +

Mark Warner, Andreas Gutmann, M. Angela Sasse, and Ann Blandford, 2018. “Privacy unraveling around explicit HIV status disclosure fields in the online geosocial hookup app Grindr,” Proceedings of the ACM on Human-Computer Interaction, article number 181.
doi: https://doi.org/10.1145/3274450, accessed 5 September 2020.

+ +

Mark Warner, Juan F. Maestre, Jo Gibbs, Chia-Fang Chung, and Ann Blandford, 2019. “Signal appropriation of explicit HIV status disclosure fields in sex-social apps used by gay and bisexual men,” CHI ’19: Proceedings of the 2019 CHI Conference on Human Factors in Computing Systems, paper number 692.
doi: https://doi.org/10.1145/3290605.3300922, accessed 5 September 2020.

+ +

Dylan Eric Wittkower, 2016. “Lurkers, creepers, and virtuous interactivity: From property rights to consent to care as a conceptual basis for privacy concerns and information ethics,” First Monday, volume 21, number 10, at https://firstmonday.org/article/view/6948/5628, accessed 5 September 2020.
doi: https://doi.org/10.5210/fm.v21i10.6948, accessed 5 September 2020.

+ +

Dan Wohlfeiler, Jennifer Hecht, Jonathan Volk, H. Fisher Raymond, Tom Kennedy, and Willi McFarland, 2013. “How can we improve online HIV and STD prevention for men who have sex with men? Perspectives of hook-up website owners, website users, and HIV/STD directors,” AIDS and Behavior, volume 17, number 9, pp. 3,024–3,033.
doi: https://doi.org/10.1007/s10461-012-0375-y, accessed 5 September 2020.

+ +

Mara Cecilia Zea, Carol A. Reisen, Paul J. Poppen, and Rafael M. Daz. 2003. “Asking and telling: communication about HIV status among Latino HIV-positive gay men,” AIDS and Behavior, volume 7, number 2, pp. 143–152.
doi: https://doi.org/10.1023/A:1023994207984, accessed 5 September 2020.

+ +

Shoshana Zuboff, 2019. The age of surveillance capitalism: The fight for a human future at the new frontier of power. London: Profile Books.

+ +

 

+
+ +

Editorial history

+

Received 17 October 2019; revised 12 February 2020; accepted 28 August 2020.

+ +
+ +

Creative Commons License
This paper is licensed under a Creative Commons Attribution 4.0 International License.

+ +

Surveillance, stigma & sociotechnical design for HIV
by Calvin Liang, Jevan Alexander Hutson, and Os Keyes.
First Monday, Volume 25, Number 10 - 5 October 2020
https://firstmonday.org/ojs/index.php/fm/article/download/10274/9729
doi: http://dx.doi.org/10.5210/fm.v25i10.10274

+
+ + \ No newline at end of file diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py new file mode 100644 index 0000000..96b3883 --- /dev/null +++ b/python/tests/test_html_ingest.py @@ -0,0 +1,14 @@ + +import datetime +import pytest + +from sandcrawler.html_ingest import * + + +def test_html_extract_ojs3() -> None: + + with open('tests/files/first_monday_ojs3_fulltext.html', 'rb') as f: + ojs3_html = f.read() + + fulltext = html_extract_fulltext_teixml(ojs3_html) + assert fulltext['status'] == 'success' -- cgit v1.2.3 From cefbc6fa46e6586d8735f40b3b5432a759edd5f1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 30 Oct 2020 17:33:37 -0700 Subject: html: syntax fixes; resolve relative URLs; extract more XML fulltext URLs --- python/sandcrawler/html_ingest.py | 6 +++--- python/sandcrawler/html_metadata.py | 17 ++++++++++++----- python/tests/test_html_metadata.py | 15 ++++++++------- 3 files changed, 23 insertions(+), 15 deletions(-) (limited to 'python/tests') diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index acd336e..284461e 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -6,7 +6,7 @@ import json import datetime import argparse import xml.etree.ElementTree as ET -from typing import List, Optional, Any +from typing import List, Optional, Any, Tuple import trafilatura import pydantic @@ -75,7 +75,7 @@ class IngestWebResult(pydantic.BaseModel): } -def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> (dict, ResourceResult): +def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]: if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr) inner_body = gzip.decompress(resource.body) @@ -233,7 +233,7 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal ) html_doc = HTMLParser(html_resource.body) - html_biblio = html_extract_biblio(html_doc) + html_biblio = html_extract_biblio(url, html_doc) html_fulltext = html_extract_fulltext_teixml(html_resource.body) html_scope = html_guess_scope(url, html_doc, html_biblio, html_fulltext.get('tei_xml')) if html_scope not in ('article-fulltext', 'unknown'): diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index d3ca1b7..41157e0 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -232,11 +232,7 @@ class BiblioMetadata(pydantic.BaseModel): xml_fulltext_url: Optional[str] -def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]: - """ - TODO: - - meta dc.identifier: parse DOI - """ +def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]: meta: Any = dict() head = doc.css_first("head") @@ -262,6 +258,12 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]: meta[field].append(val.attrs['content']) break + # non- lookups + if not meta.get('xml_fulltext_url'): + val = head.css_first("link[rel='alternate'][type='application/xml']") + if val and val.attrs['href']: + meta['xml_fulltext_url'] = val.attrs['href'] + # TODO: replace with clean_doi() et al if meta.get('doi') and meta.get('doi').startswith('doi:'): meta['doi'] = meta['doi'][4:] @@ -290,6 +292,11 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]: if release_type: meta['release_type'] = release_type + # resolve relative URLs + for key in ('pdf_fulltext_url', 'html_fulltext_url', 'xml_fulltext_url'): + if meta.get(key): + meta[key] = urllib.parse.urljoin(doc_url, meta[key]) + return BiblioMetadata(**meta) def load_adblock_rules() -> braveblock.Adblocker: diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py index 597520c..b428b0d 100644 --- a/python/tests/test_html_metadata.py +++ b/python/tests/test_html_metadata.py @@ -10,7 +10,7 @@ def test_html_metadata_plos() -> None: with open('tests/files/plos_one_article.html', 'r') as f: plos_html = f.read() - meta = html_extract_biblio(HTMLParser(plos_html)) + meta = html_extract_biblio("http://example.org", HTMLParser(plos_html)) assert meta is not None assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody" assert meta.doi == "10.1371/journal.pone.0213978" @@ -46,7 +46,7 @@ def test_html_metadata_elife() -> None: with open('tests/files/elife_article.html', 'r') as f: elife_html = f.read() - meta = html_extract_biblio(HTMLParser(elife_html)) + meta = html_extract_biblio("http://example.org", HTMLParser(elife_html)) assert meta is not None assert meta.title == "Parallel visual circuitry in a basal chordate" assert meta.doi == "10.7554/eLife.44753" @@ -70,7 +70,7 @@ def test_html_metadata_peerj() -> None: with open('tests/files/peerj_oa_article.html', 'r') as f: peerj_html = f.read() - meta = html_extract_biblio(HTMLParser(peerj_html)) + meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html)) assert meta is not None assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles" assert meta.doi == "10.7717/peerj.4375" @@ -88,6 +88,7 @@ def test_html_metadata_peerj() -> None: assert meta.container_name == "PeerJ" # "2018-02-13" assert meta.release_date == datetime.date(year=2018, month=2, day=13) + assert meta.xml_fulltext_url and ".xml" in meta.xml_fulltext_url def test_html_metadata_nature() -> None: @@ -95,7 +96,7 @@ def test_html_metadata_nature() -> None: with open('tests/files/nature_article.html', 'r') as f: nature_html = f.read() - meta = html_extract_biblio(HTMLParser(nature_html)) + meta = html_extract_biblio("http://example.org", HTMLParser(nature_html)) assert meta is not None assert meta.title == "More than 100 scientific journals have disappeared from the Internet" assert meta.doi == "10.1038/d41586-020-02610-z" @@ -115,7 +116,7 @@ def test_html_metadata_ojs3() -> None: with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f: ojs3_html = f.read() - meta = html_extract_biblio(HTMLParser(ojs3_html)) + meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html)) assert meta is not None assert meta.title == "Surveillance, stigma & sociotechnical design for HIV" assert meta.doi == "10.5210/fm.v25i10.10274" @@ -140,7 +141,7 @@ def test_html_metadata_dlib() -> None: with open('tests/files/dlib_05vanhyning.html', 'r') as f: dlib_html = f.read() - meta = html_extract_biblio(HTMLParser(dlib_html)) + meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html)) assert meta is not None assert meta.doi == "10.1045/may2017-vanhyning" # "2017-05-15" @@ -159,7 +160,7 @@ def test_html_metadata_dc_case() -> None: Hi. """ - meta = html_extract_biblio(HTMLParser(snippet)) + meta = html_extract_biblio("http://example.org", HTMLParser(snippet)) assert meta is not None assert meta.issue == "123" -- cgit v1.2.3 From c145488142d4b5413323322dfc1422efdece83f7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 16:23:34 -0800 Subject: html: some refactoring --- python/sandcrawler/html_ingest.py | 29 +++++++++++---------- python/sandcrawler/html_metadata.py | 50 +++++++++++++++++++++++++++++-------- python/tests/test_html_ingest.py | 2 +- 3 files changed, 57 insertions(+), 24 deletions(-) (limited to 'python/tests') diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 823218b..fe883ba 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -16,19 +16,21 @@ from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_ from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules -def html_extract_fulltext_teixml(doc: bytes) -> dict: +TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}" + +def html_extract_body_teixml(doc: bytes) -> dict: tei_xml = trafilatura.extract(doc, tei_output=True, include_comments=False, include_formatting=True, ) if tei_xml: - return dict(status="success", tei_xml=tei_xml) + return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml) elif doc.startswith(b''): # hack for firstmonday.org - return html_extract_fulltext_teixml(doc[106:]) + return html_extract_body_teixml(doc[106:]) else: - return dict(status="empty-xml") + return dict(status="empty-xml", agent=TRAFILATURA_AGENT) def teixml_body_text(doc_xml: str) -> str: ns = {"tei": "http://www.tei-c.org/ns/1.0"} @@ -58,14 +60,15 @@ class WebResource(pydantic.BaseModel): class IngestWebResult(pydantic.BaseModel): status: str hit: bool + error_message: Optional[str] cdx: Optional[dict] terminal: Optional[Any] # TODO request: Optional[Any] # TODO file_meta: Optional[dict] html_biblio: Optional[BiblioMetadata] - html_scope: Optional[str] - html_fulltext: Optional[dict] - subresources: Optional[List[WebResource]] + scope: Optional[str] + html_body: Optional[dict] + html_resources: Optional[List[WebResource]] class Config: arbitrary_types_allowed = True @@ -228,8 +231,8 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal html_doc = HTMLParser(html_resource.body) html_biblio = html_extract_biblio(url, html_doc) - html_fulltext = html_extract_fulltext_teixml(html_resource.body) - html_scope = html_guess_scope(url, html_doc, html_biblio, html_fulltext.get('tei_xml')) + html_body = html_extract_body_teixml(html_resource.body) + html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('tei_xml')) if html_scope not in ('article-fulltext', 'unknown'): return IngestWebResult( status="wrong-scope", @@ -237,7 +240,7 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), file_meta=file_meta, html_biblio=html_biblio, - html_scope=html_scope, + scope=html_scope, ) raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock) @@ -256,10 +259,10 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal hit=True, cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), file_meta=file_meta, - html_fulltext=html_fulltext, + html_body=html_body, html_biblio=html_biblio, - html_scope=html_scope, - subresources=full_resources, + scope=html_scope, + html_resources=full_resources, ) return output diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 41157e0..b23118b 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -1,6 +1,6 @@ import datetime -from typing import List, Optional, Any +from typing import List, Optional, Any, Tuple, Dict import urllib.parse import dateparser @@ -158,9 +158,6 @@ HEAD_META_PATTERNS: Any = { "meta[name='citation_fulltext_html_url']", "meta[name='bepress_citation_fulltext_html_url']", ], - "xml_fulltext_url": [ - "meta[name='citation_xml_url']", - ], "pdf_fulltext_url": [ "meta[name='citation_pdf_url']", "meta[name='bepress_citation_pdf_url']", @@ -188,6 +185,19 @@ HEAD_META_LIST_PATTERNS: Any = { ], } +XML_FULLTEXT_PATTERNS: List[dict] = [ + { + "selector": "meta[name='citation_xml_url']", + "attr": "content", + "why": "citation_xml_url", + }, + { + "selector": "link[rel='alternate'][type='application/xml']", + "attr": "href", + "why": "alternate link", + }, +] + RELEASE_TYPE_MAP = { "research article": "article-journal", "text.serial.journal": "article-journal", @@ -232,6 +242,27 @@ class BiblioMetadata(pydantic.BaseModel): xml_fulltext_url: Optional[str] +def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]: + """ + Tries to quickly extract fulltext URLs using a set of patterns. This + function is intendend to be generic across various extraction techniques. + + Returns null or a tuple of (url, why) + """ + for pattern in patterns: + if not 'selector' in pattern: + continue + elem = doc.css_first(pattern['selector']) + if not elem: + continue + if 'attr' in pattern: + val = elem.attrs[pattern['attr']] + if val: + val = urllib.parse.urljoin(doc_url, val) + assert val + return (val, pattern.get('why', 'unknown')) + return None + def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]: meta: Any = dict() @@ -258,11 +289,10 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat meta[field].append(val.attrs['content']) break - # non- lookups - if not meta.get('xml_fulltext_url'): - val = head.css_first("link[rel='alternate'][type='application/xml']") - if val and val.attrs['href']: - meta['xml_fulltext_url'] = val.attrs['href'] + # (some) fulltext extractions + xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS) + if xml_fulltext_url: + meta['xml_fulltext_url'] = xml_fulltext_url[0] # TODO: replace with clean_doi() et al if meta.get('doi') and meta.get('doi').startswith('doi:'): @@ -293,7 +323,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat meta['release_type'] = release_type # resolve relative URLs - for key in ('pdf_fulltext_url', 'html_fulltext_url', 'xml_fulltext_url'): + for key in ('pdf_fulltext_url', 'html_fulltext_url'): if meta.get(key): meta[key] = urllib.parse.urljoin(doc_url, meta[key]) diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py index 96b3883..e6e48ac 100644 --- a/python/tests/test_html_ingest.py +++ b/python/tests/test_html_ingest.py @@ -10,5 +10,5 @@ def test_html_extract_ojs3() -> None: with open('tests/files/first_monday_ojs3_fulltext.html', 'rb') as f: ojs3_html = f.read() - fulltext = html_extract_fulltext_teixml(ojs3_html) + fulltext = html_extract_body_teixml(ojs3_html) assert fulltext['status'] == 'success' -- cgit v1.2.3 From 653fac9632c6ae9dd036ad844454cf419cd5320b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 22:40:14 -0800 Subject: xml: re-encode XML docs into UTF-8 for persisting --- proposals/20201103_xml_ingest.md | 19 +- python/sandcrawler/ingest.py | 4 +- python/sandcrawler/xml.py | 7 + python/tests/files/scielo_article.jats.xml | 336 +++++++++++++++++++++++++++++ python/tests/test_xml.py | 18 ++ 5 files changed, 382 insertions(+), 2 deletions(-) create mode 100644 python/sandcrawler/xml.py create mode 100644 python/tests/files/scielo_article.jats.xml create mode 100644 python/tests/test_xml.py (limited to 'python/tests') diff --git a/proposals/20201103_xml_ingest.md b/proposals/20201103_xml_ingest.md index c0d0a79..181cc11 100644 --- a/proposals/20201103_xml_ingest.md +++ b/proposals/20201103_xml_ingest.md @@ -10,8 +10,8 @@ x differential JATS XML and scielo XML from generic XML? if startswith "
" => JATS x refactor ingest worker to be more general x have ingest code publish body to kafka topic +x write a persist worker / create/configure kafka topic -/ write a persist worker - test everything locally - fatcat: ingest tool to create requests - fatcat: entity updates worker creates XML ingest requests for specific sources @@ -27,6 +27,23 @@ that we currently ingest PDF fulltext. Currently this will just fetch the single XML document, which is often lacking figures, tables, and other required files. +## Text Encoding + +Because we would like to treat XML as a string in a couple contexts, but XML +can have multiple encodings (indicated in an XML header), we are in a bit of a +bind. Simply parsing into unicode and then re-encoding as UTF-8 could result in +a header/content mismatch. Any form of re-encoding will change the hash of the +document. For recording in fatcat, the file metadata will be passed through. +For storing in Kafka and blob store (for downstream analysis), we will parse +the raw XML document (as "bytes") with an XML parser, then re-output with UTF-8 +encoding. The hash of the *original* XML file will be used as the key for +refering to this document. This is unintuitive, but similar to what we are +doing with PDF and HTML documents (extracting in a useful format, but keeping +the original document's hash as a key). + +Unclear if we need to do this re-encode process for XML documents already in +UTF-8 encoding. + ## Ingest Worker Could either re-use HTML metadata extractor to fetch XML fulltext links, or diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 1a42b6a..363485e 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -18,6 +18,7 @@ from sandcrawler.html import extract_fulltext_url from sandcrawler.html_metadata import html_extract_fulltext_url, XML_FULLTEXT_PATTERNS from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient +from sandcrawler.xml import xml_reserialize class IngestFileWorker(SandcrawlerWorker): @@ -316,10 +317,11 @@ class IngestFileWorker(SandcrawlerWorker): count), or attempting to fetch sub-resources. """ if self.xmldoc_sink and file_meta['mimetype'] == "application/jats+xml": + jats_xml = xml_reserialize(resource.body) msg = dict( sha1hex=file_meta["sha1hex"], status="success", - jats_xml=resource.body.encode('utf-8'), + jats_xml=jats_xml, ) self.xmldoc_sink.push_record(msg, key=file_meta['sha1hex']) return dict(status="success") diff --git a/python/sandcrawler/xml.py b/python/sandcrawler/xml.py new file mode 100644 index 0000000..7a0086d --- /dev/null +++ b/python/sandcrawler/xml.py @@ -0,0 +1,7 @@ + +import xml.etree.ElementTree as ET + + +def xml_reserialize(raw: bytes) -> str: + root = ET.fromstring(raw) + return '\n' + ET.tostring(root, encoding="unicode") diff --git a/python/tests/files/scielo_article.jats.xml b/python/tests/files/scielo_article.jats.xml new file mode 100644 index 0000000..08c864e --- /dev/null +++ b/python/tests/files/scielo_article.jats.xml @@ -0,0 +1,336 @@ +
+ + +1683-9803 + + +1683-9803 + + + + + +S1683-98032015000200002 +10.18004/ped.2015.agosto.102-107 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Paraguay + + + + +Paraguay + + + + +Paraguay + + +30 +08 +2015 + + +30 +08 +2015 + +42 +2 +102 +107 + + +

+

+ + + + + + + + +
+
ARTÍCULO ORIGINAL

 

Prevalencia de desnutrición y hábitos alimentarios en  niños menores de 5 años en las comunidades indígenas de Yby Yau y Azote’y, 2011

Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011

 

Syntia Carolina Ruiz Valiente(1), Manuel Ruiz Cañete(2), Bartola Cohene Velazquez(3)

1. Hospital General Pediátrico Niños Acosta Ñu. Reducto-San Lorenzo, Paraguay.

2. Centro de Salud de Yby Yau. Paraguay.

3. Puesto de Salud de Paso Tuya. Azote’y. Paraguay.

Correspondencia: Syntia Carolina Ruiz Valiente. E-mail: scrv_py@hotmail.com

]]> + Recibido: 24/01/2015; Aceptado: 10/06/2015.

Los autores declaran que no existen conflictos de interés en el presente estudio.

 


RESUMEN

Introducción: La infancia es una etapa trascendental en el desarrollo evolutivo del hombre, para lo cual es fundamental una adecuada nutrición. La desnutrición infantil no es solo un problema de falta de alimentos, es un conflicto social más profundo. La prevalencia de desnutrición en menores de 5 años del país es de 5,9% según datos del Instituto Nacional de Alimentación y Nutrición. Objetivo: Determinar la prevalencia de desnutrición y hábitos alimentarios en niños menores de 5 años de las comunidades indígenas de Yby Yaú y Azote’y. Materiales y Métodos: Estudio descriptivo, transversal, realizado de enero a abril del 2011, que identificó la prevalencia de desnutrición infantil en niños indígenas de las etnias Pa'i Tavyterã y Mbya Guaraní de 11 comunidades indígenas de Yby Yau y Azote’y. Fueron examinados 349 menores de 5 años de edad. Para la evaluación del estado nutricional se utilizó la curva de crecimiento de la OMS. Los niños/as fueron pesados/as en balanzas mecánicas. Para la medida de la altura, los mayores de dos años fueron medidos con el tallimetro y los menores de 2 años con cinta métrica. Resultados: Se observó desnutrición en 53 niños que equivale al 15% de la muestra. De estos 60,4% padecían de desnutrición moderada y 39,6% desnutrición grave. El mayor porcentaje de desnutrición se encontró en el grupo de edad de 0 a 24 meses con 71,6%. El 77% de los niños tenían desnutrición crónica. Conclusiones: La prevalencia de desnutrición en indígenas en Yby Yaú y Azote’y es de 15%, lo que sobrepasa los índices de desnutrición en menores de 5 años del país.

Palabras clave: Desnutrición aguda, desnutrición crónica, indígenas.

 

ABSTRACT

Introduction: Childhood is a crucial stage in the development of humans, which is why proper nutrition is essential for this stage. Child malnutrition is not just a problem of lack of food, it is rooted in deeper social problems. The prevalence of malnutrition in children under five years of age  in Paraguay is 5.9% , according to the Paraguayan National Institute of Food and Nutrition. Objective: Determine the prevalence of malnutrition and the eating habits in children under five years of age in indigenous communities in the towns of Azote'y and Yaú Yby. Materials and Methods: This was a descriptive, cross-sectional study conducted from January to April 2011, which identified the prevalence of child malnutrition in indigenous children in 11 ethnic Pa'i Tavyterá and Mbya Guarani indigenous communities in Azote'y and Yby Yau. We examined 349 children under 5 years of age. The World Health Organization (WHO) growth charts were used to assess nutritional status. Children were weighed with mechanical scales. To measure height, children two and older were measured with a stadiometer and children younger than two were measured with tape. Results: Malnutrition was observed in 53 children (15% of the sample). Of these, 60.4% were suffering from moderate malnutrition and 39.6% from severe malnutrition. The highest percentage of malnutrition was found in the 0-24 month age group (71.6%). 77% of children had chronic malnutrition. Conclusions: The prevalence of malnutrition in indigenous children in Yby Yaú and Azote'y is 15%, which exceeds the national malnutrition rates in children under five years of age.

Keywords: Acute malnutrition, chronic malnutrition, indigenous.


]]> + 

INTRODUCCIÓN

La desnutrición es una enfermedad multisistémica, que afecta todos los órganos y sistemas del ser humano, es producida por una disminución drástica, aguda o crónica, en la disponibilidad de nutrimentos, ya sea por ingestión insuficiente, inadecuada absorción, exceso de pérdidas o la conjunción de dos o más de estos factores. Se manifiesta por grados de déficit antropométrico, signos y síntomas clínicos y alteraciones bioquímicas, hematológicas e inmunológicas (1).

La población indígena está gravemente afectada por este problema, tal vez por ser un estrato olvidado y descuidado por la población en general y por el estado paraguayo. A pesar de las leyes, y de todos los proyectos que favorecen a esta esfera de la sociedad, aún existe un abismo inimaginable entre lo ideal y lo real. Mientras se elaboran programas que buscan dar mejores condiciones de vida a estas comunidades, que la mayoría de las veces solo quedan plasmados en el papel, los índices de desnutrición son alarmantes. Esto se debe probablemente a que en la sociedad posmoderna, la deforestación, el uso de agrotóxicos, la invasión de los terratenientes despojó a los nativos de sus tierras, obligándolos a vivir en situaciones carenciales, pues estos debido a su cultura esperan que la naturaleza les ofrezca el sustento diario. Las costumbres, la economía y la religión en las etnias Pa Tavyterã y Mby`a Guaraní están íntimamente relacionadas a la producción alimenticia e ingesta.

Para el nativo guaraní es muy difícil comprender que el hombre es el que debe producir alimento para su sustento, pero como la sociedad actual obliga a ello, estos por no conseguir adaptarse a los cambios que se produjeron, están más expuestos a las carencias alimentarias. Según datos del gobierno central en el 2008, 41,8% de los niños indígenas menores de 5 años padecían de desnutrición.

En un estudio realizado en México, la prevalencia de desnutrición en indígenas fue 39,4%(2). Un 44% presentó uno o más signos clínicos de malnutrición. Según el Instituto Nacional de Encuestas y Censos del Ecuador (2001 y 2006) 40,1% de los niños indígenas menores de 5 años tienen desnutrición crónica (3).

En Caracas, se hizo un estudio con la población infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, y ellos obtuvieron el siguiente resultado: El diagnóstico nutricional hallado con mayor frecuencia fue Nutrición normal (55%) seguida por Desnutrición Subclínica (15%) y Desnutrición Leve (12%). En líneas generales, un 55% de la población se encontraba en rangos de nutrición normal, mientras el 45% restante presentaba problema de malnutrición comprendiendo ésta por déficit y por exceso (4).

En el Brasil en un estudio realizado para determinar el perfil nutricional de los aborígenes menores de 5 años de Kaingángen Paraná vieron que cuando utilizado los criterios propuestos por la OMS, se registró una alta prevalencia de déficit Estatura/Edad, con uno en cuatro niños (24,8%) que presentaba este diagnóstico. El déficit de Peso/Edad fue diagnosticado en 9,2% de los niños evaluados. Los índices de peso para la altura diagnosticaron solo tres niños (2,1%) como desnutridas agudas (5).

En otro estudio realizado también en el Brasil, esta vez en Amazonia, con niños de la etnia Suruí se observó que los porcentajes de los niños con déficit en los índices de estatura para la edad fue 31,4%, peso para la edad 12,4% y peso para la estatura 0% (6).

El objetivo del presente estudio es determinar la prevalencia de desnutrición en niños menores de 5 años de las comunidades indígenas de Yby-Yaú y Azote’y y conocer el comportamiento alimentario de los niños/as de las comunidades indígenas estudiadas.

]]> + 

MATERIALES Y MÉTODOS

Estudio transversal, descriptivo realizado en el periodo de enero a abril del año 2011, donde se identificó la prevalencia de desnutrición infantil en niños indígenas de las etnias Paĩ Tavyterã y Mby`a Guaraní en los distritos de Yby-Yaú y Azote’y.

El tamaño muestral total fue de 370 niños, determinado a través de censo realizado por el Centro de Salud de Yby-Yaú y el Puesto de Salud de Paso Tuya. Para los fines del estudio fueron identificados 349 niños (94.3%) de niños recién nacidos a menores de 5 años en los distritos de Yby-Yaú y Azote'y.

Las etnias que se encuentran dentro del área de estudio está compuesta por los mby`a guaraní y los paĩ tavyterã, distribuidas en las siguientes comunidades indígenas: Vy'apavẽ, Yrapey, Guyrakeha, Guyra Ñe'engatuamba, Sat;, San Juan, Mbery'o Jaguarymi, Ka'aguy Poty Rory, Yvyra'ija, Tukambiju y Takuaritiy.

El trabajo se realizó por concentración, en los locales fijados por los líderes de las distintas comunidades. Fue aplicado un cuestionario a las madres, creado para el efecto por medio de entrevista. La edad de los niños fue dada por las madres, pues la mayoría de estas no cuentan con registro de nacimiento, ni siquiera certificado de nacido vivo.

Para la evaluación del estado nutricional de los niños se optó por la curva del gráfico de crecimiento de la Organización Mundial de la Salud (OMS) lo cual está contenido en la libreta del niño y la niña. Los niños/as fueron pesados/as en balanzas mecánicas, los que ya conseguían quedarse de pie fueron pesados en balanza de pie y los niños menores de 1 año en balanzas colgantes.

Para la medida de la altura, los niños mayores de dos años fueron colocados en posición de pie, bien rectos, y fueron medidos con el tallimetro. La talla de los niños menores de 2 años fue realizada con cinta métrica con el niño/a en decúbito supino en superficie recta.

Los datos fueron analizados manualmente, y los gráficos confeccionados con el programa Microsoft Office Excel 2007.

 

]]> +RESULTADOS

Se evaluaron 349 niños, que representan el 94,3% del total de aborígenes menores de 5 años de las comunidades de Yby-Yaú y Azote’y. Del total de 349 niños, 69 % (240) son Pa; Tavyterã y 31% (109) Mby`a Guaraní.

La comunidad con el mayor porcentaje de niños fue la de Vy'ãpavẽ (36,4%), y la de menor frecuencia fue la comunidad de Tekoha Kagãtã, que es una comunidad recién formada localizada en Pasiño (Figura 1).

 

Viendo el perfil nutricional de los niños, se pudo observar que 61% de los niños/as no están desnutridos, 24% de los niños/as están en riesgo de desnutrición y 15% están con desnutrición. Aunque se trata de un estrato social desfavorecido también se observa índice de sobrepeso y obesidad, en las comunidades de Vy'ãpavẽ e Yrapey (Figura 2).

 

]]> +Teniendo presente los gráficos de Talla/Edad la prevalencia de desnutrición crónica es bastante elevada, pues 77% de los niños padecen de desnutrición crónica. El mayor índice de desnutrición se encuentran en los primeros 24 meses de vida (Tabla 1). De los 53 niños con desnutrición, 60,4% padecen de desnutrición moderada, y el 39,6% desnutrición grave. Siendo que el mayor porcentaje de desnutrición se observa en Vy'ãpavẽ.

 

Se estudió además el comportamiento alimentario de estos niños, viendo que alimentos preferencialmente hacen parte de su dieta y la edad de introducción de los mismos, la mayoría de las madres introducen algún tipo alimento entre los 6 y 8 meses de edad (Figura 3) y los primeros alimentos introducidos dependen del lugar donde estos habitan. El caldo de pescado es uno de los primeros alimentos introducidos en las comunidades que viven cerca de los ríos, entretanto el 60% inician la alimentación con caldo de arroz y caldo de fideo.

 

Al observar la frecuencia en que se alimentan estos niños, el 64% se alimenta tres veces al día, el 20% menos de 3 veces al día y solo el 16 % más de tres veces al día.

El principal nutriente en la dieta son los carbohidratos, el 47% de los niños consumen carbohidratos más de 5 veces por semana, y el 21% menos de 3 veces por semana. El mayor porcentaje de consumo de proteínas se observa en las comunidades que se encuentran cerca de ríos (Guyra Ñe`engatuamba y Mbery'o Jaguarymi), siendo que 70% consume proteínas menos de 3 veces por semana, y solo el 3% más de cinco veces por semana. El consumo de verduras y hortalizas es muy escaso, el 91% consume verduras y hortalizas menos de 3 veces por semana, el 2% más de 5 veces y 7% entre 3 y 5 veces por semana.

]]> + 

DISCUSIÓN

A lo largo de toda la historia de la humanidad, la desnutrición ha sido una patología de las clases sociales menos privilegiadas, son los que no poseen las condiciones necesarias para tener una vida digna, donde la educación, salud, recursos económicos son miserables, donde esta dolencia alcanza su auge (7).

Según los datos del Censo realizado por la Unidad de Salud Indígena que se encuentra en el Distrito de Yby-Yaú, los Puestos de Salud de Yby- Yaú y Azote’y en el tercer trimestre del Año 2010, se encontraron 328 niños de hasta 60 meses (8). Al realizar los trabajos de campo, este número se elevó a 349 individuos, por lo que se hizo un nuevo censo solo con los niños de este grupo etario. Ese fenómeno tal vez, se deba a la migraciones que se desarrollan normalmente entre los guaraní. Al observar la historia, y también por la experiencia que se adquirió durante el trabajo de campo, se pudo observar la familia lingüística a la cual pertenecen los mby`a y los pa; (la guaraní) son nómadas, es común que migren a otras comunidades, en un mismo Tekoha (9,10).

La población diana fue de 370 niños menores de 5 años de los cuales se llegó a entrevistar a las madres de 349 y se hizo las mediciones antropométricas posteriormente. En la mayoría de las comunidades indígenas se obtuvo el 100% de participación, son excepciones las comunidades de Yrapey y Takuaritiy.

Del total de niños/as, la etnia de mayor prevalencia fue la de Pa; Tavyterã. En relación al sexo, las comunidades son bastante equilibradas, con una ligera prevalencia del sexo masculino sobre el femenino.

Según datos de la UNICEF en Paraguay se observa 3,4% de desnutrición aguda en niños menores de 5 años (11). La prevalencia de desnutrición en los niños paraguayos menores de 5 años en el área rural es de 5,9% y en el área urbana es de 4,5% (12). Existen pocas publicaciones sobre este tema en aborígenes menores de 5 años, siendo que el mayor número de publicaciones fue realizado por el Brasil (12,4%), México (39,4%) y Ecuador.

La prevalencia de desnutrición en las comunidades indígenas de Yby-Yaú y Azote’y es de 15,2%, observando los gráficos de Peso/edad si de 2 años y Peso/Talla en mayores de 2 años y menores de 5 años. Las comunidades donde la desnutrición son más prevalentes son Guyrakeha e Yvyra'ija; en Sat; y Tekoha Kagatã no se encontró niños desnutridos.

De 53 niños con desnutrición, 60,4% padecen de desnutrición moderada, y el 39,6% desnutrición grave. El grupo con mayor índice de desnutrición, se encuentra durante los primeros 24 meses, pues es en esta etapa donde el organismo requiere una mayor cantidad de nutrientes por el mayor crecimiento. Además, después de los 6 meses se inicia la introducción de otros alimentos. Estos dos factores, asociados aumentan el índice de desnutrición en este grupo de edad.

De la población total de los niños estudiados el 23,8% están con riesgo de desnutrición. Según el Instituto Nacional de Alimentación y Nutrición (INAN) en el año 2010, 13,6% de niños menores de 5 años del área urbana y 16,2% del área rural del Paraguay sufren desnutrición crónica. En una encuesta realizada por la Dirección General de Estadística, Encuestas y Censos en el año 2008, 41,8% de los niños/as indígenas menores de cinco años padecen de desnutrición crónica. Observadas las medidas de Talla/Edad el 77% de los niños padecen de desnutrición crónica. Ese dato es alarmante, porque la desnutrición crónica es consecuencia de una carencia prolongada de alimentos o enfermedades sucesivas. En Tukambiju, Mbery'o Jaguarymi, Guyrakeha, Yvyra'ija y Sat; son comunidades con una prevalencia mayor al 80% de niños/as con talla baja para la edad.

]]> +El índice de desnutrición en indígenas en los distritos de Yby-Yaú y Azote’y, sobrepasa la prevalencia general de desnutrición en menores de 5 años del país, lo cual está alrededor de 5.9% según datos del INAN.

En las comunidades indígenas se puede observar que un porcentaje razonable introduce alimentos entre los 6 meses y antes de los 9 meses. El porcentaje de los que introducen antes de los 6 meses es de 18,6% y entre los 9 meses y un año es de 27%. Se pudo observar que, ocho niños tuvieron lactancia materna exclusiva por más de 1 año. Todos los niños/as con lactancia materna exclusiva en la fecha de la recolección de datos tenía menos de 6 meses o 6 meses. El caldo de fideo y de arroz ocupa el primer y segundo lugar respectivamente como primer alimento introducido por las madres. Los alimentos que deberían ser introducidos inicialmente como el puré de frutas y verduras ocupan un pequeño porcentaje en la lista. Otros alimentos que se tendrían que introducir después de los 9 meses, de preferencia a los un año, como por ejemplo el caldo de poroto, caldo de pescado, leche de vaca y huevo son los primeros alimentos que se introducen.

El 64% de los niños se alimentan tres veces al día, el 20,5% menos de tres veces y 15,5% más de tres veces al día.

El 69,5% de los niños/as de las comunidades indígenas de Yby-Yaú y Azote’y consumen proteínas menos de tres veces por semana; 27,3% consumen de tres a cinco veces por semana los diferentes tipos de proteínas, teniendo predominancia el consumo de pez. Solo 3,2% consume proteínas más de 5 veces. Las comunidades que viven cerca de bosques, ríos o arroyos son los que más consumen proteínas.

Los carbohidratos son la principal fuente de alimentación de los niños y niñas de las comunidades indígenas de Yby-Yaú y Azote’y. Eso se debe a que son los alimentos de más fácil adquisición y los más accesibles económicamente hablando.

En las comunidades indígenas el consumo de verduras y hortalizas es escaso. Las comunidades que más consumen verduras y hortalizas son Mberyo Jaguarymi y Takuaritiy.

Este trabajo refleja la realidad de las comunidades indígenas de los dos distritos observados, no podemos extrapolar estas mismas cifras en el departamento de Concepción, o en todo el país por el tamaño de la muestra, es necesario hacer nuevos estudios con un tamaño muestral mayor para obtener una visión del verdadero estado nutricional de los niños indígenas. El porcentaje de desnutrición es alto, pero se trata de distritos con no muchos recursos económicos, donde la pobreza es una realidad aún en otros estratos sociales.

La realidad indígena es un problema real, y una manera de reducir estas cifras es enseñándoles a producir su propio alimento. Para ello no debemos luchar con su cultura ni intentar hacerlos ver el mundo a través de nuestra realidad, sino dentro de sus costumbres encontrar formas de que ellos tengan condiciones de un mejor porvenir.

 

AGRADECIMIENTOS

]]> +A las comunidades indígenas que participaron en nuestro estudio, los profesionales de blanco del Centro de Salud de Yby-Yau y Azote’y, a la Comunidad de Hermanas de la Divina Providencia de Yby-Yau, a la Dra. Blanca Villalba y a la Dra. Gloria Martínez.

 

REFERENCIAS

1. Monteiro CA. Fome, desnutrição e pobreza: além da semântica. Saúde Soc. 2003;12(1):7-11.         [ Links ]

2. Viñas MR, Frías ML, Verdú JM. Entorno social y desnutrición en niños de 1 a 4 años de comunidades indígenas de México. Rev Esp Nutr Comunitaria. 2005;11(3):128-34.         [ Links ]

3. INEC. Ecuador: 40,1% de indígenas con desnutrición crónica. Ecuador: Estudio del INEC; 2009.         [ Links ]

4. Chumpitaz D, Russo A, Del NogaL B, Case C, Lares M. Evaluación nutricional de la población infantil warao en la comunidad de Yakariyene, estado Delta Amacuro, agosto-octubre 2004. AVFT. 2006;25(1):26-31.         [ Links ]

5. Kuhl AM, Tittoni C, Leite MS, Bastos JL. Perfil Nutricional e fatores associados à ocorrência de desnutrição entre crianças indígenas Kaingáng da Terra Indígena de Mangueirinha, Paraná, Brasil. Cad Saúde Pública. 2009;25(2):409-420.         [ Links ]

6. Orellana JD, Coimbra Jr. CE, Lourenço AE, Santos RV. Estado nutricional e anemia en crianças Suruí, Amazônia, Brasil. J Pediatr (Rio J). 2006;82(5):383-88.         [ Links ]

7. Organización de las Naciones Unidas. Foro permanente para las cuestiones indígenas: informe sobre el quinto período de sesiones (15 a 26 de mayo de 2006). Nueva York: Naciones Unidas; 2006.         [ Links ]

8. Centro de Salud de Yby-Yau. Censo local de las comunidades indígenas. Yby-Yau; 2010.         [ Links ]

9. Chase-Sardi M, Brun A, Enciso MA. Situación sociocultural, económica, jurídico-político actual de las comunidades indígenas del Paraguay. Asunción: UCA; 1989.         [ Links ]

10. Meli B, Grunberg G, Grunberg F. Pa -Tavyter: etnografía guaraní del Paraguay contemporáneo. 2da. ed. Asunción: Centro de Estudios Antropólogicos de la Universidad Católica; 2008.         [ Links ]

11. FAO. Panorama de la seguridad alimentaria y nutricional en América Latina y el Caribe 2013. FAO; 2014.         [ Links ]

12. Masi C, Sánchez Bernal S, Dallman D, Rodas A, Morinigo G, Mendoza L. Perfil nutricional de niños menores de 5 años que acuden a servicios públicos de salud en el Paraguay. Asunción: INAN; 2010.         [ Links ]

]]> + + + + + + + + + + + +2003 +12 +1 +1 +7-11 + + + + + + + + + + + + + + + + + + + +2005 +11 +3 +3 +128-34 + + + +INEC + +2009 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +2006 +25 +1 +1 +26-31 + + + + + + + + + + + + + + + + + + + + + + + +2009 +25 +2 +2 +409-420 + + + + + + + + + + + + + + + + + + + + + + + +2006 +82 +5 +5 +383-88 + + + +Organizacin de las Naciones Unidas + +2006 + + + + + + +Centro de Salud de Yby-Yau + +2010 + + + + + + + + + + + + + + + + + + + + +1989 + + + + + + + + + + + + + + + + + + + + + +2008 + + + + + + +FAO + +2014 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +2010 + + + + + + +
diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py new file mode 100644 index 0000000..a996c56 --- /dev/null +++ b/python/tests/test_xml.py @@ -0,0 +1,18 @@ + +import pytest + +from sandcrawler.xml import xml_reserialize + + +def test_xml_reserialize() -> None: + + with open('tests/files/scielo_article.jats.xml', 'rb') as f: + raw_xml = f.read() + + assert b'encoding="ISO-8859-1"' in raw_xml + raw_xml.decode("ISO-8859-1") + with pytest.raises(UnicodeDecodeError): + raw_xml.decode("utf-8") + + str_xml = xml_reserialize(raw_xml) + assert 'encoding="UTF-8"' in str_xml -- cgit v1.2.3