aboutsummaryrefslogtreecommitdiffstats
path: root/scratch
diff options
context:
space:
mode:
Diffstat (limited to 'scratch')
-rw-r--r--scratch/.gitignore2
-rw-r--r--scratch/HelloWorld.ipynb362
-rw-r--r--scratch/README.md10
3 files changed, 0 insertions, 374 deletions
diff --git a/scratch/.gitignore b/scratch/.gitignore
deleted file mode 100644
index 01e0dc1..0000000
--- a/scratch/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-.env
-.ipynb_checkpoints/
diff --git a/scratch/HelloWorld.ipynb b/scratch/HelloWorld.ipynb
deleted file mode 100644
index e382aa4..0000000
--- a/scratch/HelloWorld.ipynb
+++ /dev/null
@@ -1,362 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "6974daac",
- "metadata": {},
- "outputs": [],
- "source": [
- "from pyspark.sql import SparkSession\n",
- "from pyspark.context import SparkContext"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "e72fbe47",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "pyspark.sql.session.SparkSession"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "SparkSession"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "9333e07a",
- "metadata": {},
- "outputs": [],
- "source": [
- "spark = SparkSession.builder.master(\"local\").appName(\"HelloWorld\").getOrCreate()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "adf79021",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " <div>\n",
- " <p><b>SparkSession - hive</b></p>\n",
- " \n",
- " <div>\n",
- " <p><b>SparkContext</b></p>\n",
- "\n",
- " <p><a href=\"http://192.168.179.73:4040\">Spark UI</a></p>\n",
- "\n",
- " <dl>\n",
- " <dt>Version</dt>\n",
- " <dd><code>v3.1.1</code></dd>\n",
- " <dt>Master</dt>\n",
- " <dd><code>local[*]</code></dd>\n",
- " <dt>AppName</dt>\n",
- " <dd><code>PySparkShell</code></dd>\n",
- " </dl>\n",
- " </div>\n",
- " \n",
- " </div>\n",
- " "
- ],
- "text/plain": [
- "<pyspark.sql.session.SparkSession at 0x7f63b4051650>"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "324b74ca",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'3.1.1'"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.version"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "856f2700",
- "metadata": {},
- "outputs": [],
- "source": [
- "sc = SparkContext.getOrCreate()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "7218a07f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " <div>\n",
- " <p><b>SparkContext</b></p>\n",
- "\n",
- " <p><a href=\"http://192.168.179.73:4040\">Spark UI</a></p>\n",
- "\n",
- " <dl>\n",
- " <dt>Version</dt>\n",
- " <dd><code>v3.1.1</code></dd>\n",
- " <dt>Master</dt>\n",
- " <dd><code>local[*]</code></dd>\n",
- " <dt>AppName</dt>\n",
- " <dd><code>PySparkShell</code></dd>\n",
- " </dl>\n",
- " </div>\n",
- " "
- ],
- "text/plain": [
- "<SparkContext master=local[*] appName=PySparkShell>"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sc"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "17114da3",
- "metadata": {},
- "outputs": [],
- "source": [
- "txt = sc.textFile('file:////usr/share/doc/python3/copyright')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "c1af9b77",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "file:////usr/share/doc/python3/copyright MapPartitionsRDD[4] at textFile at NativeMethodAccessorImpl.java:0"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "txt"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "ee37564c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "pyspark.rdd.RDD"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "type(txt)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "208ef97a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "319"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "txt.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "06f807b8",
- "metadata": {},
- "outputs": [],
- "source": [
- "python_lines = txt.filter(lambda line: 'python' in line.lower())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "ff2835b7",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "52\n"
- ]
- }
- ],
- "source": [
- "print(python_lines.count())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "9c8ec8c7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['This is the Debian GNU/Linux prepackaged version of the Python programming',\n",
- " 'language. Python was written by Guido van Rossum <guido@cwi.nl> and others.',\n",
- " 'sources from ftp.python.org:/pub/python, based on the Debianization by',\n",
- " 'Python was created in the early 1990s by Guido van Rossum at Stichting',\n",
- " \"as a successor of a language called ABC. Guido remains Python's\",\n",
- " 'In 1995, Guido continued his work on Python at the Corporation for',\n",
- " 'In May 2000, Guido and the Python core development team moved to',\n",
- " 'BeOpen.com to form the BeOpen PythonLabs team. In October of the same',\n",
- " 'year, the PythonLabs team moved to Digital Creations (now Zope',\n",
- " 'Corporation, see http://www.zope.com). In 2001, the Python Software',\n",
- " 'Foundation (PSF, see http://www.python.org/psf/) was formed, a',\n",
- " 'non-profit organization created specifically to own Python-related',\n",
- " 'All Python releases are Open Source (see http://www.opensource.org for',\n",
- " 'the Open Source Definition). Historically, most, but not all, Python',\n",
- " \"(1) GPL-compatible doesn't mean that we're distributing Python under\",\n",
- " ' the GPL. All Python licenses, unlike the GPL, let you distribute',\n",
- " ' GPL-compatible licenses make it possible to combine Python with',\n",
- " 'B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON',\n",
- " 'PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2',\n",
- " '1. This LICENSE AGREEMENT is between the Python Software Foundation',\n",
- " 'otherwise using this software (\"Python\") in source or binary form and',\n",
- " 'distribute, and otherwise use Python alone or in any derivative version,',\n",
- " 'Python Software Foundation; All Rights Reserved\" are retained in Python alone or',\n",
- " 'or incorporates Python or any part thereof, and wants to make',\n",
- " 'the changes made to Python.',\n",
- " '4. PSF is making Python available to Licensee on an \"AS IS\"',\n",
- " 'FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT',\n",
- " '5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON',\n",
- " 'A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,',\n",
- " '8. By copying, installing or otherwise using Python, Licensee',\n",
- " 'BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0',\n",
- " 'BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1',\n",
- " '2. Subject to the terms and conditions of this BeOpen Python License',\n",
- " 'provided, however, that the BeOpen Python License is retained in the',\n",
- " 'third party. As an exception, the \"BeOpen Python\" logos available at',\n",
- " 'http://www.pythonlabs.com/logos.html may be used according to the',\n",
- " 'CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1',\n",
- " '(\"Licensee\") accessing and otherwise using Python 1.6.1 software in',\n",
- " 'prepare derivative works, distribute, and otherwise use Python 1.6.1',\n",
- " 'Reserved\" are retained in Python 1.6.1 alone or in any derivative',\n",
- " 'quotes): \"Python 1.6.1 is made available subject to the terms and',\n",
- " 'Python 1.6.1 may be located on the Internet using the following',\n",
- " 'or incorporates Python 1.6.1 or any part thereof, and wants to make',\n",
- " 'the changes made to Python 1.6.1.',\n",
- " '4. CNRI is making Python 1.6.1 available to Licensee on an \"AS IS\"',\n",
- " 'FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT',\n",
- " '5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON',\n",
- " 'A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,',\n",
- " 'on Python 1.6.1 that incorporate non-separable material that was',\n",
- " 'installing or otherwise using Python 1.6.1, Licensee agrees to be',\n",
- " 'CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2',\n",
- " 'py3compile, py3clean and debpython module:']"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "python_lines.collect()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9c60ff17",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "cgraph",
- "language": "python",
- "name": "cgraph"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/scratch/README.md b/scratch/README.md
deleted file mode 100644
index 4c3fa65..0000000
--- a/scratch/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# PySpark Test Run
-
-* 2020-04-02
-
-Goal: We want to understand, which URLs of the citation corpus have been
-preserved. Also we want the GWB URL if possible. We'll try pyspark.
-
-Our cluster runs Hadoop 2.6, so we'll try:
-
- $ PYSPARK_HADOOP_VERSION=2.7 pip install pyspark