-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Vincent-Philippe Lauzon
authored and
Vincent-Philippe Lauzon
committed
Jan 13, 2018
1 parent
32b1074
commit 434dd24
Showing
8 changed files
with
99,898 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"cells":[{"cell_type":"code","source":["# Replace with your container and storage account: \"wasbs://<container>@<storage account>.blob.core.windows.net/\"\npathPrefix = \"wasbs://[email protected]/\"\n# Fetch porgat.txt from storage account\nfile = sc.textFile(pathPrefix + \"porgat.txt\")"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["# Remove the headers from the file: lines starting with a star\nnoHeaders = file.filter(lambda x: len(x)>0 and x[0]!='*')\n# Extract a pair from each line: the leading integer and a string for the rest of the line\npaired = noHeaders.map(lambda l: l.partition(' ')).filter(lambda t: len(t)==3 and len(t[0])>0 and len(t[2])>0).map(lambda t: (int(t[0]), t[2]))\n# Filter relationships as they do not start with quotes, then split the integer list\nscatteredRelationships = paired.filter(lambda (charId, text): text[0]!='\"').map(lambda (charId, text): (charId, [int(x) for x in text.split(' ')]))\n# Relationships for the same character id sometime spans more than a line in the file, so let's group them together\nrelationships = scatteredRelationships.reduceByKey(lambda pubList1, pubList2: pubList1 + pubList2)\n# Filter non-relationships as they start with quotes ; remove the quotes\nnonRelationships = paired.filter(lambda (index, text): text[0]=='\"').map(lambda (index, text): (index, text[1:-1].strip()))\n# Characters stop at a certain line (part of the initial header ; we hardcode it here)\ncharacters = nonRelationships.filter(lambda (charId, name): charId<=6486)\n# Publications starts after the characters\npublications = nonRelationships.filter(lambda (charId, name): charId>6486)"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["from pyspark.sql import Row\n\n# Let's create dataframes out of the RDDs and register them as temporary views for SQL to use\n\n# Relationships has a list as a component, let's flat that\nflatRelationships = relationships.flatMap(lambda (charId, pubList): [(charId, pubId) for pubId in pubList])\n# Let's map the relationships to an RDD of rows in order to create a data frame out of it\nrelationshipsDf = spark.createDataFrame(flatRelationships.map(lambda t: Row(charId=t[0], pubId=t[1])))\n# Register relationships as a temporary view\nrelationshipsDf.createOrReplaceTempView(\"relationships\")\n\n# Let's do the same for characters\ncharactersDf = spark.createDataFrame(characters.map(lambda t: Row(charId=t[0], name=t[1])))\ncharactersDf.createOrReplaceTempView(\"characters\")\n\n# and for publications\npublicationsDf = spark.createDataFrame(publications.map(lambda t: Row(pubId=t[0], name=t[1])))\npublicationsDf.createOrReplaceTempView(\"publications\")\n"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["%sql\n\nSELECT c1.name AS name1, c2.name AS name2, sub.charId1, sub.charId2, sub.pubCount\nFROM\n(\n SELECT r1.charId AS charId1, r2.charId AS charId2, COUNT(r1.pubId, r2.pubId) AS pubCount\n FROM relationships AS r1\n CROSS JOIN relationships AS r2\n WHERE r1.charId < r2.charId\n AND r1.pubId=r2.pubId\n GROUP BY r1.charId, r2.charId\n) AS sub\nINNER JOIN characters c1 ON c1.charId=sub.charId1\nINNER JOIN characters c2 ON c2.charId=sub.charId2\nORDER BY sub.pubCount DESC\nLIMIT 10"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["%sql\n\nSELECT c1.name AS name1, c2.name AS name2, c3.name AS name3, sub.charId1, sub.charId2, sub.charId3, sub.pubCount\nFROM\n(\n SELECT r1.charId AS charId1, r2.charId AS charId2, r3.charId AS charId3, COUNT(r1.pubId, r2.pubId, r3.pubId) AS pubCount\n FROM relationships AS r1\n CROSS JOIN relationships AS r2\n CROSS JOIN relationships AS r3\n WHERE r1.charId < r2.charId\n AND r2.charId < r3.charId\n AND r1.pubId=r2.pubId\n AND r2.pubId=r3.pubId\n GROUP BY r1.charId, r2.charId, r3.charId\n) AS sub\nINNER JOIN characters c1 ON c1.charId=sub.charId1\nINNER JOIN characters c2 ON c2.charId=sub.charId2\nINNER JOIN characters c3 ON c3.charId=sub.charId3\nORDER BY sub.pubCount DESC\nLIMIT 10"],"metadata":{},"outputs":[],"execution_count":5}],"metadata":{"name":"SQL Blog","notebookId":3185064906819836},"nbformat":4,"nbformat_minor":0} |
Oops, something went wrong.