Skip to content

Commit

Permalink
2 blogs
Browse files Browse the repository at this point in the history
  • Loading branch information
Vincent-Philippe Lauzon authored and Vincent-Philippe Lauzon committed Jan 13, 2018
1 parent 32b1074 commit 434dd24
Show file tree
Hide file tree
Showing 8 changed files with 99,898 additions and 0 deletions.
30,520 changes: 30,520 additions & 0 deletions dataframes/porgat.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions dataframes/sql-notebook.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"code","source":["# Replace with your container and storage account: \"wasbs://<container>@<storage account>.blob.core.windows.net/\"\npathPrefix = \"wasbs://[email protected]/\"\n# Fetch porgat.txt from storage account\nfile = sc.textFile(pathPrefix + \"porgat.txt\")"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["# Remove the headers from the file: lines starting with a star\nnoHeaders = file.filter(lambda x: len(x)>0 and x[0]!='*')\n# Extract a pair from each line: the leading integer and a string for the rest of the line\npaired = noHeaders.map(lambda l: l.partition(' ')).filter(lambda t: len(t)==3 and len(t[0])>0 and len(t[2])>0).map(lambda t: (int(t[0]), t[2]))\n# Filter relationships as they do not start with quotes, then split the integer list\nscatteredRelationships = paired.filter(lambda (charId, text): text[0]!='\"').map(lambda (charId, text): (charId, [int(x) for x in text.split(' ')]))\n# Relationships for the same character id sometime spans more than a line in the file, so let's group them together\nrelationships = scatteredRelationships.reduceByKey(lambda pubList1, pubList2: pubList1 + pubList2)\n# Filter non-relationships as they start with quotes ; remove the quotes\nnonRelationships = paired.filter(lambda (index, text): text[0]=='\"').map(lambda (index, text): (index, text[1:-1].strip()))\n# Characters stop at a certain line (part of the initial header ; we hardcode it here)\ncharacters = nonRelationships.filter(lambda (charId, name): charId<=6486)\n# Publications starts after the characters\npublications = nonRelationships.filter(lambda (charId, name): charId>6486)"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["from pyspark.sql import Row\n\n# Let's create dataframes out of the RDDs and register them as temporary views for SQL to use\n\n# Relationships has a list as a component, let's flat that\nflatRelationships = relationships.flatMap(lambda (charId, pubList): [(charId, pubId) for pubId in pubList])\n# Let's map the relationships to an RDD of rows in order to create a data frame out of it\nrelationshipsDf = spark.createDataFrame(flatRelationships.map(lambda t: Row(charId=t[0], pubId=t[1])))\n# Register relationships as a temporary view\nrelationshipsDf.createOrReplaceTempView(\"relationships\")\n\n# Let's do the same for characters\ncharactersDf = spark.createDataFrame(characters.map(lambda t: Row(charId=t[0], name=t[1])))\ncharactersDf.createOrReplaceTempView(\"characters\")\n\n# and for publications\npublicationsDf = spark.createDataFrame(publications.map(lambda t: Row(pubId=t[0], name=t[1])))\npublicationsDf.createOrReplaceTempView(\"publications\")\n"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["%sql\n\nSELECT c1.name AS name1, c2.name AS name2, sub.charId1, sub.charId2, sub.pubCount\nFROM\n(\n SELECT r1.charId AS charId1, r2.charId AS charId2, COUNT(r1.pubId, r2.pubId) AS pubCount\n FROM relationships AS r1\n CROSS JOIN relationships AS r2\n WHERE r1.charId < r2.charId\n AND r1.pubId=r2.pubId\n GROUP BY r1.charId, r2.charId\n) AS sub\nINNER JOIN characters c1 ON c1.charId=sub.charId1\nINNER JOIN characters c2 ON c2.charId=sub.charId2\nORDER BY sub.pubCount DESC\nLIMIT 10"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["%sql\n\nSELECT c1.name AS name1, c2.name AS name2, c3.name AS name3, sub.charId1, sub.charId2, sub.charId3, sub.pubCount\nFROM\n(\n SELECT r1.charId AS charId1, r2.charId AS charId2, r3.charId AS charId3, COUNT(r1.pubId, r2.pubId, r3.pubId) AS pubCount\n FROM relationships AS r1\n CROSS JOIN relationships AS r2\n CROSS JOIN relationships AS r3\n WHERE r1.charId < r2.charId\n AND r2.charId < r3.charId\n AND r1.pubId=r2.pubId\n AND r2.pubId=r3.pubId\n GROUP BY r1.charId, r2.charId, r3.charId\n) AS sub\nINNER JOIN characters c1 ON c1.charId=sub.charId1\nINNER JOIN characters c2 ON c2.charId=sub.charId2\nINNER JOIN characters c3 ON c3.charId=sub.charId3\nORDER BY sub.pubCount DESC\nLIMIT 10"],"metadata":{},"outputs":[],"execution_count":5}],"metadata":{"name":"SQL Blog","notebookId":3185064906819836},"nbformat":4,"nbformat_minor":0}
Loading

0 comments on commit 434dd24

Please sign in to comment.