Skip to content

Commit

Permalink
Adding SQL Konferenz 2017 Keynote example scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
MikeRys committed May 3, 2018
1 parent cde2358 commit 9eb8703
Show file tree
Hide file tree
Showing 13 changed files with 390 additions and 0 deletions.
34 changes: 34 additions & 0 deletions Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2013
VisualStudioVersion = 12.0.31101.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{182E2583-ECAD-465B-BB50-91101D7C24CE}") = "SQLKonferenz2017-Keynote", "SQLKonferenz2017-Keynote\SQLKonferenz2017-Keynote.usqlproj", "{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|Any CPU = Release|Any CPU
Release|x64 = Release|x64
Release|x86 = Release|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|Any CPU.Build.0 = Debug|Any CPU
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|x64.ActiveCfg = Debug|x64
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|x64.Build.0 = Debug|x64
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|x86.ActiveCfg = Debug|x86
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|x86.Build.0 = Debug|x86
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|Any CPU.ActiveCfg = Release|Any CPU
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|Any CPU.Build.0 = Release|Any CPU
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|x64.ActiveCfg = Release|x64
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|x64.Build.0 = Release|x64
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|x86.ActiveCfg = Release|x86
{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|x86.Build.0 = Release|x86
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
@data = EXTRACT date string,
time string,
author string,
tweet string
FROM "/Samples/Data/Tweets/MikeDoesBigDataTweets.csv"
USING Extractors.Csv();

OUTPUT @data TO "/Output/sqlkonf.csv" USING Outputters.Csv();
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
using Microsoft.Analytics.Interfaces;
using Microsoft.Analytics.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
@data = EXTRACT date string,
time string,
author string,
tweet string,
origin string
FROM "/Samples/Data/Tweets/{origin}Tweets.csv"
USING Extractors.Csv();

OUTPUT @data TO "/Output/sqlkonf.csv" USING Outputters.Csv();
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
DROP TABLE IF EXISTS TweetData;
CREATE TABLE TweetData (
date string,
time string,
author string,
tweet string,
origin SqlArray<string>,
INDEX idx CLUSTERED (author)
DISTRIBUTED BY HASH(author) INTO 2
);

@data = EXTRACT date string,
time string,
author string,
tweet string,
origin string
FROM "/Samples/Data/Tweets/{origin}Tweets.csv"
USING Extractors.Csv();

INSERT INTO TweetData
SELECT date, time, author, tweet, ARRAY_AGG(origin.ToLowerInvariant()) AS origin
FROM @data

WHERE tweet.EndsWith("stop")
GROUP BY date, time, author, tweet;
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

@m = SELECT TweetAnalysis.Udfs.get_mentions(tweet) AS mentions
FROM TweetData;

@m = SELECT m.Substring(1) AS m
, "mention" AS category
FROM @m CROSS APPLY EXPLODE(mentions) AS t(m)
WHERE m != "@";

@t =
SELECT author, "author" AS category
FROM TweetData
UNION ALL
SELECT *
FROM @m;

@res = SELECT author.ToLowerInvariant() AS author
, category
, COUNT( * ) AS tweetcount
FROM @t
GROUP BY author.ToLowerInvariant(), category;

OUTPUT @res
TO "/Output/TweetAnalysis/MyTwitterAnalysis.csv"
ORDER BY tweetcount DESC
USING Outputters.Csv();
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
using Microsoft.Analytics.Interfaces;
using Microsoft.Analytics.Types.Sql;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Linq;

// TweetAnalysis Code Behind
// Show the use of a U-SQL user-defined function (UDF)
//
namespace TweetAnalysis
{
public class Udfs
{
// SqlArray<string> get_mentions(string tweet)
//
// Returns a U-SQL array of string containing the twitter handles that were mentioned inside the tweet.
//
public static SqlArray<string> get_mentions(string tweet)
{
return new SqlArray<string>(
tweet.Split(new char[] { ' ', ',', '.', ':', '!', ';', '"', '“' }).Where(x => x.StartsWith("@"))
);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Requires 4-USQL-ObjectModel.usql to be executed to create the TVF

// Analyzes cooked TweetAuthorsAndMentions
//
// Does some analytics using windowing expressions.
//
// 1. Shows use of windowing expressions
// 2. Constant table with VALUES
// 3. ARRAY Contains
// 4. Joins

// For each author and category with tweetcount > 50, get their tweetcount, the median tweetcount in the category, their percentile and absolute rank in rank order
@res =
SELECT DISTINCT
author, category, tweetcount
, PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY tweetcount ASC)
OVER (PARTITION BY category) AS median_tweetcount_perhandle_category
, PERCENT_RANK() OVER
(PARTITION BY category ORDER BY tweetcount ASC) AS relative_rank
, ROW_NUMBER() OVER
(PARTITION BY category ORDER BY tweetcount DESC) AS absolute_rank
FROM TweetAndMentionsTVF(DEFAULT) AS t
WHERE tweetcount >= 50;

OUTPUT @res
TO "/Output/TweetAnalysis/MyTwitterAnalysis6.csv"
ORDER BY absolute_rank, category ASC
USING Outputters.Csv();

// For each author who provided their tweet feed (to be fair), provide their influence measured as mentioned/authored
// Account for changes to tweet handles with a constant lookup table
// (could also be done as a standard table, or as a file that gets deployed as resource and looked up with a UDF)

@tweet_handle_mapping = SELECT * FROM (VALUES ("sqlservermike","mikedoesbigdata")) AS T(old_handle, new_handle);

@t =
SELECT n.new_handle ?? t.author AS author,
category,
tweetcount,
file_origin
FROM TweetAndMentionsTVF(DEFAULT) AS t
LEFT OUTER JOIN
@tweet_handle_mapping AS n
ON t.author == n.old_handle;

@t =
SELECT author,
category,
SUM(tweetcount) AS tweetcount
FROM @t
WHERE file_origin != null && file_origin.Contains(author)
GROUP BY author,
category;

@res =
SELECT m.author,
a.tweetcount AS authored_count,
m.tweetcount AS mentioned_count,
(double) m.tweetcount / (double) a.tweetcount AS influence
FROM @t AS a
JOIN
@t AS m
ON a.author == m.author
WHERE a.category == "author" AND m.category == "mention";

OUTPUT @res
TO "/Output/TweetAnalysis/influencer.csv"
ORDER BY influence DESC
USING Outputters.Csv();
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
@left =
SELECT *
FROM (VALUES ( 1, "x", (int?) 50 ),
( 1, "y", (int?) 60 )
) AS L(K, A, C);

@right =
SELECT *
FROM (VALUES ( 5, "x", 1 ),
( 6, "x", 2 ),
(10, "y", 3 )
) AS R(B, A, K);

@res =
SELECT * FROM @left
OUTER UNION BY NAME ON (A, K)
SELECT * FROM @right;

OUTPUT @res TO "/output/docsamples/outerunion.csv" USING Outputters.Csv();
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
DROP FUNCTION IF EXISTS TweetAndMentionsTVF;

CREATE FUNCTION TweetAndMentionsTVF(@origin string = null)
RETURNS @res
AS
BEGIN

// In order to see the user-code inside the TVF, you have to reference the assembly inside the TVF body.
REFERENCE ASSEMBLY TweetAnalysis; // Generated from TweetAnalysis Example

// Extract mentions
@m =
SELECT origin,
TweetAnalysis.Udfs.get_mentions(tweet) AS mentions,
author AS mentioned_by
FROM TweetData
WHERE String.IsNullOrEmpty(@origin) || origin.Contains(@origin);

// Combine authors and mentions
@t =
SELECT origin,
author,
"author" AS category,
"" AS mentioned_by
FROM TweetData
UNION ALL
SELECT origin,
m.Substring(1) AS m,
"mention" AS category,
mentioned_by
FROM @m
CROSS APPLY
EXPLODE(mentions) AS t(m)
WHERE m != "@";

// Count authors and mentions
@res =
SELECT author.ToLowerInvariant() AS author,
category,
COUNT( * ) AS tweetcount,
new SQL.ARRAY<string>(ARRAY_AGG(origin).SelectMany(x => x).Distinct()) AS file_origin,
ARRAY_AGG(DISTINCT mentioned_by) AS mentioned_by
FROM @t
GROUP BY author.ToLowerInvariant(),
category;
END;
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
@in = EXTRACT begin DateTime, end DateTime, user string
FROM "/Samples/Blogs/MRys/Ranges/ranges.txt"
USING Extractors.Text(delimiter:'-');

@r = REDUCE @in PRESORT begin
ON user
PRODUCE begin DateTime, end DateTime, user string
READONLY user
USING new ReduceSample.RangeReducer();

// Remove comment to show filter push through the reducer
// @r = SELECT * FROM @r WHERE user == "ABC";

OUTPUT @r
TO "/temp/result.csv"
USING Outputters.Csv();
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
using Microsoft.Analytics.Interfaces;
using Microsoft.Analytics.Types.Sql;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace ReduceSample
{
[SqlUserDefinedReducer(IsRecursive = true)] // not sure if it can run recursive yet. Need to test with large data sets.
public class RangeReducer : IReducer
{
public override IEnumerable<IRow> Reduce(IRowset input, IUpdatableRow output)
{
// Init aggregation values
bool first_row_processed = false;
var begin = DateTime.MaxValue; // Dummy value to make compiler happy
var end = DateTime.MinValue; // Dummy value to make compiler happy

// requires that the reducer is PRESORTED on begin and READONLY on the reduce key.
foreach (var row in input.Rows)
{
// Initialize the first interval with the first row if i is 0
if (!first_row_processed)
{
first_row_processed = true; // mark that we handled the first row
begin = row.Get<DateTime>("begin");
end = row.Get<DateTime>("end");
// If the end is just a time and not a date, it can be earlier than the begin, indicating it is on the next day.
// This let's fix up the end to the next day in that case
if (end < begin) { end = end.AddDays(1); }
}
else
{
var b = row.Get<DateTime>("begin");
var e = row.Get<DateTime>("end");
// fix up the date if end is earlier than begin
if (e < b) { e = e.AddDays(1); }

// if the begin is still inside the interval, increase the interval if it is longer
if (b <= end)
{
// if the new end time is later than the current, extend the interval
if (e > end) { end = e; }
}
else // output the previous interval and start a new one
{
output.Set<DateTime>("begin", begin);
output.Set<DateTime>("end", end);
yield return output.AsReadOnly();
begin = b; end = e;
} // if
} // if
} // foreach

// now output the last interval
output.Set<DateTime>("begin", begin);
output.Set<DateTime>("end", end);
yield return output.AsReadOnly();
} // Reduce

} // RangeReducer
} // ReduceSample
Loading

0 comments on commit 9eb8703

Please sign in to comment.