[FLINK-3907] [gelly] Directed Clustering Coefficient

This closes apache#2079
thiagohernandes · Jun 24, 2016 · d34bdaf · d34bdaf
1 parent d92aeb7
commit d34bdaf
Show file tree

Hide file tree

Showing 35 changed files with 2,107 additions and 262 deletions.
diff --git a/docs/apis/batch/libs/gelly.md b/docs/apis/batch/libs/gelly.md
@@ -2112,8 +2112,9 @@ divided by the number of potential edges between neighbors.
 See the [Triangle Enumeration](#triangle-enumeration) library method for a detailed explanation of triangle enumeration.
 
 #### Usage
-The algorithm takes a simple, undirected graph as input and outputs a `DataSet` of tuples containing the vertex ID,
-vertex degree, and number of triangles containing the vertex. The graph ID type must be `Comparable` and `Copyable`.
+Directed and undirected variants are provided. The algorithms take a simple graph as input and output a `DataSet` of
+tuples containing the vertex ID, vertex degree, and number of triangles containing the vertex. The graph ID type must be
+`Comparable` and `Copyable`.
 
 ### Global Clustering Coefficient
 
@@ -2126,8 +2127,9 @@ See the [Local Clustering Coefficient](#local-clustering-coefficient) library me
 clustering coefficient.
 
 #### Usage
-The algorithm takes a simple, undirected graph as input and outputs a result containing the total number of triplets and
-triangles in the graph. The graph ID type must be `Comparable` and `Copyable`.
+Directed and undirected variants are provided. The algorithm takes a simple graph as input and outputs a result
+containing the total number of triplets and triangles in the graph. The graph ID type must be `Comparable` and
+`Copyable`.
 
 
 {% top %}

diff --git a/flink-java/src/main/java/org/apache/flink/api/java/Utils.java b/flink-java/src/main/java/org/apache/flink/api/java/Utils.java
@@ -204,7 +204,7 @@ public boolean equals(Object obj) {
 
 		@Override
 		public int hashCode() {
-			return (int) (this.count + this.hashCode());
+			return (int) (this.count + this.checksum);
 		}
 
 		@Override

diff --git a/...k-gelly-examples/src/main/java/org/apache/flink/graph/examples/ClusteringCoefficient.java b/...k-gelly-examples/src/main/java/org/apache/flink/graph/examples/ClusteringCoefficient.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.graph.examples;
+
+import org.apache.commons.lang3.StringEscapeUtils;
+import org.apache.commons.lang3.text.WordUtils;
+import org.apache.commons.math3.random.JDKRandomGenerator;
+import org.apache.flink.api.common.JobExecutionResult;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.api.java.ExecutionEnvironment;
+import org.apache.flink.api.java.io.CsvOutputFormat;
+import org.apache.flink.api.java.utils.DataSetUtils;
+import org.apache.flink.api.java.utils.ParameterTool;
+import org.apache.flink.graph.Graph;
+import org.apache.flink.graph.GraphAnalytic;
+import org.apache.flink.graph.asm.translate.LongValueToIntValue;
+import org.apache.flink.graph.asm.translate.TranslateGraphIds;
+import org.apache.flink.graph.generator.RMatGraph;
+import org.apache.flink.graph.generator.random.JDKRandomGeneratorFactory;
+import org.apache.flink.graph.generator.random.RandomGenerableFactory;
+import org.apache.flink.types.IntValue;
+import org.apache.flink.types.LongValue;
+import org.apache.flink.types.NullValue;
+
+import java.text.NumberFormat;
+
+/**
+ * Driver for the library implementations of Global and Local Clustering Coefficient.
+ *
+ * This example reads a simple directed or undirected graph from a CSV file or
+ * generates an RMat graph with the given scale and edge factor then calculates
+ * the local clustering coefficient for each vertex and the global clustering
+ * coefficient for the graph.
+ *
+ * @see org.apache.flink.graph.library.clustering.directed.GlobalClusteringCoefficient
+ * @see org.apache.flink.graph.library.clustering.directed.LocalClusteringCoefficient
+ * @see org.apache.flink.graph.library.clustering.undirected.GlobalClusteringCoefficient
+ * @see org.apache.flink.graph.library.clustering.undirected.LocalClusteringCoefficient
+ */
+public class ClusteringCoefficient {
+
+	public static final int DEFAULT_SCALE = 10;
+
+	public static final int DEFAULT_EDGE_FACTOR = 16;
+
+	public static final boolean DEFAULT_CLIP_AND_FLIP = true;
+
+	private static void printUsage() {
+		System.out.println(WordUtils.wrap("The local clustering coefficient measures the connectedness of each" +
+			" vertex's neighborhood and the global clustering coefficient measures the connectedness of the graph." +
+			" Scores range from 0.0 (no edges between neighbors or vertices) to 1.0 (neighborhood or graph" +
+			" is a clique).", 80));
+		System.out.println();
+		System.out.println(WordUtils.wrap("This algorithm returns tuples containing the vertex ID, the degree of" +
+			" the vertex, and the number of edges between vertex neighbors.", 80));
+		System.out.println();
+		System.out.println("usage: ClusteringCoefficient --directed <true | false> --input <csv | rmat [options]> --output <print | hash | csv [options]");
+		System.out.println();
+		System.out.println("options:");
+		System.out.println("  --input csv --input_filename FILENAME [--input_line_delimiter LINE_DELIMITER] [--input_field_delimiter FIELD_DELIMITER]");
+		System.out.println("  --input rmat [--scale SCALE] [--edge_factor EDGE_FACTOR]");
+		System.out.println();
+		System.out.println("  --output print");
+		System.out.println("  --output hash");
+		System.out.println("  --output csv --output_filename FILENAME [--output_line_delimiter LINE_DELIMITER] [--output_field_delimiter FIELD_DELIMITER]");
+	}
+
+	public static void main(String[] args) throws Exception {
+		// Set up the execution environment
+		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
+		env.getConfig().enableObjectReuse();
+
+		ParameterTool parameters = ParameterTool.fromArgs(args);
+		if (! parameters.has("directed")) {
+			printUsage();
+			return;
+		}
+		boolean directedAlgorithm = parameters.getBoolean("directed");
+
+		// global and local clustering coefficient results
+		GraphAnalytic gcc;
+		DataSet lcc;
+
+		switch (parameters.get("input", "")) {
+			case "csv": {
+				String lineDelimiter = StringEscapeUtils.unescapeJava(
+					parameters.get("input_line_delimiter", CsvOutputFormat.DEFAULT_LINE_DELIMITER));
+
+				String fieldDelimiter = StringEscapeUtils.unescapeJava(
+					parameters.get("input_field_delimiter", CsvOutputFormat.DEFAULT_FIELD_DELIMITER));
+
+				Graph<LongValue, NullValue, NullValue> graph = Graph
+					.fromCsvReader(parameters.get("input_filename"), env)
+						.ignoreCommentsEdges("#")
+						.lineDelimiterEdges(lineDelimiter)
+						.fieldDelimiterEdges(fieldDelimiter)
+						.keyType(LongValue.class);
+
+				if (directedAlgorithm) {
+					gcc = graph
+						.run(new org.apache.flink.graph.library.clustering.directed.GlobalClusteringCoefficient<LongValue, NullValue, NullValue>());
+					lcc = graph
+						.run(new org.apache.flink.graph.library.clustering.directed.LocalClusteringCoefficient<LongValue, NullValue, NullValue>());
+				} else {
+					gcc = graph
+						.run(new org.apache.flink.graph.library.clustering.undirected.GlobalClusteringCoefficient<LongValue, NullValue, NullValue>());
+					lcc = graph
+						.run(new org.apache.flink.graph.library.clustering.undirected.LocalClusteringCoefficient<LongValue, NullValue, NullValue>());
+				}
+			} break;
+
+			case "rmat": {
+				int scale = parameters.getInt("scale", DEFAULT_SCALE);
+				int edgeFactor = parameters.getInt("edge_factor", DEFAULT_EDGE_FACTOR);
+
+				RandomGenerableFactory<JDKRandomGenerator> rnd = new JDKRandomGeneratorFactory();
+
+				long vertexCount = 1L << scale;
+				long edgeCount = vertexCount * edgeFactor;
+
+				Graph<LongValue, NullValue, NullValue> graph = new RMatGraph<>(env, rnd, vertexCount, edgeCount)
+					.generate();
+
+				if (directedAlgorithm) {
+					if (scale > 32) {
+						Graph<LongValue, NullValue, NullValue> newGraph = graph
+							.run(new org.apache.flink.graph.asm.simple.directed.Simplify<LongValue, NullValue, NullValue>());
+
+						gcc = newGraph
+							.run(new org.apache.flink.graph.library.clustering.directed.GlobalClusteringCoefficient<LongValue, NullValue, NullValue>());
+						lcc = newGraph
+							.run(new org.apache.flink.graph.library.clustering.directed.LocalClusteringCoefficient<LongValue, NullValue, NullValue>());
+					} else {
+						Graph<IntValue, NullValue, NullValue> newGraph = graph
+							.run(new TranslateGraphIds<LongValue, IntValue, NullValue, NullValue>(new LongValueToIntValue()))
+							.run(new org.apache.flink.graph.asm.simple.directed.Simplify<IntValue, NullValue, NullValue>());
+
+						gcc = newGraph
+							.run(new org.apache.flink.graph.library.clustering.directed.GlobalClusteringCoefficient<IntValue, NullValue, NullValue>());
+						lcc = newGraph
+							.run(new org.apache.flink.graph.library.clustering.directed.LocalClusteringCoefficient<IntValue, NullValue, NullValue>());
+					}
+				} else {
+					boolean clipAndFlip = parameters.getBoolean("clip_and_flip", DEFAULT_CLIP_AND_FLIP);
+
+					if (scale > 32) {
+						Graph<LongValue, NullValue, NullValue> newGraph = graph
+							.run(new org.apache.flink.graph.asm.simple.undirected.Simplify<LongValue, NullValue, NullValue>(clipAndFlip));
+
+						gcc = newGraph
+							.run(new org.apache.flink.graph.library.clustering.undirected.GlobalClusteringCoefficient<LongValue, NullValue, NullValue>());
+						lcc = newGraph
+							.run(new org.apache.flink.graph.library.clustering.undirected.LocalClusteringCoefficient<LongValue, NullValue, NullValue>());
+					} else {
+						Graph<IntValue, NullValue, NullValue> newGraph = graph
+							.run(new TranslateGraphIds<LongValue, IntValue, NullValue, NullValue>(new LongValueToIntValue()))
+							.run(new org.apache.flink.graph.asm.simple.undirected.Simplify<IntValue, NullValue, NullValue>(clipAndFlip));
+
+						gcc = newGraph
+							.run(new org.apache.flink.graph.library.clustering.undirected.GlobalClusteringCoefficient<IntValue, NullValue, NullValue>());
+						lcc = newGraph
+							.run(new org.apache.flink.graph.library.clustering.undirected.LocalClusteringCoefficient<IntValue, NullValue, NullValue>());
+					}
+				}
+			} break;
+
+			default:
+				printUsage();
+				return;
+		}
+
+		switch (parameters.get("output", "")) {
+			case "print":
+				if (directedAlgorithm) {
+					for (Object e: lcc.collect()) {
+						org.apache.flink.graph.library.clustering.directed.LocalClusteringCoefficient.Result result =
+							(org.apache.flink.graph.library.clustering.directed.LocalClusteringCoefficient.Result)e;
+						System.out.println(result.toVerboseString());
+					}
+				} else {
+					for (Object e: lcc.collect()) {
+						org.apache.flink.graph.library.clustering.undirected.LocalClusteringCoefficient.Result result =
+							(org.apache.flink.graph.library.clustering.undirected.LocalClusteringCoefficient.Result)e;
+						System.out.println(result.toVerboseString());
+					}
+				}
+				System.out.println(gcc.getResult());
+				break;
+
+			case "hash":
+				System.out.println(DataSetUtils.checksumHashCode(lcc));
+				System.out.println(gcc.getResult());
+				break;
+
+			case "csv":
+				String filename = parameters.get("output_filename");
+
+				String lineDelimiter = StringEscapeUtils.unescapeJava(
+					parameters.get("output_line_delimiter", CsvOutputFormat.DEFAULT_LINE_DELIMITER));
+
+				String fieldDelimiter = StringEscapeUtils.unescapeJava(
+					parameters.get("output_field_delimiter", CsvOutputFormat.DEFAULT_FIELD_DELIMITER));
+
+				lcc.writeAsCsv(filename, lineDelimiter, fieldDelimiter);
+
+				System.out.println(gcc.execute());
+				break;
+
+			default:
+				printUsage();
+				return;
+		}
+
+		JobExecutionResult result = env.getLastJobExecutionResult();
+
+		NumberFormat nf = NumberFormat.getInstance();
+		System.out.println("Execution runtime: " + nf.format(result.getNetRuntime()) + " ms");
+	}
+}
diff --git a/...ries/flink-gelly-examples/src/main/java/org/apache/flink/graph/examples/JaccardIndex.java b/...ries/flink-gelly-examples/src/main/java/org/apache/flink/graph/examples/JaccardIndex.java
@@ -97,10 +97,10 @@ public static void main(String[] args) throws Exception {
 
 				Graph<LongValue, NullValue, NullValue> graph = Graph
 					.fromCsvReader(parameters.get("input_filename"), env)
-					.ignoreCommentsEdges("#")
-					.lineDelimiterEdges(lineDelimiter)
-					.fieldDelimiterEdges(fieldDelimiter)
-					.keyType(LongValue.class);
+						.ignoreCommentsEdges("#")
+						.lineDelimiterEdges(lineDelimiter)
+						.fieldDelimiterEdges(fieldDelimiter)
+						.keyType(LongValue.class);
 
 				ji = graph
 					.run(new org.apache.flink.graph.library.similarity.JaccardIndex<LongValue, NullValue, NullValue>());
@@ -162,6 +162,7 @@ public static void main(String[] args) throws Exception {
 
 				env.execute();
 				break;
+
 			default:
 				printUsage();
 				return;