Skip to content

Commit

Permalink
Support for merging two datasets (vmware-archive#703)
Browse files Browse the repository at this point in the history
* Support for merging two datasets
  • Loading branch information
Mihai Budiu authored Oct 26, 2020
1 parent bd5f730 commit 61ed65b
Show file tree
Hide file tree
Showing 12 changed files with 138 additions and 27 deletions.
10 changes: 9 additions & 1 deletion docs/userManual.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ one row for an airline flight. Columns in this dataset include: the date of the
the origin and destination cities, the origin and destination states,
the origin airport code, the distance flown, the departure and arrival delay.

Updated on 2020 Oct 21.
Updated on 2020 Oct 26.

# Contents
* 1 [Basic concepts](#1-basic-concepts)
Expand Down Expand Up @@ -830,6 +830,14 @@ closed, this option will reopen it.

* Refresh: fetches and redraws all the views associated with this dataset.

* Merge with...: allows the user to specify another dataset, loaded in a separate tab.
The two datasets will be merged, by taking the union of their rows; the result will
be loaded in a new tab. The datasets can be merged only if their schemas are compatible.
Two schemas as incompatible if there is a common column name with different types in the
two datasets. If two datasets are incompatible the user needs to rename columns
or convert datatypes prior to the merging operation. If a column does not exist in
one of the two datasets, it is assumed to contain only null values.

* Edit privacy policy: this is an experimental feature related to
differentially-private data visualizations, which is not yet documented.
This option is only enabled for a data curator while visualizing a
Expand Down
8 changes: 8 additions & 0 deletions docs/userManual.src
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,14 @@ closed, this option will reopen it.

* Refresh: fetches and redraws all the views associated with this dataset.

* Merge with...: allows the user to specify another dataset, loaded in a separate tab.
The two datasets will be merged, by taking the union of their rows; the result will
be loaded in a new tab. The datasets can be merged only if their schemas are compatible.
Two schemas as incompatible if there is a common column name with different types in the
two datasets. If two datasets are incompatible the user needs to rename columns
or convert datatypes prior to the merging operation. If a column does not exist in
one of the two datasets, it is assumed to contain only null values.

* Edit privacy policy: this is an experimental feature related to
differentially-private data visualizations, which is not yet documented.
This option is only enabled for a data curator while visualizing a
Expand Down
2 changes: 1 addition & 1 deletion platform/src/main/java/org/hillview/table/BaseTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public List<IColumn> getColumns(Schema schema) {
if (this.columns.containsKey(cd.name))
result.add(this.columns.get(cd.name));
else
result.add(new EmptyColumn(cd));
result.add(new EmptyColumn(cd, this.getNumOfRows()));
}
return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public static IAppendableColumn create(ColumnDescription desc) {
case Json:
return new StringListColumn(desc);
case None:
return new EmptyColumn(desc);
return new EmptyColumn(desc, 0);
case Integer:
return new IntListColumn(desc);
case Date:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ public EmptyColumn(String name, int size) {
this.sealed = true;
}

public EmptyColumn(ColumnDescription desc) {
public EmptyColumn(ColumnDescription desc, int size) {
super(desc);
this.size = 0;
this.size = size;
this.sealed = false;
}

Expand Down Expand Up @@ -112,7 +112,7 @@ public IColumn rename(String newName) {

@Override
public IColumn convertKind(ContentsKind kind, String newColName, IMembershipSet unused) {
return new EmptyColumn(new ColumnDescription(newColName, kind));
return new EmptyColumn(new ColumnDescription(newColName, kind), this.size);
}

@Override
Expand Down
91 changes: 83 additions & 8 deletions web/src/main/webapp/datasetView.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import {HeatmapView} from "./dataViews/heatmapView";
import {HeavyHittersView} from "./dataViews/heavyHittersView";
import {Histogram2DView} from "./dataViews/histogram2DView";
import {HistogramView} from "./dataViews/histogramView";
import {SchemaView} from "./modules";
import {BaseReceiver, SchemaView} from "./modules";
import {SpectrumView} from "./dataViews/spectrumView";
import {SchemaReceiver, TableView} from "./modules";
import {DataLoaded, getDescription, RemoteTableReceiver} from "./initialObject";
Expand Down Expand Up @@ -48,6 +48,7 @@ import {Dialog, saveAs} from "./ui/dialog";
import {showBookmarkURL} from "./ui/dialog";
import {CorrelationHeatmapView} from "./dataViews/correlationHeatmapView";
import {GeoView} from "./dataViews/geoView";
import {SchemaClass} from "./schemaClass";

export interface IViewSerialization {
viewKind: ViewKind;
Expand Down Expand Up @@ -221,7 +222,7 @@ export class DatasetView implements IHtmlElement {
}

protected merge(): void {
const names = HillviewToplevel.instance.getDatasetNames();
const names = HillviewToplevel.instance.datasets.filter(d => d != this).map(d => d.name);
const dialog = new Dialog("Merge", "Merge with another dataset");
dialog.addSelectFieldAsObject("dataset", "dataset",
names.map((_, i) => i), i => names[i],
Expand All @@ -230,7 +231,7 @@ export class DatasetView implements IHtmlElement {
const index = dialog.getFieldValueAsObject<number>("dataset");
if (index == null)
return;
const dataset = HillviewToplevel.instance.getDataset(index);
const dataset = HillviewToplevel.instance.datasets[index];
this.mergeWith(dataset);
});
dialog.show();
Expand All @@ -247,10 +248,40 @@ export class DatasetView implements IHtmlElement {
protected mergeWith(view: DatasetView | null): void {
if (view == null)
return;
const rr = this.remoteObject.createMergeRequest(view.remoteObjectId);
const name = this.name + "+" + view.name;
const observer = new RemoteTableReceiver(this.loadMenuPage, rr, this.loaded, name, true);
rr.invoke(observer);
// Get the schemas from any of the views.
const schemaA = this.getSchema();
const schemaB = view.getSchema();
if (schemaA == null || schemaB == null) {
this.loadMenuPage.reportError("Could not find schema");
return;
}
const schemaRes = schemaA.merge(schemaB);
if (schemaRes.isErr) {
this.loadMenuPage.reportError("Conflicting schema: " + schemaRes.error);
return;
}
const schema = schemaRes.unwrap();
if (schema.schema.length != schemaA.length) {
const proj = this.remoteObject.createProjectRequest(schema.schema);
const receiver = new ProjectFirstReceiver(schema,
this.loadMenuPage, proj, this, view, schemaB);
proj.invoke(receiver);
} else {
const receiver = new ProjectFirstReceiver(schema,
this.loadMenuPage, null, this, view, schemaB);
receiver.run(this.remoteObjectId);
receiver.finished();
}
}

public getSchema(): SchemaClass | null {
for (const page of this.allPages) {
const view = page.dataView as BigTableView;
if (view == null)
continue;
return view.meta.schema;
}
return null;
}

/**
Expand Down Expand Up @@ -676,4 +707,48 @@ class CreateBookmarkURLReceiver extends OnCompleteReceiver<string> {
showBookmarkURL(url);
console.log("Bookmark has been created.");
}
}
}

// Part of the workflow of merging two tables with distinct schemas.
// Receivers a table that has been projected and invokes a projection on a second table,
// after which the tables are merged.
class ProjectFirstReceiver extends BaseReceiver {
constructor(protected schema: SchemaClass, loadMenuPage: FullPage,
operation: ICancellable<RemoteObjectId> | null,
protected left: DatasetView, protected right: DatasetView,
protected rightSchema: SchemaClass) {
super(loadMenuPage, operation, "reconcile schemas", null);
}

public run(firstProjectionId: RemoteObjectId): void {
const firstProjection = new TableTargetAPI(firstProjectionId);
if (this.schema.schema.length != this.rightSchema.length) {
const proj = new TableTargetAPI(this.right.remoteObjectId).createProjectRequest(this.schema.schema);
const receiver = new ProjectSecondReceiver(
this.page, proj, this.left, this.right, firstProjection);
proj.invoke(receiver);
} else {
const receiver = new ProjectSecondReceiver(
this.page, null, this.left, this.right, firstProjection);
receiver.run(this.right.remoteObjectId);
receiver.finished();
}
}
}

class ProjectSecondReceiver extends BaseReceiver {
constructor(loadMenuPage: FullPage, operation: ICancellable<RemoteObjectId> | null,
protected left: DatasetView, protected right: DatasetView,
protected firstProjection: TableTargetAPI) {
super(loadMenuPage, operation, "reconcile schemas", null);
}

public run(secondObjectId: RemoteObjectId): void {
const rr = this.firstProjection.createMergeRequest(secondObjectId);
const name = this.left.name + " + " + this.right.name;
const receiver = new RemoteTableReceiver(this.page, rr,
{ kind: "Merged", first: this.left.loaded, second: this.right.loaded }, name, true);
rr.invoke(receiver);
}
}

12 changes: 11 additions & 1 deletion web/src/main/webapp/initialObject.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,13 @@ export interface SSTableFilesLoaded {
description: CassandraConnectionInfo;
}

export type DataLoaded = FilesLoaded | TablesLoaded | HillviewLogs | IDatasetSerialization | SSTableFilesLoaded;
export interface Merged {
kind: "Merged";
first: DataLoaded;
second: DataLoaded;
}

export type DataLoaded = FilesLoaded | TablesLoaded | HillviewLogs | IDatasetSerialization | SSTableFilesLoaded | Merged;

export function getDescription(data: DataLoaded): PageTitle {
switch (data.kind) {
Expand All @@ -66,6 +72,10 @@ export function getDescription(data: DataLoaded): PageTitle {
return new PageTitle("logs", "Hillview installation logs");
case "SSTable":
return new PageTitle(data.description.database + "/" + data.description.table, "loaded from files");
case "Merged":
const name = getDescription(data.first).format + "+" + getDescription(data.second).format;
return new PageTitle("Merged " + name,
getDescription(data.first).provenance + "+" + getDescription(data.second).provenance);
}
}

Expand Down
2 changes: 1 addition & 1 deletion web/src/main/webapp/loadView.ts
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ class GenericLogDialog extends Dialog {
"shell pattern with path that describes the names of the files to load (comma-separated patterns allowed)");
pattern.required = true;
// TODO: This should perhaps be read from the back-end service.
const logFormats = ["%{HADOOP}", "%{RFC5424}", "%{VSANTRACE}", "%{PROTON}", "%{PROTON_PROXY}",
const logFormats = ["%{HADOOP}", "%{RFC5424}", "%{VSANTRACE}", "%{NSXT_PROTON}", "%{NSXT_PROXY}",
"%{SYSLOG}", "%{BLOCKTRACE}"];
const format = this.addSelectField("logFormat", "Log format", logFormats, "%{SYSLOG}",
"Log format : https://github.com/vmware/hillview/blob/master/docs/userManual.md" +
Expand Down
3 changes: 2 additions & 1 deletion web/src/main/webapp/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
"file-saver": "1.3.8",
"jsoneditor": "7.2.1",
"pako": "1.0.6",
"rx": "4.1.0"
"rx": "4.1.0",
"@badrap/result": "^0.2.6"
},
"devDependencies": {
"@types/geojson": "^1.0.5",
Expand Down
17 changes: 17 additions & 0 deletions web/src/main/webapp/schemaClass.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import {IColumnDescription, Schema} from "./javaBridge";
import {assert, cloneArray, Serializable} from "./util";
import {Result} from "@badrap/result";

/**
* A SchemaClass is a class containing a Schema and some indexes and methods
Expand Down Expand Up @@ -144,4 +145,20 @@ export class SchemaClass implements Serializable<SchemaClass> {
});
return cds;
}

public merge(other: SchemaClass): Result<SchemaClass> {
const result: IColumnDescription[] = [];
for (const cd of this.schema) {
const otherCd = other.find(cd.name);
if (otherCd == null)
result.push(cd);
else if (otherCd.kind != cd.kind)
return Result.err(new Error("Conflicing types for column " + cd.name + ": " + cd.kind + " and " + otherCd.kind))
// else: we will push it when we scan the other schema
}
for (const cd of other.schema) {
result.push(cd);
}
return Result.ok(new SchemaClass(result));
}
}
2 changes: 1 addition & 1 deletion web/src/main/webapp/tableTarget.ts
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ export abstract class BaseReceiver extends OnCompleteReceiver<RemoteObjectId> {
protected remoteObject: TableTargetAPI;

protected constructor(public page: FullPage,
public operation: ICancellable<RemoteObjectId>,
public operation: ICancellable<RemoteObjectId> | null,
public description: string,
protected dataset: DatasetView | null) { // may be null for the first table
super(page, operation, description);
Expand Down
10 changes: 1 addition & 9 deletions web/src/main/webapp/toplevel.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import {UIConfig} from "./javaBridge";
*/
export class HillviewToplevel implements IHtmlElement {
private readonly topLevel: HTMLElement;
private readonly datasets: DatasetView[];
public readonly datasets: DatasetView[];
private readonly strip: HTMLDivElement;
private readonly tabs: HTMLElement[];
private readonly content: HTMLDivElement;
Expand Down Expand Up @@ -165,19 +165,11 @@ export class HillviewToplevel implements IHtmlElement {
dataset.resize();
}

public getDataset(index: number): DatasetView | null {
return this.datasets[index];
}

public resize(): void {
if (this.current != null) {
this.current.resize();
}
}

getDatasetNames(): string[] {
return this.datasets.map(d => d.name);
}
}

export function createHillview(): void {
Expand Down

0 comments on commit 61ed65b

Please sign in to comment.