-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathindex.ts
161 lines (148 loc) · 4.92 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import * as async from 'async';
import {basename, extname} from 'path';
import {createReadStream} from 'fs';
import {logger} from 'loge';
import {Connection} from 'sqlcmd-sql';
import * as xlsx from 'xlsx';
import {Parser} from '@chbrown/sv';
import * as excel from './excel';
interface RawTable {
name: string;
data: string[][];
}
/**
Prepare a string for use as a SQL-friendly identifier, like a table or column name.
It is idempotent (can be run multiple times with no ill effect)
*/
export function toIdentifier(input: string): string {
return input
// replace # with 'id'
.replace(/#/g, 'id')
// replace illegal characters with whitespace
.replace(/\W+/g, ' ')
.trim()
// replace whitespace with underscores
.replace(/\s+/g, '_');
}
/**
Cut off the basename part of a filepath, without the extension, for use as a SQL-friendly identifier.
*/
export function pathToIdentifier(path: string) {
const ext = extname(path);
const base = basename(path, ext);
return toIdentifier(base);
}
function isEmpty(value: string): boolean {
return (value === undefined) || /^\s*$/.test(value);
}
const regExpTests = [
{
id: 'DATETIME',
// DATETIME: '2016-01-18T01:45:53Z', '2016-01-18 15:10:20'
regExp: /^[12]\d{3}(-?)[01]\d\1[0123]\d[T ][012]?\d:[0-5]\d(:[0-5]\d)?Z?$/
},
{
id: 'DATE',
// DATE: '2016-01-18', '20160118' (but not '2016-01-40', '2016-0118', or '201601-18')
regExp: /^[12]\d{3}(-?)[01]\d\1[0123]\d$/
},
{
// INTEGER is a subset of some DATE formats, so it must come after
id: 'INTEGER',
// INTEGER: '-100', '0', '99' (but not '-' or '9223372036854775808')
regExp: /^-?\d{1,10}$/
},
{
// BIGINT is a superset of INTEGER, but we want to prefer INTEGER if possible
id: 'BIGINT',
// BIGINT: '-1000000000000000000', '0', or '9223372036854775808' (but not '-')
regExp: /^-?\d{1,19}$/
},
{
// REAL is a subset of INTEGER, so it must come after
id: 'REAL',
// REAL: '-100.05', '20', '99.004' (but not '.')
regExp: /^-?(\d+|\.\d+|\d+\.\d*)$/
},
{
id: 'TIME',
// TIME: '23:54', '01:45', '4:90' (but not '2016-0118' or '201601-18')
regExp: /^[012]?\d:[0-5]\d$/
},
];
/**
@param {string[]} values -
@returns {string} A SQL column definition, e.g., "created_at TIMESTAMP NOT NULL"
*/
function inferColumnType(values: string[]): string {
const nonEmptyValues = values.filter(value => !isEmpty(value));
let dataType = 'TEXT';
if (nonEmptyValues.length > 0) {
regExpTests.some(({id, regExp}) => {
const matches = nonEmptyValues.every(value => regExp.test(value));
if (matches) {
dataType = id;
}
// short-circuit by returning true if we found a match
return matches;
});
}
return [
dataType,
...((nonEmptyValues.length === values.length) ? ['NOT NULL'] : []),
].join(' ');
}
export function createTable(name: string,
data: string[][],
callback: (error?: Error) => void) {
const db = new Connection({outputStream: process.stdout});
const [columns, ...rows] = data;
const columnDeclarations = columns.map((column, i) => {
const columnName = toIdentifier(column || `column_${i}`).toLowerCase();
const values = rows.map(row => row[i]);
const columnType = inferColumnType(values);
return `"${columnName}" ${columnType}`;
});
const tableIdentifier = toIdentifier(name).toLowerCase();
logger.info(`Creating table ${tableIdentifier}`);
db.CreateTable(tableIdentifier)
.add(...columnDeclarations)
.execute(error => {
if (error) return callback(error);
logger.info(`Inserting ${rows.length} rows`);
async.eachSeries(rows, (row, callback) => {
const args: any[] = [];
const values = row.map(value => {
const argIndex = args.push(isEmpty(value) ? null : value);
return `$${argIndex}`;
});
db.executeSQL(`INSERT INTO ${tableIdentifier} VALUES (${values.join(', ')})`, args, callback);
}, callback);
});
}
/**
Read RawTable objects from Excel spreadsheet (no identifier sanitization)
*/
export function readExcel(filename: string): RawTable[] {
const workbook = xlsx.readFile(filename, {});
return workbook.SheetNames.map(sheetName => {
const worksheet = workbook.Sheets[sheetName];
const data = excel.readTable(worksheet);
return {name: sheetName, data};
});
}
export function readSV(filename: string, callback: (error: Error, table?: RawTable) => void) {
const name = pathToIdentifier(filename);
const objects = [];
createReadStream(filename).pipe(new Parser())
.on('error', error => callback(error))
.on('data', object => objects.push(object))
.on('end', () => {
// TODO: customizing sv.Parser so that we can get out string[] rows if we want
const columns = Object.keys(objects[0]);
const rows = objects.map(object => {
return columns.map(column => object[column]);
});
callback(null, {name, data: [columns, ...rows]});
});
}