index.ts

import * as async from 'async';
import {basename, extname} from 'path';
import {createReadStream} from 'fs';
import {logger} from 'loge';
import {Connection} from 'sqlcmd-sql';
import * as xlsx from 'xlsx';
import {Parser} from '@chbrown/sv';

import * as excel from './excel';

interface RawTable {
  name: string;
  data: string[][];
}

/**
Prepare a string for use as a SQL-friendly identifier, like a table or column name.

It is idempotent (can be run multiple times with no ill effect)
*/
export function toIdentifier(input: string): string {
  return input
    // replace # with 'id'
    .replace(/#/g, 'id')
    // replace illegal characters with whitespace
    .replace(/\W+/g, ' ')
    .trim()
    // replace whitespace with underscores
    .replace(/\s+/g, '_');
}

/**
Cut off the basename part of a filepath, without the extension, for use as a SQL-friendly identifier.
*/
export function pathToIdentifier(path: string) {
  const ext = extname(path);
  const base = basename(path, ext);
  return toIdentifier(base);
}

function isEmpty(value: string): boolean {
  return (value === undefined) || /^\s*$/.test(value);
}

const regExpTests = [
  {
    id: 'DATETIME',
    // DATETIME: '2016-01-18T01:45:53Z', '2016-01-18 15:10:20'
    regExp: /^[12]\d{3}(-?)[01]\d\1[0123]\d[T ][012]?\d:[0-5]\d(:[0-5]\d)?Z?$/
  },
  {
    id: 'DATE',
    // DATE: '2016-01-18', '20160118' (but not '2016-01-40', '2016-0118', or '201601-18')
    regExp: /^[12]\d{3}(-?)[01]\d\1[0123]\d$/
  },
  {
    // INTEGER is a subset of some DATE formats, so it must come after
    id: 'INTEGER',
    // INTEGER: '-100', '0', '99' (but not '-' or '9223372036854775808')
    regExp: /^-?\d{1,10}$/
  },
  {
    // BIGINT is a superset of INTEGER, but we want to prefer INTEGER if possible
    id: 'BIGINT',
    // BIGINT: '-1000000000000000000', '0', or '9223372036854775808' (but not '-')
    regExp: /^-?\d{1,19}$/
  },
  {
    // REAL is a subset of INTEGER, so it must come after
    id: 'REAL',
    // REAL: '-100.05', '20', '99.004' (but not '.')
    regExp: /^-?(\d+|\.\d+|\d+\.\d*)$/
  },
  {
    id: 'TIME',
    // TIME: '23:54', '01:45', '4:90' (but not '2016-0118' or '201601-18')
    regExp: /^[012]?\d:[0-5]\d$/
  },
];

/**
@param {string[]} values -
@returns {string} A SQL column definition, e.g., "created_at TIMESTAMP NOT NULL"
*/
function inferColumnType(values: string[]): string {
  const nonEmptyValues = values.filter(value => !isEmpty(value));
  let dataType = 'TEXT';
  if (nonEmptyValues.length > 0) {
    regExpTests.some(({id, regExp}) => {
      const matches = nonEmptyValues.every(value => regExp.test(value));
      if (matches) {
        dataType = id;
      }
      // short-circuit by returning true if we found a match
      return matches;
    });
  }
  return [
    dataType,
    ...((nonEmptyValues.length === values.length) ? ['NOT NULL'] : []),
  ].join(' ');
}

export function createTable(name: string,
                            data: string[][],
                            callback: (error?: Error) => void) {
  const db = new Connection({outputStream: process.stdout});
  const [columns, ...rows] = data;
  const columnDeclarations = columns.map((column, i) => {
    const columnName = toIdentifier(column || `column_${i}`).toLowerCase();
    const values = rows.map(row => row[i]);
    const columnType = inferColumnType(values);
    return `"${columnName}" ${columnType}`;
  });

  const tableIdentifier = toIdentifier(name).toLowerCase();
  logger.info(`Creating table ${tableIdentifier}`);
  db.CreateTable(tableIdentifier)
  .add(...columnDeclarations)
  .execute(error => {
    if (error) return callback(error);

    logger.info(`Inserting ${rows.length} rows`);
    async.eachSeries(rows, (row, callback) => {
      const args: any[] = [];
      const values = row.map(value => {
        const argIndex = args.push(isEmpty(value) ? null : value);
        return `$${argIndex}`;
      });
      db.executeSQL(`INSERT INTO ${tableIdentifier} VALUES (${values.join(', ')})`, args, callback);
    }, callback);
  });
}

/**
Read RawTable objects from Excel spreadsheet (no identifier sanitization)
*/
export function readExcel(filename: string): RawTable[] {
  const workbook = xlsx.readFile(filename, {});
  return workbook.SheetNames.map(sheetName => {
    const worksheet = workbook.Sheets[sheetName];
    const data = excel.readTable(worksheet);
    return {name: sheetName, data};
  });
}

export function readSV(filename: string, callback: (error: Error, table?: RawTable) => void) {
  const name = pathToIdentifier(filename);
  const objects = [];
  createReadStream(filename).pipe(new Parser())
  .on('error', error => callback(error))
  .on('data', object => objects.push(object))
  .on('end', () => {
    // TODO: customizing sv.Parser so that we can get out string[] rows if we want
    const columns = Object.keys(objects[0]);
    const rows = objects.map(object => {
      return columns.map(column => object[column]);
    });
    callback(null, {name, data: [columns, ...rows]});
  });
}