Skip to content
This repository has been archived by the owner on Jul 13, 2024. It is now read-only.

Commit

Permalink
Find grapheme break property in log time.
Browse files Browse the repository at this point in the history
  • Loading branch information
wilx committed Nov 15, 2020
1 parent 5170b7f commit 13e7d89
Show file tree
Hide file tree
Showing 4 changed files with 13,091 additions and 1,910 deletions.
80 changes: 80 additions & 0 deletions scripts/converter-new.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
const formatCategory = (category) =>
category === 'Extended_Pictographic'
? `${category.toUpperCase()}`
: `CLUSTER_BREAK.${category.toUpperCase()}`;

const ifTemplate = (condition, category, comment) => `\
//${comment}
if(${condition}) { return ${formatCategory(category)}; }`;

const conditionTemplate = (codepoints) => {
if (codepoints.length === 1) {
return `0x${codepoints[0].toString(16)} === code`;
} else if (codepoints.length === 2) {
return `0x${codepoints[0].toString(
16,
)} <= code && code <= 0x${codepoints[1].toString(16)}`;
}
throw new Error(`Unexpected codepoints length: ${codepoints.length}`);
};

function processOneProperty(line) {
const rangeAndRest = line.split(';');
if (rangeAndRest.length != 2) {
console.error('rangeAndRest.lengh != 2: ' + line);
throw Error('rangeAndRest.lengh != 2: ' + line);
}
const category = rangeAndRest[1].split('#')[0].trim();
const [codepointRange, others] = line.split(';');
const codepoints = codepointRange
.trimRight()
.split('..')
.map((x) => Number.parseInt(x, 16));
const comment = others.split('#')[1];
return {
range: codepoints,
category: category,
comment: comment,
};
}

const hexDigitsRe = /^[0-9A-F]/i;
function splitPropertyChunk(content) {
return content.split('\n').filter((line) => hexDigitsRe.test(line));
}

function genTree(ranges) {
const len = ranges.length;
if (len === 1) {
const r = ranges[0];
let result = ifTemplate(conditionTemplate(r.range), r.category, r.comment);
return result;
} else {
const mid = Math.floor(len / 2);
const loRange = ranges.slice(0, mid);
const hiRange = ranges.slice(mid);
const m = ranges[mid];
let result = `if (code < 0x${m.range[0].toString(16)}) {
${genTree(loRange)}
} else {
${genTree(hiRange)}
}`;
return result;
}
}

function convertNew(content, categoryFilter = () => true) {
const propertyLines = splitPropertyChunk(content);
let propertyRecords = propertyLines
.map((line) => processOneProperty(line))
.filter((x) => categoryFilter(x.category))
.sort((a, b) => a.range[0] - b.range[0]);

const tree = genTree(propertyRecords);
return `${tree}
// unlisted code points are treated as a break property of "Other"
return ${formatCategory('Other')};
`;
}

module.exports = convertNew;
16 changes: 16 additions & 0 deletions scripts/generate-emoji-extended-pictographic-new.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/**
* Generate the JavaScript code snippet to retrieve the grapheme emoji properties used in Graphemer.
*/
const fs = require('fs');
const path = require('path');
const convertNew = require('./converter-new');

const content = fs.readFileSync(path.resolve(__dirname, './emoji-data.txt'), {
encoding: 'utf8',
});

console.log(
convertNew(content, function (category) {
return category === 'Extended_Pictographic';
}),
);
18 changes: 18 additions & 0 deletions scripts/generate-grapheme-break-new.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/**
* Generate the JavaScript code snippet to retrieve the grapheme breaks properties used in Graphemer.
*/
const fs = require('fs');
const path = require('path');
const convertNew = require('./converter-new');

const content = fs.readFileSync(
path.resolve(__dirname, './GraphemeBreakProperty.txt'),
{
encoding: 'utf8',
},
);

const converted = convertNew(content);
//console.log("size of converted array: " + converted.length);
//console.log(JSON.stringify(converted, null, 2));
console.log(converted);
Loading

0 comments on commit 13e7d89

Please sign in to comment.