tool/enc-unicode.rb

#!/usr/bin/env ruby

# Creates the data structures needed by Onigurma to map Unicode codepoints to
# property names and POSIX character classes
#
# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt from unicode.org.
# (http://unicode.org/Public/UNIDATA/)
# And run following command.
#   ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
# You can get source file for gperf.
# After this, simply make ruby.

unless ARGV.size == 1
  $stderr.puts "Usage: #{$0} data_directory"
  exit(1)
end

POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII]

def pair_codepoints(codepoints)

  # We have a sorted Array of codepoints that we wish to partition into
  # ranges such that the start- and endpoints form an inclusive set of
  # codepoints with property _property_. Note: It is intended that some ranges
  # will begin with the value with  which they end, e.g. 0x0020 -> 0x0020

  codepoints.sort!
  last_cp = codepoints.first
  pairs = [[last_cp, nil]]
  codepoints[1..-1].each do |codepoint|
    next if last_cp == codepoint

    # If the current codepoint does not follow directly on from the last
    # codepoint, the last codepoint represents the end of the current range,
    # and the current codepoint represents the start of the next range.
    if last_cp.next != codepoint
      pairs[-1][-1] = last_cp
      pairs << [codepoint, nil]
    end
    last_cp = codepoint
  end

  # The final pair has as its endpoint the last codepoint for this property
  pairs[-1][-1] = codepoints.last
  pairs
end

def parse_unicode_data(file)
  last_cp = 0
  data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
    'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
  beg_cp = nil
  IO.foreach(file) do |line|
    fields = line.split(';')
    cp = fields[0].to_i(16)

    case fields[1]
    when /\A<(.*),\s*First>\z/
      beg_cp = cp
      next
    when /\A<(.*),\s*Last>\z/
      cps = (beg_cp..cp).to_a
    else
      beg_cp = cp
      cps = [cp]
    end

    # The Cn category represents unassigned characters. These are not listed in
    # UnicodeData.txt so we must derive them by looking for 'holes' in the range
    # of listed codepoints. We increment the last codepoint seen and compare it
    # with the current codepoint. If the current codepoint is less than
    # last_cp.next we have found a hole, so we add the missing codepoint to the
    # Cn category.
    data['Cn'].concat((last_cp.next...beg_cp).to_a)

    # Assigned - Defined in unicode.c; interpreted as every character in the
    # Unicode range minus the unassigned characters
    data['Assigned'].concat(cps)

    # The third field denotes the 'General' category, e.g. Lu
    (data[fields[2]] ||= []).concat(cps)

    # The 'Major' category is the first letter of the 'General' category, e.g.
    # 'Lu' -> 'L'
    (data[fields[2][0,1]] ||= []).concat(cps)
    last_cp = cp
  end

  # The last Cn codepoint should be 0x10ffff. If it's not, append the missing
  # codepoints to Cn and C
  cn_remainder = (last_cp.next..0x10ffff).to_a
  data['Cn'] += cn_remainder
  data['C'] += cn_remainder

  # Define General Category properties
  gcps = data.keys.sort - POSIX_NAMES

  # Returns General Category Property names and the data
  [gcps, data]
end

def define_posix_props(data)
  # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
  #

  data['Alpha'] = data['Alphabetic']
  data['Upper'] = data['Uppercase']
  data['Lower'] = data['Lowercase']
  data['Punct'] = data['Punctuation']
  data['Digit'] = data['Decimal_Number']
  data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
                   (0x0061..0x0066).to_a
  data['Alnum'] = data['Alpha'] + data['Digit']
  data['Space'] = data['White_Space']
  data['Blank'] = data['White_Space'] - [0x0A, 0x0B, 0x0C, 0x0D, 0x85] -
    data['Line_Separator'] - data['Paragraph_Separator']
  data['Cntrl'] = data['Cc']
  data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
  data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
    data['Surrogate'] - data['Unassigned']
  data['Print'] = data['Graph'] + data['Blank'] - data['Cntrl']
end

def parse_scripts(data)
  files = [
    {fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
    {fn: 'Scripts.txt', title: 'Script'},
    {fn: 'PropList.txt', title: 'Binary Property'}
  ]
  current = nil
  cps = []
  names = []
  files.each do |file|
    IO.foreach(get_file(file[:fn])) do |line|
      if /^# Total code points: / =~ line
        data[current] = cps
        make_const(current, cps, file[:title])
        names << current
        cps = []
      elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line
        current = $3
        $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
      end
    end
  end
  names
end

def parse_aliases(data)
  kv = {}
  IO.foreach(get_file('PropertyAliases.txt')) do |line|
    next unless /^(\w+)\s*; (\w+)/ =~ line
    data[$1] = data[$2]
    kv[normalize_propname($1)] = normalize_propname($2)
  end
  IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
    next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
    if $1 == 'gc'
      data[$3] = data[$2]
      data[$4] = data[$2]
      kv[normalize_propname($3)] = normalize_propname($2)
      kv[normalize_propname($4)] = normalize_propname($2) if $4
    else
      data[$2] = data[$3]
      data[$4] = data[$3]
      kv[normalize_propname($2)] = normalize_propname($3)
      kv[normalize_propname($4)] = normalize_propname($3) if $4
    end
  end
  kv
end

$const_cache = {}
# make_const(property, pairs, name): Prints a 'static const' structure for a
# given property, group of paired codepoints, and a human-friendly name for
# the group
def make_const(prop, data, name)
  puts "\n/* '#{prop}': #{name} */"
  if origprop = $const_cache.key(data)
    puts "#define CR_#{prop} CR_#{origprop}"
  else
    $const_cache[prop] = data
    pairs = pair_codepoints(data)
    puts "static const OnigCodePoint CR_#{prop}[] = {"
    # The first element of the constant is the number of pairs of codepoints
    puts "\t#{pairs.size},"
    pairs.each do |pair|
      pair.map! { |c|  c == 0 ? '0x0000' : sprintf("%0#6x", c) }
      puts "\t#{pair.first}, #{pair.last},"
    end
    puts "}; /* CR_#{prop} */"
  end
end

def normalize_propname(name)
  name = name.downcase
  name.delete!('- _')
  name
end

def get_file(name)
  File.join(ARGV[0], name)
end


# Write Data
puts '%{'
props, data = parse_unicode_data(get_file('UnicodeData.txt'))
print "\n#ifdef USE_UNICODE_PROPERTIES"
props.each do |name|
  category =
    case name.size
    when 1 then 'Major Category'
    when 2 then 'General Category'
    else        '-'
    end
  make_const(name, data[name], category)
end
props.concat parse_scripts(data)
puts '#endif /* USE_UNICODE_PROPERTIES */'
aliases = parse_aliases(data)
define_posix_props(data)
POSIX_NAMES.each do |name|
  make_const(name, data[name], "[[:#{name}:]]")
end
puts(<<'__HEREDOC')

static const OnigCodePoint* const CodeRanges[] = {
__HEREDOC
POSIX_NAMES.each{|name|puts"  CR_#{name},"}
puts "#ifdef USE_UNICODE_PROPERTIES"
props.each{|name|puts"  CR_#{name},"}

puts(<<'__HEREDOC')
#endif /* USE_UNICODE_PROPERTIES */
};
struct uniname2ctype_struct {
  int name, ctype;
};

static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int);
%}
struct uniname2ctype_struct;
%%
__HEREDOC
i = -1
name_to_index = {}
POSIX_NAMES.each do |name|
  i += 1
  name = normalize_propname(name)
  name_to_index[name] = i
  puts"%-40s %3d" % [name + ',', i]
end
puts "#ifdef USE_UNICODE_PROPERTIES"
props.each do |name|
  i += 1
  name = normalize_propname(name)
  name_to_index[name] = i
  puts "%-40s %3d" % [name + ',', i]
end
aliases.each_pair do |k, v|
  next if name_to_index[k]
  next unless v = name_to_index[v]
  puts "%-40s %3d" % [k + ',', v]
end
puts(<<'__HEREDOC')
#endif /* USE_UNICODE_PROPERTIES */
%%
static int
uniname2ctype(const UChar *name, unsigned int len)
{
  const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
  if (p) return p->ctype;
  return -1;
}
__HEREDOC