diff options
author | Julian Andres Klode <jak@debian.org> | 2017-06-26 21:42:54 +0200 |
---|---|---|
committer | Julian Andres Klode <jak@debian.org> | 2017-06-26 21:42:54 +0200 |
commit | bbd26e1127605f328726962af7ca27b24c1ad494 (patch) | |
tree | 7cfc198043e4ddc78f4ecaf04544e1a5836fd279 /triehash.pl | |
parent | e2073b0276226b625897ef475f225bf8f508719e (diff) |
Squashed 'triehash/' changes from 565fde4e7..0ca66b761
0ca66b761 Redefine ambiguous to be much more simple
3d9adfb3f Add more comments
2896e78c2 Render C code to match longest prefix
21e620cf0 fix various typos reported by spellintian
git-subtree-dir: triehash
git-subtree-split: 0ca66b761aa56d42d35c4cc254f455424764895a
Diffstat (limited to 'triehash.pl')
-rwxr-xr-x | triehash.pl | 129 |
1 files changed, 80 insertions, 49 deletions
diff --git a/triehash.pl b/triehash.pl index c722cc62d..25300a765 100755 --- a/triehash.pl +++ b/triehash.pl @@ -195,6 +195,15 @@ GetOptions ("code|C=s" => \$code_name, or die("Could not parse options!"); +# This implements a simple trie. Each node has three attributes: +# +# children - A hash of keys to other nodes +# value - The value to be stored here +# label - A named representation of the value. +# +# The key at each level of the trie can consist of one or more bytes, and the +# trie can be normalized to a form where all keys at a level have the same +# length using rebuild_tree(). package Trie { sub new { @@ -229,6 +238,8 @@ package Trie { return (substr($key, 0, $split), substr($key, $split)); } + # Given a key, a label, and a value, insert that into the tree, possibly + # replacing an existing node. sub insert { my ($self, $key, $label, $value) = @_; @@ -245,6 +256,10 @@ package Trie { $self->{children}{$child}->insert($tail, $label, $value); } + # Construct a new trie that only contains words of a given length. This + # is used to split up the common trie after knowing all words, so we can + # switch on the expected word length first, and have the per-trie function + # implement simple longest prefix matching. sub filter_depth { my ($self, $togo) = @_; @@ -269,6 +284,7 @@ package Trie { return $new; } + # (helper for rebuild_tree) # Reinsert all value nodes into the specified $trie, prepending $prefix # to their $paths. sub reinsert_value_nodes_into { @@ -281,7 +297,17 @@ package Trie { } } - # Find an earlier split due a an ambiguous character + # (helper for rebuild_tree) + # Find the earliest point to split a key. Normally, we split at the maximum + # power of 2 that is greater or equal than the length of the key. When we + # are building an ASCII-optimised case-insensitive trie that simply ORs + # each byte with 0x20, we need to split at the first ambiguous character: + # + # For example, the words a-bc and a\rbc are identical in such a situation: + # '-' | 0x20 == '-' == '\r' | 0x20 + # We cannot simply switch on all 4 bytes at once, but need to split before + # the ambigious character so we can process the ambiguous character on its + # own. sub find_ealier_split { my ($self, $key) = @_; @@ -296,7 +322,10 @@ package Trie { return $self->alignpower2(length $key); } - # Rebuild the trie, splitting at ambigous chars, and unifying key lengths + # This rebuilds the trie, splitting each key before ambiguous characters + # as explained in find_earlier_split(), and then chooses the smallest + # such split at each level, so that all keys at all levels have the same + # length (so we can use a multi-byte switch). sub rebuild_tree { my $self = shift; # Determine if/where we need to split before an ambiguous character @@ -387,54 +416,61 @@ package CCodeGen { return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8); } + # Render the trie so that it matches the longest prefix. sub print_table { my ($self, $trie, $fh, $indent, $index) = @_; $indent //= 0; $index //= 0; - if (defined $trie->{value}) { - printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label}); - return; - } + # If we have children, try to match them. + if (%{$trie->{children}}) { + # The difference between lowercase and uppercase alphabetical characters + # is that they have one bit flipped. If we have alphabetical characters + # in the search space, and the entire search space works fine if we + # always turn on the flip, just OR the character we are switching over + # with the bit. + my $want_use_bit = 0; + my $can_use_bit = 1; + my $key_length = 0; + foreach my $key (sort keys %{$trie->{children}}) { + $can_use_bit &= not main::ambiguous($key); + $want_use_bit |= ($key =~ /^[a-zA-Z]+$/); + $key_length = length($key); + } - # The difference between lowercase and uppercase alphabetical characters - # is that they have one bit flipped. If we have alphabetical characters - # in the search space, and the entire search space works fine if we - # always turn on the flip, just OR the character we are switching over - # with the bit. - my $want_use_bit = 0; - my $can_use_bit = 1; - my $key_length = 0; - foreach my $key (sort keys %{$trie->{children}}) { - $can_use_bit &= not main::ambiguous($key); - $want_use_bit |= ($key =~ /^[a-zA-Z]+$/); - $key_length = length($key); - } + if ($ignore_case && $can_use_bit && $want_use_bit) { + printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length); + } else { + printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length)); + } - if ($ignore_case && $can_use_bit && $want_use_bit) { - printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length); - } else { - printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length)); - } + my $notfirst = 0; + foreach my $key (sort keys %{$trie->{children}}) { + if ($notfirst) { + printf $fh (" " x $indent . " break;\n"); + } + if ($ignore_case) { + printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key))); + printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit); + } else { + printf $fh (" " x $indent . "case %s:\n", $self->case_label($key)); + } - my $notfirst = 0; - foreach my $key (sort keys %{$trie->{children}}) { - if ($notfirst) { - printf $fh (" " x $indent . " break;\n"); - } - if ($ignore_case) { - printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key))); - printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit); - } else { - printf $fh (" " x $indent . "case %s:\n", $self->case_label($key)); + $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key)); + + $notfirst=1; } - $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key)); + printf $fh (" " x $indent . "}\n"); + } + - $notfirst=1; + # This node has a value, so it is a possible end point. If no children + # matched, we have found our longest prefix. + if (defined $trie->{value}) { + printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label}); } - printf $fh (" " x $indent . "}\n"); } sub print_words { @@ -524,22 +560,17 @@ package CCodeGen { } } -# Check if the word can be reached by exactly one word in (alphabet OR 0x20). +# A character is ambiguous if the 1<<5 (0x20) bit does not correspond to the +# lower case bit. A word is ambiguous if any character is. This definition is +# used to check if we can perform the |0x20 optimization when building a case- +# insensitive trie. sub ambiguous { my $word = shift; foreach my $char (split //, $word) { - # Setting the lowercase flag in the character produces a different - # character, the character would thus not be matched. - return 1 if ((ord($char) | 0x20) != ord(lc($char))); - - # A word is also ambiguous if any character in lowercase can be reached - # by ORing 0x20 from another character in the charset that is not a - # lowercase character of the current character. - # Assume that we have UTF-8 and the most significant bit can be set - for my $i (0..255) { - return 1 if (($i | 0x20) == ord(lc($char)) && lc(chr($i)) ne lc($char)); - } + # If 0x20 does not solely indicate lowercase, it is ambiguous + return 1 if ord(lc($char)) != (ord($char) | 0x20); + return 1 if ord(uc($char)) != (ord($char) & ~0x20); } return 0; |