summaryrefslogtreecommitdiff
path: root/triehash
diff options
context:
space:
mode:
authorJulian Andres Klode <jak@debian.org>2017-06-26 21:43:31 +0200
committerJulian Andres Klode <jak@debian.org>2017-06-26 21:43:58 +0200
commit5318af3002654ce881fbbc0f69a3e0a70f3984f1 (patch)
tree7ca651417f0143149015bd0ec8e4ebbaa2eb5850 /triehash
parent12a2f66b7b62f70d234d0b7c6fca3271873e93cd (diff)
parentbbd26e1127605f328726962af7ca27b24c1ad494 (diff)
Merge triehash v0.2
Diffstat (limited to 'triehash')
-rwxr-xr-xtriehash/triehash.pl129
1 files changed, 80 insertions, 49 deletions
diff --git a/triehash/triehash.pl b/triehash/triehash.pl
index c943c1d9f..25300a765 100755
--- a/triehash/triehash.pl
+++ b/triehash/triehash.pl
@@ -195,6 +195,15 @@ GetOptions ("code|C=s" => \$code_name,
or die("Could not parse options!");
+# This implements a simple trie. Each node has three attributes:
+#
+# children - A hash of keys to other nodes
+# value - The value to be stored here
+# label - A named representation of the value.
+#
+# The key at each level of the trie can consist of one or more bytes, and the
+# trie can be normalized to a form where all keys at a level have the same
+# length using rebuild_tree().
package Trie {
sub new {
@@ -229,6 +238,8 @@ package Trie {
return (substr($key, 0, $split), substr($key, $split));
}
+ # Given a key, a label, and a value, insert that into the tree, possibly
+ # replacing an existing node.
sub insert {
my ($self, $key, $label, $value) = @_;
@@ -245,6 +256,10 @@ package Trie {
$self->{children}{$child}->insert($tail, $label, $value);
}
+ # Construct a new trie that only contains words of a given length. This
+ # is used to split up the common trie after knowing all words, so we can
+ # switch on the expected word length first, and have the per-trie function
+ # implement simple longest prefix matching.
sub filter_depth {
my ($self, $togo) = @_;
@@ -269,6 +284,7 @@ package Trie {
return $new;
}
+ # (helper for rebuild_tree)
# Reinsert all value nodes into the specified $trie, prepending $prefix
# to their $paths.
sub reinsert_value_nodes_into {
@@ -281,7 +297,17 @@ package Trie {
}
}
- # Find an earlier split due a an ambiguous character
+ # (helper for rebuild_tree)
+ # Find the earliest point to split a key. Normally, we split at the maximum
+ # power of 2 that is greater or equal than the length of the key. When we
+ # are building an ASCII-optimised case-insensitive trie that simply ORs
+ # each byte with 0x20, we need to split at the first ambiguous character:
+ #
+ # For example, the words a-bc and a\rbc are identical in such a situation:
+ # '-' | 0x20 == '-' == '\r' | 0x20
+ # We cannot simply switch on all 4 bytes at once, but need to split before
+ # the ambigious character so we can process the ambiguous character on its
+ # own.
sub find_ealier_split {
my ($self, $key) = @_;
@@ -296,7 +322,10 @@ package Trie {
return $self->alignpower2(length $key);
}
- # Rebuild the trie, splitting at ambiguous chars, and unifying key lengths
+ # This rebuilds the trie, splitting each key before ambiguous characters
+ # as explained in find_earlier_split(), and then chooses the smallest
+ # such split at each level, so that all keys at all levels have the same
+ # length (so we can use a multi-byte switch).
sub rebuild_tree {
my $self = shift;
# Determine if/where we need to split before an ambiguous character
@@ -387,54 +416,61 @@ package CCodeGen {
return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8);
}
+ # Render the trie so that it matches the longest prefix.
sub print_table {
my ($self, $trie, $fh, $indent, $index) = @_;
$indent //= 0;
$index //= 0;
- if (defined $trie->{value}) {
- printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label});
- return;
- }
+ # If we have children, try to match them.
+ if (%{$trie->{children}}) {
+ # The difference between lowercase and uppercase alphabetical characters
+ # is that they have one bit flipped. If we have alphabetical characters
+ # in the search space, and the entire search space works fine if we
+ # always turn on the flip, just OR the character we are switching over
+ # with the bit.
+ my $want_use_bit = 0;
+ my $can_use_bit = 1;
+ my $key_length = 0;
+ foreach my $key (sort keys %{$trie->{children}}) {
+ $can_use_bit &= not main::ambiguous($key);
+ $want_use_bit |= ($key =~ /^[a-zA-Z]+$/);
+ $key_length = length($key);
+ }
- # The difference between lowercase and uppercase alphabetical characters
- # is that they have one bit flipped. If we have alphabetical characters
- # in the search space, and the entire search space works fine if we
- # always turn on the flip, just OR the character we are switching over
- # with the bit.
- my $want_use_bit = 0;
- my $can_use_bit = 1;
- my $key_length = 0;
- foreach my $key (sort keys %{$trie->{children}}) {
- $can_use_bit &= not main::ambiguous($key);
- $want_use_bit |= ($key =~ /^[a-zA-Z]+$/);
- $key_length = length($key);
- }
+ if ($ignore_case && $can_use_bit && $want_use_bit) {
+ printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length);
+ } else {
+ printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
+ }
- if ($ignore_case && $can_use_bit && $want_use_bit) {
- printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length);
- } else {
- printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
- }
+ my $notfirst = 0;
+ foreach my $key (sort keys %{$trie->{children}}) {
+ if ($notfirst) {
+ printf $fh (" " x $indent . " break;\n");
+ }
+ if ($ignore_case) {
+ printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key)));
+ printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
+ } else {
+ printf $fh (" " x $indent . "case %s:\n", $self->case_label($key));
+ }
- my $notfirst = 0;
- foreach my $key (sort keys %{$trie->{children}}) {
- if ($notfirst) {
- printf $fh (" " x $indent . " break;\n");
- }
- if ($ignore_case) {
- printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key)));
- printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
- } else {
- printf $fh (" " x $indent . "case %s:\n", $self->case_label($key));
+ $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key));
+
+ $notfirst=1;
}
- $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key));
+ printf $fh (" " x $indent . "}\n");
+ }
+
- $notfirst=1;
+ # This node has a value, so it is a possible end point. If no children
+ # matched, we have found our longest prefix.
+ if (defined $trie->{value}) {
+ printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label});
}
- printf $fh (" " x $indent . "}\n");
}
sub print_words {
@@ -524,22 +560,17 @@ package CCodeGen {
}
}
-# Check if the word can be reached by exactly one word in (alphabet OR 0x20).
+# A character is ambiguous if the 1<<5 (0x20) bit does not correspond to the
+# lower case bit. A word is ambiguous if any character is. This definition is
+# used to check if we can perform the |0x20 optimization when building a case-
+# insensitive trie.
sub ambiguous {
my $word = shift;
foreach my $char (split //, $word) {
- # Setting the lowercase flag in the character produces a different
- # character, the character would thus not be matched.
- return 1 if ((ord($char) | 0x20) != ord(lc($char)));
-
- # A word is also ambiguous if any character in lowercase can be reached
- # by ORing 0x20 from another character in the charset that is not a
- # lowercase character of the current character.
- # Assume that we have UTF-8 and the most significant bit can be set
- for my $i (0..255) {
- return 1 if (($i | 0x20) == ord(lc($char)) && lc(chr($i)) ne lc($char));
- }
+ # If 0x20 does not solely indicate lowercase, it is ambiguous
+ return 1 if ord(lc($char)) != (ord($char) | 0x20);
+ return 1 if ord(uc($char)) != (ord($char) & ~0x20);
}
return 0;