summaryrefslogtreecommitdiff
path: root/triehash
diff options
context:
space:
mode:
Diffstat (limited to 'triehash')
-rw-r--r--triehash/.travis.yml14
-rw-r--r--triehash/LICENSE.md17
-rw-r--r--triehash/README.md58
-rw-r--r--triehash/tests/framework.sh84
-rwxr-xr-xtriehash/tests/run-tests.sh22
-rwxr-xr-xtriehash/tests/test-basic245
-rwxr-xr-xtriehash/tests/test-case-insensitive109
-rwxr-xr-xtriehash/tests/test-enum-include-name-flags129
-rwxr-xr-xtriehash/tests/test-multi-byte-level427
-rwxr-xr-xtriehash/triehash.pl691
10 files changed, 0 insertions, 1796 deletions
diff --git a/triehash/.travis.yml b/triehash/.travis.yml
deleted file mode 100644
index c66ea9697..000000000
--- a/triehash/.travis.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-language: perl
-perl:
- - "5.24"
- - "5.22"
- - "5.20"
- - "5.18"
- - "5.16"
- - "5.14"
-install:
- - cpanm --quiet --notest --skip-satisfied Devel::Cover Devel::Cover::Report::Codecov
-script:
- - ./tests/run-tests.sh
-after_script:
- - cover -report codecov
diff --git a/triehash/LICENSE.md b/triehash/LICENSE.md
deleted file mode 100644
index 89de35479..000000000
--- a/triehash/LICENSE.md
+++ /dev/null
@@ -1,17 +0,0 @@
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
diff --git a/triehash/README.md b/triehash/README.md
deleted file mode 100644
index d6f6e7fc3..000000000
--- a/triehash/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# Order-preserving minimal perfect hash function generator
-
-Build order-preserving minimal perfect hash functions.
-
-[![codecov](https://codecov.io/gh/julian-klode/triehash/branch/master/graph/badge.svg)](https://codecov.io/gh/julian-klode/triehash)
-[![Build Status](https://travis-ci.org/julian-klode/triehash.svg?branch=master)](https://travis-ci.org/julian-klode/triehash)
-
-## Performance
-
-Performance was evaluated against other hash functions. As an input set, the
-fields of Debian Packages and Sources files was used, and each hash function
-was run 1,000,000 times for each word. The byte count of the words were then
-summed up and divided by the total number of nanoseconds each function ran, so
-all speeds below are given in bytes per nanosecond, AKA gigabyte per second.
-
-arch/function|jak-x230 (amd64)|backup (amd64)|asachi.d.o (arm64)|asachi.d.o (armel)|asachi.d.o (armhf)|plummer.d.o (ppc64el)|eller.d.o (mipsel)
--------------|----------------|--------------|------------------|------------------|------------------|---------------------|------------------
-Trie | 2.4 | 1.9 | 1.2 | 0.9 | 0.8 | 2.0 | 0.2
-Trie (*) | 2.2 | 1.7 | 0.8 | 0.7 | 0.7 | 1.8 | 0.2
-re2c | 1.7 | 1.3 | 0.9 | 0.9 | 0.7 | 1.6 | 0.2
-re2c (*) | 1.2 | 0.9 | 0.6 | 0.6 | 0.5 | 1.1 | 0.1
-gperf (*) | 0.7 | 0.5 | 0.2 | 0.2 | 0.2 | 0.5 | 0.1
-gperf | 1.3 | 0.9 | 0.3 | 0.3 | 0.2 | 0.4 | 0.1
-djb (*) | 0.7 | 0.5 | 0.3 | 0.3 | 0.3 | 0.5 | 0.1
-djb (**) | 1.0 | 0.7 | 0.4 | 0.5 | 0.5 | 0.6 | 0.2
-djb | 0.9 | 0.7 | 0.5 | 0.5 | 0.5 | 0.7 | 0.2
-apt (*) | 1.2 | 0.9 | 0.7 | 0.7 | 0.7 | 1.1 | 0.2
-apt (**) | 2.3 | 1.7 | 0.7 | 0.9 | 0.8 | 1.9 | 0.2
-
-And transposed:
-
-function/arch |Trie |Trie (*) |re2c |re2c (*) |gperf (*)|gperf |djb (*) |djb (**) |djb |apt (*) |apt (**)
----------------------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------
-jak-x230 (amd64) | 2.4| 2.2| 1.7| 1.2| 0.7| 1.3| 0.7| 1.0| 0.9| 1.2| 2.3
-backup (amd64) | 1.9| 1.7| 1.3| 0.9| 0.5| 0.9| 0.5| 0.7| 0.7| 0.9| 1.7
-asachi.d.o (arm64) | 1.2| 0.8| 0.9| 0.6| 0.2| 0.3| 0.3| 0.4| 0.5| 0.7| 0.7
-asachi.d.o (armel) | 0.9| 0.7| 0.9| 0.6| 0.2| 0.3| 0.3| 0.5| 0.5| 0.7| 0.9
-asachi.d.o (armhf) | 0.8| 0.7| 0.7| 0.5| 0.2| 0.2| 0.3| 0.5| 0.5| 0.7| 0.8
-plummer.d.o (ppc64el)| 2.0| 1.8| 1.6| 1.1| 0.5| 0.4| 0.5| 0.6| 0.7| 1.1| 1.9
-eller.d.o (mipsel) | 0.2| 0.2| 0.2| 0.1| 0.1| 0.1| 0.1| 0.2| 0.2| 0.2| 0.2
-
-
-Legend:
-
-* The (*) variants are case-insensitive, (**) are more optimised versions
- of the (*) versions.
-* DJB (*) is a DJB Hash with naive lowercase conversion, DJB (**) just ORs one
- bit into each value to get alphabetical characters to be lowercase
-* APT (*) is the AlphaHash function from APT which hashes the last 8 bytes in a
- word in a case-insensitive manner. APT (**) is the same function unrolled.
-* All hosts except the x230 are Debian porterboxes. The x230 has a Core i5-3320M,
- barriere has an Opteron 23xx.
-
-Notes:
-
-* The overhead is larger than needed on some platforms due to gcc inserting
- unneeded zero extend instructions, see:
- https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77729
diff --git a/triehash/tests/framework.sh b/triehash/tests/framework.sh
deleted file mode 100644
index 51d4580a6..000000000
--- a/triehash/tests/framework.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/bin/sh
-# Simple integration test framework
-
-set -e
-
-
-cleanup() {
- rm -f test.output test.c test.h test.tree
-}
-
-dumpone() {
- if [ -e "$@" ]; then
- echo "Content of $@:"
- cat "$@" | sed "s#^#\t#g"
- fi
-
-}
-
-dump() {
- dumpone test.output
- dumpone test.c
- dumpone test.h
- dumpone test.tree
- return 1
-}
-
-testsuccess() {
- [ "$INNER" ] || cleanup
- [ "$INNER" ] || echo "Testing success of $@"
- if ! "$@" > test.output 2>&1; then
- echo "ERROR: Running $@ failed with error $?, messages were:" >&2
- dump
- return 1
- fi
-}
-
-testfailure() {
- [ "$INNER" ] || cleanup
- [ "$INNER" ] || echo "Testing failure of $@"
- if "$@" > test.output 2>&1; then
- echo "ERROR: Running $@ unexpectedly succeeded, messages were:" >&2
- dump
- return 1
- fi
-}
-
-testfileequal() {
- [ "$INNER" ] || echo "Testing output of $2"
- printf "%b\n" "$1" > expected
- if ! diff -u "expected" "$2" > test.diff; then
- echo "ERROR: Differences between expected output and and $2:" >&2
- cat test.diff | sed "s#^#\t#g"
- dump
- return 1
- fi
-}
-
-testgrep() {
- [ "$INNER" ] || echo "Testing grep $@"
- INNER=1 testsuccess grep "$@"
- unset INNER
-}
-
-testsuccessequal() {
- expect="$1"
- shift
- cleanup
- echo "Testing success and output of $@"
- INNER=1 testsuccess "$@"
- INNER=1 testfileequal "$expect" test.output
- unset INNER
-}
-
-
-WORDS="Word-_0
-Word = 42
-VeryLongWord
-Label ~ Word2
-= -9"
-
-triehash() {
- printf "%b\n" "$WORDS" | perl -MDevel::Cover=-summary,0,-silent,1 $(dirname $(dirname $(readlink -f $0)))/triehash.pl "$@" || return $?
- return $?
-}
diff --git a/triehash/tests/run-tests.sh b/triehash/tests/run-tests.sh
deleted file mode 100755
index b9c1ec309..000000000
--- a/triehash/tests/run-tests.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/sh
-DIR=$(dirname $(readlink -f $0))
-
-# Let's go into triehash.pl's dir
-cd $(dirname "$DIR")
-
-rm -rf cover_db
-
-count=$(cd "$DIR" && echo test-* | wc -w)
-i=1
-
-for test in $DIR/test-*; do
- echo "[$i/$count] Running testcase $test"
- if ! $test > test.summary 2>&1; then
- cat test.summary
- exit 1
- fi
- i=$((i + 1))
-done
-
-
-cover
diff --git a/triehash/tests/test-basic b/triehash/tests/test-basic
deleted file mode 100755
index 19cb08684..000000000
--- a/triehash/tests/test-basic
+++ /dev/null
@@ -1,245 +0,0 @@
-#!/bin/sh
-. $(dirname $(readlink -f $0))/framework.sh
-
-# Check for non-existing files
-testfailure triehash -C /does/not/exist1 -H /does/not/exist1 /does/not/exist/input
-
-# Check that we can specify - for -C and -H
-testsuccessequal "#ifndef TRIE_HASH_PerfectHash
-#define TRIE_HASH_PerfectHash
-#include <stddef.h>
-#include <stdint.h>
-enum PerfectKey {
- Unknown = -1,
-};
-static enum PerfectKey PerfectHash(const char *string, size_t length);
-static enum PerfectKey PerfectHash(const char *string, size_t length)
-{
- switch (length) {
- default:
- return Unknown;
- }
-}
-#endif /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 -C - -H -
-
-# Check that split files work
-testsuccess triehash -C test.c -H test.h --multi-byte=0
-testfileequal "#include \"test.h\"
- enum PerfectKey PerfectHash(const char *string, size_t length)
-{
- switch (length) {
- default:
- return Unknown;
- }
-}" test.c
-testfileequal "#ifndef TRIE_HASH_PerfectHash
-#define TRIE_HASH_PerfectHash
-#include <stddef.h>
-#include <stdint.h>
-enum PerfectKey {
- Unknown = -1,
-};
- enum PerfectKey PerfectHash(const char *string, size_t length);
-#endif /* TRIE_HASH_PerfectHash */" test.h
-
-
-# Check the C code generator
-testsuccess triehash -C test.c -H test.c /dev/stdin
-testfileequal "#ifndef TRIE_HASH_PerfectHash
-#define TRIE_HASH_PerfectHash
-#include <stddef.h>
-#include <stdint.h>
-enum PerfectKey {
- VeryLongWord = 43,
- Word = 42,
- Word___0 = 0,
- Label = 44,
- Unknown = -9,
-};
-static enum PerfectKey PerfectHash(const char *string, size_t length);
-#ifdef __GNUC__
-typedef uint16_t __attribute__((aligned (1))) triehash_uu16;
-typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1];
-typedef uint32_t __attribute__((aligned (1))) triehash_uu32;
-typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1];
-typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
-typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1];
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define onechar(c, s, l) (((uint64_t)(c)) << (s))
-#else
-#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))
-#endif
-#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)
-#define TRIE_HASH_MULTI_BYTE
-#endif
-#endif /*GNUC */
-#ifdef TRIE_HASH_MULTI_BYTE
-static enum PerfectKey PerfectHash4(const char *string)
-{
- switch(*((triehash_uu32*) &string[0])) {
- case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
- return Word;
- }
- return Unknown;
-}
-static enum PerfectKey PerfectHash5(const char *string)
-{
- switch(*((triehash_uu32*) &string[0])) {
- case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
- switch(string[4]) {
- case 0| onechar('2', 0, 8):
- return Label;
- }
- }
- return Unknown;
-}
-static enum PerfectKey PerfectHash7(const char *string)
-{
- switch(*((triehash_uu32*) &string[0])) {
- case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
- switch(string[4]) {
- case 0| onechar('-', 0, 8):
- switch(string[5]) {
- case 0| onechar('_', 0, 8):
- switch(string[6]) {
- case 0| onechar('0', 0, 8):
- return Word___0;
- }
- }
- }
- }
- return Unknown;
-}
-static enum PerfectKey PerfectHash12(const char *string)
-{
- switch(*((triehash_uu64*) &string[0])) {
- case 0| onechar('V', 0, 64)| onechar('e', 8, 64)| onechar('r', 16, 64)| onechar('y', 24, 64)| onechar('L', 32, 64)| onechar('o', 40, 64)| onechar('n', 48, 64)| onechar('g', 56, 64):
- switch(*((triehash_uu32*) &string[8])) {
- case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
- return VeryLongWord;
- }
- }
- return Unknown;
-}
-#else
-static enum PerfectKey PerfectHash4(const char *string)
-{
- switch(string[0]) {
- case 'W':
- switch(string[1]) {
- case 'o':
- switch(string[2]) {
- case 'r':
- switch(string[3]) {
- case 'd':
- return Word;
- }
- }
- }
- }
- return Unknown;
-}
-static enum PerfectKey PerfectHash5(const char *string)
-{
- switch(string[0]) {
- case 'W':
- switch(string[1]) {
- case 'o':
- switch(string[2]) {
- case 'r':
- switch(string[3]) {
- case 'd':
- switch(string[4]) {
- case '2':
- return Label;
- }
- }
- }
- }
- }
- return Unknown;
-}
-static enum PerfectKey PerfectHash7(const char *string)
-{
- switch(string[0]) {
- case 'W':
- switch(string[1]) {
- case 'o':
- switch(string[2]) {
- case 'r':
- switch(string[3]) {
- case 'd':
- switch(string[4]) {
- case '-':
- switch(string[5]) {
- case '_':
- switch(string[6]) {
- case '0':
- return Word___0;
- }
- }
- }
- }
- }
- }
- }
- return Unknown;
-}
-static enum PerfectKey PerfectHash12(const char *string)
-{
- switch(string[0]) {
- case 'V':
- switch(string[1]) {
- case 'e':
- switch(string[2]) {
- case 'r':
- switch(string[3]) {
- case 'y':
- switch(string[4]) {
- case 'L':
- switch(string[5]) {
- case 'o':
- switch(string[6]) {
- case 'n':
- switch(string[7]) {
- case 'g':
- switch(string[8]) {
- case 'W':
- switch(string[9]) {
- case 'o':
- switch(string[10]) {
- case 'r':
- switch(string[11]) {
- case 'd':
- return VeryLongWord;
- }
- }
- }
- }
- }
- }
- }
- }
- }
- }
- }
- }
- return Unknown;
-}
-#endif /* TRIE_HASH_MULTI_BYTE */
-static enum PerfectKey PerfectHash(const char *string, size_t length)
-{
- switch (length) {
- case 4:
- return PerfectHash4(string);
- case 5:
- return PerfectHash5(string);
- case 7:
- return PerfectHash7(string);
- case 12:
- return PerfectHash12(string);
- default:
- return Unknown;
- }
-}
-#endif /* TRIE_HASH_PerfectHash */" test.c
diff --git a/triehash/tests/test-case-insensitive b/triehash/tests/test-case-insensitive
deleted file mode 100755
index 25ab2dd78..000000000
--- a/triehash/tests/test-case-insensitive
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/bin/sh
-. $(dirname $(readlink -f $0))/framework.sh
-
-WORDS="Halllo\nH-lllo\nHalll1"
-
-# Case-insensitive test
-testsuccessequal "#include \"/dev/null\"
-#ifdef __GNUC__
-typedef uint16_t __attribute__((aligned (1))) triehash_uu16;
-typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1];
-typedef uint32_t __attribute__((aligned (1))) triehash_uu32;
-typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1];
-typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
-typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1];
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define onechar(c, s, l) (((uint64_t)(c)) << (s))
-#else
-#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))
-#endif
-#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)
-#define TRIE_HASH_MULTI_BYTE
-#endif
-#endif /*GNUC */
-#ifdef TRIE_HASH_MULTI_BYTE
-static enum PerfectKey PerfectHash6(const char *string)
-{
- switch(string[0] | 0x20) {
- case 0| onechar('h', 0, 8):
- switch(string[1]) {
- case 0| onechar('-', 0, 8):
- switch(*((triehash_uu32*) &string[2]) | 0x20202020) {
- case 0| onechar('l', 0, 32)| onechar('l', 8, 32)| onechar('l', 16, 32)| onechar('o', 24, 32):
- return H_lllo;
- }
- break;
- case 0| onechar('a', 0, 8):
- case 0| onechar('A', 0, 8):
- switch(*((triehash_uu16*) &string[2]) | 0x2020) {
- case 0| onechar('l', 0, 16)| onechar('l', 8, 16):
- switch(string[4] | 0x20) {
- case 0| onechar('l', 0, 8):
- switch(string[5]) {
- case 0| onechar('1', 0, 8):
- return Halll1;
- break;
- case 0| onechar('o', 0, 8):
- case 0| onechar('O', 0, 8):
- return Halllo;
- }
- }
- }
- }
- }
- return Unknown;
-}
-#else
-static enum PerfectKey PerfectHash6(const char *string)
-{
- switch(string[0] | 0x20) {
- case 'h':
- switch(string[1]) {
- case '-':
- switch(string[2] | 0x20) {
- case 'l':
- switch(string[3] | 0x20) {
- case 'l':
- switch(string[4] | 0x20) {
- case 'l':
- switch(string[5] | 0x20) {
- case 'o':
- return H_lllo;
- }
- }
- }
- }
- break;
- case 'a':
- case 'A':
- switch(string[2] | 0x20) {
- case 'l':
- switch(string[3] | 0x20) {
- case 'l':
- switch(string[4] | 0x20) {
- case 'l':
- switch(string[5]) {
- case '1':
- return Halll1;
- break;
- case 'o':
- case 'O':
- return Halllo;
- }
- }
- }
- }
- }
- }
- return Unknown;
-}
-#endif /* TRIE_HASH_MULTI_BYTE */
- enum PerfectKey PerfectHash(const char *string, size_t length)
-{
- switch (length) {
- case 6:
- return PerfectHash6(string);
- default:
- return Unknown;
- }
-}" triehash --multi-byte=3210 --ignore-case -H /dev/null /dev/stdin
diff --git a/triehash/tests/test-enum-include-name-flags b/triehash/tests/test-enum-include-name-flags
deleted file mode 100755
index 33bd97c0f..000000000
--- a/triehash/tests/test-enum-include-name-flags
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/bin/sh
-. $(dirname $(readlink -f $0))/framework.sh
-
-# Need a short word, we really just need to check if the labels work
-WORDS=w
-
-testsuccessequal "\
-#ifndef TRIE_HASH_PerfectHash
-#define TRIE_HASH_PerfectHash
-#include <stddef.h>
-#include <stdint.h>
-#include <foo.h>
-enum PerfectKey {
- w = 0,
- Unknown = -1,
-};
- enum PerfectKey PerfectHash(const char *string, size_t length);
-#endif /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 -C /dev/null --include="<foo.h>" /dev/stdin
-
-# Check for --enum-class support
-testsuccessequal "\
-#ifndef TRIE_HASH_PerfectHash
-#define TRIE_HASH_PerfectHash
-#include <stddef.h>
-#include <stdint.h>
-enum class PerfectKey {
- w = 0,
- Unknown = -1,
-};
-static enum PerfectKey PerfectHash(const char *string, size_t length);
-static enum PerfectKey PerfectHash1(const char *string)
-{
- switch(string[0]) {
- case 'w':
- return PerfectKey::w;
- }
- return PerfectKey::Unknown;
-}
-static enum PerfectKey PerfectHash(const char *string, size_t length)
-{
- switch (length) {
- case 1:
- return PerfectHash1(string);
- default:
- return PerfectKey::Unknown;
- }
-}
-#endif /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 --enum-class /dev/stdin
-
-# Check for --enum-name support
-testsuccessequal "\
-#ifndef TRIE_HASH_PerfectHash
-#define TRIE_HASH_PerfectHash
-#include <stddef.h>
-#include <stdint.h>
-enum Foo {
- Unknown = -1,
-};
-static enum Foo PerfectHash(const char *string, size_t length);
-static enum Foo PerfectHash(const char *string, size_t length)
-{
- switch (length) {
- default:
- return Unknown;
- }
-}
-#endif /* TRIE_HASH_PerfectHash */\
-" triehash --multi-byte=0 --enum-name="Foo"
-
-# Check for --enum-class support
-testsuccessequal "\
-#ifndef TRIE_HASH_PerfectHash
-#define TRIE_HASH_PerfectHash
-#include <stddef.h>
-#include <stdint.h>
-enum class Foo::Bar {
- Unknown = -1,
-};
-static enum Foo::Bar PerfectHash(const char *string, size_t length);
-static enum Foo::Bar PerfectHash(const char *string, size_t length)
-{
- switch (length) {
- default:
- return Foo::Bar::Unknown;
- }
-}
-#endif /* TRIE_HASH_PerfectHash */\
-" triehash --multi-byte=0 --enum-class --enum-name="Foo::Bar"
-
-# Check for --function-name support
-testsuccessequal "\
-#ifndef TRIE_HASH_NonSense
-#define TRIE_HASH_NonSense
-#include <stddef.h>
-#include <stdint.h>
-enum PerfectKey {
- Unknown = -1,
-};
-static enum PerfectKey NonSense(const char *string, size_t length);
-static enum PerfectKey NonSense(const char *string, size_t length)
-{
- switch (length) {
- default:
- return Unknown;
- }
-}
-#endif /* TRIE_HASH_NonSense */\
-" triehash --multi-byte=0 --function-name="NonSense"
-
-# Check for --counter-name support
-testsuccessequal "\
-#ifndef TRIE_HASH_PerfectHash
-#define TRIE_HASH_PerfectHash
-#include <stddef.h>
-#include <stdint.h>
-enum { MyCounter = 0 };
-enum PerfectKey {
- Unknown = -1,
-};
-static enum PerfectKey PerfectHash(const char *string, size_t length);
-static enum PerfectKey PerfectHash(const char *string, size_t length)
-{
- switch (length) {
- default:
- return Unknown;
- }
-}
-#endif /* TRIE_HASH_PerfectHash */\
-" triehash --multi-byte=0 --counter-name="MyCounter"
diff --git a/triehash/tests/test-multi-byte-level b/triehash/tests/test-multi-byte-level
deleted file mode 100755
index ddfb8cd1b..000000000
--- a/triehash/tests/test-multi-byte-level
+++ /dev/null
@@ -1,427 +0,0 @@
-#!/bin/sh
-. $(dirname $(readlink -f $0))/framework.sh
-
-# Check that building a single-byte trie works
-testsuccessequal "\
-┌────────────────────────────────────────────────────┐
-│ Initial trie │
-└────────────────────────────────────────────────────┘
-
-├── V
-│ ├── e
-│ │ ├── r
-│ │ │ ├── y
-│ │ │ │ ├── L
-│ │ │ │ │ ├── o
-│ │ │ │ │ │ ├── n
-│ │ │ │ │ │ │ ├── g
-│ │ │ │ │ │ │ │ ├── W
-│ │ │ │ │ │ │ │ │ ├── o
-│ │ │ │ │ │ │ │ │ │ ├── r
-│ │ │ │ │ │ │ │ │ │ │ ├── d → VeryLongWord
-├── W
-│ ├── o
-│ │ ├── r
-│ │ │ ├── d → Word
-│ │ │ │ ├── -
-│ │ │ │ │ ├── _
-│ │ │ │ │ │ ├── 0 → Word-_0
-│ │ │ │ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Rebuilt trie │
-└────────────────────────────────────────────────────┘
-
-├── V
-│ ├── e
-│ │ ├── r
-│ │ │ ├── y
-│ │ │ │ ├── L
-│ │ │ │ │ ├── o
-│ │ │ │ │ │ ├── n
-│ │ │ │ │ │ │ ├── g
-│ │ │ │ │ │ │ │ ├── W
-│ │ │ │ │ │ │ │ │ ├── o
-│ │ │ │ │ │ │ │ │ │ ├── r
-│ │ │ │ │ │ │ │ │ │ │ ├── d → VeryLongWord
-├── W
-│ ├── o
-│ │ ├── r
-│ │ │ ├── d → Word
-│ │ │ │ ├── -
-│ │ │ │ │ ├── _
-│ │ │ │ │ │ ├── 0 → Word-_0
-│ │ │ │ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 4 │
-└────────────────────────────────────────────────────┘
-
-├── W
-│ ├── o
-│ │ ├── r
-│ │ │ ├── d → Word
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 5 │
-└────────────────────────────────────────────────────┘
-
-├── W
-│ ├── o
-│ │ ├── r
-│ │ │ ├── d
-│ │ │ │ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 7 │
-└────────────────────────────────────────────────────┘
-
-├── W
-│ ├── o
-│ │ ├── r
-│ │ │ ├── d
-│ │ │ │ ├── -
-│ │ │ │ │ ├── _
-│ │ │ │ │ │ ├── 0 → Word-_0
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 12 │
-└────────────────────────────────────────────────────┘
-
-├── V
-│ ├── e
-│ │ ├── r
-│ │ │ ├── y
-│ │ │ │ ├── L
-│ │ │ │ │ ├── o
-│ │ │ │ │ │ ├── n
-│ │ │ │ │ │ │ ├── g
-│ │ │ │ │ │ │ │ ├── W
-│ │ │ │ │ │ │ │ │ ├── o
-│ │ │ │ │ │ │ │ │ │ ├── r
-│ │ │ │ │ │ │ │ │ │ │ ├── d → VeryLongWord" triehash --multi-byte=0 -l tree /dev/stdin
-
-# Two byte optimization
-testsuccessequal "\
-┌────────────────────────────────────────────────────┐
-│ Initial trie │
-└────────────────────────────────────────────────────┘
-
-├── Ve
-│ ├── ry
-│ │ ├── Lo
-│ │ │ ├── ng
-│ │ │ │ ├── Wo
-│ │ │ │ │ ├── rd → VeryLongWord
-├── Wo
-│ ├── rd → Word
-│ │ ├── -_
-│ │ │ ├── 0 → Word-_0
-│ │ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Rebuilt trie │
-└────────────────────────────────────────────────────┘
-
-├── Ve
-│ ├── ry
-│ │ ├── Lo
-│ │ │ ├── ng
-│ │ │ │ ├── Wo
-│ │ │ │ │ ├── rd → VeryLongWord
-├── Wo
-│ ├── rd → Word
-│ │ ├── -
-│ │ │ ├── _0 → Word-_0
-│ │ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 4 │
-└────────────────────────────────────────────────────┘
-
-├── Wo
-│ ├── rd → Word
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 5 │
-└────────────────────────────────────────────────────┘
-
-├── Wo
-│ ├── rd
-│ │ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 7 │
-└────────────────────────────────────────────────────┘
-
-├── Wo
-│ ├── rd
-│ │ ├── -_
-│ │ │ ├── 0 → Word-_0
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 12 │
-└────────────────────────────────────────────────────┘
-
-├── Ve
-│ ├── ry
-│ │ ├── Lo
-│ │ │ ├── ng
-│ │ │ │ ├── Wo
-│ │ │ │ │ ├── rd → VeryLongWord" triehash --multi-byte=1 -l tree /dev/stdin
-# Four byte optimization
-testsuccessequal "\
-┌────────────────────────────────────────────────────┐
-│ Initial trie │
-└────────────────────────────────────────────────────┘
-
-├── Very
-│ ├── Long
-│ │ ├── Word → VeryLongWord
-├── Word → Word
-│ ├── -
-│ │ ├── _
-│ │ │ ├── 0 → Word-_0
-│ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Rebuilt trie │
-└────────────────────────────────────────────────────┘
-
-├── Very
-│ ├── Long
-│ │ ├── Word → VeryLongWord
-├── Word → Word
-│ ├── -
-│ │ ├── _
-│ │ │ ├── 0 → Word-_0
-│ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 4 │
-└────────────────────────────────────────────────────┘
-
-├── Word → Word
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 5 │
-└────────────────────────────────────────────────────┘
-
-├── Word
-│ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 7 │
-└────────────────────────────────────────────────────┘
-
-├── Word
-│ ├── -
-│ │ ├── _
-│ │ │ ├── 0 → Word-_0
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 12 │
-└────────────────────────────────────────────────────┘
-
-├── Very
-│ ├── Long
-│ │ ├── Word → VeryLongWord" triehash --multi-byte=2 -l tree /dev/stdin
-# Eigh byte optimization
-testsuccessequal "\
-┌────────────────────────────────────────────────────┐
-│ Initial trie │
-└────────────────────────────────────────────────────┘
-
-├── VeryLong
-│ ├── W
-│ │ ├── o
-│ │ │ ├── r
-│ │ │ │ ├── d → VeryLongWord
-├── W
-│ ├── o
-│ │ ├── r
-│ │ │ ├── d → Word
-│ │ │ │ ├── -
-│ │ │ │ │ ├── _
-│ │ │ │ │ │ ├── 0 → Word-_0
-│ │ │ │ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Rebuilt trie │
-└────────────────────────────────────────────────────┘
-
-├── V
-│ ├── eryLongW
-│ │ ├── o
-│ │ │ ├── r
-│ │ │ │ ├── d → VeryLongWord
-├── W
-│ ├── o
-│ │ ├── r
-│ │ │ ├── d → Word
-│ │ │ │ ├── -
-│ │ │ │ │ ├── _
-│ │ │ │ │ │ ├── 0 → Word-_0
-│ │ │ │ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 4 │
-└────────────────────────────────────────────────────┘
-
-├── W
-│ ├── o
-│ │ ├── r
-│ │ │ ├── d → Word
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 5 │
-└────────────────────────────────────────────────────┘
-
-├── W
-│ ├── o
-│ │ ├── r
-│ │ │ ├── d
-│ │ │ │ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 7 │
-└────────────────────────────────────────────────────┘
-
-├── W
-│ ├── o
-│ │ ├── r
-│ │ │ ├── d
-│ │ │ │ ├── -
-│ │ │ │ │ ├── _
-│ │ │ │ │ │ ├── 0 → Word-_0
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 12 │
-└────────────────────────────────────────────────────┘
-
-├── VeryLong
-│ ├── W
-│ │ ├── o
-│ │ │ ├── r
-│ │ │ │ ├── d → VeryLongWord" triehash --multi-byte=3 -l tree /dev/stdin
-
-
-# Check that building a multi-byte trie works
-testsuccessequal "\
-┌────────────────────────────────────────────────────┐
-│ Initial trie │
-└────────────────────────────────────────────────────┘
-
-├── VeryLong
-│ ├── Word → VeryLongWord
-├── Word → Word
-│ ├── -
-│ │ ├── _
-│ │ │ ├── 0 → Word-_0
-│ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Rebuilt trie │
-└────────────────────────────────────────────────────┘
-
-├── Very
-│ ├── LongWord → VeryLongWord
-├── Word → Word
-│ ├── -
-│ │ ├── _
-│ │ │ ├── 0 → Word-_0
-│ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 4 │
-└────────────────────────────────────────────────────┘
-
-├── Word → Word
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 5 │
-└────────────────────────────────────────────────────┘
-
-├── Word
-│ ├── 2 → Label
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 7 │
-└────────────────────────────────────────────────────┘
-
-├── Word
-│ ├── -
-│ │ ├── _
-│ │ │ ├── 0 → Word-_0
-┌────────────────────────────────────────────────────┐
-│ Trie for words of length 12 │
-└────────────────────────────────────────────────────┘
-
-├── VeryLong
-│ ├── Word → VeryLongWord" triehash -l tree /dev/stdin
-
-
-###### CHANGE THE WORDS FOR THE FOLLOWING TESTS #######
-WORDS="Word"
-
-# Check that we are generating the proper multi-byte and fallback sessions
-testsuccessequal "#include \"/dev/null\"
-#ifdef __GNUC__
-typedef uint16_t __attribute__((aligned (1))) triehash_uu16;
-typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1];
-typedef uint32_t __attribute__((aligned (1))) triehash_uu32;
-typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1];
-typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
-typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1];
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define onechar(c, s, l) (((uint64_t)(c)) << (s))
-#else
-#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))
-#endif
-#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)
-#define TRIE_HASH_MULTI_BYTE
-#endif
-#endif /*GNUC */
-#ifdef TRIE_HASH_MULTI_BYTE
-static enum PerfectKey PerfectHash4(const char *string)
-{
- switch(*((triehash_uu32*) &string[0])) {
- case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32):
- return Word;
- }
- return Unknown;
-}
-#else
-static enum PerfectKey PerfectHash4(const char *string)
-{
- switch(string[0]) {
- case 'W':
- switch(string[1]) {
- case 'o':
- switch(string[2]) {
- case 'r':
- switch(string[3]) {
- case 'd':
- return Word;
- }
- }
- }
- }
- return Unknown;
-}
-#endif /* TRIE_HASH_MULTI_BYTE */
- enum PerfectKey PerfectHash(const char *string, size_t length)
-{
- switch (length) {
- case 4:
- return PerfectHash4(string);
- default:
- return Unknown;
- }
-}" triehash -H /dev/null /dev/stdin
-
-
-# Check that we are generating no multi-byte session
-testsuccessequal "#include \"/dev/null\"
-static enum PerfectKey PerfectHash4(const char *string)
-{
- switch(string[0]) {
- case 'W':
- switch(string[1]) {
- case 'o':
- switch(string[2]) {
- case 'r':
- switch(string[3]) {
- case 'd':
- return Word;
- }
- }
- }
- }
- return Unknown;
-}
- enum PerfectKey PerfectHash(const char *string, size_t length)
-{
- switch (length) {
- case 4:
- return PerfectHash4(string);
- default:
- return Unknown;
- }
-}" triehash --multi-byte=0 -H /dev/null /dev/stdin
diff --git a/triehash/triehash.pl b/triehash/triehash.pl
deleted file mode 100755
index 841c0e7e2..000000000
--- a/triehash/triehash.pl
+++ /dev/null
@@ -1,691 +0,0 @@
-#!/usr/bin/perl -w
-#
-# Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org>
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-
-=head1 NAME
-
-triehash - Generate a perfect hash function derived from a trie.
-
-=cut
-
-use strict;
-use warnings;
-use Getopt::Long;
-
-=head1 SYNOPSIS
-
-B<triehash> [S<I<option>>] [S<I<input file>>]
-
-=head1 DESCRIPTION
-
-triehash takes a list of words in input file and generates a function and
-an enumeration to describe the word
-
-=head1 INPUT FILE FORMAT
-
-The file consists of multiple lines of the form:
-
- [label ~ ] word [= value]
-
-This maps word to value, and generates an enumeration with entries of the form:
-
- label = value
-
-If I<label> is undefined, the word will be used, the minus character will be
-replaced by an underscore. If value is undefined it is counted upwards from
-the last value.
-
-There may also be one line of the format
-
- [ label ~] = value
-
-Which defines the value to be used for non-existing keys. Note that this also
-changes default value for other keys, as for normal entries. So if you place
-
- = 0
-
-at the beginning of the file, unknown strings map to 0, and the other strings
-map to values starting with 1. If label is not specified, the default is
-I<Unknown>.
-
-=head1 OPTIONS
-
-=over 4
-
-=item B<-C>I<.c file> B<--code>=I<.c file>
-
-Generate code in the given file.
-
-=item B<-H>I<header file> B<--header>=I<header file>
-
-Generate a header in the given file, containing a declaration of the hash
-function and an enumeration.
-
-=item B<--enum-name=>I<word>
-
-The name of the enumeration.
-
-=item B<--function-name=>I<word>
-
-The name of the function.
-
-=item B<--namespace=>I<name>
-
-Put the function and enum into a namespace (C++)
-
-=item B<--class=>I<name>
-
-Put the function and enum into a class (C++)
-
-=item B<--enum-class>
-
-Generate an enum class instead of an enum (C++)
-
-=item B<--counter-name=>I<name>
-
-Use I<name> for a counter that is set to the latest entry in the enumeration
-+ 1. This can be useful for defining array sizes.
-
-=item B<--extern-c>
-
-Wrap everything into an extern "C" block. Not compatible with the C++
-options, as a header with namespaces, classes, or enum classes is not
-valid C.
-
-=item B<--multi-byte>=I<value>
-
-Generate code reading multiple bytes at once. The value is a string of power
-of twos to enable. The default value is 320 meaning that 8, 4, and single byte
-reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you
-also want to allow 2-byte reads. 2-byte reads are disabled by default because
-they negatively affect performance on older Intel architectures.
-
-This generates code for both multiple bytes and single byte reads, but only
-enables the multiple byte reads of GNU C compatible compilers, as the following
-extensions are used:
-
-=over 8
-
-=item Byte-aligned integers
-
-We must be able to generate integers that are aligned to a single byte using:
-
- typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
-
-=item Byte-order
-
-The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined.
-
-=back
-
-We forcefully disable multi-byte reads on platforms where the variable
-I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined,
-as there is a measurable overhead from emulating the unaligned reads on
-ARM.
-
-=item B<--language=>I<language>
-
-Generate a file in the specified language. Currently known are 'C' and 'tree',
-the latter generating a tree.
-
-=item B<--include=>I<header>
-
-Add the header to the include statements of the header file. The value must
-be surrounded by quotes or angle brackets for C code. May be specified multiple
-times.
-
-=back
-
-=cut
-
-my $unknown = -1;
-my $unknown_label = "Unknown";
-my $counter_start = 0;
-my $enum_name = "PerfectKey";
-my $function_name = "PerfectHash";
-my $enum_class = 0;
-
-my $code_name = "-";
-my $header_name = "-";
-my $code;
-my $header;
-my $ignore_case = 0;
-my $multi_byte = "320";
-my $language = 'C';
-my $counter_name = undef;
-my @includes = ();
-
-
-Getopt::Long::config('default',
- 'bundling',
- 'no_getopt_compat',
- 'no_auto_abbrev',
- 'permute',
- 'auto_help');
-
-GetOptions ("code|C=s" => \$code_name,
- "header|H=s" => \$header_name,
- "function-name=s" => \$function_name,
- "ignore-case" => \$ignore_case,
- "enum-name=s" => \$enum_name,
- "language|l=s" => \$language,
- "multi-byte=s" => \$multi_byte,
- "enum-class" => \$enum_class,
- "include=s" => \@includes,
- "counter-name=s" => \$counter_name)
- or die("Could not parse options!");
-
-
-# This implements a simple trie. Each node has three attributes:
-#
-# children - A hash of keys to other nodes
-# value - The value to be stored here
-# label - A named representation of the value.
-#
-# The key at each level of the trie can consist of one or more bytes, and the
-# trie can be normalized to a form where all keys at a level have the same
-# length using rebuild_tree().
-package Trie {
-
- sub new {
- my $class = shift;
- my $self = {};
- bless $self, $class;
-
- $self->{children} = {};
- $self->{value} = undef;
- $self->{label} = undef;
-
- return $self;
- }
-
- # Return the largest power of 2 smaller or equal to the argument
- sub alignpower2 {
- my ($self, $length) = @_;
-
- return 8 if ($length >= 8 && $multi_byte =~ /3/);
- return 4 if ($length >= 4 && $multi_byte =~ /2/);
- return 2 if ($length >= 2 && $multi_byte =~ /1/);
-
- return 1;
- }
-
- # Split the key into a head block and a tail
- sub split_key {
- my ($self, $key) = @_;
- my $length = length $key;
- my $split = $self->alignpower2($length);
-
- return (substr($key, 0, $split), substr($key, $split));
- }
-
- # Given a key, a label, and a value, insert that into the tree, possibly
- # replacing an existing node.
- sub insert {
- my ($self, $key, $label, $value) = @_;
-
- if (length($key) == 0) {
- $self->{label} = $label;
- $self->{value} = $value;
- return;
- }
-
- my ($child, $tail) = $self->split_key($key);
-
- $self->{children}{$child} = Trie->new if (!defined($self->{children}{$child}));
-
- $self->{children}{$child}->insert($tail, $label, $value);
- }
-
- # Construct a new trie that only contains words of a given length. This
- # is used to split up the common trie after knowing all words, so we can
- # switch on the expected word length first, and have the per-trie function
- # implement simple longest prefix matching.
- sub filter_depth {
- my ($self, $togo) = @_;
-
- my $new = Trie->new;
-
- if ($togo != 0) {
- my $found = 0;
- foreach my $key (sort keys %{$self->{children}}) {
- if ($togo > length($key) || defined $self->{children}{$key}->{value}) {
- my $child = $self->{children}{$key}->filter_depth($togo - length($key));
-
- $new->{children}{$key}= $child if defined $child;
- $found = 1 if defined $child;
- }
- }
- return undef if (!$found);
- } else {
- $new->{value} = $self->{value};
- $new->{label} = $self->{label};
- }
-
- return $new;
- }
-
- # (helper for rebuild_tree)
- # Reinsert all value nodes into the specified $trie, prepending $prefix
- # to their $paths.
- sub reinsert_value_nodes_into {
- my ($self, $trie, $prefix) = @_;
-
- $trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value});
-
- foreach my $key (sort keys %{$self->{children}}) {
- $self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key);
- }
- }
-
- # (helper for rebuild_tree)
- # Find the earliest point to split a key. Normally, we split at the maximum
- # power of 2 that is greater or equal than the length of the key. When we
- # are building an ASCII-optimised case-insensitive trie that simply ORs
- # each byte with 0x20, we need to split at the first ambiguous character:
- #
- # For example, the words a-bc and a\rbc are identical in such a situation:
- # '-' | 0x20 == '-' == '\r' | 0x20
- # We cannot simply switch on all 4 bytes at once, but need to split before
- # the ambiguous character so we can process the ambiguous character on its
- # own.
- sub find_ealier_split {
- my ($self, $key) = @_;
-
- if ($ignore_case) {
- for my $i (0..length($key)-1) {
- # If the key starts with an ambiguous character, we need to
- # take only it. Otherwise, we need to take everything
- # before the character.
- return $self->alignpower2($i || 1) if (main::ambiguous(substr($key, $i, 1)));
- }
- }
- return $self->alignpower2(length $key);
- }
-
- # This rebuilds the trie, splitting each key before ambiguous characters
- # as explained in find_earlier_split(), and then chooses the smallest
- # such split at each level, so that all keys at all levels have the same
- # length (so we can use a multi-byte switch).
- sub rebuild_tree {
- my $self = shift;
- # Determine if/where we need to split before an ambiguous character
- my $new_split = 99999999999999999;
- foreach my $key (sort keys %{$self->{children}}) {
- my $special_length = $self->find_ealier_split($key);
- $new_split = $special_length if ($special_length < $new_split);
- }
-
- # Start building a new uniform trie
- my $newself = Trie->new;
- $newself->{label} = $self->{label};
- $newself->{value} = $self->{value};
- $newself->{children} = {};
-
- foreach my $key (sort keys %{$self->{children}}) {
- my $head = substr($key, 0, $new_split);
- my $tail = substr($key, $new_split);
- # Rebuild the child node at $head, pushing $tail downwards
- $newself->{children}{$head} //= Trie->new;
- $self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail);
- # We took up to one special character of each key label. There might
- # be more, so we need to rebuild recursively.
- $newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree();
- }
-
- return $newself;
- }
-}
-
-# Code generator for C and C++
-package CCodeGen {
- my $static = ($code_name eq $header_name) ? "static" : "";
- my $enum_specifier = $enum_class ? "enum class" : "enum";
-
- sub new {
- my $class = shift;
- my $self = {};
- bless $self, $class;
-
- return $self;
- }
-
- sub open_output {
- my $self = shift;
- if ($code_name ne "-") {
- open($code, '>', $code_name) or die "Cannot open $code_name: $!" ;
- } else {
- $code = *STDOUT;
- }
- if($code_name eq $header_name) {
- $header = $code;
- } elsif ($header_name ne "-") {
- open($header, '>', $header_name) or die "Cannot open $header_name: $!" ;
- } else {
- $header = *STDOUT;
- }
- }
-
- sub word_to_label {
- my ($class, $word) = @_;
-
- $word =~ s/_/__/g;
- $word =~ s/-/_/g;
- return $word;
- }
-
- # Return a case label, by shifting and or-ing bytes in the word
- sub case_label {
- my ($self, $key) = @_;
-
- return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte;
-
- my $output = '0';
-
- for my $i (0..length($key)-1) {
- $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key));
- }
-
- return $output;
- }
-
- # Return an appropriate read instruction for $length bytes from $offset
- sub switch_key {
- my ($self, $offset, $length) = @_;
-
- return "string[$offset]" if $length == 1;
- return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8);
- }
-
- # Render the trie so that it matches the longest prefix.
- sub print_table {
- my ($self, $trie, $fh, $indent, $index) = @_;
- $indent //= 0;
- $index //= 0;
-
- # If we have children, try to match them.
- if (%{$trie->{children}}) {
- # The difference between lowercase and uppercase alphabetical characters
- # is that they have one bit flipped. If we have alphabetical characters
- # in the search space, and the entire search space works fine if we
- # always turn on the flip, just OR the character we are switching over
- # with the bit.
- my $want_use_bit = 0;
- my $can_use_bit = 1;
- my $key_length = 0;
- foreach my $key (sort keys %{$trie->{children}}) {
- $can_use_bit &= not main::ambiguous($key);
- $want_use_bit |= ($key =~ /^[a-zA-Z]+$/);
- $key_length = length($key);
- }
-
- if ($ignore_case && $can_use_bit && $want_use_bit) {
- printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length);
- } else {
- printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
- }
-
- my $notfirst = 0;
- foreach my $key (sort keys %{$trie->{children}}) {
- if ($notfirst) {
- printf $fh (" " x $indent . " break;\n");
- }
- if ($ignore_case) {
- printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key)));
- printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
- } else {
- printf $fh (" " x $indent . "case %s:\n", $self->case_label($key));
- }
-
- $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key));
-
- $notfirst=1;
- }
-
- printf $fh (" " x $indent . "}\n");
- }
-
-
- # This node has a value, so it is a possible end point. If no children
- # matched, we have found our longest prefix.
- if (defined $trie->{value}) {
- printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label});
- }
-
- }
-
- sub print_words {
- my ($self, $trie, $fh, $indent, $sofar) = @_;
-
- $indent //= 0;
- $sofar //= "";
-
-
- printf $fh (" " x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value};
-
- foreach my $key (sort keys %{$trie->{children}}) {
- $self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key);
- }
- }
-
- sub print_functions {
- my ($self, $trie, %lengths) = @_;
- foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
- print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n");
- print $code ("{\n");
- $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1);
- printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
- print $code ("}\n");
- }
- }
-
- sub main {
- my ($self, $trie, $num_values, %lengths) = @_;
- print $header ("#ifndef TRIE_HASH_${function_name}\n");
- print $header ("#define TRIE_HASH_${function_name}\n");
- print $header ("#include <stddef.h>\n");
- print $header ("#include <stdint.h>\n");
- foreach my $include (@includes) {
- print $header ("#include $include\n");
- }
- printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name));
- print $header ("${enum_specifier} ${enum_name} {\n");
- $self->print_words($trie, $header, 1);
- printf $header (" $unknown_label = $unknown,\n");
- print $header ("};\n");
- print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n");
-
- print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name);
-
- if ($multi_byte) {
- print $code ("#ifdef __GNUC__\n");
- for (my $i=16; $i <= 64; $i *= 2) {
- print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n");
- print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n");
- }
-
- print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n");
- print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n");
- print $code ("#else\n");
- print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n");
- print $code ("#endif\n");
- print $code ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n");
- print $code ("#define TRIE_HASH_MULTI_BYTE\n");
- print $code ("#endif\n");
- print $code ("#endif /*GNUC */\n");
-
- print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n");
- $self->print_functions($trie, %lengths);
- $multi_byte = 0;
- print $code ("#else\n");
- $self->print_functions($trie, %lengths);
- print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n");
- } else {
- $self->print_functions($trie, %lengths);
- }
-
- print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n");
- print $code ("{\n");
- print $code (" switch (length) {\n");
- foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
- print $code (" case $local_length:\n");
- print $code (" return ${function_name}${local_length}(string);\n");
- }
- print $code (" default:\n");
- printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
- print $code (" }\n");
- print $code ("}\n");
-
- # Print end of header here, in case header and code point to the same file
- print $header ("#endif /* TRIE_HASH_${function_name} */\n");
- }
-}
-
-# A character is ambiguous if the 1<<5 (0x20) bit does not correspond to the
-# lower case bit. A word is ambiguous if any character is. This definition is
-# used to check if we can perform the |0x20 optimization when building a case-
-# insensitive trie.
-sub ambiguous {
- my $word = shift;
-
- foreach my $char (split //, $word) {
- # If 0x20 does not solely indicate lowercase, it is ambiguous
- return 1 if ord(lc($char)) != (ord($char) | 0x20);
- return 1 if ord(uc($char)) != (ord($char) & ~0x20);
- }
-
- return 0;
-}
-
-sub build_trie {
- my $codegen = shift;
- my $trie = Trie->new;
-
- my $counter = $counter_start;
- my %lengths;
-
- open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!";
- while (my $line = <$input>) {
- my ($label, $word, $value) = $line =~/\s*(?:([^~\s]+)\s*~)?(?:\s*([^~=\s]+)\s*)?(?:=\s*([^\s]+)\s+)?\s*/;
-
- if (defined $word) {
- $counter = $value if defined($value);
- $label //= $codegen->word_to_label($word);
-
- $trie->insert($word, $label, $counter);
- $lengths{length($word)} = 1;
- $counter++;
- } elsif (defined $value) {
- $unknown = $value;
- $unknown_label = $label if defined($label);
- $counter = $value + 1;
- } else {
- die "Invalid line: $line";
- }
- }
-
- return ($trie, $counter, %lengths);
-}
-
-# Generates an ASCII art tree
-package TreeCodeGen {
-
- sub new {
- my $class = shift;
- my $self = {};
- bless $self, $class;
-
- return $self;
- }
-
- sub word_to_label {
- my ($self, $word) = @_;
- return $word;
- }
-
- sub main {
- my ($self, $trie, $counter, %lengths) = @_;
- printf $code ("┌────────────────────────────────────────────────────┐\n");
- printf $code ("│ Initial trie │\n");
- printf $code ("└────────────────────────────────────────────────────┘\n");
- $self->print($trie);
- printf $code ("┌────────────────────────────────────────────────────┐\n");
- printf $code ("│ Rebuilt trie │\n");
- printf $code ("└────────────────────────────────────────────────────┘\n");
- $self->print($trie->rebuild_tree());
-
- foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
- printf $code ("┌────────────────────────────────────────────────────┐\n");
- printf $code ("│ Trie for words of length %-4d │\n", $local_length);
- printf $code ("└────────────────────────────────────────────────────┘\n");
- $self->print($trie->filter_depth($local_length)->rebuild_tree());
- }
- }
-
- sub open_output {
- my $self = shift;
- if ($code_name ne "-") {
- open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ;
- } else {
- $code = *STDOUT;
- }
- }
-
- # Print a trie
- sub print {
- my ($self, $trie, $depth) = @_;
- $depth //= 0;
-
- print $code (" → ") if defined($trie->{label});
- print $code ($trie->{label} // "", "\n");
- foreach my $key (sort keys %{$trie->{children}}) {
- print $code ("│ " x ($depth), "├── $key");
- $self->print($trie->{children}{$key}, $depth + 1);
- }
- }
-}
-
-my %codegens = (
- C => "CCodeGen",
- tree => "TreeCodeGen",
-);
-
-
-defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens);
-my $codegen = $codegens{$language}->new();
-my ($trie, $counter, %lengths) = build_trie($codegen);
-
-$codegen->open_output();
-$codegen->main($trie, $counter, %lengths);
-
-
-=head1 LICENSE
-
-triehash is available under the MIT/Expat license, see the source code
-for more information.
-
-=head1 AUTHOR
-
-Julian Andres Klode <jak@jak-linux.org>
-
-=cut
-