diff options
author | Timo Teräs <timo.teras@iki.fi> | 2010-08-13 10:17:31 +0300 |
---|---|---|
committer | Timo Teräs <timo.teras@iki.fi> | 2010-08-13 10:17:31 +0300 |
commit | 8bc76c78a69360efc7a07a3c4e92f393cca22543 (patch) | |
tree | 447d60cf98355698ef9d6ad2480550b0d71ed266 | |
parent | e0a013397a51963039c43877be3afe954e519be0 (diff) | |
download | squark-8bc76c78a69360efc7a07a3c4e92f393cca22543.tar.bz2 squark-8bc76c78a69360efc7a07a3c4e92f393cca22543.tar.xz |
db: smarter string pointer encoding (include length field)
So we don't need explicit null terminator in most cases saving
space. It will also speed up comparisons as getting string blob is
now constant time (no strlen needed).
-rw-r--r-- | lua-squarkdb.c | 21 | ||||
-rw-r--r-- | squark-filter.c | 8 | ||||
-rw-r--r-- | squarkdb.c | 16 | ||||
-rw-r--r-- | squarkdb.h | 5 |
4 files changed, 38 insertions, 12 deletions
diff --git a/lua-squarkdb.c b/lua-squarkdb.c index dbac6d0..09d4afe 100644 --- a/lua-squarkdb.c +++ b/lua-squarkdb.c @@ -227,7 +227,7 @@ static int Lsqdb_map_strings(lua_State *L) { struct sqdb *db; const char *str; - char *ptr; + unsigned char *ptr; size_t len, total, pos; db = Lsqdb_checkarg(L, 1); @@ -238,7 +238,9 @@ static int Lsqdb_map_strings(lua_State *L) lua_pushnil(L); while (lua_next(L, 2) != 0) { str = luaL_checklstring(L, -2, &len); - total += len + 1; + total += len; + if (len >= (1 << SQDB_LENGTH_BITS)) + total++; lua_pop(L, 1); } @@ -252,15 +254,20 @@ static int Lsqdb_map_strings(lua_State *L) lua_pushnil(L); while (lua_next(L, 2) != 0) { str = lua_tolstring(L, -2, &len); - memcpy(&ptr[pos], str, len + 1); lua_pop(L, 1); - /* table[key] = pos */ + /* table[key] = encoded_string_pointer */ lua_pushvalue(L, -1); - lua_pushinteger(L, pos); - lua_rawset(L, 2); + if (len >= (1 << SQDB_LENGTH_BITS)) { + lua_pushinteger(L, pos << SQDB_LENGTH_BITS); + ptr[pos++] = len; + } else { + lua_pushinteger(L, (pos << SQDB_LENGTH_BITS) + len); + } + memcpy(&ptr[pos], str, len); + pos += len; - pos += len + 1; + lua_rawset(L, 2); } return 0; diff --git a/squark-filter.c b/squark-filter.c index 8973d04..e47cbf5 100644 --- a/squark-filter.c +++ b/squark-filter.c @@ -112,12 +112,10 @@ static blob_t url_classify(struct url_info *url, struct sqdb *db) void *cmph; struct sqdb_index_entry *indx; uint32_t *categories; - char *strings; cmph_uint32 i = -1, previ; cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL); indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL); - strings = sqdb_section_get(db, SQDB_SECTION_STRINGS, NULL); /* search for most qualified domain match; do first lookup * with two domain components */ @@ -132,14 +130,14 @@ static blob_t url_classify(struct url_info *url, struct sqdb *db) previ = i; i = cmph_search_packed(cmph, key.ptr, key.len); - if (blob_cmp(got, BLOB_STR(&strings[indx[i].component])) != 0) { + if (blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) { /* the subdomain did no longer match, use * parents classification */ i = previ; goto parent_dns_match; } if (!blob_is_null(tld)) { - if (blob_cmp(tld, BLOB_STR(&strings[indx[indx[i].parent].component])) != 0) { + if (blob_cmp(tld, sqdb_get_string_literal(db, indx[indx[i].parent].component)) != 0) { /* top level domain did not match */ i = -1; goto parent_dns_match; @@ -163,7 +161,7 @@ parent_dns_match: categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL); printf("%d\n", indx[i].category); - return BLOB_STR(&strings[categories[indx[i].category]]); + return sqdb_get_string_literal(db, categories[indx[i].category]); } int main(int argc, char **argv) @@ -138,3 +138,19 @@ void *sqdb_section_get(struct sqdb *db, int id, uint32_t *size) return db->mmap_base + hdr->section[id].offset; } +blob_t sqdb_get_string_literal(struct sqdb *db, uint32_t encoded_ptr) +{ + unsigned char *ptr; + unsigned int len, off; + + ptr = sqdb_section_get(db, SQDB_SECTION_STRINGS, NULL); + if (ptr == NULL) + return BLOB_NULL; + + off = encoded_ptr >> SQDB_LENGTH_BITS; + len = encoded_ptr & ((1 << SQDB_LENGTH_BITS) - 1); + if (len == 0) + len = ptr[off++]; + + return BLOB_PTR_LEN(ptr + off, len); +} @@ -3,6 +3,9 @@ #include <stddef.h> #include <stdint.h> +#include "blob.h" + +#define SQDB_LENGTH_BITS 5 #define SQDB_SECTION_STRINGS 0 #define SQDB_SECTION_CATEGORIES 1 @@ -39,6 +42,7 @@ struct sqdb_index_entry { uint32_t component; }; + const char *sqdb_section_names[SQDB_SECTION_MAX]; int sqdb_create(struct sqdb *db, const char *fn); @@ -47,5 +51,6 @@ void sqdb_close(struct sqdb *db); void *sqdb_section_create(struct sqdb *db, int id, uint32_t size); void *sqdb_section_get(struct sqdb *db, int id, uint32_t *size); +blob_t sqdb_get_string_literal(struct sqdb *db, uint32_t encoded_ptr); #endif |