summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTimo Teräs <timo.teras@iki.fi>2010-08-13 10:17:31 +0300
committerTimo Teräs <timo.teras@iki.fi>2010-08-13 10:17:31 +0300
commit8bc76c78a69360efc7a07a3c4e92f393cca22543 (patch)
tree447d60cf98355698ef9d6ad2480550b0d71ed266
parente0a013397a51963039c43877be3afe954e519be0 (diff)
downloadsquark-8bc76c78a69360efc7a07a3c4e92f393cca22543.tar.bz2
squark-8bc76c78a69360efc7a07a3c4e92f393cca22543.tar.xz
db: smarter string pointer encoding (include length field)
So we don't need explicit null terminator in most cases saving space. It will also speed up comparisons as getting string blob is now constant time (no strlen needed).
-rw-r--r--lua-squarkdb.c21
-rw-r--r--squark-filter.c8
-rw-r--r--squarkdb.c16
-rw-r--r--squarkdb.h5
4 files changed, 38 insertions, 12 deletions
diff --git a/lua-squarkdb.c b/lua-squarkdb.c
index dbac6d0..09d4afe 100644
--- a/lua-squarkdb.c
+++ b/lua-squarkdb.c
@@ -227,7 +227,7 @@ static int Lsqdb_map_strings(lua_State *L)
{
struct sqdb *db;
const char *str;
- char *ptr;
+ unsigned char *ptr;
size_t len, total, pos;
db = Lsqdb_checkarg(L, 1);
@@ -238,7 +238,9 @@ static int Lsqdb_map_strings(lua_State *L)
lua_pushnil(L);
while (lua_next(L, 2) != 0) {
str = luaL_checklstring(L, -2, &len);
- total += len + 1;
+ total += len;
+ if (len >= (1 << SQDB_LENGTH_BITS))
+ total++;
lua_pop(L, 1);
}
@@ -252,15 +254,20 @@ static int Lsqdb_map_strings(lua_State *L)
lua_pushnil(L);
while (lua_next(L, 2) != 0) {
str = lua_tolstring(L, -2, &len);
- memcpy(&ptr[pos], str, len + 1);
lua_pop(L, 1);
- /* table[key] = pos */
+ /* table[key] = encoded_string_pointer */
lua_pushvalue(L, -1);
- lua_pushinteger(L, pos);
- lua_rawset(L, 2);
+ if (len >= (1 << SQDB_LENGTH_BITS)) {
+ lua_pushinteger(L, pos << SQDB_LENGTH_BITS);
+ ptr[pos++] = len;
+ } else {
+ lua_pushinteger(L, (pos << SQDB_LENGTH_BITS) + len);
+ }
+ memcpy(&ptr[pos], str, len);
+ pos += len;
- pos += len + 1;
+ lua_rawset(L, 2);
}
return 0;
diff --git a/squark-filter.c b/squark-filter.c
index 8973d04..e47cbf5 100644
--- a/squark-filter.c
+++ b/squark-filter.c
@@ -112,12 +112,10 @@ static blob_t url_classify(struct url_info *url, struct sqdb *db)
void *cmph;
struct sqdb_index_entry *indx;
uint32_t *categories;
- char *strings;
cmph_uint32 i = -1, previ;
cmph = sqdb_section_get(db, SQDB_SECTION_INDEX_MPH, NULL);
indx = sqdb_section_get(db, SQDB_SECTION_INDEX, NULL);
- strings = sqdb_section_get(db, SQDB_SECTION_STRINGS, NULL);
/* search for most qualified domain match; do first lookup
* with two domain components */
@@ -132,14 +130,14 @@ static blob_t url_classify(struct url_info *url, struct sqdb *db)
previ = i;
i = cmph_search_packed(cmph, key.ptr, key.len);
- if (blob_cmp(got, BLOB_STR(&strings[indx[i].component])) != 0) {
+ if (blob_cmp(got, sqdb_get_string_literal(db, indx[i].component)) != 0) {
/* the subdomain did no longer match, use
* parents classification */
i = previ;
goto parent_dns_match;
}
if (!blob_is_null(tld)) {
- if (blob_cmp(tld, BLOB_STR(&strings[indx[indx[i].parent].component])) != 0) {
+ if (blob_cmp(tld, sqdb_get_string_literal(db, indx[indx[i].parent].component)) != 0) {
/* top level domain did not match */
i = -1;
goto parent_dns_match;
@@ -163,7 +161,7 @@ parent_dns_match:
categories = sqdb_section_get(db, SQDB_SECTION_CATEGORIES, NULL);
printf("%d\n", indx[i].category);
- return BLOB_STR(&strings[categories[indx[i].category]]);
+ return sqdb_get_string_literal(db, categories[indx[i].category]);
}
int main(int argc, char **argv)
diff --git a/squarkdb.c b/squarkdb.c
index e05f514..f289b28 100644
--- a/squarkdb.c
+++ b/squarkdb.c
@@ -138,3 +138,19 @@ void *sqdb_section_get(struct sqdb *db, int id, uint32_t *size)
return db->mmap_base + hdr->section[id].offset;
}
+blob_t sqdb_get_string_literal(struct sqdb *db, uint32_t encoded_ptr)
+{
+ unsigned char *ptr;
+ unsigned int len, off;
+
+ ptr = sqdb_section_get(db, SQDB_SECTION_STRINGS, NULL);
+ if (ptr == NULL)
+ return BLOB_NULL;
+
+ off = encoded_ptr >> SQDB_LENGTH_BITS;
+ len = encoded_ptr & ((1 << SQDB_LENGTH_BITS) - 1);
+ if (len == 0)
+ len = ptr[off++];
+
+ return BLOB_PTR_LEN(ptr + off, len);
+}
diff --git a/squarkdb.h b/squarkdb.h
index 3733ec1..743325e 100644
--- a/squarkdb.h
+++ b/squarkdb.h
@@ -3,6 +3,9 @@
#include <stddef.h>
#include <stdint.h>
+#include "blob.h"
+
+#define SQDB_LENGTH_BITS 5
#define SQDB_SECTION_STRINGS 0
#define SQDB_SECTION_CATEGORIES 1
@@ -39,6 +42,7 @@ struct sqdb_index_entry {
uint32_t component;
};
+
const char *sqdb_section_names[SQDB_SECTION_MAX];
int sqdb_create(struct sqdb *db, const char *fn);
@@ -47,5 +51,6 @@ void sqdb_close(struct sqdb *db);
void *sqdb_section_create(struct sqdb *db, int id, uint32_t size);
void *sqdb_section_get(struct sqdb *db, int id, uint32_t *size);
+blob_t sqdb_get_string_literal(struct sqdb *db, uint32_t encoded_ptr);
#endif