[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Bacula-devel] space saving in the database


Here are the current pieces of code.
If you want to try it, here's how :
- first, you have to be using pg 8.3 (i'll backport, there is very little work 
to do on that)
- second, a word of caution : it's a postgresql/C stored procedure. It means 
it can crash postgresql. It doesn't do it on my computer (anymore :) ). Don't 
use it on production for now (I guess nobody would do it, but I just don't 
want to take the blame :) )
- run make.sh, it will create a .so file, that has to go in postgresql's lib 
directory (it won't copy it there, just to /tmp ... nevermind, it will be 
cleaner later too...)
- then, run the SQL script. It will create a new lstat type, that you can use 
in place of the text type you have right now for lstat (you can't 'alter 
table alter column type', you have to create a new table ...)

I've included a pl script. That's the one I used to generate the huffman tree
(it comes from here: http://www.perlmonks.org/?node_id=603111 )
Then I put statistics from our own database (corrected so that anything except 
A,B and ' ' are equiprobable). It's dirty as it was a one pass job, but it's 
output is easier to read than the constants from the C code.

> Please post it.  I think seeing it, knowing how it is used, and how it
> works will be useful in the evaluation of the patch.   Well, it's not a
> Bacula patch, it's a patch against the database.
>
> > Anyway, if someone can remove my doubts about bacula's base64, I'd be
> > very thankful.


#include "postgres.h"
#include "fmgr.h"
#include <sys/types.h>
#include <string.h>
#include <stdlib.h>

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

/* The structure for storing all nodes */
typedef struct huffman_node_struct
{
	u_int8_t encoding;
	int numbits;
} huffman_node;

huffman_node codereverse[512];

#define Ob(x)  ((unsigned)Ob_(0 ## x ## uL))
#define Ob_(x) ((x & 1) | (x >> 2 & 2) | (x >> 4 & 4) | (x >> 6 & 8) |          \
        (x >> 8 & 16) | (x >> 10 & 32) | (x >> 12 & 64) | (x >> 14 & 128))

huffman_node code[256]={
{Ob(10100010),8},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),1},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(10111001),8},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(10100100),8},
{Ob(11010001),8},
{Ob(11000000),8},
{Ob(10111101),8},
{Ob(11010111),8},
{Ob(11011101),8},
{Ob(11011111),8},
{Ob(11010100),8},
{Ob(10100110),8},
{Ob(11011011),8},
{Ob(11011001),8},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000111),3},
{Ob(00000100),3},
{Ob(10110101),8},
{Ob(10101110),8},
{Ob(10101000),8},
{Ob(11001100),8},
{Ob(11001010),8},
{Ob(11000111),8},
{Ob(11001001),8},
{Ob(10101101),8},
{Ob(10100101),8},
{Ob(10110110),8},
{Ob(10110100),8},
{Ob(10111100),8},
{Ob(11010101),8},
{Ob(10111000),8},
{Ob(10110011),8},
{Ob(11011110),8},
{Ob(10100011),8},
{Ob(10111010),8},
{Ob(11001011),8},
{Ob(11001111),8},
{Ob(11000100),8},
{Ob(10110111),8},
{Ob(10101001),8},
{Ob(10111111),8},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(10111011),8},
{Ob(10101111),8},
{Ob(11001000),8},
{Ob(10100111),8},
{Ob(10101100),8},
{Ob(11010010),8},
{Ob(10101011),8},
{Ob(11010000),8},
{Ob(11010011),8},
{Ob(10111110),8},
{Ob(11000010),8},
{Ob(11011010),8},
{Ob(11011000),8},
{Ob(11010110),8},
{Ob(01010000),7},
{Ob(11011100),8},
{Ob(10110000),8},
{Ob(11001101),8},
{Ob(11000110),8},
{Ob(11000011),8},
{Ob(11000001),8},
{Ob(11000101),8},
{Ob(10110010),8},
{Ob(11001110),8},
{Ob(10101010),8},
{Ob(10110001),8},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0},
{Ob(00000000),0}
};

huffman_node code_reverse[256]={
{' ',1},
{0,0},
{0,0},
{0,0},
{'B',3},
{0,0},
{0,0},
{'A',3},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{'o',7},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,8},
{'S',8},
{'/',8},
{'K',8},
{'7',8},
{'d',8},
{'E',8},
{'Y',8},
{'y',8},
{'g',8},
{'e',8},
{'J',8},
{'D',8},
{'b',8},
{'q',8},
{'z',8},
{'w',8},
{'Q',8},
{'M',8},
{'C',8},
{'L',8},
{'X',8},
{'P',8},
{'+',8},
{'T',8},
{'a',8},
{'N',8},
{'2',8},
{'j',8},
{'Z',8},
{'1',8},
{'u',8},
{'k',8},
{'t',8},
{'W',8},
{'v',8},
{'s',8},
{'H',8},
{'c',8},
{'I',8},
{'G',8},
{'U',8},
{'F',8},
{'r',8},
{'x',8},
{'V',8},
{'h',8},
{'0',8},
{'f',8},
{'i',8},
{'6',8},
{'O',8},
{'n',8},
{'3',8},
{'m',8},
{'9',8},
{'l',8},
{'8',8},
{'p',8},
{'4',8},
{'R',8},
{'5',8},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0},
{0,0}
};

void set_bitrange(unsigned char* output, int offset, unsigned char value, int numbits)
{
	int block, blockoffset;
	unsigned char valueleft;
	unsigned char valueright;
	block = offset >> 3; //divide by 8
	blockoffset = offset - (block << 3);
	// We move the input buffer to the left (aligned with the left of the buffer
	// |00000110|A, on 3 bits becomes |11000000|
	value <<= 8-numbits;

	// Now we put this buffer on the current buffer and the next :
	// First : we align our data on the current block and OR them
	// Then : we align our data on the next block and OR them
	valueleft=value>>blockoffset;
	valueright=value<<(8-blockoffset);
	output[block] |= valueleft;
	output[block+1] |= valueright;
}

unsigned char get_bitrange(unsigned char* input, int offset, int size)
{
	unsigned char buffer, bufferleft, bufferright;
	int block, blockoffset;
	block = offset >> 3; // divide by 8
	blockoffset = offset - (block <<3);
	bufferleft = input[block]<<blockoffset;
	if (offset+8 <=size) // we can still read a full byte (size is byte aligned)
	{
		bufferright = input[block+1]>>(8-blockoffset);
	}
	else // we do a fake buffer
	{
		bufferright = 0x00;
	}
	buffer = bufferleft|bufferright;
	return buffer;
}


/*This function converts the char stream to encoded stream */
PG_FUNCTION_INFO_V1(huffman_encode);
Datum huffman_encode (PG_FUNCTION_ARGS)
{
	char *input = PG_GETARG_CSTRING(0);
	unsigned char *output;
	bytea *realoutput;
	unsigned char buffer;
	unsigned char *p;
	int numbits;
	int outputpos=0;
	int inputsize=strlen(input)+1;
	int i;

	// We over allocated to save time
	output=(unsigned char*)palloc(2*(inputsize+1));
	memset(output,0,2*(inputsize+1));
	p =(unsigned char*) input;
	for (i=0; i<inputsize ; i++)
	{
		numbits = code[*p].numbits;
		// Gerer si numbits = 0 ?
		buffer = code[*p].encoding;
		set_bitrange(output, outputpos , buffer, numbits);
		outputpos += numbits;
		p++;
	}
	numbits = code[0].numbits;
	buffer = code[0].encoding;
	set_bitrange(output, outputpos , buffer, numbits);
	outputpos += numbits;
	// We have to convert outputpos to bytes. If there is at least one bit in the
	// last byte, we have to count one byte more
	outputpos = (outputpos +7 )>> 3;

	realoutput = (bytea *) palloc(outputpos+VARHDRSZ);
	SET_VARSIZE(realoutput,outputpos+VARHDRSZ);
	memcpy(VARDATA(realoutput),output,outputpos);
	pfree(output);
	PG_RETURN_BYTEA_P(realoutput);
}

// Decodes the encoded stream back to cstring
PG_FUNCTION_INFO_V1(huffman_decode);
Datum huffman_decode (PG_FUNCTION_ARGS)
{
	bytea     *input = PG_GETARG_BYTEA_P(0);
	unsigned char* output;
	unsigned char buffer, buffertemp;
	int inputpos=0;
	int outputsize=0;
	int numbits;
	int inputsize=VARSIZE(input)-VARHDRSZ;
	int end=0;
	// to be sure, we need one byte per bit of input
	output=(unsigned char*)palloc(inputsize*8);
	unsigned char* p = output;
	// We multiply inputsize by 8 (it is now in bits...)
	inputsize <<= 3;
	while ( (end != 1) && inputpos<=inputsize)
	{
		buffer=get_bitrange((unsigned char*)VARDATA(input), inputpos, inputsize);
		/* We try from 1 to 8 bits (or remaining bits in input) */
		int offset=8;
		int minoffset=1;
		if (inputsize-inputpos<=8)
		{
			minoffset=inputsize-inputpos;
		}
		do
		{
			offset--;
			buffertemp = buffer >> offset ;
			numbits = code_reverse[buffertemp].numbits;
		} while (numbits == 0 && offset >= 1);
		if(numbits == 0 && offset == 0)
		{
			// We had a strange input, it seems-we couldn't find it
			pfree(output);
			ereport(ERROR,(errmsg("not a lstat input - unrecognised symbol")));
			PG_RETURN_NULL();
		}
		*p = code_reverse[buffertemp].encoding;
		if (*p == '\0') 
		{
			end=1;
		}
		p++;
		outputsize++;
		inputpos+=numbits;
	}
	// We check we've seen the end, else it's not properly formatted
	if (! end)
	{
		pfree(output);
		ereport(ERROR,(errmsg("not a lstat input - not terminated")));
		PG_RETURN_NULL();
	}

	PG_RETURN_CSTRING(output);
}

Attachment: make.sh
Description: application/shellscript

Attachment: huffman.pl
Description: Perl program

DROP TYPE lstat cascade;
CREATE TYPE lstat;
CREATE FUNCTION huffman_encode(cstring) returns lstat as 'lstat_pg','huffman_encode' language C strict;
CREATE FUNCTION huffman_decode(lstat) returns cstring as 'lstat_pg','huffman_decode' language C strict;
CREATE TYPE lstat (
   storage = extended,
   alignment = int4,
   internallength = variable,
   input = huffman_encode,
   output = huffman_decode
);
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Bacula-devel mailing list
Bacula-devel@xxxxxxxxxxxxxxxxxxxxx
https://lists.sourceforge.net/lists/listinfo/bacula-devel


This mailing list archive is a service of Copilot Consulting.