cmph-2.0.2/ 0000755 0001750 0001750 00000000000 13411542035 012002 5 ustar joseph joseph cmph-2.0.2/cmph.vcproj 0000644 0001750 0001750 00000010764 13411542035 014166 0 ustar joseph joseph
cmph-2.0.2/m4/ 0000755 0001750 0001750 00000000000 13411542035 012322 5 ustar joseph joseph cmph-2.0.2/m4/cxx0x.m4 0000644 0001750 0001750 00000005123 13411542035 013637 0 ustar joseph joseph dnl Check for baseline language coverage in the compiler for the C++0x standard.
# AC_COMPILE_STDCXX_OX
AC_DEFUN([AC_COMPILE_STDCXX_0X], [
AC_CACHE_CHECK(if compiler supports C++0x features without additional flags,
ac_cv_cxx_compile_cxx0x_native,
[AC_LANG_SAVE
AC_LANG_CPLUSPLUS
AC_TRY_COMPILE([
#include
#include
template
struct check
{
static_assert(sizeof(int) <= sizeof(T), "not big enough");
};
typedef check> right_angle_brackets;
int a;
decltype(a) b;
],,
ac_cv_cxx_compile_cxx0x_native=yes, ac_cv_cxx_compile_cxx0x_native=no)
AC_LANG_RESTORE
])
AC_CACHE_CHECK(if compiler supports C++0x features with -std=c++11,
ac_cv_cxx_compile_cxx11_cxx,
[AC_LANG_SAVE
AC_LANG_CPLUSPLUS
ac_save_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS -std=c++11"
AC_TRY_COMPILE([
#include
template
struct check
{
static_assert(sizeof(int) <= sizeof(T), "not big enough");
};
typedef check> right_angle_brackets;
int a;
decltype(a) b;],,
ac_cv_cxx_compile_cxx11_cxx=yes, ac_cv_cxx_compile_cxx11_cxx=no)
CXXFLAGS="$ac_save_CXXFLAGS"
AC_LANG_RESTORE
])
AC_CACHE_CHECK(if compiler supports C++0x features with -std=c++0x,
ac_cv_cxx_compile_cxx0x_cxx,
[AC_LANG_SAVE
AC_LANG_CPLUSPLUS
ac_save_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS -std=c++0x"
AC_TRY_COMPILE([
#include
template
struct check
{
static_assert(sizeof(int) <= sizeof(T), "not big enough");
};
typedef check> right_angle_brackets;
int a;
decltype(a) b;],,
ac_cv_cxx_compile_cxx0x_cxx=yes, ac_cv_cxx_compile_cxx0x_cxx=no)
CXXFLAGS="$ac_save_CXXFLAGS"
AC_LANG_RESTORE
])
AC_CACHE_CHECK(if compiler supports C++0x features with -std=gnu++0x,
ac_cv_cxx_compile_cxx0x_gxx,
[AC_LANG_SAVE
AC_LANG_CPLUSPLUS
ac_save_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS -std=gnu++0x"
AC_TRY_COMPILE([
#include
template
struct check
{
static_assert(sizeof(int) <= sizeof(T), "not big enough");
};
typedef check> right_angle_brackets;
int a;
decltype(a) b;],,
ac_cv_cxx_compile_cxx0x_gxx=yes, ac_cv_cxx_compile_cxx0x_gxx=no)
CXXFLAGS="$ac_save_CXXFLAGS"
AC_LANG_RESTORE
])
if test "$ac_cv_cxx_compile_cxx0x_native" = yes ||
test "$ac_cv_cxx_compile_cxx0x_cxx" = yes ||
test "$ac_cv_cxx_compile_cxx0x_gxx" = yes; then
AC_DEFINE(HAVE_STDCXX_0X,,[Define if g++ supports C++0x features. ])
fi
])
cmph-2.0.2/m4/spoon.m4 0000644 0001750 0001750 00000000300 13411542035 013713 0 ustar joseph joseph AC_DEFUN([AC_CHECK_SPOON], [
AC_ARG_WITH(spoon, [ --with-spoon=SPOON this is inocuous, since the truth is that there is no spoon ])
AC_MSG_CHECKING(if there is spoon)
AC_MSG_RESULT(no)
])
cmph-2.0.2/m4/acinclude.m4 0000644 0001750 0001750 00000001105 13411542035 014510 0 ustar joseph joseph AC_DEFUN([AC_ENABLE_CXXMPH], [AC_ARG_ENABLE([cxxmph],
[ --enable-cxxmph enable the c++ cxxmph library ],
[case "${enableval}" in
yes) cxxmph=true ;;
no) cxxmph=false ;;
*) AC_MSG_ERROR([bad value ${enableval} for --enable-cxxmph]) ;;
esac],[cxxmph=false])])
AC_DEFUN([AC_ENABLE_BENCHMARKS], [AC_ARG_ENABLE([benchmarks],
[ --enable-benchmarks enable cxxmph benchmarks against other libs ],
[case "${enableval}" in
yes) benchmarks=true ;;
no) benchmarks=false ;;
*) AC_MSG_ERROR([bad value ${enableval} for --enable-benchmarks]) ;;
esac],[benchmarks=false])])
cmph-2.0.2/m4/largefile.m4 0000644 0001750 0001750 00000007217 13411542035 014525 0 ustar joseph joseph dnl By default, many hosts won't let programs access large files;
dnl one must use special compiler options to get large-file access to work.
dnl For more details about this brain damage please see:
dnl http://www.sas.com/standards/large.file/x_open.20Mar96.html
dnl Written by Paul Eggert .
dnl Internal subroutine of AC_SYS_EXTRA_LARGEFILE.
dnl AC_SYS_EXTRA_LARGEFILE_FLAGS(FLAGSNAME)
AC_DEFUN([AC_SYS_EXTRA_LARGEFILE_FLAGS],
[AC_CACHE_CHECK([for $1 value to request large file support],
ac_cv_sys_largefile_$1,
[ac_cv_sys_largefile_$1=`($GETCONF LFS_$1) 2>/dev/null` || {
ac_cv_sys_largefile_$1=no
ifelse($1, CFLAGS,
[case "$host_os" in
# IRIX 6.2 and later require cc -n32.
changequote(, )dnl
irix6.[2-9]* | irix6.1[0-9]* | irix[7-9].* | irix[1-9][0-9]*)
changequote([, ])dnl
if test "$GCC" != yes; then
ac_cv_sys_largefile_CFLAGS=-n32
fi
ac_save_CC="$CC"
CC="$CC $ac_cv_sys_largefile_CFLAGS"
AC_TRY_LINK(, , , ac_cv_sys_largefile_CFLAGS=no)
CC="$ac_save_CC"
esac])
}])])
dnl Internal subroutine of AC_SYS_EXTRA_LARGEFILE.
dnl AC_SYS_EXTRA_LARGEFILE_SPACE_APPEND(VAR, VAL)
AC_DEFUN([AC_SYS_EXTRA_LARGEFILE_SPACE_APPEND],
[case $2 in
no) ;;
?*)
case "[$]$1" in
'') $1=$2 ;;
*) $1=[$]$1' '$2 ;;
esac ;;
esac])
dnl Internal subroutine of AC_SYS_EXTRA_LARGEFILE.
dnl AC_SYS_EXTRA_LARGEFILE_MACRO_VALUE(C-MACRO, CACHE-VAR, COMMENT, CODE-TO-SET-DEFAULT)
AC_DEFUN([AC_SYS_EXTRA_LARGEFILE_MACRO_VALUE],
[AC_CACHE_CHECK([for $1], $2,
[$2=no
changequote(, )dnl
$4
for ac_flag in $ac_cv_sys_largefile_CFLAGS no; do
case "$ac_flag" in
-D$1)
$2=1 ;;
-D$1=*)
$2=`expr " $ac_flag" : '[^=]*=\(.*\)'` ;;
esac
done
changequote([, ])dnl
])
if test "[$]$2" != no; then
AC_DEFINE_UNQUOTED([$1], [$]$2, [$3])
fi])
AC_DEFUN([AC_SYS_EXTRA_LARGEFILE],
[AC_REQUIRE([AC_CANONICAL_HOST])
AC_ARG_ENABLE(largefile,
[ --disable-largefile omit support for large files])
if test "$enable_largefile" != no; then
AC_CHECK_TOOL(GETCONF, getconf)
AC_SYS_EXTRA_LARGEFILE_FLAGS(CFLAGS)
AC_SYS_EXTRA_LARGEFILE_FLAGS(LDFLAGS)
AC_SYS_EXTRA_LARGEFILE_FLAGS(LIBS)
for ac_flag in $ac_cv_sys_largefile_CFLAGS no; do
case "$ac_flag" in
no) ;;
-D_FILE_OFFSET_BITS=*) ;;
-D_LARGEFILE_SOURCE | -D_LARGEFILE_SOURCE=*) ;;
-D_LARGE_FILES | -D_LARGE_FILES=*) ;;
-D?* | -I?*)
AC_SYS_EXTRA_LARGEFILE_SPACE_APPEND(CPPFLAGS, "$ac_flag") ;;
*)
AC_SYS_EXTRA_LARGEFILE_SPACE_APPEND(CFLAGS, "$ac_flag") ;;
esac
done
AC_SYS_EXTRA_LARGEFILE_SPACE_APPEND(LDFLAGS, "$ac_cv_sys_largefile_LDFLAGS")
AC_SYS_EXTRA_LARGEFILE_SPACE_APPEND(LIBS, "$ac_cv_sys_largefile_LIBS")
AC_SYS_EXTRA_LARGEFILE_MACRO_VALUE(_FILE_OFFSET_BITS,
ac_cv_sys_file_offset_bits,
[Number of bits in a file offset, on hosts where this is settable.])
[case "$host_os" in
# HP-UX 10.20 and later
hpux10.[2-9][0-9]* | hpux1[1-9]* | hpux[2-9][0-9]*)
ac_cv_sys_file_offset_bits=64 ;;
esac]
AC_SYS_EXTRA_LARGEFILE_MACRO_VALUE(_LARGEFILE_SOURCE,
ac_cv_sys_largefile_source,
[Define to make fseeko etc. visible, on some hosts.],
[case "$host_os" in
# HP-UX 10.20 and later
hpux10.[2-9][0-9]* | hpux1[1-9]* | hpux[2-9][0-9]*)
ac_cv_sys_largefile_source=1 ;;
esac])
AC_SYS_EXTRA_LARGEFILE_MACRO_VALUE(_LARGE_FILES,
ac_cv_sys_large_files,
[Define for large files, on AIX-style hosts.],
[case "$host_os" in
# AIX 4.2 and later
aix4.[2-9]* | aix4.1[0-9]* | aix[5-9].* | aix[1-9][0-9]*)
ac_cv_sys_large_files=1 ;;
esac])
fi
])
cmph-2.0.2/CONCEPTS.t2t 0000644 0001750 0001750 00000005200 13411542035 013650 0 ustar joseph joseph Minimal Perfect Hash Functions - Introduction
%!includeconf: CONFIG.t2t
----------------------------------------
==Basic Concepts==
Suppose [figs/img14.png] is a universe of //keys//.
Let [figs/img15.png] be a //hash function// that maps the keys from [figs/img14.png] to a given interval of integers [figs/img16.png].
Let [figs/img17.png] be a set of [figs/img8.png] keys from [figs/img14.png].
Given a key [figs/img18.png], the hash function [figs/img7.png] computes an
integer in [figs/img19.png] for the storage or retrieval of [figs/img11.png] in
a //hash table//.
Hashing methods for //non-static sets// of keys can be used to construct
data structures storing [figs/img20.png] and supporting membership queries
"[figs/img18.png]?" in expected time [figs/img21.png].
However, they involve a certain amount of wasted space owing to unused
locations in the table and waisted time to resolve collisions when
two keys are hashed to the same table location.
For //static sets// of keys it is possible to compute a function
to find any key in a table in one probe; such hash functions are called
//perfect//.
More precisely, given a set of keys [figs/img20.png], we shall say that a
hash function [figs/img15.png] is a //perfect hash function//
for [figs/img20.png] if [figs/img7.png] is an injection on [figs/img20.png],
that is, there are no //collisions// among the keys in [figs/img20.png]:
if [figs/img11.png] and [figs/img22.png] are in [figs/img20.png] and [figs/img23.png],
then [figs/img24.png].
Figure 1(a) illustrates a perfect hash function.
Since no collisions occur, each key can be retrieved from the table
with a single probe.
If [figs/img25.png], that is, the table has the same size as [figs/img20.png],
then we say that [figs/img7.png] is a //minimal perfect hash function//
for [figs/img20.png].
Figure 1(b) illustrates a minimal perfect hash function.
Minimal perfect hash functions totally avoid the problem of wasted
space and time. A perfect hash function [figs/img7.png] is //order preserving//
if the keys in [figs/img20.png] are arranged in some given order
and [figs/img7.png] preserves this order in the hash table.
| [figs/img26.png]
| **Figure 1:** (a) Perfect hash function. (b) Minimal perfect hash function.
Minimal perfect hash functions are widely used for memory efficient
storage and fast retrieval of items from static sets, such as words in natural
languages, reserved words in programming languages or interactive systems,
universal resource locations (URLs) in Web search engines, or item sets in
data mining techniques.
%!include: ALGORITHMS.t2t
%!include: FOOTER.t2t
%!include(html): ''GOOGLEANALYTICS.t2t'' cmph-2.0.2/MPL-1.1 0000644 0001750 0001750 00000062232 13411542035 012657 0 ustar joseph joseph MOZILLA PUBLIC LICENSE
Version 1.1
---------------
1. Definitions.
1.0.1. "Commercial Use" means distribution or otherwise making the
Covered Code available to a third party.
1.1. "Contributor" means each entity that creates or contributes to
the creation of Modifications.
1.2. "Contributor Version" means the combination of the Original
Code, prior Modifications used by a Contributor, and the Modifications
made by that particular Contributor.
1.3. "Covered Code" means the Original Code or Modifications or the
combination of the Original Code and Modifications, in each case
including portions thereof.
1.4. "Electronic Distribution Mechanism" means a mechanism generally
accepted in the software development community for the electronic
transfer of data.
1.5. "Executable" means Covered Code in any form other than Source
Code.
1.6. "Initial Developer" means the individual or entity identified
as the Initial Developer in the Source Code notice required by Exhibit
A.
1.7. "Larger Work" means a work which combines Covered Code or
portions thereof with code not governed by the terms of this License.
1.8. "License" means this document.
1.8.1. "Licensable" means having the right to grant, to the maximum
extent possible, whether at the time of the initial grant or
subsequently acquired, any and all of the rights conveyed herein.
1.9. "Modifications" means any addition to or deletion from the
substance or structure of either the Original Code or any previous
Modifications. When Covered Code is released as a series of files, a
Modification is:
A. Any addition to or deletion from the contents of a file
containing Original Code or previous Modifications.
B. Any new file that contains any part of the Original Code or
previous Modifications.
1.10. "Original Code" means Source Code of computer software code
which is described in the Source Code notice required by Exhibit A as
Original Code, and which, at the time of its release under this
License is not already Covered Code governed by this License.
1.10.1. "Patent Claims" means any patent claim(s), now owned or
hereafter acquired, including without limitation, method, process,
and apparatus claims, in any patent Licensable by grantor.
1.11. "Source Code" means the preferred form of the Covered Code for
making modifications to it, including all modules it contains, plus
any associated interface definition files, scripts used to control
compilation and installation of an Executable, or source code
differential comparisons against either the Original Code or another
well known, available Covered Code of the Contributor's choice. The
Source Code can be in a compressed or archival form, provided the
appropriate decompression or de-archiving software is widely available
for no charge.
1.12. "You" (or "Your") means an individual or a legal entity
exercising rights under, and complying with all of the terms of, this
License or a future version of this License issued under Section 6.1.
For legal entities, "You" includes any entity which controls, is
controlled by, or is under common control with You. For purposes of
this definition, "control" means (a) the power, direct or indirect,
to cause the direction or management of such entity, whether by
contract or otherwise, or (b) ownership of more than fifty percent
(50%) of the outstanding shares or beneficial ownership of such
entity.
2. Source Code License.
2.1. The Initial Developer Grant.
The Initial Developer hereby grants You a world-wide, royalty-free,
non-exclusive license, subject to third party intellectual property
claims:
(a) under intellectual property rights (other than patent or
trademark) Licensable by Initial Developer to use, reproduce,
modify, display, perform, sublicense and distribute the Original
Code (or portions thereof) with or without Modifications, and/or
as part of a Larger Work; and
(b) under Patents Claims infringed by the making, using or
selling of Original Code, to make, have made, use, practice,
sell, and offer for sale, and/or otherwise dispose of the
Original Code (or portions thereof).
(c) the licenses granted in this Section 2.1(a) and (b) are
effective on the date Initial Developer first distributes
Original Code under the terms of this License.
(d) Notwithstanding Section 2.1(b) above, no patent license is
granted: 1) for code that You delete from the Original Code; 2)
separate from the Original Code; or 3) for infringements caused
by: i) the modification of the Original Code or ii) the
combination of the Original Code with other software or devices.
2.2. Contributor Grant.
Subject to third party intellectual property claims, each Contributor
hereby grants You a world-wide, royalty-free, non-exclusive license
(a) under intellectual property rights (other than patent or
trademark) Licensable by Contributor, to use, reproduce, modify,
display, perform, sublicense and distribute the Modifications
created by such Contributor (or portions thereof) either on an
unmodified basis, with other Modifications, as Covered Code
and/or as part of a Larger Work; and
(b) under Patent Claims infringed by the making, using, or
selling of Modifications made by that Contributor either alone
and/or in combination with its Contributor Version (or portions
of such combination), to make, use, sell, offer for sale, have
made, and/or otherwise dispose of: 1) Modifications made by that
Contributor (or portions thereof); and 2) the combination of
Modifications made by that Contributor with its Contributor
Version (or portions of such combination).
(c) the licenses granted in Sections 2.2(a) and 2.2(b) are
effective on the date Contributor first makes Commercial Use of
the Covered Code.
(d) Notwithstanding Section 2.2(b) above, no patent license is
granted: 1) for any code that Contributor has deleted from the
Contributor Version; 2) separate from the Contributor Version;
3) for infringements caused by: i) third party modifications of
Contributor Version or ii) the combination of Modifications made
by that Contributor with other software (except as part of the
Contributor Version) or other devices; or 4) under Patent Claims
infringed by Covered Code in the absence of Modifications made by
that Contributor.
3. Distribution Obligations.
3.1. Application of License.
The Modifications which You create or to which You contribute are
governed by the terms of this License, including without limitation
Section 2.2. The Source Code version of Covered Code may be
distributed only under the terms of this License or a future version
of this License released under Section 6.1, and You must include a
copy of this License with every copy of the Source Code You
distribute. You may not offer or impose any terms on any Source Code
version that alters or restricts the applicable version of this
License or the recipients' rights hereunder. However, You may include
an additional document offering the additional rights described in
Section 3.5.
3.2. Availability of Source Code.
Any Modification which You create or to which You contribute must be
made available in Source Code form under the terms of this License
either on the same media as an Executable version or via an accepted
Electronic Distribution Mechanism to anyone to whom you made an
Executable version available; and if made available via Electronic
Distribution Mechanism, must remain available for at least twelve (12)
months after the date it initially became available, or at least six
(6) months after a subsequent version of that particular Modification
has been made available to such recipients. You are responsible for
ensuring that the Source Code version remains available even if the
Electronic Distribution Mechanism is maintained by a third party.
3.3. Description of Modifications.
You must cause all Covered Code to which You contribute to contain a
file documenting the changes You made to create that Covered Code and
the date of any change. You must include a prominent statement that
the Modification is derived, directly or indirectly, from Original
Code provided by the Initial Developer and including the name of the
Initial Developer in (a) the Source Code, and (b) in any notice in an
Executable version or related documentation in which You describe the
origin or ownership of the Covered Code.
3.4. Intellectual Property Matters
(a) Third Party Claims.
If Contributor has knowledge that a license under a third party's
intellectual property rights is required to exercise the rights
granted by such Contributor under Sections 2.1 or 2.2,
Contributor must include a text file with the Source Code
distribution titled "LEGAL" which describes the claim and the
party making the claim in sufficient detail that a recipient will
know whom to contact. If Contributor obtains such knowledge after
the Modification is made available as described in Section 3.2,
Contributor shall promptly modify the LEGAL file in all copies
Contributor makes available thereafter and shall take other steps
(such as notifying appropriate mailing lists or newsgroups)
reasonably calculated to inform those who received the Covered
Code that new knowledge has been obtained.
(b) Contributor APIs.
If Contributor's Modifications include an application programming
interface and Contributor has knowledge of patent licenses which
are reasonably necessary to implement that API, Contributor must
also include this information in the LEGAL file.
(c) Representations.
Contributor represents that, except as disclosed pursuant to
Section 3.4(a) above, Contributor believes that Contributor's
Modifications are Contributor's original creation(s) and/or
Contributor has sufficient rights to grant the rights conveyed by
this License.
3.5. Required Notices.
You must duplicate the notice in Exhibit A in each file of the Source
Code. If it is not possible to put such notice in a particular Source
Code file due to its structure, then You must include such notice in a
location (such as a relevant directory) where a user would be likely
to look for such a notice. If You created one or more Modification(s)
You may add your name as a Contributor to the notice described in
Exhibit A. You must also duplicate this License in any documentation
for the Source Code where You describe recipients' rights or ownership
rights relating to Covered Code. You may choose to offer, and to
charge a fee for, warranty, support, indemnity or liability
obligations to one or more recipients of Covered Code. However, You
may do so only on Your own behalf, and not on behalf of the Initial
Developer or any Contributor. You must make it absolutely clear than
any such warranty, support, indemnity or liability obligation is
offered by You alone, and You hereby agree to indemnify the Initial
Developer and every Contributor for any liability incurred by the
Initial Developer or such Contributor as a result of warranty,
support, indemnity or liability terms You offer.
3.6. Distribution of Executable Versions.
You may distribute Covered Code in Executable form only if the
requirements of Section 3.1-3.5 have been met for that Covered Code,
and if You include a notice stating that the Source Code version of
the Covered Code is available under the terms of this License,
including a description of how and where You have fulfilled the
obligations of Section 3.2. The notice must be conspicuously included
in any notice in an Executable version, related documentation or
collateral in which You describe recipients' rights relating to the
Covered Code. You may distribute the Executable version of Covered
Code or ownership rights under a license of Your choice, which may
contain terms different from this License, provided that You are in
compliance with the terms of this License and that the license for the
Executable version does not attempt to limit or alter the recipient's
rights in the Source Code version from the rights set forth in this
License. If You distribute the Executable version under a different
license You must make it absolutely clear that any terms which differ
from this License are offered by You alone, not by the Initial
Developer or any Contributor. You hereby agree to indemnify the
Initial Developer and every Contributor for any liability incurred by
the Initial Developer or such Contributor as a result of any such
terms You offer.
3.7. Larger Works.
You may create a Larger Work by combining Covered Code with other code
not governed by the terms of this License and distribute the Larger
Work as a single product. In such a case, You must make sure the
requirements of this License are fulfilled for the Covered Code.
4. Inability to Comply Due to Statute or Regulation.
If it is impossible for You to comply with any of the terms of this
License with respect to some or all of the Covered Code due to
statute, judicial order, or regulation then You must: (a) comply with
the terms of this License to the maximum extent possible; and (b)
describe the limitations and the code they affect. Such description
must be included in the LEGAL file described in Section 3.4 and must
be included with all distributions of the Source Code. Except to the
extent prohibited by statute or regulation, such description must be
sufficiently detailed for a recipient of ordinary skill to be able to
understand it.
5. Application of this License.
This License applies to code to which the Initial Developer has
attached the notice in Exhibit A and to related Covered Code.
6. Versions of the License.
6.1. New Versions.
Netscape Communications Corporation ("Netscape") may publish revised
and/or new versions of the License from time to time. Each version
will be given a distinguishing version number.
6.2. Effect of New Versions.
Once Covered Code has been published under a particular version of the
License, You may always continue to use it under the terms of that
version. You may also choose to use such Covered Code under the terms
of any subsequent version of the License published by Netscape. No one
other than Netscape has the right to modify the terms applicable to
Covered Code created under this License.
6.3. Derivative Works.
If You create or use a modified version of this License (which you may
only do in order to apply it to code which is not already Covered Code
governed by this License), You must (a) rename Your license so that
the phrases "Mozilla", "MOZILLAPL", "MOZPL", "Netscape",
"MPL", "NPL" or any confusingly similar phrase do not appear in your
license (except to note that your license differs from this License)
and (b) otherwise make it clear that Your version of the license
contains terms which differ from the Mozilla Public License and
Netscape Public License. (Filling in the name of the Initial
Developer, Original Code or Contributor in the notice described in
Exhibit A shall not of themselves be deemed to be modifications of
this License.)
7. DISCLAIMER OF WARRANTY.
COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS,
WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF
DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING.
THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE
IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT,
YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE
COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER
OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF
ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER.
8. TERMINATION.
8.1. This License and the rights granted hereunder will terminate
automatically if You fail to comply with terms herein and fail to cure
such breach within 30 days of becoming aware of the breach. All
sublicenses to the Covered Code which are properly granted shall
survive any termination of this License. Provisions which, by their
nature, must remain in effect beyond the termination of this License
shall survive.
8.2. If You initiate litigation by asserting a patent infringement
claim (excluding declatory judgment actions) against Initial Developer
or a Contributor (the Initial Developer or Contributor against whom
You file such action is referred to as "Participant") alleging that:
(a) such Participant's Contributor Version directly or indirectly
infringes any patent, then any and all rights granted by such
Participant to You under Sections 2.1 and/or 2.2 of this License
shall, upon 60 days notice from Participant terminate prospectively,
unless if within 60 days after receipt of notice You either: (i)
agree in writing to pay Participant a mutually agreeable reasonable
royalty for Your past and future use of Modifications made by such
Participant, or (ii) withdraw Your litigation claim with respect to
the Contributor Version against such Participant. If within 60 days
of notice, a reasonable royalty and payment arrangement are not
mutually agreed upon in writing by the parties or the litigation claim
is not withdrawn, the rights granted by Participant to You under
Sections 2.1 and/or 2.2 automatically terminate at the expiration of
the 60 day notice period specified above.
(b) any software, hardware, or device, other than such Participant's
Contributor Version, directly or indirectly infringes any patent, then
any rights granted to You by such Participant under Sections 2.1(b)
and 2.2(b) are revoked effective as of the date You first made, used,
sold, distributed, or had made, Modifications made by that
Participant.
8.3. If You assert a patent infringement claim against Participant
alleging that such Participant's Contributor Version directly or
indirectly infringes any patent where such claim is resolved (such as
by license or settlement) prior to the initiation of patent
infringement litigation, then the reasonable value of the licenses
granted by such Participant under Sections 2.1 or 2.2 shall be taken
into account in determining the amount or value of any payment or
license.
8.4. In the event of termination under Sections 8.1 or 8.2 above,
all end user license agreements (excluding distributors and resellers)
which have been validly granted by You or any distributor hereunder
prior to termination shall survive termination.
9. LIMITATION OF LIABILITY.
UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT
(INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL
DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE,
OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR
ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY
CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL,
WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER
COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN
INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF
LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY
RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW
PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE
EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO
THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU.
10. U.S. GOVERNMENT END USERS.
The Covered Code is a "commercial item," as that term is defined in
48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer
software" and "commercial computer software documentation," as such
terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48
C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995),
all U.S. Government End Users acquire Covered Code with only those
rights set forth herein.
11. MISCELLANEOUS.
This License represents the complete agreement concerning subject
matter hereof. If any provision of this License is held to be
unenforceable, such provision shall be reformed only to the extent
necessary to make it enforceable. This License shall be governed by
California law provisions (except to the extent applicable law, if
any, provides otherwise), excluding its conflict-of-law provisions.
With respect to disputes in which at least one party is a citizen of,
or an entity chartered or registered to do business in the United
States of America, any litigation relating to this License shall be
subject to the jurisdiction of the Federal Courts of the Northern
District of California, with venue lying in Santa Clara County,
California, with the losing party responsible for costs, including
without limitation, court costs and reasonable attorneys' fees and
expenses. The application of the United Nations Convention on
Contracts for the International Sale of Goods is expressly excluded.
Any law or regulation which provides that the language of a contract
shall be construed against the drafter shall not apply to this
License.
12. RESPONSIBILITY FOR CLAIMS.
As between Initial Developer and the Contributors, each party is
responsible for claims and damages arising, directly or indirectly,
out of its utilization of rights under this License and You agree to
work with Initial Developer and Contributors to distribute such
responsibility on an equitable basis. Nothing herein is intended or
shall be deemed to constitute any admission of liability.
13. MULTIPLE-LICENSED CODE.
Initial Developer may designate portions of the Covered Code as
"Multiple-Licensed". "Multiple-Licensed" means that the Initial
Developer permits you to utilize portions of the Covered Code under
Your choice of the NPL or the alternative licenses, if any, specified
by the Initial Developer in the file described in Exhibit A.
EXHIBIT A -Mozilla Public License.
``The contents of this file are subject to the Mozilla Public License
Version 1.1 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License at
http://www.mozilla.org/MPL/
Software distributed under the License is distributed on an "AS IS"
basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific language governing rights and limitations
under the License.
The Original Code is ______________________________________.
The Initial Developer of the Original Code is ________________________.
Portions created by ______________________ are Copyright (C) ______
_______________________. All Rights Reserved.
Contributor(s): ______________________________________.
Alternatively, the contents of this file may be used under the terms
of the _____ license (the "[___] License"), in which case the
provisions of [______] License are applicable instead of those
above. If you wish to allow use of your version of this file only
under the terms of the [____] License and not to allow others to use
your version of this file under the MPL, indicate your decision by
deleting the provisions above and replace them with the notice and
other provisions required by the [___] License. If you do not delete
the provisions above, a recipient may use your version of this file
under either the MPL or the [___] License."
[NOTE: The text of this Exhibit A may differ slightly from the text of
the notices in the Source Code files of the Original Code. You should
use the text of this Exhibit A rather than the text found in the
Original Code Source Code for Your Modifications.]
cmph-2.0.2/ChangeLog 0000644 0001750 0001750 00000035260 13411542035 013562 0 ustar joseph joseph 2005-08-08 18:34 fc_botelho
* INSTALL, examples/Makefile, examples/Makefile.in,
examples/.deps/file_adapter_ex2.Po,
examples/.deps/vector_adapter_ex1.Po, src/brz.c: [no log message]
2005-08-07 22:00 fc_botelho
* src/: brz.c, brz.h, brz_structs.h, cmph.c, cmph.h, main.c:
temporary directory passed by command line
2005-08-07 20:22 fc_botelho
* src/brz.c: stable version of BRZ
2005-08-06 22:09 fc_botelho
* src/bmz.c: no message
2005-08-06 22:02 fc_botelho
* src/bmz.c: no message
2005-08-06 21:45 fc_botelho
* src/brz.c: fastest version of BRZ
2005-08-06 17:20 fc_botelho
* src/: bmz.c, brz.c, main.c: [no log message]
2005-07-29 16:43 fc_botelho
* src/brz.c: BRZ algorithm is almost stable
2005-07-29 15:29 fc_botelho
* src/: bmz.c, brz.c, brz_structs.h, cmph_types.h: BRZ algorithm is
almost stable
2005-07-29 00:09 fc_botelho
* src/: brz.c, djb2_hash.c, djb2_hash.h, fnv_hash.c, fnv_hash.h,
hash.c, hash.h, jenkins_hash.c, jenkins_hash.h, sdbm_hash.c,
sdbm_hash.h: it was fixed more mistakes in BRZ algorithm
2005-07-28 21:00 fc_botelho
* src/: bmz.c, brz.c, cmph.c: fixed some mistakes in BRZ algorithm
2005-07-27 19:13 fc_botelho
* src/brz.c: algorithm BRZ included
2005-07-27 18:16 fc_botelho
* src/: bmz_structs.h, brz.c, brz.h, brz_structs.h: Algorithm BRZ
included
2005-07-27 18:13 fc_botelho
* src/: Makefile.am, bmz.c, chm.c, cmph.c, cmph.h, cmph_types.h:
Algorithm BRZ included
2005-07-25 19:18 fc_botelho
* README, README.t2t, scpscript: it was included an examples
directory
2005-07-25 18:26 fc_botelho
* INSTALL, Makefile.am, configure.ac, examples/Makefile,
examples/Makefile.am, examples/Makefile.in,
examples/file_adapter_ex2.c, examples/keys.txt,
examples/vector_adapter_ex1.c, examples/.deps/file_adapter_ex2.Po,
examples/.deps/vector_adapter_ex1.Po, src/cmph.c, src/cmph.h: it
was included a examples directory
2005-03-03 02:07 davi
* src/: bmz.c, chm.c, chm.h, chm_structs.h, cmph.c, cmph.h,
graph.c, graph.h, jenkins_hash.c, jenkins_hash.h, main.c (xgraph):
New f*cking cool algorithm works. Roughly implemented in chm.c
2005-03-02 20:55 davi
* src/xgraph.c (xgraph): xchmr working nice, but a bit slow
2005-03-02 02:01 davi
* src/xchmr.h: file xchmr.h was initially added on branch xgraph.
2005-03-02 02:01 davi
* src/xchmr_structs.h: file xchmr_structs.h was initially added on
branch xgraph.
2005-03-02 02:01 davi
* src/xchmr.c: file xchmr.c was initially added on branch xgraph.
2005-03-02 02:01 davi
* src/: Makefile.am, cmph.c, cmph_types.h, xchmr.c, xchmr.h,
xchmr_structs.h, xgraph.c, xgraph.h (xgraph): xchmr working fine
except for false positives on cyclic detection.
2005-03-02 00:05 davi
* src/: Makefile.am, xgraph.c, xgraph.h (xgraph): Added external
graph functionality in branch xgraph.
2005-03-02 00:05 davi
* src/xgraph.c: file xgraph.c was initially added on branch xgraph.
2005-03-02 00:05 davi
* src/xgraph.h: file xgraph.h was initially added on branch xgraph.
2005-02-28 19:53 davi
* src/chm.c: Fixed off by one bug in chm.
2005-02-17 16:20 fc_botelho
* LOGO.html, README, README.t2t, gendocs: The way of calling the
function cmph_search was fixed in the file README.t2t
2005-01-31 17:13 fc_botelho
* README.t2t: Heuristic BMZ memory consumption was updated
2005-01-31 17:09 fc_botelho
* BMZ.t2t: DJB2, SDBM, FNV and Jenkins hash link were added
2005-01-31 16:50 fc_botelho
* BMZ.t2t, CHM.t2t, COMPARISON.t2t, CONCEPTS.t2t, CONFIG.t2t,
FAQ.t2t, GPERF.t2t, LOGO.t2t, README.t2t, TABLE1.t2t, TABLE4.t2t,
TABLE5.t2t, DOC.css: BMZ documentation was finished
2005-01-28 18:12 fc_botelho
* figs/img1.png, figs/img10.png, figs/img100.png, figs/img101.png,
figs/img102.png, figs/img103.png, figs/img104.png, figs/img105.png,
figs/img106.png, figs/img107.png, figs/img108.png, figs/img109.png,
papers/bmz_tr004_04.ps, papers/bmz_wea2005.ps, papers/chm92.pdf,
figs/img11.png, figs/img110.png, figs/img111.png, figs/img112.png,
figs/img113.png, figs/img114.png, figs/img115.png, figs/img116.png,
figs/img117.png, figs/img118.png, figs/img119.png, figs/img12.png,
figs/img120.png, figs/img121.png, figs/img122.png, figs/img123.png,
figs/img124.png, figs/img125.png, figs/img126.png, figs/img127.png,
figs/img128.png, figs/img129.png, figs/img13.png, figs/img130.png,
figs/img131.png, figs/img132.png, figs/img133.png, figs/img134.png,
figs/img135.png, figs/img136.png, figs/img137.png, figs/img138.png,
figs/img139.png, figs/img14.png, figs/img140.png, figs/img141.png,
figs/img142.png, figs/img143.png, figs/img144.png, figs/img145.png,
figs/img146.png, figs/img147.png, figs/img148.png, figs/img149.png,
figs/img15.png, figs/img150.png, figs/img151.png, figs/img152.png,
figs/img153.png, figs/img154.png, figs/img155.png, figs/img156.png,
figs/img157.png, figs/img158.png, figs/img159.png, figs/img16.png,
figs/img160.png, figs/img161.png, figs/img162.png, figs/img163.png,
figs/img164.png, figs/img165.png, figs/img166.png, figs/img167.png,
figs/img168.png, figs/img169.png, figs/img17.png, figs/img170.png,
figs/img171.png, figs/img172.png, figs/img173.png, figs/img174.png,
figs/img175.png, figs/img176.png, figs/img177.png, figs/img178.png,
figs/img179.png, figs/img18.png, figs/img180.png, figs/img181.png,
figs/img182.png, figs/img183.png, figs/img184.png, figs/img185.png,
figs/img186.png, figs/img187.png, figs/img188.png, figs/img189.png,
figs/img19.png, figs/img190.png, figs/img191.png, figs/img192.png,
figs/img193.png, figs/img194.png, figs/img195.png, figs/img196.png,
figs/img197.png, figs/img198.png, figs/img199.png, figs/img2.png,
figs/img20.png, figs/img200.png, figs/img201.png, figs/img202.png,
figs/img203.png, figs/img204.png, figs/img205.png, figs/img206.png,
figs/img207.png, figs/img208.png, figs/img209.png, figs/img21.png,
figs/img210.png, figs/img211.png, figs/img212.png, figs/img213.png,
figs/img214.png, figs/img215.png, figs/img216.png, figs/img217.png,
figs/img218.png, figs/img219.png, figs/img22.png, figs/img220.png,
figs/img221.png, figs/img222.png, figs/img223.png, figs/img224.png,
figs/img225.png, figs/img226.png, figs/img227.png, figs/img228.png,
figs/img229.png, figs/img23.png, figs/img230.png, figs/img231.png,
figs/img232.png, figs/img233.png, figs/img234.png, figs/img235.png,
figs/img236.png, figs/img237.png, figs/img238.png, figs/img239.png,
figs/img24.png, figs/img240.png, figs/img241.png, figs/img242.png,
figs/img243.png, figs/img244.png, figs/img245.png, figs/img246.png,
figs/img247.png, figs/img248.png, figs/img249.png, figs/img25.png,
figs/img250.png, figs/img251.png, figs/img252.png, figs/img253.png,
figs/img26.png, figs/img27.png, figs/img28.png, figs/img29.png,
figs/img3.png, figs/img30.png, figs/img31.png, figs/img32.png,
figs/img33.png, figs/img34.png, figs/img35.png, figs/img36.png,
figs/img37.png, figs/img38.png, figs/img39.png, figs/img4.png,
figs/img40.png, figs/img41.png, figs/img42.png, figs/img43.png,
figs/img44.png, figs/img45.png, figs/img46.png, figs/img47.png,
figs/img48.png, figs/img49.png, figs/img5.png, figs/img50.png,
figs/img51.png, figs/img52.png, figs/img53.png, figs/img54.png,
figs/img55.png, figs/img56.png, figs/img57.png, figs/img58.png,
figs/img59.png, figs/img6.png, figs/img60.png, figs/img61.png,
figs/img62.png, figs/img63.png, figs/img64.png, figs/img65.png,
figs/img66.png, figs/img67.png, figs/img68.png, figs/img69.png,
figs/img7.png, figs/img70.png, figs/img71.png, figs/img72.png,
figs/img73.png, figs/img74.png, figs/img75.png, figs/img76.png,
figs/img77.png, figs/img78.png, figs/img79.png, figs/img8.png,
figs/img80.png, figs/img81.png, figs/img82.png, figs/img83.png,
figs/img84.png, figs/img85.png, figs/img86.png, figs/img87.png,
figs/img88.png, figs/img89.png, figs/img9.png, figs/img90.png,
figs/img91.png, figs/img92.png, figs/img93.png, figs/img94.png,
figs/img95.png, figs/img96.png, figs/img97.png, figs/img98.png,
figs/img99.png: Initial version
2005-01-28 18:07 fc_botelho
* BMZ.t2t, CHM.t2t, COMPARISON.t2t, CONFIG.t2t, README.t2t: It was
improved the documentation of BMZ and CHM algorithms
2005-01-27 18:07 fc_botelho
* BMZ.t2t, CHM.t2t, FAQ.t2t: history of BMZ algorithm is available
2005-01-27 14:23 fc_botelho
* AUTHORS: It was added the authors' email
2005-01-27 14:21 fc_botelho
* BMZ.t2t, CHM.t2t, COMPARISON.t2t, FAQ.t2t, FOOTER.t2t, GPERF.t2t,
README.t2t: It was added FOOTER.t2t file
2005-01-27 12:16 fc_botelho
* src/cmph_types.h: It was removed pjw and glib functions from
cmph_hash_names vector
2005-01-27 12:12 fc_botelho
* src/hash.c: It was removed pjw and glib functions from
cmph_hash_names vector
2005-01-27 11:01 davi
* FAQ.t2t, README, README.t2t, gendocs, src/bmz.c, src/bmz.h,
src/chm.c, src/chm.h, src/cmph.c, src/cmph_structs.c, src/debug.h,
src/main.c: Fix to alternate hash functions code. Removed htonl
stuff from chm algorithm. Added faq.
2005-01-27 09:14 fc_botelho
* README.t2t: It was corrected some formatting mistakes
2005-01-26 22:04 davi
* BMZ.t2t, CHM.t2t, COMPARISON.t2t, GPERF.t2t, README, README.t2t,
gendocs: Added gperf notes.
2005-01-25 19:10 fc_botelho
* INSTALL: generated in version 0.3
2005-01-25 19:09 fc_botelho
* src/: czech.c, czech.h, czech_structs.h: The czech.h,
czech_structs.h and czech.c files were removed
2005-01-25 19:06 fc_botelho
* src/: chm.c, chm.h, chm_structs.h, cmph.c, cmph_types.h, main.c,
Makefile.am: It was changed the prefix czech by chm
2005-01-25 18:50 fc_botelho
* gendocs: script to generate the documentation and the README file
2005-01-25 18:47 fc_botelho
* README: README was updated
2005-01-25 18:44 fc_botelho
* configure.ac: Version was updated
2005-01-25 18:42 fc_botelho
* src/cmph.h: Vector adapter commented
2005-01-25 18:40 fc_botelho
* CHM.t2t, CONFIG.t2t, LOGO.html: It was included the PreProc macro
through the CONFIG.t2t file and the LOGO through the LOGO.html file
2005-01-25 18:33 fc_botelho
* README.t2t, BMZ.t2t, COMPARISON.t2t, CZECH.t2t: It was included
the PreProc macro through the CONFIG.t2t file and the LOGO through
the LOGO.html file
2005-01-24 18:25 fc_botelho
* src/: bmz.c, bmz.h, cmph_structs.c, cmph_structs.h, czech.c,
cmph.c, czech.h, main.c, cmph.h: The file adpater was implemented.
2005-01-24 17:20 fc_botelho
* README.t2t: the memory consumption to create a mphf using bmz
with a heuristic was fixed.
2005-01-24 17:11 fc_botelho
* src/: cmph_types.h, main.c: The algorithms and hash functions
were put in alphabetical order
2005-01-24 16:15 fc_botelho
* BMZ.t2t, COMPARISON.t2t, CZECH.t2t, README.t2t: It was fixed some
English mistakes and It was included the files BMZ.t2t, CZECH.t2t
and COMPARISON.t2t
2005-01-21 19:19 davi
* ChangeLog, Doxyfile: Added Doxyfile.
2005-01-21 19:14 davi
* README.t2t, wingetopt.c, src/cmph.h, tests/graph_tests.c: Fixed
wingetopt.c
2005-01-21 18:44 fc_botelho
* src/Makefile.am: included files bitbool.h and bitbool.c
2005-01-21 18:42 fc_botelho
* src/: bmz.c, bmz.h, bmz_structs.h, cmph.c, cmph.h,
cmph_structs.c, cmph_structs.h, czech.c, czech.h, czech_structs.h,
djb2_hash.c, djb2_hash.h, fnv_hash.c, fnv_hash.h, graph.c, graph.h,
hash.c, hash.h, hash_state.h, jenkins_hash.c, jenkins_hash.h,
main.c, sdbm_hash.c, sdbm_hash.h, vqueue.c, vqueue.h, vstack.c,
vstack.h: Only public symbols were prefixed with cmph, and the API
was changed to agree with the initial txt2html documentation
2005-01-21 18:30 fc_botelho
* src/: bitbool.c, bitbool.h: mask to represent a boolean value
using only 1 bit
2005-01-20 10:28 davi
* ChangeLog, README, README.t2t, wingetopt.h, src/main.c: Added
initial txt2tags documentation.
2005-01-19 10:40 davi
* acinclude.m4, configure.ac: Added macros for large file support.
2005-01-18 19:06 fc_botelho
* src/: bmz.c, bmz.h, bmz_structs.h, cmph.c, cmph.h,
cmph_structs.c, cmph_structs.h, cmph_types.h, czech.c, czech.h,
czech_structs.h, djb2_hash.c, djb2_hash.h, fnv_hash.c, fnv_hash.h,
graph.c, graph.h, hash.c, hash.h, hash_state.h, jenkins_hash.c,
jenkins_hash.h, main.c, sdbm_hash.c, sdbm_hash.h, vqueue.c,
vqueue.h, vstack.c, vstack.h: version with cmph prefix
2005-01-18 15:10 davi
* ChangeLog, cmph.vcproj, cmphapp.vcproj, wingetopt.c, wingetopt.h:
Added missing files.
2005-01-18 14:25 fc_botelho
* aclocal.m4: initial version
2005-01-18 14:16 fc_botelho
* aclocal.m4: initial version
2005-01-18 13:58 fc_botelho
* src/czech.c: using bit mask to represent boolean values
2005-01-18 13:56 fc_botelho
* src/czech.c: no message
2005-01-18 10:18 davi
* COPYING, INSTALL, src/Makefile.am, src/bmz.c, src/bmz.h,
src/cmph.c, src/cmph.h, src/cmph_structs.c, src/cmph_structs.h,
src/czech.c, src/czech.h, src/debug.h, src/djb2_hash.c,
src/graph.c, src/graph.h, src/hash.c, src/jenkins_hash.c,
src/main.c, src/sdbm_hash.c, src/vqueue.c: Fixed a lot of warnings.
Added visual studio project. Make needed changes to work with
windows.
2005-01-17 16:01 fc_botelho
* src/main.c: stable version
2005-01-17 15:58 fc_botelho
* src/: bmz.c, cmph.c, cmph.h, graph.c: stable version
2005-01-13 21:56 davi
* src/czech.c: Better error handling in czech.c.
2005-01-05 18:45 fc_botelho
* src/cmph_structs.c: included option -k to specify the number of
keys to use
2005-01-05 17:48 fc_botelho
* src/: cmph.c, main.c: included option -k to specify the number of
keys to use
2005-01-03 19:38 fc_botelho
* src/bmz.c: using less memory
2005-01-03 18:47 fc_botelho
* src/: bmz.c, graph.c: using less space to store the used_edges
and critical_nodes arrays
2004-12-23 11:16 davi
* INSTALL, COPYING, AUTHORS, ChangeLog, Makefile.am, NEWS, README,
cmph.spec, configure.ac, src/graph.c, tests/Makefile.am,
tests/graph_tests.c, src/bmz.c, src/cmph_types.h,
src/czech_structs.h, src/hash_state.h, src/jenkins_hash.c,
src/bmz_structs.h, src/cmph.c, src/cmph.h, src/cmph_structs.h,
src/czech.c, src/debug.h, src/djb2_hash.c, src/djb2_hash.h,
src/fnv_hash.c, src/fnv_hash.h, src/graph.h, src/hash.c,
src/hash.h, src/jenkins_hash.h, src/sdbm_hash.c, src/vstack.h,
src/Makefile.am, src/bmz.h, src/cmph_structs.c, src/czech.h,
src/main.c, src/sdbm_hash.h, src/vqueue.c, src/vqueue.h,
src/vstack.c: Initial release.
2004-12-23 11:16 davi
* INSTALL, COPYING, AUTHORS, ChangeLog, Makefile.am, NEWS, README,
cmph.spec, configure.ac, src/graph.c, tests/Makefile.am,
tests/graph_tests.c, src/bmz.c, src/cmph_types.h,
src/czech_structs.h, src/hash_state.h, src/jenkins_hash.c,
src/bmz_structs.h, src/cmph.c, src/cmph.h, src/cmph_structs.h,
src/czech.c, src/debug.h, src/djb2_hash.c, src/djb2_hash.h,
src/fnv_hash.c, src/fnv_hash.h, src/graph.h, src/hash.c,
src/hash.h, src/jenkins_hash.h, src/sdbm_hash.c, src/vstack.h,
src/Makefile.am, src/bmz.h, src/cmph_structs.c, src/czech.h,
src/main.c, src/sdbm_hash.h, src/vqueue.c, src/vqueue.h,
src/vstack.c: Initial revision
cmph-2.0.2/ALGORITHMS.t2t 0000644 0001750 0001750 00000000320 13411542035 014101 0 ustar joseph joseph
----------------------------------------
| [Home index.html] | [CHD chd.html] | [BDZ bdz.html] | [BMZ bmz.html] | [CHM chm.html] | [BRZ brz.html] | [FCH fch.html]
----------------------------------------
cmph-2.0.2/LOGO.t2t 0000644 0001750 0001750 00000000255 13411542035 013177 0 ustar joseph joseph
cmph-2.0.2/GPERF.t2t 0000644 0001750 0001750 00000002314 13411542035 013300 0 ustar joseph joseph GPERF versus CMPH
%!includeconf: CONFIG.t2t
You might ask why cmph if [gperf http://www.gnu.org/software/gperf/gperf.html]
already works perfectly. Actually, gperf and cmph have different goals.
Basically, these are the requirements for each of them:
- GPERF
- Create very fast hash functions for **small** sets
- Create **perfect** hash functions
- CMPH
- Create very fast hash function for **very large** sets
- Create **minimal perfect** hash functions
As result, cmph can be used to create hash functions where gperf would run
forever without finding a perfect hash function, because of the running
time of the algorithm and the large memory usage.
On the other side, functions created by cmph are about 2x slower than those
created by gperf.
So, if you have large sets, or memory usage is a key restriction for you, stick
to cmph. If you have small sets, and do not care about memory usage, go with
gperf. The first problem is common in the information retrieval field (e.g.
assigning ids to millions of documents), while the former is usually found in
the compiler programming area (detect reserved keywords).
%!include: ALGORITHMS.t2t
%!include: FOOTER.t2t
%!include(html): ''GOOGLEANALYTICS.t2t'' cmph-2.0.2/configure.ac 0000644 0001750 0001750 00000005172 13411542035 014275 0 ustar joseph joseph dnl Process this file with autoconf to produce a configure script.
AC_INIT([cmph], [2.0.2])
AC_CONFIG_SRCDIR([Makefile.am])
AM_INIT_AUTOMAKE
AC_CONFIG_HEADERS([config.h])
AC_CONFIG_MACRO_DIR([m4])
dnl Checks for programs.
AC_PROG_AWK
AC_PROG_CC
AC_PROG_INSTALL
AC_PROG_LN_S
LT_INIT
AC_SYS_EXTRA_LARGEFILE
if test "x$ac_cv_sys_largefile_CFLAGS" = "xno" ; then
ac_cv_sys_largefile_CFLAGS=""
fi
if test "x$ac_cv_sys_largefile_LDFLAGS" = "xno" ; then
ac_cv_sys_largefile_LDFLAGS=""
fi
if test "x$ac_cv_sys_largefile_LIBS" = "xno" ; then
ac_cv_sys_largefile_LIBS=""
fi
CFLAGS="$ac_cv_sys_largefile_CFLAGS $CFLAGS"
LDFLAGS="$ac_cv_sys_largefile_LDFLAGS $LDFLAGS"
LIBS="$LIBS $ac_cv_sys_largefile_LIBS"
dnl Checks for headers
AC_CHECK_HEADERS([getopt.h math.h])
dnl Checks for libraries.
LT_LIB_M
LDFLAGS="$LIBS $LIBM $LDFLAGS"
CFLAGS="-Wall $CFLAGS"
AC_PROG_CXX
CXXFLAGS="-Wall -Wno-unused-function -DNDEBUG -O3 -fomit-frame-pointer $CXXFLAGS"
AC_ENABLE_CXXMPH
if test x$cxxmph = xtrue; then
AC_COMPILE_STDCXX_0X
if test x$ac_cv_cxx_compile_cxx0x_native = "xno"; then
if test x$ac_cv_cxx_compile_cxx11_cxx = "xyes"; then
CXXFLAGS="$CXXFLAGS -std=c++11"
elif test x$ac_cv_cxx_compile_cxx0x_cxx = "xyes"; then
CXXFLAGS="$CXXFLAGS -std=c++0x"
elif test x$ac_cv_cxx_compile_cxx0x_gxx = "xyes"; then
CXXFLAGS="$CXXFLAGS -std=gnu++0x"
else
AC_MSG_ERROR("cxxmph demands a working c++0x compiler.")
fi
fi
AC_SUBST([CXXMPH], "cxxmph")
fi
AM_CONDITIONAL([USE_CXXMPH], [test "$cxxmph" = true])
AC_ENABLE_BENCHMARKS
if test x$benchmarks = xtrue; then
AC_LANG_PUSH([C++])
AC_CHECK_HEADERS([hopscotch_map.h])
AC_LANG_POP([C++])
fi
AM_CONDITIONAL([USE_BENCHMARKS], [test "$benchmarks" = true])
# Unit tests based on the check library. Disabled by default.
# We do not use pkg-config because it is inconvenient for all developers to
# have check library installed.
AC_ARG_ENABLE(check, AS_HELP_STRING(
[--enable-check],
[Build unit tests depending on check library (default: disabled)]))
AS_IF([test "x$enable_check" = "xyes"],
[ AC_CHECK_LIB([check], [tcase_create])
AS_IF([test "$ac_cv_lib_check_tcase_create" = yes], [CHECK_LIBS="-lcheck"],
[AC_MSG_ERROR("Failed to find check library (http://check.sf.net).")])
AC_CHECK_HEADER(check.h,[],
[AC_MSG_ERROR("Failed to find check library header (http://check.sf.net).")])
])
AM_CONDITIONAL([USE_LIBCHECK], [test "$ac_cv_lib_check_tcase_create" = yes])
AC_SUBST(CHECK_LIBS)
AC_SUBST(CHECK_CFLAGS)
AC_CHECK_SPOON
AC_CONFIG_FILES([Makefile src/Makefile cxxmph/Makefile tests/Makefile examples/Makefile man/Makefile cmph.pc cxxmph.pc])
AC_OUTPUT
cmph-2.0.2/BMZ.t2t 0000644 0001750 0001750 00000052445 13411542035 013077 0 ustar joseph joseph BMZ Algorithm
%!includeconf: CONFIG.t2t
----------------------------------------
==History==
At the end of 2003, professor [Nivio Ziviani http://www.dcc.ufmg.br/~nivio] was
finishing the second edition of his [book http://www.dcc.ufmg.br/algoritmos/].
During the [book http://www.dcc.ufmg.br/algoritmos/] writing,
professor [Nivio Ziviani http://www.dcc.ufmg.br/~nivio] studied the problem of generating
[minimal perfect hash functions concepts.html]
(if you are not familiarized with this problem, see [[1 #papers]][[2 #papers]]).
Professor [Nivio Ziviani http://www.dcc.ufmg.br/~nivio] coded a modified version of
the [CHM algorithm chm.html], which was proposed by
Czech, Havas and Majewski, and put it in his [book http://www.dcc.ufmg.br/algoritmos/].
The [CHM algorithm chm.html] is based on acyclic random graphs to generate
[order preserving minimal perfect hash functions concepts.html] in linear time.
Professor [Nivio Ziviani http://www.dcc.ufmg.br/~nivio]
argued himself, why must the random graph
be acyclic? In the modified version availalbe in his [book http://www.dcc.ufmg.br/algoritmos/] he got rid of this restriction.
The modification presented a problem, it was impossible to generate minimal perfect hash functions
for sets with more than 1000 keys.
At the same time, [Fabiano C. Botelho http://www.dcc.ufmg.br/~fbotelho],
a master degree student at [Departament of Computer Science http://www.dcc.ufmg.br] in
[Federal University of Minas Gerais http://www.ufmg.br],
started to be advised by [Nivio Ziviani http://www.dcc.ufmg.br/~nivio] who presented the problem
to [Fabiano http://www.dcc.ufmg.br/~fbotelho].
During the master, [Fabiano http://www.dcc.ufmg.br/~fbotelho] and
[Nivio Ziviani http://www.dcc.ufmg.br/~nivio] faced lots of problems.
In april of 2004, [Fabiano http://www.dcc.ufmg.br/~fbotelho] was talking with a
friend of him (David Menoti) about the problems
and many ideas appeared.
The ideas were implemented and a very fast algorithm to generate
minimal perfect hash functions had been designed.
We refer the algorithm to as **BMZ**, because it was conceived by Fabiano C. **B**otelho,
David **M**enoti and Nivio **Z**iviani. The algorithm is described in [[1 #papers]].
To analyse BMZ algorithm we needed some results from the random graph theory, so
we invited professor [Yoshiharu Kohayakawa http://www.ime.usp.br/~yoshi] to help us.
The final description and analysis of BMZ algorithm is presented in [[2 #papers]].
----------------------------------------
==The Algorithm==
The BMZ algorithm shares several features with the [CHM algorithm chm.html].
In particular, BMZ algorithm is also
based on the generation of random graphs [figs/img27.png], where [figs/img28.png] is in
one-to-one correspondence with the key set [figs/img20.png] for which we wish to
generate a [minimal perfect hash function concepts.html].
The two main differences between BMZ algorithm and CHM algorithm
are as follows: (//i//) BMZ algorithm generates random
graphs [figs/img27.png] with [figs/img29.png] and [figs/img30.png], where [figs/img31.png],
and hence [figs/img32.png] necessarily contains cycles,
while CHM algorithm generates //acyclic// random
graphs [figs/img27.png] with [figs/img29.png] and [figs/img30.png],
with a greater number of vertices: [figs/img33.png];
(//ii//) CHM algorithm generates [order preserving minimal perfect hash functions concepts.html]
while BMZ algorithm does not preserve order. Thus, BMZ algorithm improves
the space requirement at the expense of generating functions that are not
order preserving.
Suppose [figs/img14.png] is a universe of //keys//.
Let [figs/img17.png] be a set of [figs/img8.png] keys from [figs/img14.png].
Let us show how the BMZ algorithm constructs a minimal perfect hash function [figs/img7.png].
We make use of two auxiliary random functions [figs/img41.png] and [figs/img55.png],
where [figs/img56.png] for some suitably chosen integer [figs/img57.png],
where [figs/img58.png].We build a random graph [figs/img59.png] on [figs/img60.png],
whose edge set is [figs/img61.png]. There is an edge in [figs/img32.png] for each
key in the set of keys [figs/img20.png].
In what follows, we shall be interested in the //2-core// of
the random graph [figs/img32.png], that is, the maximal subgraph
of [figs/img32.png] with minimal degree at
least 2 (see [[2 #papers]] for details).
Because of its importance in our context, we call the 2-core the
//critical// subgraph of [figs/img32.png] and denote it by [figs/img63.png].
The vertices and edges in [figs/img63.png] are said to be //critical//.
We let [figs/img64.png] and [figs/img65.png].
Moreover, we let [figs/img66.png] be the set of //non-critical//
vertices in [figs/img32.png].
We also let [figs/img67.png] be the set of all critical
vertices that have at least one non-critical vertex as a neighbour.
Let [figs/img68.png] be the set of //non-critical// edges in [figs/img32.png].
Finally, we let [figs/img69.png] be the //non-critical// subgraph
of [figs/img32.png].
The non-critical subgraph [figs/img70.png] corresponds to the //acyclic part//
of [figs/img32.png].
We have [figs/img71.png].
We then construct a suitable labelling [figs/img72.png] of the vertices
of [figs/img32.png]: we choose [figs/img73.png] for each [figs/img74.png] in such
a way that [figs/img75.png] ([figs/img18.png]) is a
minimal perfect hash function for [figs/img20.png].
This labelling [figs/img37.png] can be found in linear time
if the number of edges in [figs/img63.png] is at most [figs/img76.png] (see [[2 #papers]]
for details).
Figure 1 presents a pseudo code for the BMZ algorithm.
The procedure BMZ ([figs/img20.png], [figs/img37.png]) receives as input the set of
keys [figs/img20.png] and produces the labelling [figs/img37.png].
The method uses a mapping, ordering and searching approach.
We now describe each step.
| procedure BMZ ([figs/img20.png], [figs/img37.png])
| Mapping ([figs/img20.png], [figs/img32.png]);
| Ordering ([figs/img32.png], [figs/img63.png], [figs/img70.png]);
| Searching ([figs/img32.png], [figs/img63.png], [figs/img70.png], [figs/img37.png]);
| **Figure 1**: Main steps of BMZ algorithm for constructing a minimal perfect hash function
----------------------------------------
===Mapping Step===
The procedure Mapping ([figs/img20.png], [figs/img32.png]) receives as input the set
of keys [figs/img20.png] and generates the random graph [figs/img59.png], by generating
two auxiliary functions [figs/img41.png], [figs/img78.png].
The functions [figs/img41.png] and [figs/img42.png] are constructed as follows.
We impose some upper bound [figs/img79.png] on the lengths of the keys in [figs/img20.png].
To define [figs/img80.png] ([figs/img81.png], [figs/img62.png]), we generate
an [figs/img82.png] table of random integers [figs/img83.png].
For a key [figs/img18.png] of length [figs/img84.png] and [figs/img85.png], we let
| [figs/img86.png]
The random graph [figs/img59.png] has vertex set [figs/img56.png] and
edge set [figs/img61.png]. We need [figs/img32.png] to be
simple, i.e., [figs/img32.png] should have neither loops nor multiple edges.
A loop occurs when [figs/img87.png] for some [figs/img18.png].
We solve this in an ad hoc manner: we simply let [figs/img88.png] in this case.
If we still find a loop after this, we generate another pair [figs/img89.png].
When a multiple edge occurs we abort and generate a new pair [figs/img89.png].
Although the function above causes [collisions concepts.html] with probability //1/t//,
in [cmph library index.html] we use faster hash
functions ([DJB2 hash http://www.cs.yorku.ca/~oz/hash.html], [FNV hash http://www.isthe.com/chongo/tech/comp/fnv/],
[Jenkins hash http://burtleburtle.net/bob/hash/doobs.html] and [SDBM hash http://www.cs.yorku.ca/~oz/hash.html])
in which we do not need to impose any upper bound [figs/img79.png] on the lengths of the keys in [figs/img20.png].
As mentioned before, for us to find the labelling [figs/img72.png] of the
vertices of [figs/img59.png] in linear time,
we require that [figs/img108.png].
The crucial step now is to determine the value
of [figs/img1.png] (in [figs/img57.png]) to obtain a random
graph [figs/img71.png] with [figs/img109.png].
Botelho, Menoti an Ziviani determinded emprically in [[1 #papers]] that
the value of [figs/img1.png] is //1.15//. This value is remarkably
close to the theoretical value determined in [[2 #papers]],
which is around [figs/img112.png].
----------------------------------------
===Ordering Step===
The procedure Ordering ([figs/img32.png], [figs/img63.png], [figs/img70.png]) receives
as input the graph [figs/img32.png] and partitions [figs/img32.png] into the two
subgraphs [figs/img63.png] and [figs/img70.png], so that [figs/img71.png].
Figure 2 presents a sample graph with 9 vertices
and 8 edges, where the degree of a vertex is shown besides each vertex.
Initially, all vertices with degree 1 are added to a queue [figs/img136.png].
For the example shown in Figure 2(a), [figs/img137.png] after the initialization step.
| [figs/img138.png]
| **Figure 2:** Ordering step for a graph with 9 vertices and 8 edges.
Next, we remove one vertex [figs/img139.png] from the queue, decrement its degree and
the degree of the vertices with degree greater than 0 in the adjacent
list of [figs/img139.png], as depicted in Figure 2(b) for [figs/img140.png].
At this point, the adjacencies of [figs/img139.png] with degree 1 are
inserted into the queue, such as vertex 1.
This process is repeated until the queue becomes empty.
All vertices with degree 0 are non-critical vertices and the others are
critical vertices, as depicted in Figure 2(c).
Finally, to determine the vertices in [figs/img141.png] we collect all
vertices [figs/img142.png] with at least one vertex [figs/img143.png] that
is in Adj[figs/img144.png] and in [figs/img145.png], as the vertex 8 in Figure 2(c).
----------------------------------------
===Searching Step===
In the searching step, the key part is
the //perfect assignment problem//: find [figs/img153.png] such that
the function [figs/img154.png] defined by
| [figs/img155.png]
is a bijection from [figs/img156.png] to [figs/img157.png] (recall [figs/img158.png]).
We are interested in a labelling [figs/img72.png] of
the vertices of the graph [figs/img59.png] with
the property that if [figs/img11.png] and [figs/img22.png] are keys
in [figs/img20.png], then [figs/img159.png]; that is, if we associate
to each edge the sum of the labels on its endpoints, then these values
should be all distinct.
Moreover, we require that all the sums [figs/img160.png] ([figs/img18.png])
fall between [figs/img115.png] and [figs/img161.png], and thus we have a bijection
between [figs/img20.png] and [figs/img157.png].
The procedure Searching ([figs/img32.png], [figs/img63.png], [figs/img70.png], [figs/img37.png])
receives as input [figs/img32.png], [figs/img63.png], [figs/img70.png] and finds a
suitable [figs/img162.png] bit value for each vertex [figs/img74.png], stored in the
array [figs/img37.png].
This step is first performed for the vertices in the
critical subgraph [figs/img63.png] of [figs/img32.png] (the 2-core of [figs/img32.png])
and then it is performed for the vertices in [figs/img70.png] (the non-critical subgraph
of [figs/img32.png] that contains the "acyclic part" of [figs/img32.png]).
The reason the assignment of the [figs/img37.png] values is first
performed on the vertices in [figs/img63.png] is to resolve reassignments
as early as possible (such reassignments are consequences of the cycles
in [figs/img63.png] and are depicted hereinafter).
----------------------------------------
====Assignment of Values to Critical Vertices====
The labels [figs/img73.png] ([figs/img142.png])
are assigned in increasing order following a greedy
strategy where the critical vertices [figs/img139.png] are considered one at a time,
according to a breadth-first search on [figs/img63.png].
If a candidate value [figs/img11.png] for [figs/img73.png] is forbidden
because setting [figs/img163.png] would create two edges with the same sum,
we try [figs/img164.png] for [figs/img73.png]. This fact is referred to
as a //reassignment//.
Let [figs/img165.png] be the set of addresses assigned to edges in [figs/img166.png].
Initially [figs/img167.png].
Let [figs/img11.png] be a candidate value for [figs/img73.png].
Initially [figs/img168.png].
Considering the subgraph [figs/img63.png] in Figure 2(c),
a step by step example of the assignment of values to vertices in [figs/img63.png] is
presented in Figure 3.
Initially, a vertex [figs/img139.png] is chosen, the assignment [figs/img163.png] is made
and [figs/img11.png] is set to [figs/img164.png].
For example, suppose that vertex [figs/img169.png] in Figure 3(a) is
chosen, the assignment [figs/img170.png] is made and [figs/img11.png] is set to [figs/img96.png].
| [figs/img171.png]
| **Figure 3:** Example of the assignment of values to critical vertices.
In Figure 3(b), following the adjacent list of vertex [figs/img169.png],
the unassigned vertex [figs/img115.png] is reached.
At this point, we collect in the temporary variable [figs/img172.png] all adjacencies
of vertex [figs/img115.png] that have been assigned an [figs/img11.png] value,
and [figs/img173.png].
Next, for all [figs/img174.png], we check if [figs/img175.png].
Since [figs/img176.png], then [figs/img177.png] is set
to [figs/img96.png], [figs/img11.png] is incremented
by 1 (now [figs/img178.png]) and [figs/img179.png].
Next, vertex [figs/img180.png] is reached, [figs/img181.png] is set
to [figs/img62.png], [figs/img11.png] is set to [figs/img180.png] and [figs/img182.png].
Next, vertex [figs/img183.png] is reached and [figs/img184.png].
Since [figs/img185.png] and [figs/img186.png], then [figs/img187.png] is
set to [figs/img180.png], [figs/img11.png] is set to [figs/img183.png] and [figs/img188.png].
Finally, vertex [figs/img189.png] is reached and [figs/img190.png].
Since [figs/img191.png], [figs/img11.png] is incremented by 1 and set to 5, as depicted in
Figure 3(c).
Since [figs/img192.png], [figs/img11.png] is again incremented by 1 and set to 6,
as depicted in Figure 3(d).
These two reassignments are indicated by the arrows in Figure 3.
Since [figs/img193.png] and [figs/img194.png], then [figs/img195.png] is set
to [figs/img196.png] and [figs/img197.png]. This finishes the algorithm.
----------------------------------------
====Assignment of Values to Non-Critical Vertices====
As [figs/img70.png] is acyclic, we can impose the order in which addresses are
associated with edges in [figs/img70.png], making this step simple to solve
by a standard depth first search algorithm.
Therefore, in the assignment of values to vertices in [figs/img70.png] we
benefit from the unused addresses in the gaps left by the assignment of values
to vertices in [figs/img63.png].
For that, we start the depth-first search from the vertices in [figs/img141.png] because
the [figs/img37.png] values for these critical vertices were already assigned
and cannot be changed.
Considering the subgraph [figs/img70.png] in Figure 2(c),
a step by step example of the assignment of values to vertices in [figs/img70.png] is
presented in Figure 4.
Figure 4(a) presents the initial state of the algorithm.
The critical vertex 8 is the only one that has non-critical vertices as
adjacent.
In the example presented in Figure 3, the addresses [figs/img198.png] were not used.
So, taking the first unused address [figs/img115.png] and the vertex [figs/img96.png],
which is reached from the vertex [figs/img169.png], [figs/img199.png] is set
to [figs/img200.png], as shown in Figure 4(b).
The only vertex that is reached from vertex [figs/img96.png] is vertex [figs/img62.png], so
taking the unused address [figs/img183.png] we set [figs/img201.png] to [figs/img202.png],
as shown in Figure 4(c).
This process is repeated until the UnAssignedAddresses list becomes empty.
| [figs/img203.png]
| **Figure 4:** Example of the assignment of values to non-critical vertices.
----------------------------------------
==The Heuristic==[heuristic]
We now present an heuristic for BMZ algorithm that
reduces the value of [figs/img1.png] to any given value between //1.15// and //0.93//.
This reduces the space requirement to store the resulting function
to any given value between [figs/img12.png] words and [figs/img13.png] words.
The heuristic reuses, when possible, the set
of [figs/img11.png] values that caused reassignments, just before
trying [figs/img164.png].
Decreasing the value of [figs/img1.png] leads to an increase in the number of
iterations to generate [figs/img32.png].
For example, for [figs/img244.png] and [figs/img6.png], the analytical expected number
of iterations are [figs/img245.png] and [figs/img246.png], respectively (see [[2 #papers]]
for details),
while for [figs/img128.png] the same value is around //2.13//.
----------------------------------------
==Memory Consumption==
Now we detail the memory consumption to generate and to store minimal perfect hash functions
using the BMZ algorithm. The structures responsible for memory consumption are in the
following:
- Graph:
+ **first**: is a vector that stores //cn// integer numbers, each one representing
the first edge (index in the vector edges) in the list of
edges of each vertex.
The integer numbers are 4 bytes long. Therefore,
the vector first is stored in //4cn// bytes.
+ **edges**: is a vector to represent the edges of the graph. As each edge
is compounded by a pair of vertices, each entry stores two integer numbers
of 4 bytes that represent the vertices. As there are //n// edges, the
vector edges is stored in //8n// bytes.
+ **next**: given a vertex [figs/img139.png], we can discover the edges that
contain [figs/img139.png] following its list of edges,
which starts on first[[figs/img139.png]] and the next
edges are given by next[...first[[figs/img139.png]]...]. Therefore, the vectors first and next represent
the linked lists of edges of each vertex. As there are two vertices for each edge,
when an edge is iserted in the graph, it must be inserted in the two linked lists
of the vertices in its composition. Therefore, there are //2n// entries of integer
numbers in the vector next, so it is stored in //4*2n = 8n// bytes.
+ **critical vertices(critical_nodes vector)**: is a vector of //cn// bits,
where each bit indicates if a vertex is critical (1) or non-critical (0).
Therefore, the critical and non-critical vertices are represented in //cn/8// bytes.
+ **critical edges (used_edges vector)**: is a vector of //n// bits, where each
bit indicates if an edge is critical (1) or non-critical (0). Therefore, the
critical and non-critical edges are represented in //n/8// bytes.
- Other auxiliary structures
+ **queue**: is a queue of integer numbers used in the breadth-first search of the
assignment of values to critical vertices. There is an entry in the queue for
each two critical vertices. Let [figs/img110.png] be the expected number of critical
vertices. Therefore, the queue is stored in //4*0.5*[figs/img110.png]=2[figs/img110.png]//.
+ **visited**: is a vector of //cn// bits, where each bit indicates if the g value of
a given vertex was already defined. Therefore, the vector visited is stored
in //cn/8// bytes.
+ **function //g//**: is represented by a vector of //cn// integer numbers.
As each integer number is 4 bytes long, the function //g// is stored in
//4cn// bytes.
Thus, the total memory consumption of BMZ algorithm for generating a minimal
perfect hash function (MPHF) is: //(8.25c + 16.125)n +2[figs/img110.png] + O(1)// bytes.
As the value of constant //c// may be 1.15 and 0.93 we have:
|| //c// | [figs/img110.png] | Memory consumption to generate a MPHF |
| 0.93 | //0.497n// | //24.80n + O(1)// |
| 1.15 | //0.401n// | //26.42n + O(1)// |
| **Table 1:** Memory consumption to generate a MPHF using the BMZ algorithm.
The values of [figs/img110.png] were calculated using Eq.(1) presented in [[2 #papers]].
Now we present the memory consumption to store the resulting function.
We only need to store the //g// function. Thus, we need //4cn// bytes.
Again we have:
|| //c// | Memory consumption to store a MPHF |
| 0.93 | //3.72n// |
| 1.15 | //4.60n// |
| **Table 2:** Memory consumption to store a MPHF generated by the BMZ algorithm.
----------------------------------------
==Experimental Results==
[CHM x BMZ comparison.html]
----------------------------------------
==Papers==[papers]
+ [F. C. Botelho http://www.dcc.ufmg.br/~fbotelho], D. Menoti, [N. Ziviani http://www.dcc.ufmg.br/~nivio]. [A New algorithm for constructing minimal perfect hash functions papers/bmz_tr004_04.ps], Technical Report TR004/04, Department of Computer Science, Federal University of Minas Gerais, 2004.
+ [F. C. Botelho http://www.dcc.ufmg.br/~fbotelho], Y. Kohayakawa, and [N. Ziviani http://www.dcc.ufmg.br/~nivio]. [A Practical Minimal Perfect Hashing Method papers/wea05.pdf]. //4th International Workshop on efficient and Experimental Algorithms (WEA05),// Springer-Verlag Lecture Notes in Computer Science, vol. 3505, Santorini Island, Greece, May 2005, 488-500.
%!include: ALGORITHMS.t2t
%!include: FOOTER.t2t
%!include(html): ''GOOGLEANALYTICS.t2t'' cmph-2.0.2/cmph.pc.in 0000644 0001750 0001750 00000000375 13411542035 013667 0 ustar joseph joseph url=http://cmph.sourceforge.net/
prefix=@prefix@
exec_prefix=@exec_prefix@
libdir=@libdir@
includedir=@includedir@
Name: cmph
Description: minimal perfect hashing library
Version: @VERSION@
Libs: -L${libdir} -lcmph
Cflags: -I${includedir}
URL: ${url}
cmph-2.0.2/FAQ.t2t 0000644 0001750 0001750 00000003177 13411542035 013054 0 ustar joseph joseph CMPH FAQ
%!includeconf: CONFIG.t2t
- How do I define the ids of the keys?
- You don't. The ids will be assigned by the algorithm creating the minimal
perfect hash function. If the algorithm creates an **ordered** minimal
perfect hash function, the ids will be the indices of the keys in the
input. Otherwise, you have no guarantee of the distribution of the ids.
- Why do I always get the error "Unable to create minimum perfect hashing function"?
- The algorithms do not guarantee that a minimal perfect hash function can
be created. In practice, it will always work if your input
is big enough (>100 keys).
The error is probably because you have duplicated
keys in the input. You must guarantee that the keys are unique in the
input. If you are using a UN*X based OS, try doing
``` #sort input.txt | uniq > input_uniq.txt
and run cmph with input_uniq.txt
- Why do I change the hash function using cmph_config_set_hashfuncs function and the default (jenkins)
one is executed?
- Probably you are you using the cmph_config_set_algo function after
the cmph_config_set_hashfuncs. Therefore, the default hash function
is reset when you call the cmph_config_set_algo function.
- What do I do when the following error is got?
- Error: **error while loading shared libraries: libcmph.so.0: cannot open shared object file: No such file ordirectory**
- Solution: type **export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/** at the shell or put that shell command
in your .profile file or in the /etc/profile file.
%!include: ALGORITHMS.t2t
%!include: FOOTER.t2t
%!include(html): ''GOOGLEANALYTICS.t2t'' cmph-2.0.2/CONFIG.t2t 0000644 0001750 0001750 00000011115 13411542035 013401 0 ustar joseph joseph %! style(html): DOC.css
%! PreProc(html): '^%html% ' ''
%! PreProc(txt): '^%txt% ' ''
%! PostProc(html): "&" "&"
%! PostProc(txt): " " " "
%! PostProc(html): 'ALIGN="middle" SRC="figs/img7.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img7.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img57.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img57.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img32.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img32.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img20.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img20.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img60.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img60.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img62.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img62.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img79.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img79.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img139.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img139.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img140.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img140.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img143.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img143.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img115.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img115.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img11.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img11.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img169.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img169.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img96.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img96.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img178.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img178.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img180.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img180.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img183.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img183.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img189.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img189.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img196.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img196.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img172.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img172.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img8.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img8.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img1.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img1.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img14.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img14.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img128.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img128.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img112.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img112.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img12.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img12.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img13.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img13.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img244.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img244.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img245.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img245.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img246.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img246.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img15.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img15.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img25.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img25.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img168.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img168.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img6.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img6.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img5.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img5.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img28.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img28.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img237.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img237.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img248.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img237.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img248.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img237.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img249.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img249.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/img250.png"(.*?)>' 'ALIGN="bottom" SRC="figs/img250.png"\1>'
%! PostProc(html): 'ALIGN="middle" SRC="figs/bdz/img8.png"(.*?)>' 'ALIGN="bottom" SRC="figs/bdz/img8.png"\1>'
% The ^ need to be escaped by \
%!postproc(html): \^\^(.*?)\^\^ \1
%!postproc(html): ,,(.*?),, \1
cmph-2.0.2/LGPL-2 0000644 0001750 0001750 00000064212 13411542035 012627 0 ustar joseph joseph Most components of the "acl" package are licensed under
Version 2.1 of the GNU Lesser General Public License (see below).
below.
Some components (as annotated in the source) are licensed
under Version 2 of the GNU General Public License (see COPYING).
----------------------------------------------------------------------
GNU LESSER GENERAL PUBLIC LICENSE
Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
[This is the first released version of the Lesser GPL. It also counts
as the successor of the GNU Library Public License, version 2, hence
the version number 2.1.]
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
Licenses are intended to guarantee your freedom to share and change
free software--to make sure the software is free for all its users.
This license, the Lesser General Public License, applies to some
specially designated software packages--typically libraries--of the
Free Software Foundation and other authors who decide to use it. You
can use it too, but we suggest you first think carefully about whether
this license or the ordinary General Public License is the better
strategy to use in any particular case, based on the explanations below.
When we speak of free software, we are referring to freedom of use,
not price. Our General Public Licenses are designed to make sure that
you have the freedom to distribute copies of free software (and charge
for this service if you wish); that you receive source code or can get
it if you want it; that you can change the software and use pieces of
it in new free programs; and that you are informed that you can do
these things.
To protect your rights, we need to make restrictions that forbid
distributors to deny you these rights or to ask you to surrender these
rights. These restrictions translate to certain responsibilities for
you if you distribute copies of the library or if you modify it.
For example, if you distribute copies of the library, whether gratis
or for a fee, you must give the recipients all the rights that we gave
you. You must make sure that they, too, receive or can get the source
code. If you link other code with the library, you must provide
complete object files to the recipients, so that they can relink them
with the library after making changes to the library and recompiling
it. And you must show them these terms so they know their rights.
We protect your rights with a two-step method: (1) we copyright the
library, and (2) we offer you this license, which gives you legal
permission to copy, distribute and/or modify the library.
To protect each distributor, we want to make it very clear that
there is no warranty for the free library. Also, if the library is
modified by someone else and passed on, the recipients should know
that what they have is not the original version, so that the original
author's reputation will not be affected by problems that might be
introduced by others.
Finally, software patents pose a constant threat to the existence of
any free program. We wish to make sure that a company cannot
effectively restrict the users of a free program by obtaining a
restrictive license from a patent holder. Therefore, we insist that
any patent license obtained for a version of the library must be
consistent with the full freedom of use specified in this license.
Most GNU software, including some libraries, is covered by the
ordinary GNU General Public License. This license, the GNU Lesser
General Public License, applies to certain designated libraries, and
is quite different from the ordinary General Public License. We use
this license for certain libraries in order to permit linking those
libraries into non-free programs.
When a program is linked with a library, whether statically or using
a shared library, the combination of the two is legally speaking a
combined work, a derivative of the original library. The ordinary
General Public License therefore permits such linking only if the
entire combination fits its criteria of freedom. The Lesser General
Public License permits more lax criteria for linking other code with
the library.
We call this license the "Lesser" General Public License because it
does Less to protect the user's freedom than the ordinary General
Public License. It also provides other free software developers Less
of an advantage over competing non-free programs. These disadvantages
are the reason we use the ordinary General Public License for many
libraries. However, the Lesser license provides advantages in certain
special circumstances.
For example, on rare occasions, there may be a special need to
encourage the widest possible use of a certain library, so that it becomes
a de-facto standard. To achieve this, non-free programs must be
allowed to use the library. A more frequent case is that a free
library does the same job as widely used non-free libraries. In this
case, there is little to gain by limiting the free library to free
software only, so we use the Lesser General Public License.
In other cases, permission to use a particular library in non-free
programs enables a greater number of people to use a large body of
free software. For example, permission to use the GNU C Library in
non-free programs enables many more people to use the whole GNU
operating system, as well as its variant, the GNU/Linux operating
system.
Although the Lesser General Public License is Less protective of the
users' freedom, it does ensure that the user of a program that is
linked with the Library has the freedom and the wherewithal to run
that program using a modified version of the Library.
The precise terms and conditions for copying, distribution and
modification follow. Pay close attention to the difference between a
"work based on the library" and a "work that uses the library". The
former contains code derived from the library, whereas the latter must
be combined with the library in order to run.
GNU LESSER GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License Agreement applies to any software library or other
program which contains a notice placed by the copyright holder or
other authorized party saying it may be distributed under the terms of
this Lesser General Public License (also called "this License").
Each licensee is addressed as "you".
A "library" means a collection of software functions and/or data
prepared so as to be conveniently linked with application programs
(which use some of those functions and data) to form executables.
The "Library", below, refers to any such software library or work
which has been distributed under these terms. A "work based on the
Library" means either the Library or any derivative work under
copyright law: that is to say, a work containing the Library or a
portion of it, either verbatim or with modifications and/or translated
straightforwardly into another language. (Hereinafter, translation is
included without limitation in the term "modification".)
"Source code" for a work means the preferred form of the work for
making modifications to it. For a library, complete source code means
all the source code for all modules it contains, plus any associated
interface definition files, plus the scripts used to control compilation
and installation of the library.
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running a program using the Library is not restricted, and output from
such a program is covered only if its contents constitute a work based
on the Library (independent of the use of the Library in a tool for
writing it). Whether that is true depends on what the Library does
and what the program that uses the Library does.
1. You may copy and distribute verbatim copies of the Library's
complete source code as you receive it, in any medium, provided that
you conspicuously and appropriately publish on each copy an
appropriate copyright notice and disclaimer of warranty; keep intact
all the notices that refer to this License and to the absence of any
warranty; and distribute a copy of this License along with the
Library.
You may charge a fee for the physical act of transferring a copy,
and you may at your option offer warranty protection in exchange for a
fee.
2. You may modify your copy or copies of the Library or any portion
of it, thus forming a work based on the Library, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) The modified work must itself be a software library.
b) You must cause the files modified to carry prominent notices
stating that you changed the files and the date of any change.
c) You must cause the whole of the work to be licensed at no
charge to all third parties under the terms of this License.
d) If a facility in the modified Library refers to a function or a
table of data to be supplied by an application program that uses
the facility, other than as an argument passed when the facility
is invoked, then you must make a good faith effort to ensure that,
in the event an application does not supply such function or
table, the facility still operates, and performs whatever part of
its purpose remains meaningful.
(For example, a function in a library to compute square roots has
a purpose that is entirely well-defined independent of the
application. Therefore, Subsection 2d requires that any
application-supplied function or table used by this function must
be optional: if the application does not supply it, the square
root function must still compute square roots.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Library,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Library, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote
it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Library.
In addition, mere aggregation of another work not based on the Library
with the Library (or with a work based on the Library) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may opt to apply the terms of the ordinary GNU General Public
License instead of this License to a given copy of the Library. To do
this, you must alter all the notices that refer to this License, so
that they refer to the ordinary GNU General Public License, version 2,
instead of to this License. (If a newer version than version 2 of the
ordinary GNU General Public License has appeared, then you can specify
that version instead if you wish.) Do not make any other change in
these notices.
Once this change is made in a given copy, it is irreversible for
that copy, so the ordinary GNU General Public License applies to all
subsequent copies and derivative works made from that copy.
This option is useful when you wish to copy part of the code of
the Library into a program that is not a library.
4. You may copy and distribute the Library (or a portion or
derivative of it, under Section 2) in object code or executable form
under the terms of Sections 1 and 2 above provided that you accompany
it with the complete corresponding machine-readable source code, which
must be distributed under the terms of Sections 1 and 2 above on a
medium customarily used for software interchange.
If distribution of object code is made by offering access to copy
from a designated place, then offering equivalent access to copy the
source code from the same place satisfies the requirement to
distribute the source code, even though third parties are not
compelled to copy the source along with the object code.
5. A program that contains no derivative of any portion of the
Library, but is designed to work with the Library by being compiled or
linked with it, is called a "work that uses the Library". Such a
work, in isolation, is not a derivative work of the Library, and
therefore falls outside the scope of this License.
However, linking a "work that uses the Library" with the Library
creates an executable that is a derivative of the Library (because it
contains portions of the Library), rather than a "work that uses the
library". The executable is therefore covered by this License.
Section 6 states terms for distribution of such executables.
When a "work that uses the Library" uses material from a header file
that is part of the Library, the object code for the work may be a
derivative work of the Library even though the source code is not.
Whether this is true is especially significant if the work can be
linked without the Library, or if the work is itself a library. The
threshold for this to be true is not precisely defined by law.
If such an object file uses only numerical parameters, data
structure layouts and accessors, and small macros and small inline
functions (ten lines or less in length), then the use of the object
file is unrestricted, regardless of whether it is legally a derivative
work. (Executables containing this object code plus portions of the
Library will still fall under Section 6.)
Otherwise, if the work is a derivative of the Library, you may
distribute the object code for the work under the terms of Section 6.
Any executables containing that work also fall under Section 6,
whether or not they are linked directly with the Library itself.
6. As an exception to the Sections above, you may also combine or
link a "work that uses the Library" with the Library to produce a
work containing portions of the Library, and distribute that work
under terms of your choice, provided that the terms permit
modification of the work for the customer's own use and reverse
engineering for debugging such modifications.
You must give prominent notice with each copy of the work that the
Library is used in it and that the Library and its use are covered by
this License. You must supply a copy of this License. If the work
during execution displays copyright notices, you must include the
copyright notice for the Library among them, as well as a reference
directing the user to the copy of this License. Also, you must do one
of these things:
a) Accompany the work with the complete corresponding
machine-readable source code for the Library including whatever
changes were used in the work (which must be distributed under
Sections 1 and 2 above); and, if the work is an executable linked
with the Library, with the complete machine-readable "work that
uses the Library", as object code and/or source code, so that the
user can modify the Library and then relink to produce a modified
executable containing the modified Library. (It is understood
that the user who changes the contents of definitions files in the
Library will not necessarily be able to recompile the application
to use the modified definitions.)
b) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (1) uses at run time a
copy of the library already present on the user's computer system,
rather than copying library functions into the executable, and (2)
will operate properly with a modified version of the library, if
the user installs one, as long as the modified version is
interface-compatible with the version that the work was made with.
c) Accompany the work with a written offer, valid for at
least three years, to give the same user the materials
specified in Subsection 6a, above, for a charge no more
than the cost of performing this distribution.
d) If distribution of the work is made by offering access to copy
from a designated place, offer equivalent access to copy the above
specified materials from the same place.
e) Verify that the user has already received a copy of these
materials or that you have already sent this user a copy.
For an executable, the required form of the "work that uses the
Library" must include any data and utility programs needed for
reproducing the executable from it. However, as a special exception,
the materials to be distributed need not include anything that is
normally distributed (in either source or binary form) with the major
components (compiler, kernel, and so on) of the operating system on
which the executable runs, unless that component itself accompanies
the executable.
It may happen that this requirement contradicts the license
restrictions of other proprietary libraries that do not normally
accompany the operating system. Such a contradiction means you cannot
use both them and the Library together in an executable that you
distribute.
7. You may place library facilities that are a work based on the
Library side-by-side in a single library together with other library
facilities not covered by this License, and distribute such a combined
library, provided that the separate distribution of the work based on
the Library and of the other library facilities is otherwise
permitted, and provided that you do these two things:
a) Accompany the combined library with a copy of the same work
based on the Library, uncombined with any other library
facilities. This must be distributed under the terms of the
Sections above.
b) Give prominent notice with the combined library of the fact
that part of it is a work based on the Library, and explaining
where to find the accompanying uncombined form of the same work.
8. You may not copy, modify, sublicense, link with, or distribute
the Library except as expressly provided under this License. Any
attempt otherwise to copy, modify, sublicense, link with, or
distribute the Library is void, and will automatically terminate your
rights under this License. However, parties who have received copies,
or rights, from you under this License will not have their licenses
terminated so long as such parties remain in full compliance.
9. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Library or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Library (or any work based on the
Library), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Library or works based on it.
10. Each time you redistribute the Library (or any work based on the
Library), the recipient automatically receives a license from the
original licensor to copy, distribute, link with or modify the Library
subject to these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties with
this License.
11. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Library at all. For example, if a patent
license would not permit royalty-free redistribution of the Library by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Library.
If any portion of this section is held invalid or unenforceable under any
particular circumstance, the balance of the section is intended to apply,
and the section as a whole is intended to apply in other circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
12. If the distribution and/or use of the Library is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Library under this License may add
an explicit geographical distribution limitation excluding those countries,
so that distribution is permitted only in or among countries not thus
excluded. In such case, this License incorporates the limitation as if
written in the body of this License.
13. The Free Software Foundation may publish revised and/or new
versions of the Lesser General Public License from time to time.
Such new versions will be similar in spirit to the present version,
but may differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the Library
specifies a version number of this License which applies to it and
"any later version", you have the option of following the terms and
conditions either of that version or of any later version published by
the Free Software Foundation. If the Library does not specify a
license version number, you may choose any version ever published by
the Free Software Foundation.
14. If you wish to incorporate parts of the Library into other free
programs whose distribution conditions are incompatible with these,
write to the author to ask for permission. For software which is
copyrighted by the Free Software Foundation, write to the Free
Software Foundation; we sometimes make exceptions for this. Our
decision will be guided by the two goals of preserving the free status
of all derivatives of our free software and of promoting the sharing
and reuse of software generally.
NO WARRANTY
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Libraries
If you develop a new library, and you want it to be of the greatest
possible use to the public, we recommend making it free software that
everyone can redistribute and change. You can do so by permitting
redistribution under these terms (or, alternatively, under the terms of the
ordinary General Public License).
To apply these terms, attach the following notices to the library. It is
safest to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least the
"copyright" line and a pointer to where the full notice is found.
Copyright (C)
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Also add information on how to contact you by electronic and paper mail.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the library, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
, 1 April 1990
Ty Coon, President of Vice
That's all there is to it!
cmph-2.0.2/NEWSLOG.t2t 0000644 0001750 0001750 00000007470 13411542035 013563 0 ustar joseph joseph News Log
%!includeconf: CONFIG.t2t
----------------------------------------
==News for version 1.1==
Fixed a bug in the chd_pc algorithm and reorganized tests.
==News for version 1.0==
This is a bugfix only version, after which a revamp of the cmph code and
algorithms will be done.
----------------------------------------
==News for version 0.9==
- [The CHD algorithm chd.html], which is an algorithm that can be tuned to generate MPHFs that require approximately 2.07 bits per key to be stored. The algorithm outperforms [the BDZ algorithm bdz.html] and therefore is the fastest one available in the literature for sets that can be treated in internal memory.
- [The CHD_PH algorithm chd.html], which is an algorithm to generate PHFs with load factor up to //99 %//. It is actually the CHD algorithm without the ranking step. If we set the load factor to //81 %//, which is the maximum that can be obtained with [the BDZ algorithm bdz.html], the resulting functions can be stored in //1.40// bits per key. The space requirement increases with the load factor.
- All reported bugs and suggestions have been corrected and included as well.
----------------------------------------
==News for version 0.8==
- [An algorithm to generate MPHFs that require around 2.6 bits per key to be stored bdz.html], which is referred to as BDZ algorithm. The algorithm is the fastest one available in the literature for sets that can be treated in internal memory.
- [An algorithm to generate PHFs with range m = cn, for c > 1.22 bdz.html], which is referred to as BDZ_PH algorithm. It is actually the BDZ algorithm without the ranking step. The resulting functions can be stored in 1.95 bits per key for //c = 1.23// and are considerably faster than the MPHFs generated by the BDZ algorithm.
- An adapter to support a vector of struct as the source of keys has been added.
- An API to support the ability of packing a perfect hash function into a preallocated contiguous memory space. The computation of a packed function is still faster and can be easily mmapped.
- The hash functions djb2, fnv and sdbm were removed because they do not use random seeds and therefore are not useful for MPHFs algorithms.
- All reported bugs and suggestions have been corrected and included as well.
----------------------------------------
==News for version 0.7==
- Added man pages and a pkgconfig file.
----------------------------------------
==News for version 0.6==
- [An algorithm to generate MPHFs that require less than 4 bits per key to be stored fch.html], which is referred to as FCH algorithm. The algorithm is only efficient for small sets.
- The FCH algorithm is integrated with [BRZ algorithm brz.html] so that you will be able to efficiently generate space-efficient MPHFs for sets in the order of billion keys.
- All reported bugs and suggestions have been corrected and included as well.
----------------------------------------
==News for version 0.5==
- A thread safe vector adapter has been added.
- [A new algorithm for sets in the order of billion of keys that requires approximately 8.1 bits per key to store the resulting MPHFs. brz.html]
- All reported bugs and suggestions have been corrected and included as well.
----------------------------------------
==News for version 0.4==
- Vector Adapter has been added.
- An optimized version of bmz (bmz8) for small set of keys (at most 256 keys) has been added.
- All reported bugs and suggestions have been corrected and included as well.
----------------------------------------
==News for version 0.3==
- New heuristic added to the bmz algorithm permits to generate a mphf with only
//24.80n + O(1)// bytes. The resulting function can be stored in //3.72n// bytes.
%html% [click here bmz.html#heuristic] for details.
%!include: ALGORITHMS.t2t
%!include: FOOTER.t2t
%!include(html): ''GOOGLEANALYTICS.t2t''
cmph-2.0.2/examples/ 0000755 0001750 0001750 00000000000 13411542035 013620 5 ustar joseph joseph cmph-2.0.2/examples/keys.txt 0000644 0001750 0001750 00000000156 13411542035 015336 0 ustar joseph joseph aaaaaaaaaa
bbbbbbbbbb
cccccccccc
dddddddddd
eeeeeeeeee
ffffffffff
gggggggggg
hhhhhhhhhh
iiiiiiiiii
jjjjjjjjjj
cmph-2.0.2/examples/struct_vector_adapter_ex3.c 0000644 0001750 0001750 00000003146 13411542035 021155 0 ustar joseph joseph #include
#include
// Create minimal perfect hash function from in-memory vector
#pragma pack(1)
typedef struct {
cmph_uint32 id;
char key[11];
cmph_uint32 year;
} rec_t;
#pragma pack(0)
int main(int argc, char **argv)
{
// Creating a filled vector
unsigned int i = 0;
rec_t vector[10] = {{1, "aaaaaaaaaa", 1999}, {2, "bbbbbbbbbb", 2000}, {3, "cccccccccc", 2001},
{4, "dddddddddd", 2002}, {5, "eeeeeeeeee", 2003}, {6, "ffffffffff", 2004},
{7, "gggggggggg", 2005}, {8, "hhhhhhhhhh", 2006}, {9, "iiiiiiiiii", 2007},
{10,"jjjjjjjjjj", 2008}};
unsigned int nkeys = 10;
FILE* mphf_fd = fopen("temp_struct_vector.mph", "wb");
// Source of keys
cmph_io_adapter_t *source = cmph_io_struct_vector_adapter(vector, (cmph_uint32)sizeof(rec_t), (cmph_uint32)sizeof(cmph_uint32), 11, nkeys);
//Create minimal perfect hash function using the BDZ algorithm.
cmph_config_t *config = cmph_config_new(source);
cmph_config_set_algo(config, CMPH_BDZ);
cmph_config_set_mphf_fd(config, mphf_fd);
cmph_t *hash = cmph_new(config);
cmph_config_destroy(config);
cmph_dump(hash, mphf_fd);
cmph_destroy(hash);
fclose(mphf_fd);
//Find key
mphf_fd = fopen("temp_struct_vector.mph", "rb");
hash = cmph_load(mphf_fd);
while (i < nkeys) {
const char *key = vector[i].key;
unsigned int id = cmph_search(hash, key, 11);
fprintf(stderr, "key:%s -- hash:%u\n", key, id);
i++;
}
//Destroy hash
cmph_destroy(hash);
cmph_io_vector_adapter_destroy(source);
fclose(mphf_fd);
return 0;
}
cmph-2.0.2/examples/file_adapter_ex2.c 0000644 0001750 0001750 00000001552 13411542035 017164 0 ustar joseph joseph #include
#include
#include
// Create minimal perfect hash function from in-disk keys using BDZ algorithm
int main(int argc, char **argv)
{
//Open file with newline separated list of keys
FILE * keys_fd = fopen("keys.txt", "r");
cmph_t *hash = NULL;
if (keys_fd == NULL)
{
fprintf(stderr, "File \"keys.txt\" not found\n");
exit(1);
}
// Source of keys
cmph_io_adapter_t *source = cmph_io_nlfile_adapter(keys_fd);
cmph_config_t *config = cmph_config_new(source);
cmph_config_set_algo(config, CMPH_BDZ);
hash = cmph_new(config);
cmph_config_destroy(config);
//Find key
const char *key = "jjjjjjjjjj";
unsigned int id = cmph_search(hash, key, (cmph_uint32)strlen(key));
fprintf(stderr, "Id:%u\n", id);
//Destroy hash
cmph_destroy(hash);
cmph_io_nlfile_adapter_destroy(source);
fclose(keys_fd);
return 0;
}
cmph-2.0.2/examples/vector_adapter_ex1.c 0000755 0001750 0001750 00000002464 13411542035 017554 0 ustar joseph joseph #include
#include
// Create minimal perfect hash function from in-memory vector
int main(int argc, char **argv)
{
// Creating a filled vector
unsigned int i = 0;
const char *vector[] = {"aaaaaaaaaa", "bbbbbbbbbb", "cccccccccc", "dddddddddd", "eeeeeeeeee",
"ffffffffff", "gggggggggg", "hhhhhhhhhh", "iiiiiiiiii", "jjjjjjjjjj"};
unsigned int nkeys = 10;
FILE* mphf_fd = fopen("temp.mph", "wb");
// Source of keys
cmph_io_adapter_t *source = cmph_io_vector_adapter((char **)vector, nkeys);
//Create minimal perfect hash function using the brz algorithm.
cmph_config_t *config = cmph_config_new(source);
cmph_config_set_algo(config, CMPH_BRZ);
cmph_config_set_mphf_fd(config, mphf_fd);
cmph_t *hash = cmph_new(config);
cmph_config_destroy(config);
cmph_dump(hash, mphf_fd);
cmph_destroy(hash);
fclose(mphf_fd);
//Find key
mphf_fd = fopen("temp.mph", "rb");
hash = cmph_load(mphf_fd);
while (i < nkeys) {
const char *key = vector[i];
unsigned int id = cmph_search(hash, key, (cmph_uint32)strlen(key));
fprintf(stderr, "key:%s -- hash:%u\n", key, id);
i++;
}
//Destroy hash
cmph_destroy(hash);
cmph_io_vector_adapter_destroy(source);
fclose(mphf_fd);
return 0;
}
cmph-2.0.2/examples/Makefile.am 0000755 0001750 0001750 00000001013 13411542035 015652 0 ustar joseph joseph noinst_PROGRAMS = vector_adapter_ex1 file_adapter_ex2 struct_vector_adapter_ex3 small_set_ex4
AM_CPPFLAGS = -I../src/
vector_adapter_ex1_LDADD = ../src/libcmph.la
vector_adapter_ex1_SOURCES = vector_adapter_ex1.c
file_adapter_ex2_LDADD = ../src/libcmph.la
file_adapter_ex2_SOURCES = file_adapter_ex2.c
struct_vector_adapter_ex3_LDADD = ../src/libcmph.la
struct_vector_adapter_ex3_SOURCES = struct_vector_adapter_ex3.c
small_set_ex4_LDADD = ../src/libcmph.la
small_set_ex4_SOURCES = small_set_ex4.c
cmph-2.0.2/examples/small_set_ex4.c 0000644 0001750 0001750 00000006005 13411542035 016530 0 ustar joseph joseph #include
int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n)
{
cmph_t *hash;
cmph_config_t *config;
cmph_io_adapter_t *source;
cmph_uint32 i;
char filename[256];
FILE* mphf_fd = NULL;
printf("%s (%u)\n", cmph_names[alg_n], alg_n);
source = cmph_io_struct_vector_adapter(items_to_hash,
(cmph_uint32)sizeof(cmph_uint32),
0,
(cmph_uint32)sizeof(cmph_uint32),
items_len);
config = cmph_config_new(source);
cmph_config_set_algo(config, alg_n);
if (alg_n == CMPH_BRZ) {
sprintf(filename, "%s_%u.mph", cmph_names[alg_n], items_len);
mphf_fd = fopen(filename, "w");
cmph_config_set_mphf_fd(config, mphf_fd);
}
hash = cmph_new(config);
cmph_config_destroy(config);
if (alg_n == CMPH_BRZ) {
cmph_dump(hash, mphf_fd);
cmph_destroy(hash);
fclose(mphf_fd);
mphf_fd = fopen(filename, "r");
hash = cmph_load(mphf_fd);
}
printf("packed_size %u\n",cmph_packed_size(hash));
for (i=0; i %u\n",
items_to_hash[i],
cmph_search(hash,
(char*)(items_to_hash+i),
(cmph_uint32)sizeof(cmph_uint32)));
printf("\n");
cmph_io_vector_adapter_destroy(source);
cmph_destroy(hash);
if (alg_n == CMPH_BRZ) {
fclose(mphf_fd);
}
return 0;
}
int main (void)
{
cmph_uint32 vec1[] = {1,2,3,4,5};
cmph_uint32 vec1_len = 5;
cmph_uint32 vec2[] = {7576423, 7554496}; //CMPH_FCH, CMPH_BDZ, CMPH_BDZ_PH (4,5,6)
cmph_uint32 vec2_len = 2;
cmph_uint32 vec3[] = {2184764, 1882984, 1170551}; // CMPH_CHD_PH, CMPH_CHD (7,8)
cmph_uint32 vec3_len = 3;
cmph_uint32 vec4[] = {2184764}; // CMPH_CHD_PH, CMPH_CHD (7,8)
cmph_uint32 vec4_len = 1;
cmph_uint32 i;
// Testing with vec1
cmph_uint32* values = (cmph_uint32*)vec1;
cmph_uint32 length = vec1_len;
printf("TESTING VECTOR WITH %u INTEGERS\n", length);
for (i = 0; i < CMPH_COUNT; i++)
{
test(values, length, i);
}
// Testing with vec2
values = (cmph_uint32*)vec2;
length = vec2_len;
printf("TESTING VECTOR WITH %u INTEGERS\n", length);
for (i = 0; i < CMPH_COUNT; i++)
{
test(values, length, i);
}
// Testing with vec3
values = (cmph_uint32*)vec3;
length = vec3_len;
printf("TESTING VECTOR WITH %u INTEGERS\n", length);
for (i = 0; i < CMPH_COUNT; i++)
{
test(values, length, i);
}
// Testing with vec4
values = (cmph_uint32*)vec4;
length = vec4_len;
printf("TESTING VECTOR WITH %u INTEGERS\n", length);
for (i = 0; i < CMPH_COUNT; i++)
{
test(values, length, i);
}
return 0;
}
cmph-2.0.2/src/ 0000755 0001750 0001750 00000000000 13411542035 012571 5 ustar joseph joseph cmph-2.0.2/src/buffer_manager.c 0000644 0001750 0001750 00000004674 13411542035 015713 0 ustar joseph joseph #include "buffer_manager.h"
#include "buffer_entry.h"
#include
#include
#include
struct __buffer_manager_t
{
cmph_uint32 memory_avail; // memory available
buffer_entry_t ** buffer_entries; // buffer entries to be managed
cmph_uint32 nentries; // number of entries to be managed
cmph_uint32 *memory_avail_list; // memory available list
int pos_avail_list; // current position in memory available list
};
buffer_manager_t * buffer_manager_new(cmph_uint32 memory_avail, cmph_uint32 nentries)
{
cmph_uint32 memory_avail_entry, i;
buffer_manager_t *buff_manager = (buffer_manager_t *)malloc(sizeof(buffer_manager_t));
if (!buff_manager) return NULL;
buff_manager->memory_avail = memory_avail;
buff_manager->buffer_entries = (buffer_entry_t **)calloc((size_t)nentries, sizeof(buffer_entry_t *));
buff_manager->memory_avail_list = (cmph_uint32 *)calloc((size_t)nentries, sizeof(cmph_uint32));
buff_manager->pos_avail_list = -1;
buff_manager->nentries = nentries;
memory_avail_entry = buff_manager->memory_avail/buff_manager->nentries + 1;
for(i = 0; i < buff_manager->nentries; i++)
{
buff_manager->buffer_entries[i] = buffer_entry_new(memory_avail_entry);
}
return buff_manager;
}
void buffer_manager_open(buffer_manager_t * buffer_manager, cmph_uint32 index, char * filename)
{
buffer_entry_open(buffer_manager->buffer_entries[index], filename);
}
cmph_uint8 * buffer_manager_read_key(buffer_manager_t * buffer_manager, cmph_uint32 index, cmph_uint32 * keylen)
{
cmph_uint8 * key = NULL;
if (buffer_manager->pos_avail_list >= 0 ) // recovering memory
{
cmph_uint32 new_capacity = buffer_entry_get_capacity(buffer_manager->buffer_entries[index]) + buffer_manager->memory_avail_list[(buffer_manager->pos_avail_list)--];
buffer_entry_set_capacity(buffer_manager->buffer_entries[index], new_capacity);
}
key = buffer_entry_read_key(buffer_manager->buffer_entries[index], keylen);
if (key == NULL) // storing memory to be recovered
{
buffer_manager->memory_avail_list[++(buffer_manager->pos_avail_list)] = buffer_entry_get_capacity(buffer_manager->buffer_entries[index]);
}
return key;
}
void buffer_manager_destroy(buffer_manager_t * buffer_manager)
{
cmph_uint32 i;
for(i = 0; i < buffer_manager->nentries; i++)
{
buffer_entry_destroy(buffer_manager->buffer_entries[i]);
}
free(buffer_manager->memory_avail_list);
free(buffer_manager->buffer_entries);
free(buffer_manager);
}
cmph-2.0.2/src/miller_rabin.c 0000644 0001750 0001750 00000002274 13411542035 015401 0 ustar joseph joseph #include "miller_rabin.h"
static inline cmph_uint64 int_pow(cmph_uint64 a, cmph_uint64 d, cmph_uint64 n)
{
cmph_uint64 a_pow = a;
cmph_uint64 res = 1;
while(d > 0)
{
if((d & 1) == 1)
res =(((cmph_uint64)res) * a_pow) % n;
a_pow = (((cmph_uint64)a_pow) * a_pow) % n;
d /= 2;
};
return res;
};
static inline cmph_uint8 check_witness(cmph_uint64 a_exp_d, cmph_uint64 n, cmph_uint64 s)
{
cmph_uint64 i;
cmph_uint64 a_exp = a_exp_d;
if(a_exp == 1 || a_exp == (n - 1))
return 1;
for(i = 1; i < s; i++)
{
a_exp = (((cmph_uint64)a_exp) * a_exp) % n;
if(a_exp == (n - 1))
return 1;
};
return 0;
};
cmph_uint8 check_primality(cmph_uint64 n)
{
cmph_uint64 a, d, s, a_exp_d;
if((n % 2) == 0)
return 0;
if((n % 3) == 0)
return 0;
if((n % 5) == 0)
return 0;
if((n % 7 ) == 0)
return 0;
//we decompoe the number n - 1 into 2^s*d
s = 0;
d = n - 1;
do
{
s++;
d /= 2;
}while((d % 2) == 0);
a = 2;
a_exp_d = int_pow(a, d, n);
if(check_witness(a_exp_d, n, s) == 0)
return 0;
a = 7;
a_exp_d = int_pow(a, d, n);
if(check_witness(a_exp_d, n, s) == 0)
return 0;
a = 61;
a_exp_d = int_pow(a, d, n);
if(check_witness(a_exp_d, n, s) == 0)
return 0;
return 1;
};
cmph-2.0.2/src/bmz.c 0000644 0001750 0001750 00000050032 13411542035 013525 0 ustar joseph joseph #include "graph.h"
#include "bmz.h"
#include "cmph_structs.h"
#include "bmz_structs.h"
#include "hash.h"
#include "vqueue.h"
#include "bitbool.h"
#include
#include
#include
#include
#include
// #define DEBUG
#include "debug.h"
static int bmz_gen_edges(cmph_config_t *mph);
static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint32 v, cmph_uint32 * biggest_g_value, cmph_uint32 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited);
static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, cmph_uint32 v, cmph_uint32 * biggest_g_value, cmph_uint32 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited);
static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint8 * visited);
bmz_config_data_t *bmz_config_new(void)
{
bmz_config_data_t *bmz = NULL;
bmz = (bmz_config_data_t *)malloc(sizeof(bmz_config_data_t));
if (!bmz) return NULL;
memset(bmz, 0, sizeof(bmz_config_data_t));
bmz->hashfuncs[0] = CMPH_HASH_JENKINS;
bmz->hashfuncs[1] = CMPH_HASH_JENKINS;
bmz->g = NULL;
bmz->graph = NULL;
bmz->hashes = NULL;
return bmz;
}
void bmz_config_destroy(cmph_config_t *mph)
{
bmz_config_data_t *data = (bmz_config_data_t *)mph->data;
DEBUGP("Destroying algorithm dependent data\n");
free(data);
}
void bmz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
cmph_uint32 i = 0;
while(*hashptr != CMPH_HASH_COUNT)
{
if (i >= 2) break; //bmz only uses two hash functions
bmz->hashfuncs[i] = *hashptr;
++i, ++hashptr;
}
}
cmph_t *bmz_new(cmph_config_t *mph, double c)
{
cmph_t *mphf = NULL;
bmz_data_t *bmzf = NULL;
cmph_uint32 i;
cmph_uint32 iterations;
cmph_uint32 iterations_map = 20;
cmph_uint8 *used_edges = NULL;
cmph_uint8 restart_mapping = 0;
cmph_uint8 * visited = NULL;
bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data;
if (c == 0) c = 1.15; // validating restrictions over parameter c.
DEBUGP("c: %f\n", c);
bmz->m = mph->key_source->nkeys;
bmz->n = (cmph_uint32)ceil(c * mph->key_source->nkeys);
if (bmz->n < 5) // workaround for small key sets
{
bmz->n = 5;
}
DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz->m, bmz->n, c);
bmz->graph = graph_new(bmz->n, bmz->m);
DEBUGP("Created graph\n");
bmz->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3);
for(i = 0; i < 3; ++i) bmz->hashes[i] = NULL;
do
{
// Mapping step
cmph_uint32 biggest_g_value = 0;
cmph_uint32 biggest_edge_value = 1;
iterations = 100;
if (mph->verbosity)
{
fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", bmz->m, bmz->n);
}
while(1)
{
int ok;
DEBUGP("hash function 1\n");
bmz->hashes[0] = hash_state_new(bmz->hashfuncs[0], bmz->n);
DEBUGP("hash function 2\n");
bmz->hashes[1] = hash_state_new(bmz->hashfuncs[1], bmz->n);
DEBUGP("Generating edges\n");
ok = bmz_gen_edges(mph);
if (!ok)
{
--iterations;
hash_state_destroy(bmz->hashes[0]);
bmz->hashes[0] = NULL;
hash_state_destroy(bmz->hashes[1]);
bmz->hashes[1] = NULL;
DEBUGP("%u iterations remaining\n", iterations);
if (mph->verbosity)
{
fprintf(stderr, "simple graph creation failure - %u iterations remaining\n", iterations);
}
if (iterations == 0) break;
}
else break;
}
if (iterations == 0)
{
graph_destroy(bmz->graph);
return NULL;
}
// Ordering step
if (mph->verbosity)
{
fprintf(stderr, "Starting ordering step\n");
}
graph_obtain_critical_nodes(bmz->graph);
// Searching step
if (mph->verbosity)
{
fprintf(stderr, "Starting Searching step.\n");
fprintf(stderr, "\tTraversing critical vertices.\n");
}
DEBUGP("Searching step\n");
visited = (cmph_uint8 *)malloc((size_t)bmz->n/8 + 1);
memset(visited, 0, (size_t)bmz->n/8 + 1);
used_edges = (cmph_uint8 *)malloc((size_t)bmz->m/8 + 1);
memset(used_edges, 0, (size_t)bmz->m/8 + 1);
free(bmz->g);
bmz->g = (cmph_uint32 *)calloc((size_t)bmz->n, sizeof(cmph_uint32));
assert(bmz->g);
for (i = 0; i < bmz->n; ++i) // critical nodes
{
if (graph_node_is_critical(bmz->graph, i) && (!GETBIT(visited,i)))
{
if(c > 1.14) restart_mapping = bmz_traverse_critical_nodes(bmz, i, &biggest_g_value, &biggest_edge_value, used_edges, visited);
else restart_mapping = bmz_traverse_critical_nodes_heuristic(bmz, i, &biggest_g_value, &biggest_edge_value, used_edges, visited);
if(restart_mapping) break;
}
}
if(!restart_mapping)
{
if (mph->verbosity)
{
fprintf(stderr, "\tTraversing non critical vertices.\n");
}
bmz_traverse_non_critical_nodes(bmz, used_edges, visited); // non_critical_nodes
}
else
{
iterations_map--;
if (mph->verbosity) fprintf(stderr, "Restarting mapping step. %u iterations remaining.\n", iterations_map);
}
free(used_edges);
free(visited);
} while(restart_mapping && iterations_map > 0);
graph_destroy(bmz->graph);
bmz->graph = NULL;
if (iterations_map == 0)
{
return NULL;
}
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < bmz->n; ++i) fprintf(stderr, "%u ", bmz->g[i]);
fprintf(stderr, "\n");
#endif
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
bmzf = (bmz_data_t *)malloc(sizeof(bmz_data_t));
bmzf->g = bmz->g;
bmz->g = NULL; //transfer memory ownership
bmzf->hashes = bmz->hashes;
bmz->hashes = NULL; //transfer memory ownership
bmzf->n = bmz->n;
bmzf->m = bmz->m;
mphf->data = bmzf;
mphf->size = bmz->m;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
return mphf;
}
static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint32 v, cmph_uint32 * biggest_g_value, cmph_uint32 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited)
{
cmph_uint32 next_g;
cmph_uint32 u; /* Auxiliary vertex */
cmph_uint32 lav; /* lookahead vertex */
cmph_uint8 collision;
vqueue_t * q = vqueue_new((cmph_uint32)(graph_ncritical_nodes(bmz->graph)) + 1);
graph_iterator_t it, it1;
DEBUGP("Labelling critical vertices\n");
bmz->g[v] = (cmph_uint32)ceil ((double)(*biggest_edge_value)/2) - 1;
SETBIT(visited, v);
next_g = (cmph_uint32)floor((double)(*biggest_edge_value/2)); /* next_g is incremented in the do..while statement*/
vqueue_insert(q, v);
while(!vqueue_is_empty(q))
{
v = vqueue_remove(q);
it = graph_neighbors_it(bmz->graph, v);
while ((u = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz->graph, u) && (!GETBIT(visited,u)))
{
collision = 1;
while(collision) // lookahead to resolve collisions
{
next_g = *biggest_g_value + 1;
it1 = graph_neighbors_it(bmz->graph, u);
collision = 0;
while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz->graph, lav) && GETBIT(visited,lav))
{
if(next_g + bmz->g[lav] >= bmz->m)
{
vqueue_destroy(q);
return 1; // restart mapping step.
}
if (GETBIT(used_edges, (next_g + bmz->g[lav])))
{
collision = 1;
break;
}
}
}
if (next_g > *biggest_g_value) *biggest_g_value = next_g;
}
// Marking used edges...
it1 = graph_neighbors_it(bmz->graph, u);
while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz->graph, lav) && GETBIT(visited, lav))
{
SETBIT(used_edges,(next_g + bmz->g[lav]));
if(next_g + bmz->g[lav] > *biggest_edge_value) *biggest_edge_value = next_g + bmz->g[lav];
}
}
bmz->g[u] = next_g; // Labelling vertex u.
SETBIT(visited,u);
vqueue_insert(q, u);
}
}
}
vqueue_destroy(q);
return 0;
}
static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, cmph_uint32 v, cmph_uint32 * biggest_g_value, cmph_uint32 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited)
{
cmph_uint32 next_g;
cmph_uint32 u; /* Auxiliary vertex */
cmph_uint32 lav; /* lookahead vertex */
cmph_uint8 collision;
cmph_uint32 * unused_g_values = NULL;
cmph_uint32 unused_g_values_capacity = 0;
cmph_uint32 nunused_g_values = 0;
vqueue_t * q = vqueue_new((cmph_uint32)(0.5*graph_ncritical_nodes(bmz->graph))+1);
graph_iterator_t it, it1;
DEBUGP("Labelling critical vertices\n");
bmz->g[v] = (cmph_uint32)ceil ((double)(*biggest_edge_value)/2) - 1;
SETBIT(visited, v);
next_g = (cmph_uint32)floor((double)(*biggest_edge_value/2)); /* next_g is incremented in the do..while statement*/
vqueue_insert(q, v);
while(!vqueue_is_empty(q))
{
v = vqueue_remove(q);
it = graph_neighbors_it(bmz->graph, v);
while ((u = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz->graph, u) && (!GETBIT(visited,u)))
{
cmph_uint32 next_g_index = 0;
collision = 1;
while(collision) // lookahead to resolve collisions
{
if (next_g_index < nunused_g_values)
{
next_g = unused_g_values[next_g_index++];
}
else
{
next_g = *biggest_g_value + 1;
next_g_index = UINT_MAX;
}
it1 = graph_neighbors_it(bmz->graph, u);
collision = 0;
while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz->graph, lav) && GETBIT(visited,lav))
{
if(next_g + bmz->g[lav] >= bmz->m)
{
vqueue_destroy(q);
free(unused_g_values);
return 1; // restart mapping step.
}
if (GETBIT(used_edges, (next_g + bmz->g[lav])))
{
collision = 1;
break;
}
}
}
if(collision && (next_g > *biggest_g_value)) // saving the current g value stored in next_g.
{
if(nunused_g_values == unused_g_values_capacity)
{
unused_g_values = (cmph_uint32 *)realloc(unused_g_values, (unused_g_values_capacity + BUFSIZ)*sizeof(cmph_uint32));
unused_g_values_capacity += BUFSIZ;
}
unused_g_values[nunused_g_values++] = next_g;
}
if (next_g > *biggest_g_value) *biggest_g_value = next_g;
}
next_g_index--;
if (next_g_index < nunused_g_values) unused_g_values[next_g_index] = unused_g_values[--nunused_g_values];
// Marking used edges...
it1 = graph_neighbors_it(bmz->graph, u);
while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz->graph, lav) && GETBIT(visited, lav))
{
SETBIT(used_edges,(next_g + bmz->g[lav]));
if(next_g + bmz->g[lav] > *biggest_edge_value) *biggest_edge_value = next_g + bmz->g[lav];
}
}
bmz->g[u] = next_g; // Labelling vertex u.
SETBIT(visited, u);
vqueue_insert(q, u);
}
}
}
vqueue_destroy(q);
free(unused_g_values);
return 0;
}
static cmph_uint32 next_unused_edge(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint32 unused_edge_index)
{
while(1)
{
assert(unused_edge_index < bmz->m);
if(GETBIT(used_edges, unused_edge_index)) unused_edge_index ++;
else break;
}
return unused_edge_index;
}
static void bmz_traverse(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint32 v, cmph_uint32 * unused_edge_index, cmph_uint8 * visited)
{
graph_iterator_t it = graph_neighbors_it(bmz->graph, v);
cmph_uint32 neighbor = 0;
while((neighbor = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
if(GETBIT(visited,neighbor)) continue;
//DEBUGP("Visiting neighbor %u\n", neighbor);
*unused_edge_index = next_unused_edge(bmz, used_edges, *unused_edge_index);
bmz->g[neighbor] = *unused_edge_index - bmz->g[v];
//if (bmz->g[neighbor] >= bmz->m) bmz->g[neighbor] += bmz->m;
SETBIT(visited, neighbor);
(*unused_edge_index)++;
bmz_traverse(bmz, used_edges, neighbor, unused_edge_index, visited);
}
}
static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint8 * visited)
{
cmph_uint32 i, v1, v2, unused_edge_index = 0;
DEBUGP("Labelling non critical vertices\n");
for(i = 0; i < bmz->m; i++)
{
v1 = graph_vertex_id(bmz->graph, i, 0);
v2 = graph_vertex_id(bmz->graph, i, 1);
if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue;
if(GETBIT(visited,v1)) bmz_traverse(bmz, used_edges, v1, &unused_edge_index, visited);
else bmz_traverse(bmz, used_edges, v2, &unused_edge_index, visited);
}
for(i = 0; i < bmz->n; i++)
{
if(!GETBIT(visited,i))
{
bmz->g[i] = 0;
SETBIT(visited, i);
bmz_traverse(bmz, used_edges, i, &unused_edge_index, visited);
}
}
}
static int bmz_gen_edges(cmph_config_t *mph)
{
cmph_uint32 e;
bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data;
cmph_uint8 multiple_edges = 0;
DEBUGP("Generating edges for %u vertices\n", bmz->n);
graph_clear_edges(bmz->graph);
mph->key_source->rewind(mph->key_source->data);
for (e = 0; e < mph->key_source->nkeys; ++e)
{
cmph_uint32 h1, h2;
cmph_uint32 keylen;
char *key = NULL;
mph->key_source->read(mph->key_source->data, &key, &keylen);
h1 = hash(bmz->hashes[0], key, keylen) % bmz->n;
h2 = hash(bmz->hashes[1], key, keylen) % bmz->n;
if (h1 == h2) if (++h2 >= bmz->n) h2 = 0;
DEBUGP("key: %.*s h1: %u h2: %u\n", keylen, key, h1, h2);
if (h1 == h2)
{
if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e);
mph->key_source->dispose(mph->key_source->data, key, keylen);
return 0;
}
DEBUGP("Adding edge: %u -> %u for key %.*s\n", h1, h2, keylen, key);
mph->key_source->dispose(mph->key_source->data, key, keylen);
multiple_edges = graph_contains_edge(bmz->graph, h1, h2);
if (mph->verbosity && multiple_edges) fprintf(stderr, "A non simple graph was generated\n");
if (multiple_edges) return 0; // checking multiple edge restriction.
graph_add_edge(bmz->graph, h1, h2);
}
return !multiple_edges;
}
int bmz_dump(cmph_t *mphf, FILE *fd)
{
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint32 two = 2; //number of hash functions
bmz_data_t *data = (bmz_data_t *)mphf->data;
register size_t nbytes;
__cmph_dump(mphf, fd);
nbytes = fwrite(&two, sizeof(cmph_uint32), (size_t)1, fd);
hash_state_dump(data->hashes[0], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
hash_state_dump(data->hashes[1], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(data->g, sizeof(cmph_uint32)*(data->n), (size_t)1, fd);
#ifdef DEBUG
cmph_uint32 i;
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
fprintf(stderr, "\n");
#endif
return 1;
}
void bmz_load(FILE *f, cmph_t *mphf)
{
cmph_uint32 nhashes;
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint32 i;
bmz_data_t *bmz = (bmz_data_t *)malloc(sizeof(bmz_data_t));
register size_t nbytes;
DEBUGP("Loading bmz mphf\n");
mphf->data = bmz;
nbytes = fread(&nhashes, sizeof(cmph_uint32), (size_t)1, f);
bmz->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1));
bmz->hashes[nhashes] = NULL;
DEBUGP("Reading %u hashes\n", nhashes);
for (i = 0; i < nhashes; ++i)
{
hash_state_t *state = NULL;
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
state = hash_state_load(buf, buflen);
bmz->hashes[i] = state;
free(buf);
}
DEBUGP("Reading m and n\n");
nbytes = fread(&(bmz->n), sizeof(cmph_uint32), (size_t)1, f);
nbytes = fread(&(bmz->m), sizeof(cmph_uint32), (size_t)1, f);
bmz->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*bmz->n);
nbytes = fread(bmz->g, bmz->n*sizeof(cmph_uint32), (size_t)1, f);
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < bmz->n; ++i) fprintf(stderr, "%u ", bmz->g[i]);
fprintf(stderr, "\n");
#endif
return;
}
cmph_uint32 bmz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
bmz_data_t *bmz = (bmz_data_t *)mphf->data;
cmph_uint32 h1 = hash(bmz->hashes[0], key, keylen) % bmz->n;
cmph_uint32 h2 = hash(bmz->hashes[1], key, keylen) % bmz->n;
DEBUGP("key: %.*s h1: %u h2: %u\n", keylen, key, h1, h2);
if (h1 == h2 && ++h2 >= bmz->n) h2 = 0;
DEBUGP("key: %.*s g[h1]: %u g[h2]: %u edges: %u\n", keylen, key, bmz->g[h1], bmz->g[h2], bmz->m);
return bmz->g[h1] + bmz->g[h2];
}
void bmz_destroy(cmph_t *mphf)
{
bmz_data_t *data = (bmz_data_t *)mphf->data;
free(data->g);
hash_state_destroy(data->hashes[0]);
hash_state_destroy(data->hashes[1]);
free(data->hashes);
free(data);
free(mphf);
}
/** \fn void bmz_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void bmz_pack(cmph_t *mphf, void *packed_mphf)
{
bmz_data_t *data = (bmz_data_t *)mphf->data;
cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
// packing h1 type
CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
*((cmph_uint32 *) ptr) = h1_type;
ptr += sizeof(cmph_uint32);
// packing h1
hash_state_pack(data->hashes[0], ptr);
ptr += hash_state_packed_size(h1_type);
// packing h2 type
CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
*((cmph_uint32 *) ptr) = h2_type;
ptr += sizeof(cmph_uint32);
// packing h2
hash_state_pack(data->hashes[1], ptr);
ptr += hash_state_packed_size(h2_type);
// packing n
*((cmph_uint32 *) ptr) = data->n;
ptr += sizeof(data->n);
// packing g
memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n);
}
/** \fn cmph_uint32 bmz_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 bmz_packed_size(cmph_t *mphf)
{
bmz_data_t *data = (bmz_data_t *)mphf->data;
CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) +
3*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n);
}
/** cmph_uint32 bmz_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
register cmph_uint8 *h1_ptr = (cmph_uint8 *)packed_mphf;
register CMPH_HASH h1_type = (CMPH_HASH)(*((cmph_uint32 *)h1_ptr));
h1_ptr += 4;
register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
register CMPH_HASH h2_type = (CMPH_HASH)(*((cmph_uint32 *)h2_ptr));
h2_ptr += 4;
register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type));
register cmph_uint32 n = *g_ptr++;
register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n;
register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n;
if (h1 == h2 && ++h2 >= n) h2 = 0;
return (g_ptr[h1] + g_ptr[h2]);
}
cmph-2.0.2/src/fnv_hash.c 0000644 0001750 0001750 00000002312 13411542035 014527 0 ustar joseph joseph #include "fnv_hash.h"
#include
fnv_state_t *fnv_state_new()
{
fnv_state_t *state = (fnv_state_t *)malloc(sizeof(fnv_state_t));
if (!state) return NULL;
state->hashfunc = CMPH_HASH_FNV;
return state;
}
void fnv_state_destroy(fnv_state_t *state)
{
free(state);
}
cmph_uint32 fnv_hash(fnv_state_t *state, const char *k, cmph_uint32 keylen)
{
const unsigned char *bp = (const unsigned char *)k;
const unsigned char *be = bp + keylen;
static unsigned int hval = 0;
while (bp < be)
{
//hval *= 0x01000193; good for non-gcc compiler
hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); //good for gcc
hval ^= *bp++;
}
return hval;
}
void fnv_state_dump(fnv_state_t *state, char **buf, cmph_uint32 *buflen)
{
*buf = NULL;
*buflen = 0;
return;
}
fnv_state_t * fnv_state_copy(fnv_state_t *src_state)
{
fnv_state_t *dest_state = (fnv_state_t *)malloc(sizeof(fnv_state_t));
if (!dest_state) return NULL;
dest_state->hashfunc = src_state->hashfunc;
return dest_state;
}
fnv_state_t *fnv_state_load(const char *buf, cmph_uint32 buflen)
{
fnv_state_t *state = (fnv_state_t *)malloc(sizeof(fnv_state_t));
state->hashfunc = CMPH_HASH_FNV;
return state;
}
cmph-2.0.2/src/compressed_seq.h 0000644 0001750 0001750 00000007326 13411542035 015766 0 ustar joseph joseph #ifndef __CMPH_COMPRESSED_SEQ_H__
#define __CMPH_COMPRESSED_SEQ_H__
#include"select.h"
struct _compressed_seq_t
{
cmph_uint32 n; // number of values stored in store_table
// The length in bits of each value is decomposed into two compnents: the lg(n) MSBs are stored in rank_select data structure
// the remaining LSBs are stored in a table of n cells, each one of rem_r bits.
cmph_uint32 rem_r;
cmph_uint32 total_length; // total length in bits of stored_table
select_t sel;
cmph_uint32 * length_rems;
cmph_uint32 * store_table;
};
typedef struct _compressed_seq_t compressed_seq_t;
/** \fn void compressed_seq_init(compressed_seq_t * cs);
* \brief Initialize a compressed sequence structure.
* \param cs points to the compressed sequence structure to be initialized
*/
void compressed_seq_init(compressed_seq_t * cs);
/** \fn void compressed_seq_destroy(compressed_seq_t * cs);
* \brief Destroy a compressed sequence given as input.
* \param cs points to the compressed sequence structure to be destroyed
*/
void compressed_seq_destroy(compressed_seq_t * cs);
/** \fn void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n);
* \brief Generate a compressed sequence from an input array with n values.
* \param cs points to the compressed sequence structure
* \param vals_table poiter to the array given as input
* \param n number of values in @see vals_table
*/
void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n);
/** \fn cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx);
* \brief Returns the value stored at index @see idx of the compressed sequence structure.
* \param cs points to the compressed sequence structure
* \param idx index to retrieve the value from
* \return the value stored at index @see idx of the compressed sequence structure
*/
cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx);
/** \fn cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs);
* \brief Returns amount of space (in bits) to store the compressed sequence.
* \param cs points to the compressed sequence structure
* \return the amount of space (in bits) to store @see cs
*/
cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs);
void compressed_seq_dump(compressed_seq_t * cs, char ** buf, cmph_uint32 * buflen);
void compressed_seq_load(compressed_seq_t * cs, const char * buf, cmph_uint32 buflen);
/** \fn void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed);
* \brief Support the ability to pack a compressed sequence structure into a preallocated contiguous memory space pointed by cs_packed.
* \param cs points to the compressed sequence structure
* \param cs_packed pointer to the contiguous memory area used to store the compressed sequence structure. The size of cs_packed must be at least @see compressed_seq_packed_size
*/
void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed);
/** \fn cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs);
* \brief Return the amount of space needed to pack a compressed sequence structure.
* \return the size of the packed compressed sequence structure or zero for failures
*/
cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs);
/** \fn cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx);
* \brief Returns the value stored at index @see idx of the packed compressed sequence structure.
* \param cs_packed is a pointer to a contiguous memory area
* \param idx is the index to retrieve the value from
* \return the value stored at index @see idx of the packed compressed sequence structure
*/
cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx);
#endif
cmph-2.0.2/src/select.h 0000644 0001750 0001750 00000004236 13411542035 014226 0 ustar joseph joseph #ifndef __CMPH_SELECT_H__
#define __CMPH_SELECT_H__
#include "cmph_types.h"
struct _select_t
{
cmph_uint32 n,m;
cmph_uint32 * bits_vec;
cmph_uint32 * select_table;
};
typedef struct _select_t select_t;
void select_init(select_t * sel);
void select_destroy(select_t * sel);
void select_generate(select_t * sel, cmph_uint32 * keys_vec, cmph_uint32 n, cmph_uint32 m);
cmph_uint32 select_query(select_t * sel, cmph_uint32 one_idx);
cmph_uint32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx);
cmph_uint32 select_get_space_usage(select_t * sel);
void select_dump(select_t *sel, char **buf, cmph_uint32 *buflen);
void select_load(select_t * sel, const char *buf, cmph_uint32 buflen);
/** \fn void select_pack(select_t *sel, void *sel_packed);
* \brief Support the ability to pack a select structure into a preallocated contiguous memory space pointed by sel_packed.
* \param sel points to the select structure
* \param sel_packed pointer to the contiguous memory area used to store the select structure. The size of sel_packed must be at least @see select_packed_size
*/
void select_pack(select_t *sel, void *sel_packed);
/** \fn cmph_uint32 select_packed_size(select_t *sel);
* \brief Return the amount of space needed to pack a select structure.
* \return the size of the packed select structure or zero for failures
*/
cmph_uint32 select_packed_size(select_t *sel);
/** \fn cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx);
* \param sel_packed is a pointer to a contiguous memory area
* \param one_idx is the rank for which we want to calculate the inverse function select
* \return an integer that represents the select value of rank idx.
*/
cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx);
/** \fn cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx);
* \param sel_packed is a pointer to a contiguous memory area
* \param vec_bit_idx is a value prior computed by @see select_query_packed
* \return an integer that represents the next select value greater than @see vec_bit_idx.
*/
cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx);
#endif
cmph-2.0.2/src/chd.h 0000644 0001750 0001750 00000004473 13411542035 013510 0 ustar joseph joseph #ifndef _CMPH_CHD_H__
#define _CMPH_CHD_H__
#include "cmph.h"
typedef struct __chd_data_t chd_data_t;
typedef struct __chd_config_data_t chd_config_data_t;
/* Config API */
chd_config_data_t *chd_config_new(cmph_config_t * mph);
void chd_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
/** \fn void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin);
* \brief Allows to set the number of keys per bin.
* \param mph pointer to the configuration structure
* \param keys_per_bin value for the number of keys per bin
*/
void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin);
/** \fn void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket);
* \brief Allows to set the number of keys per bucket.
* \param mph pointer to the configuration structure
* \param keys_per_bucket value for the number of keys per bucket
*/
void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket);
void chd_config_destroy(cmph_config_t *mph);
/* Chd algorithm API */
cmph_t *chd_new(cmph_config_t *mph, double c);
void chd_load(FILE *fd, cmph_t *mphf);
int chd_dump(cmph_t *mphf, FILE *fd);
void chd_destroy(cmph_t *mphf);
cmph_uint32 chd_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
/** \fn void chd_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void chd_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 chd_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 chd_packed_size(cmph_t *mphf);
/** cmph_uint32 chd_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 chd_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
#endif
cmph-2.0.2/src/vqueue.h 0000644 0001750 0001750 00000000565 13411542035 014262 0 ustar joseph joseph #ifndef __CMPH_VQUEUE_H__
#define __CMPH_VQUEUE_H__
#include "cmph_types.h"
typedef struct __vqueue_t vqueue_t;
vqueue_t * vqueue_new(cmph_uint32 capacity);
cmph_uint8 vqueue_is_empty(vqueue_t * q);
void vqueue_insert(vqueue_t * q, cmph_uint32 val);
cmph_uint32 vqueue_remove(vqueue_t * q);
void vqueue_print(vqueue_t * q);
void vqueue_destroy(vqueue_t * q);
#endif
cmph-2.0.2/src/chm_structs.h 0000644 0001750 0001750 00000000640 13411542035 015300 0 ustar joseph joseph #ifndef __CMPH_CHM_STRUCTS_H__
#define __CMPH_CHM_STRUCTS_H__
#include "hash_state.h"
struct __chm_data_t
{
cmph_uint32 m; //edges (words) count
cmph_uint32 n; //vertex count
cmph_uint32 *g;
hash_state_t **hashes;
};
struct __chm_config_data_t
{
CMPH_HASH hashfuncs[2];
cmph_uint32 m; //edges (words) count
cmph_uint32 n; //vertex count
graph_t *graph;
cmph_uint32 *g;
hash_state_t **hashes;
};
#endif
cmph-2.0.2/src/bdz_ph.c 0000755 0001750 0001750 00000053060 13411542035 014212 0 ustar joseph joseph #include "bdz_ph.h"
#include "cmph_structs.h"
#include "bdz_structs_ph.h"
#include "hash.h"
#include "bitbool.h"
#include
#include
#include
#include
#include
//#define DEBUG
#include "debug.h"
#define UNASSIGNED 3
#define NULL_EDGE 0xffffffff
static cmph_uint8 pow3_table[5] = {1,3,9,27,81};
static cmph_uint8 lookup_table[5][256] = {
{0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
{0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
};
typedef struct
{
cmph_uint32 vertices[3];
cmph_uint32 next_edges[3];
}bdz_ph_edge_t;
typedef cmph_uint32 * bdz_ph_queue_t;
static void bdz_ph_alloc_queue(bdz_ph_queue_t * queuep, cmph_uint32 nedges)
{
(*queuep)=(cmph_uint32 *)malloc(nedges*sizeof(cmph_uint32));
};
static void bdz_ph_free_queue(bdz_ph_queue_t * queue)
{
free(*queue);
};
typedef struct
{
cmph_uint32 nedges;
bdz_ph_edge_t * edges;
cmph_uint32 * first_edge;
cmph_uint8 * vert_degree;
}bdz_ph_graph3_t;
static void bdz_ph_alloc_graph3(bdz_ph_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices)
{
graph3->edges=(bdz_ph_edge_t *)malloc(nedges*sizeof(bdz_ph_edge_t));
graph3->first_edge=(cmph_uint32 *)malloc(nvertices*sizeof(cmph_uint32));
graph3->vert_degree=(cmph_uint8 *)malloc((size_t)nvertices);
};
static void bdz_ph_init_graph3(bdz_ph_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices)
{
memset(graph3->first_edge,0xff,nvertices*sizeof(cmph_uint32));
memset(graph3->vert_degree,0,(size_t)nvertices);
graph3->nedges=0;
};
static void bdz_ph_free_graph3(bdz_ph_graph3_t *graph3)
{
free(graph3->edges);
free(graph3->first_edge);
free(graph3->vert_degree);
};
static void bdz_ph_partial_free_graph3(bdz_ph_graph3_t *graph3)
{
free(graph3->first_edge);
free(graph3->vert_degree);
graph3->first_edge = NULL;
graph3->vert_degree = NULL;
};
static void bdz_ph_add_edge(bdz_ph_graph3_t * graph3, cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2)
{
graph3->edges[graph3->nedges].vertices[0]=v0;
graph3->edges[graph3->nedges].vertices[1]=v1;
graph3->edges[graph3->nedges].vertices[2]=v2;
graph3->edges[graph3->nedges].next_edges[0]=graph3->first_edge[v0];
graph3->edges[graph3->nedges].next_edges[1]=graph3->first_edge[v1];
graph3->edges[graph3->nedges].next_edges[2]=graph3->first_edge[v2];
graph3->first_edge[v0]=graph3->first_edge[v1]=graph3->first_edge[v2]=graph3->nedges;
graph3->vert_degree[v0]++;
graph3->vert_degree[v1]++;
graph3->vert_degree[v2]++;
graph3->nedges++;
};
static void bdz_ph_dump_graph(bdz_ph_graph3_t* graph3, cmph_uint32 nedges, cmph_uint32 nvertices)
{
cmph_uint32 i;
for(i=0;iedges[i].vertices[0],
graph3->edges[i].vertices[1],graph3->edges[i].vertices[2]);
printf(" nexts %d %d %d",graph3->edges[i].next_edges[0],
graph3->edges[i].next_edges[1],graph3->edges[i].next_edges[2]);
};
for(i=0;ifirst_edge[i]);
};
};
static void bdz_ph_remove_edge(bdz_ph_graph3_t * graph3, cmph_uint32 curr_edge)
{
cmph_uint32 i,j=0,vert,edge1,edge2;
for(i=0;i<3;i++){
vert=graph3->edges[curr_edge].vertices[i];
edge1=graph3->first_edge[vert];
edge2=NULL_EDGE;
while(edge1!=curr_edge&&edge1!=NULL_EDGE){
edge2=edge1;
if(graph3->edges[edge1].vertices[0]==vert){
j=0;
} else if(graph3->edges[edge1].vertices[1]==vert){
j=1;
} else
j=2;
edge1=graph3->edges[edge1].next_edges[j];
};
if(edge1==NULL_EDGE){
printf("\nerror remove edge %d dump graph",curr_edge);
bdz_ph_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4);
exit(-1);
};
if(edge2!=NULL_EDGE){
graph3->edges[edge2].next_edges[j] =
graph3->edges[edge1].next_edges[i];
} else
graph3->first_edge[vert]=
graph3->edges[edge1].next_edges[i];
graph3->vert_degree[vert]--;
};
};
static int bdz_ph_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_ph_queue_t queue, bdz_ph_graph3_t* graph3)
{
cmph_uint32 i,v0,v1,v2;
cmph_uint32 queue_head=0,queue_tail=0;
cmph_uint32 curr_edge;
cmph_uint32 tmp_edge;
cmph_uint8 * marked_edge =(cmph_uint8 *)malloc((size_t)(nedges >> 3) + 1);
memset(marked_edge, 0, (size_t)(nedges >> 3) + 1);
for(i=0;iedges[i].vertices[0];
v1=graph3->edges[i].vertices[1];
v2=graph3->edges[i].vertices[2];
if(graph3->vert_degree[v0]==1 ||
graph3->vert_degree[v1]==1 ||
graph3->vert_degree[v2]==1){
if(!GETBIT(marked_edge,i)) {
queue[queue_head++]=i;
SETBIT(marked_edge,i);
}
};
};
while(queue_tail!=queue_head){
curr_edge=queue[queue_tail++];
bdz_ph_remove_edge(graph3,curr_edge);
v0=graph3->edges[curr_edge].vertices[0];
v1=graph3->edges[curr_edge].vertices[1];
v2=graph3->edges[curr_edge].vertices[2];
if(graph3->vert_degree[v0]==1 ) {
tmp_edge=graph3->first_edge[v0];
if(!GETBIT(marked_edge,tmp_edge)) {
queue[queue_head++]=tmp_edge;
SETBIT(marked_edge,tmp_edge);
};
};
if(graph3->vert_degree[v1]==1) {
tmp_edge=graph3->first_edge[v1];
if(!GETBIT(marked_edge,tmp_edge)){
queue[queue_head++]=tmp_edge;
SETBIT(marked_edge,tmp_edge);
};
};
if(graph3->vert_degree[v2]==1){
tmp_edge=graph3->first_edge[v2];
if(!GETBIT(marked_edge,tmp_edge)){
queue[queue_head++]=tmp_edge;
SETBIT(marked_edge,tmp_edge);
};
};
};
free(marked_edge);
return (int)queue_head - (int)nedges;/* returns 0 if successful otherwies return negative number*/
};
static int bdz_ph_mapping(cmph_config_t *mph, bdz_ph_graph3_t* graph3, bdz_ph_queue_t queue);
static void assigning(bdz_ph_config_data_t *bdz_ph, bdz_ph_graph3_t* graph3, bdz_ph_queue_t queue);
static void bdz_ph_optimization(bdz_ph_config_data_t *bdz_ph);
bdz_ph_config_data_t *bdz_ph_config_new(void)
{
bdz_ph_config_data_t *bdz_ph;
bdz_ph = (bdz_ph_config_data_t *)malloc(sizeof(bdz_ph_config_data_t));
assert(bdz_ph);
memset(bdz_ph, 0, sizeof(bdz_ph_config_data_t));
bdz_ph->hashfunc = CMPH_HASH_JENKINS;
bdz_ph->g = NULL;
bdz_ph->hl = NULL;
return bdz_ph;
}
void bdz_ph_config_destroy(cmph_config_t *mph)
{
bdz_ph_config_data_t *data = (bdz_ph_config_data_t *)mph->data;
DEBUGP("Destroying algorithm dependent data\n");
free(data);
}
void bdz_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
bdz_ph_config_data_t *bdz_ph = (bdz_ph_config_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
cmph_uint32 i = 0;
while(*hashptr != CMPH_HASH_COUNT)
{
if (i >= 1) break; //bdz_ph only uses one linear hash function
bdz_ph->hashfunc = *hashptr;
++i, ++hashptr;
}
}
cmph_t *bdz_ph_new(cmph_config_t *mph, double c)
{
cmph_t *mphf = NULL;
bdz_ph_data_t *bdz_phf = NULL;
cmph_uint32 iterations;
bdz_ph_queue_t edges;
bdz_ph_graph3_t graph3;
bdz_ph_config_data_t *bdz_ph = (bdz_ph_config_data_t *)mph->data;
#ifdef CMPH_TIMING
double construction_time_begin = 0.0;
double construction_time = 0.0;
ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
#endif
if (c == 0) c = 1.23; // validating restrictions over parameter c.
DEBUGP("c: %f\n", c);
bdz_ph->m = mph->key_source->nkeys;
bdz_ph->r = (cmph_uint32)ceil((c * mph->key_source->nkeys)/3);
if ((bdz_ph->r % 2) == 0) bdz_ph->r += 1;
if (bdz_ph->r == 1) { // workaround for small key sets
bdz_ph->r = 3;
}
bdz_ph->n = 3*bdz_ph->r;
bdz_ph_alloc_graph3(&graph3, bdz_ph->m, bdz_ph->n);
bdz_ph_alloc_queue(&edges,bdz_ph->m);
DEBUGP("Created hypergraph\n");
DEBUGP("m (edges): %u n (vertices): %u r: %u c: %f \n", bdz_ph->m, bdz_ph->n, bdz_ph->r, c);
// Mapping step
iterations = 100;
if (mph->verbosity)
{
fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", bdz_ph->m, bdz_ph->n);
}
while(1)
{
int ok;
DEBUGP("linear hash function \n");
bdz_ph->hl = hash_state_new(bdz_ph->hashfunc, 15);
ok = bdz_ph_mapping(mph, &graph3, edges);
if (!ok)
{
--iterations;
hash_state_destroy(bdz_ph->hl);
bdz_ph->hl = NULL;
DEBUGP("%u iterations remaining\n", iterations);
if (mph->verbosity)
{
fprintf(stderr, "acyclic graph creation failure - %u iterations remaining\n", iterations);
}
if (iterations == 0) break;
}
else break;
}
if (iterations == 0)
{
// free(bdz_ph->g);
bdz_ph_free_queue(&edges);
bdz_ph_free_graph3(&graph3);
return NULL;
}
bdz_ph_partial_free_graph3(&graph3);
// Assigning step
if (mph->verbosity)
{
fprintf(stderr, "Entering assigning step for mph creation of %u keys with graph sized %u\n", bdz_ph->m, bdz_ph->n);
}
assigning(bdz_ph, &graph3, edges);
bdz_ph_free_queue(&edges);
bdz_ph_free_graph3(&graph3);
if (mph->verbosity)
{
fprintf(stderr, "Starting optimization step\n");
}
bdz_ph_optimization(bdz_ph);
#ifdef CMPH_TIMING
ELAPSED_TIME_IN_SECONDS(&construction_time);
#endif
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
bdz_phf = (bdz_ph_data_t *)malloc(sizeof(bdz_ph_data_t));
bdz_phf->g = bdz_ph->g;
bdz_ph->g = NULL; //transfer memory ownership
bdz_phf->hl = bdz_ph->hl;
bdz_ph->hl = NULL; //transfer memory ownership
bdz_phf->n = bdz_ph->n;
bdz_phf->m = bdz_ph->m;
bdz_phf->r = bdz_ph->r;
mphf->data = bdz_phf;
mphf->size = bdz_ph->n;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
#ifdef CMPH_TIMING
register cmph_uint32 space_usage = bdz_ph_packed_size(mphf)*8;
register cmph_uint32 keys_per_bucket = 1;
construction_time = construction_time - construction_time_begin;
fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz_ph->m, bdz_ph->m/(double)bdz_ph->n, keys_per_bucket, construction_time, space_usage/(double)bdz_ph->m);
#endif
return mphf;
}
static int bdz_ph_mapping(cmph_config_t *mph, bdz_ph_graph3_t* graph3, bdz_ph_queue_t queue)
{
cmph_uint32 e;
int cycles = 0;
cmph_uint32 hl[3];
bdz_ph_config_data_t *bdz_ph = (bdz_ph_config_data_t *)mph->data;
bdz_ph_init_graph3(graph3, bdz_ph->m, bdz_ph->n);
mph->key_source->rewind(mph->key_source->data);
for (e = 0; e < mph->key_source->nkeys; ++e)
{
cmph_uint32 h0, h1, h2;
cmph_uint32 keylen;
char *key = NULL;
mph->key_source->read(mph->key_source->data, &key, &keylen);
hash_vector(bdz_ph->hl, key, keylen, hl);
h0 = hl[0] % bdz_ph->r;
h1 = hl[1] % bdz_ph->r + bdz_ph->r;
h2 = hl[2] % bdz_ph->r + (bdz_ph->r << 1);
mph->key_source->dispose(mph->key_source->data, key, keylen);
bdz_ph_add_edge(graph3,h0,h1,h2);
}
cycles = bdz_ph_generate_queue(bdz_ph->m, bdz_ph->n, queue, graph3);
return (cycles == 0);
}
static void assigning(bdz_ph_config_data_t *bdz_ph, bdz_ph_graph3_t* graph3, bdz_ph_queue_t queue)
{
cmph_uint32 i;
cmph_uint32 nedges=graph3->nedges;
cmph_uint32 curr_edge;
cmph_uint32 v0,v1,v2;
cmph_uint8 * marked_vertices = (cmph_uint8 *)malloc((size_t)(bdz_ph->n >> 3) + 1);
cmph_uint32 sizeg = (cmph_uint32)ceil(bdz_ph->n/4.0);
bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8));
memset(marked_vertices, 0, (size_t)(bdz_ph->n >> 3) + 1);
//memset(bdz_ph->g, 0xff, sizeg);
for(i=nedges-1;i+1>=1;i--){
curr_edge=queue[i];
v0=graph3->edges[curr_edge].vertices[0];
v1=graph3->edges[curr_edge].vertices[1];
v2=graph3->edges[curr_edge].vertices[2];
DEBUGP("B:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz_ph->g, v0), GETVALUE(bdz_ph->g, v1), GETVALUE(bdz_ph->g, v2));
if(!GETBIT(marked_vertices, v0)){
if(!GETBIT(marked_vertices,v1))
{
//SETVALUE(bdz_ph->g, v1, UNASSIGNED);
SETBIT(marked_vertices, v1);
}
if(!GETBIT(marked_vertices,v2))
{
//SETVALUE(bdz_ph->g, v2, UNASSIGNED);
SETBIT(marked_vertices, v2);
}
SETVALUE0(bdz_ph->g, v0, (6-(GETVALUE(bdz_ph->g, v1) + GETVALUE(bdz_ph->g,v2)))%3);
SETBIT(marked_vertices, v0);
} else if(!GETBIT(marked_vertices, v1)) {
if(!GETBIT(marked_vertices, v2))
{
//SETVALUE(bdz_ph->g, v2, UNASSIGNED);
SETBIT(marked_vertices, v2);
}
SETVALUE0(bdz_ph->g, v1, (7 - (GETVALUE(bdz_ph->g, v0)+GETVALUE(bdz_ph->g, v2)))%3);
SETBIT(marked_vertices, v1);
}else {
SETVALUE0(bdz_ph->g, v2, (8-(GETVALUE(bdz_ph->g,v0)+GETVALUE(bdz_ph->g, v1)))%3);
SETBIT(marked_vertices, v2);
}
DEBUGP("A:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz_ph->g, v0), GETVALUE(bdz_ph->g, v1), GETVALUE(bdz_ph->g, v2));
};
free(marked_vertices);
}
static void bdz_ph_optimization(bdz_ph_config_data_t *bdz_ph)
{
cmph_uint32 i;
cmph_uint8 byte = 0;
cmph_uint32 sizeg = (cmph_uint32)ceil(bdz_ph->n/5.0);
cmph_uint8 * new_g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8));
cmph_uint8 value;
cmph_uint32 idx;
for(i = 0; i < bdz_ph->n; i++)
{
idx = i/5;
byte = new_g[idx];
value = GETVALUE(bdz_ph->g, i);
byte = (cmph_uint8) (byte + value*pow3_table[i%5U]);
new_g[idx] = byte;
}
free(bdz_ph->g);
bdz_ph->g = new_g;
}
int bdz_ph_dump(cmph_t *mphf, FILE *fd)
{
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint32 sizeg = 0;
register size_t nbytes;
bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data;
__cmph_dump(mphf, fd);
hash_state_dump(data->hl, &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(&(data->r), sizeof(cmph_uint32), (size_t)1, fd);
sizeg = (cmph_uint32)ceil(data->n/5.0);
nbytes = fwrite(data->g, sizeof(cmph_uint8)*sizeg, (size_t)1, fd);
#ifdef DEBUG
cmph_uint32 i;
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", GETVALUE(data->g, i));
fprintf(stderr, "\n");
#endif
return 1;
}
void bdz_ph_load(FILE *f, cmph_t *mphf)
{
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint32 sizeg = 0;
register size_t nbytes;
bdz_ph_data_t *bdz_ph = (bdz_ph_data_t *)malloc(sizeof(bdz_ph_data_t));
DEBUGP("Loading bdz_ph mphf\n");
mphf->data = bdz_ph;
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
bdz_ph->hl = hash_state_load(buf, buflen);
free(buf);
DEBUGP("Reading m and n\n");
nbytes = fread(&(bdz_ph->n), sizeof(cmph_uint32), (size_t)1, f);
nbytes = fread(&(bdz_ph->m), sizeof(cmph_uint32), (size_t)1, f);
nbytes = fread(&(bdz_ph->r), sizeof(cmph_uint32), (size_t)1, f);
sizeg = (cmph_uint32)ceil(bdz_ph->n/5.0);
bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8));
nbytes = fread(bdz_ph->g, sizeg*sizeof(cmph_uint8), (size_t)1, f);
return;
}
cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
register bdz_ph_data_t *bdz_ph = (bdz_ph_data_t *)mphf->data;
cmph_uint32 hl[3];
register cmph_uint8 byte0, byte1, byte2;
register cmph_uint32 vertex;
hash_vector(bdz_ph->hl, key, keylen,hl);
hl[0] = hl[0] % bdz_ph->r;
hl[1] = hl[1] % bdz_ph->r + bdz_ph->r;
hl[2] = hl[2] % bdz_ph->r + (bdz_ph->r << 1);
byte0 = bdz_ph->g[hl[0]/5];
byte1 = bdz_ph->g[hl[1]/5];
byte2 = bdz_ph->g[hl[2]/5];
byte0 = lookup_table[hl[0]%5U][byte0];
byte1 = lookup_table[hl[1]%5U][byte1];
byte2 = lookup_table[hl[2]%5U][byte2];
vertex = hl[(byte0 + byte1 + byte2)%3];
return vertex;
}
void bdz_ph_destroy(cmph_t *mphf)
{
bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data;
free(data->g);
hash_state_destroy(data->hl);
free(data);
free(mphf);
}
/** \fn void bdz_ph_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void bdz_ph_pack(cmph_t *mphf, void *packed_mphf)
{
bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data;
cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
// packing hl type
CMPH_HASH hl_type = hash_get_type(data->hl);
*((cmph_uint32 *) ptr) = hl_type;
ptr += sizeof(cmph_uint32);
// packing hl
hash_state_pack(data->hl, ptr);
ptr += hash_state_packed_size(hl_type);
// packing r
*((cmph_uint32 *) ptr) = data->r;
ptr += sizeof(data->r);
// packing g
cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/5.0);
memcpy(ptr, data->g, sizeof(cmph_uint8)*sizeg);
}
/** \fn cmph_uint32 bdz_ph_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 bdz_ph_packed_size(cmph_t *mphf)
{
bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data;
CMPH_HASH hl_type = hash_get_type(data->hl);
cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/5.0);
return (cmph_uint32) (sizeof(CMPH_ALGO) + hash_state_packed_size(hl_type) + 2*sizeof(cmph_uint32) + sizeof(cmph_uint8)*sizeg);
}
/** cmph_uint32 bdz_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 bdz_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
register CMPH_HASH hl_type = (CMPH_HASH)*(cmph_uint32 *)packed_mphf;
register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4;
register cmph_uint8 * ptr = hl_ptr + hash_state_packed_size(hl_type);
register cmph_uint32 r = *((cmph_uint32*) ptr);
register cmph_uint8 * g = ptr + 4;
cmph_uint32 hl[3];
register cmph_uint8 byte0, byte1, byte2;
register cmph_uint32 vertex;
hash_vector_packed(hl_ptr, hl_type, key, keylen, hl);
hl[0] = hl[0] % r;
hl[1] = hl[1] % r + r;
hl[2] = hl[2] % r + (r << 1);
byte0 = g[hl[0]/5];
byte1 = g[hl[1]/5];
byte2 = g[hl[2]/5];
byte0 = lookup_table[hl[0]%5][byte0];
byte1 = lookup_table[hl[1]%5][byte1];
byte2 = lookup_table[hl[2]%5][byte2];
vertex = hl[(byte0 + byte1 + byte2)%3];
return vertex;
}
cmph-2.0.2/src/compressed_rank.c 0000644 0001750 0001750 00000017143 13411542035 016122 0 ustar joseph joseph #include
#include
#include
#include
#include"compressed_rank.h"
#include"bitbool.h"
// #define DEBUG
#include"debug.h"
static inline cmph_uint32 compressed_rank_i_log2(cmph_uint32 x)
{
register cmph_uint32 res = 0;
while(x > 1)
{
x >>= 1;
res++;
}
return res;
};
void compressed_rank_init(compressed_rank_t * cr)
{
cr->max_val = 0;
cr->n = 0;
cr->rem_r = 0;
select_init(&cr->sel);
cr->vals_rems = 0;
}
void compressed_rank_destroy(compressed_rank_t * cr)
{
free(cr->vals_rems);
cr->vals_rems = 0;
select_destroy(&cr->sel);
}
void compressed_rank_generate(compressed_rank_t * cr, cmph_uint32 * vals_table, cmph_uint32 n)
{
register cmph_uint32 i,j;
register cmph_uint32 rems_mask;
register cmph_uint32 * select_vec = 0;
cr->n = n;
cr->max_val = vals_table[cr->n - 1];
cr->rem_r = compressed_rank_i_log2(cr->max_val/cr->n);
if(cr->rem_r == 0)
{
cr->rem_r = 1;
}
select_vec = (cmph_uint32 *) calloc(cr->max_val >> cr->rem_r, sizeof(cmph_uint32));
cr->vals_rems = (cmph_uint32 *) calloc(BITS_TABLE_SIZE(cr->n, cr->rem_r), sizeof(cmph_uint32));
rems_mask = (1U << cr->rem_r) - 1U;
for(i = 0; i < cr->n; i++)
{
set_bits_value(cr->vals_rems, i, vals_table[i] & rems_mask, cr->rem_r, rems_mask);
}
for(i = 1, j = 0; i <= cr->max_val >> cr->rem_r; i++)
{
while(i > (vals_table[j] >> cr->rem_r))
{
j++;
}
select_vec[i - 1] = j;
};
// FABIANO: before it was (cr->total_length >> cr->rem_r) + 1. But I wiped out the + 1 because
// I changed the select structure to work up to m, instead of up to m - 1.
select_generate(&cr->sel, select_vec, cr->max_val >> cr->rem_r, cr->n);
free(select_vec);
}
cmph_uint32 compressed_rank_query(compressed_rank_t * cr, cmph_uint32 idx)
{
register cmph_uint32 rems_mask;
register cmph_uint32 val_quot, val_rem;
register cmph_uint32 sel_res, rank;
if(idx > cr->max_val)
{
return cr->n;
}
val_quot = idx >> cr->rem_r;
rems_mask = (1U << cr->rem_r) - 1U;
val_rem = idx & rems_mask;
if(val_quot == 0)
{
rank = sel_res = 0;
}
else
{
sel_res = select_query(&cr->sel, val_quot - 1) + 1;
rank = sel_res - val_quot;
}
do
{
if(GETBIT32(cr->sel.bits_vec, sel_res))
{
break;
}
if(get_bits_value(cr->vals_rems, rank, cr->rem_r, rems_mask) >= val_rem)
{
break;
}
sel_res++;
rank++;
} while(1);
return rank;
}
cmph_uint32 compressed_rank_get_space_usage(compressed_rank_t * cr)
{
register cmph_uint32 space_usage = select_get_space_usage(&cr->sel);
space_usage += BITS_TABLE_SIZE(cr->n, cr->rem_r)*(cmph_uint32)sizeof(cmph_uint32)*8;
space_usage += 3*(cmph_uint32)sizeof(cmph_uint32)*8;
return space_usage;
}
void compressed_rank_dump(compressed_rank_t * cr, char **buf, cmph_uint32 *buflen)
{
register cmph_uint32 sel_size = select_packed_size(&(cr->sel));
register cmph_uint32 vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r) * (cmph_uint32)sizeof(cmph_uint32);
register cmph_uint32 pos = 0;
char * buf_sel = 0;
cmph_uint32 buflen_sel = 0;
*buflen = 4*(cmph_uint32)sizeof(cmph_uint32) + sel_size + vals_rems_size;
DEBUGP("sel_size = %u\n", sel_size);
DEBUGP("vals_rems_size = %u\n", vals_rems_size);
*buf = (char *)calloc(*buflen, sizeof(char));
if (!*buf)
{
*buflen = UINT_MAX;
return;
}
// dumping max_val, n and rem_r
memcpy(*buf, &(cr->max_val), sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("max_val = %u\n", cr->max_val);
memcpy(*buf + pos, &(cr->n), sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("n = %u\n", cr->n);
memcpy(*buf + pos, &(cr->rem_r), sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("rem_r = %u\n", cr->rem_r);
// dumping sel
select_dump(&cr->sel, &buf_sel, &buflen_sel);
memcpy(*buf + pos, &buflen_sel, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("buflen_sel = %u\n", buflen_sel);
memcpy(*buf + pos, buf_sel, buflen_sel);
#ifdef DEBUG
cmph_uint32 i = 0;
for(i = 0; i < buflen_sel; i++)
{
DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(*buf + pos + i));
}
#endif
pos += buflen_sel;
free(buf_sel);
// dumping vals_rems
memcpy(*buf + pos, cr->vals_rems, vals_rems_size);
#ifdef DEBUG
for(i = 0; i < vals_rems_size; i++)
{
DEBUGP("pos = %u -- vals_rems_size = %u -- vals_rems[%u] = %u\n", pos, vals_rems_size, i, *(*buf + pos + i));
}
#endif
pos += vals_rems_size;
DEBUGP("Dumped compressed rank structure with size %u bytes\n", *buflen);
}
void compressed_rank_load(compressed_rank_t * cr, const char *buf, cmph_uint32 buflen)
{
register cmph_uint32 pos = 0;
cmph_uint32 buflen_sel = 0;
register cmph_uint32 vals_rems_size = 0;
// loading max_val, n, and rem_r
memcpy(&(cr->max_val), buf, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("max_val = %u\n", cr->max_val);
memcpy(&(cr->n), buf + pos, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("n = %u\n", cr->n);
memcpy(&(cr->rem_r), buf + pos, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("rem_r = %u\n", cr->rem_r);
// loading sel
memcpy(&buflen_sel, buf + pos, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("buflen_sel = %u\n", buflen_sel);
select_load(&cr->sel, buf + pos, buflen_sel);
#ifdef DEBUG
cmph_uint32 i = 0;
for(i = 0; i < buflen_sel; i++)
{
DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(buf + pos + i));
}
#endif
pos += buflen_sel;
// loading vals_rems
if(cr->vals_rems)
{
free(cr->vals_rems);
}
vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r);
cr->vals_rems = (cmph_uint32 *) calloc(vals_rems_size, sizeof(cmph_uint32));
vals_rems_size *= 4;
memcpy(cr->vals_rems, buf + pos, vals_rems_size);
#ifdef DEBUG
for(i = 0; i < vals_rems_size; i++)
{
DEBUGP("pos = %u -- vals_rems_size = %u -- vals_rems[%u] = %u\n", pos, vals_rems_size, i, *(buf + pos + i));
}
#endif
pos += vals_rems_size;
DEBUGP("Loaded compressed rank structure with size %u bytes\n", buflen);
}
void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed)
{
if (cr && cr_packed)
{
char *buf = NULL;
cmph_uint32 buflen = 0;
compressed_rank_dump(cr, &buf, &buflen);
memcpy(cr_packed, buf, buflen);
free(buf);
}
}
cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr)
{
register cmph_uint32 sel_size = select_packed_size(&cr->sel);
register cmph_uint32 vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r) * (cmph_uint32)sizeof(cmph_uint32);
return 4 * (cmph_uint32)sizeof(cmph_uint32) + sel_size + vals_rems_size;
}
cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx)
{
// unpacking cr_packed
register cmph_uint32 *ptr = (cmph_uint32 *)cr_packed;
register cmph_uint32 max_val = *ptr++;
register cmph_uint32 n = *ptr++;
register cmph_uint32 rem_r = *ptr++;
register cmph_uint32 buflen_sel = *ptr++;
register cmph_uint32 * sel_packed = ptr;
register cmph_uint32 * bits_vec = sel_packed + 2; // skipping n and m
register cmph_uint32 * vals_rems = (ptr += (buflen_sel >> 2));
// compressed sequence query computation
register cmph_uint32 rems_mask;
register cmph_uint32 val_quot, val_rem;
register cmph_uint32 sel_res, rank;
if(idx > max_val)
{
return n;
}
val_quot = idx >> rem_r;
rems_mask = (1U << rem_r) - 1U;
val_rem = idx & rems_mask;
if(val_quot == 0)
{
rank = sel_res = 0;
}
else
{
sel_res = select_query_packed(sel_packed, val_quot - 1) + 1;
rank = sel_res - val_quot;
}
do
{
if(GETBIT32(bits_vec, sel_res))
{
break;
}
if(get_bits_value(vals_rems, rank, rem_r, rems_mask) >= val_rem)
{
break;
}
sel_res++;
rank++;
} while(1);
return rank;
}
cmph-2.0.2/src/select_lookup_tables.h 0000644 0001750 0001750 00000031666 13411542035 017160 0 ustar joseph joseph #ifndef SELECT_LOOKUP_TABLES
#define SELECT_LOOKUP_TABLES
#include "cmph_types.h"
/*
rank_lookup_table[i] simply gives the number of bits set to one in the byte of value i.
For example if i = 01010101 in binary then we have :
rank_lookup_table[i] = 4
*/
static cmph_uint8 rank_lookup_table[256] ={
0 , 1 , 1 , 2 , 1 , 2 , 2 , 3 , 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4
, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5
, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5
, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6
, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5
, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6
, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6
, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7
, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5
, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6
, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6
, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7
, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6
, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7
, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7
, 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 , 5 , 6 , 6 , 7 , 6 , 7 , 7 , 8
};
/*
select_lookup_table[i][j] simply gives the index of the j'th bit set to one in the byte of value i.
For example if i=01010101 in binary then we have :
select_lookup_table[i][0] = 0, the first bit set to one is at position 0
select_lookup_table[i][1] = 2, the second bit set to one is at position 2
select_lookup_table[i][2] = 4, the third bit set to one is at position 4
select_lookup_table[i][3] = 6, the fourth bit set to one is at position 6
select_lookup_table[i][4] = 255, there is no more than 4 bits set to one in i, so we return escape value 255.
*/
static cmph_uint8 select_lookup_table[256][8]={
{ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 255 , 255 , 255 , 255 , 255 , 255 } ,
{ 2 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 255 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 255 , 255 , 255 , 255 , 255 } ,
{ 3 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 255 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 255 , 255 , 255 , 255 , 255 } ,
{ 2 , 3 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 255 , 255 , 255 , 255 } ,
{ 4 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 255 , 255 , 255 , 255 , 255 } ,
{ 2 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 4 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 255 , 255 , 255 , 255 } ,
{ 3 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 4 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 255 , 255 , 255 , 255 } ,
{ 2 , 3 , 4 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 4 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 255 , 255 , 255 } ,
{ 5 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 255 , 255 , 255 , 255 , 255 } ,
{ 2 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 255 , 255 , 255 , 255 } ,
{ 3 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 255 , 255 , 255 , 255 } ,
{ 2 , 3 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 5 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 255 , 255 , 255 } ,
{ 4 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 255 , 255 , 255 , 255 } ,
{ 2 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 4 , 5 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 255 , 255 , 255 } ,
{ 3 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 4 , 5 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 255 , 255 , 255 } ,
{ 2 , 3 , 4 , 5 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 4 , 5 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 255 , 255 } ,
{ 6 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 6 , 255 , 255 , 255 , 255 , 255 } ,
{ 2 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 6 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 6 , 255 , 255 , 255 , 255 } ,
{ 3 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 6 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 6 , 255 , 255 , 255 , 255 } ,
{ 2 , 3 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 6 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 6 , 255 , 255 , 255 } ,
{ 4 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 6 , 255 , 255 , 255 , 255 } ,
{ 2 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 6 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 4 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 6 , 255 , 255 , 255 } ,
{ 3 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 6 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 4 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 6 , 255 , 255 , 255 } ,
{ 2 , 3 , 4 , 6 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 6 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 4 , 6 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 6 , 255 , 255 } ,
{ 5 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 6 , 255 , 255 , 255 , 255 } ,
{ 2 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 6 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 6 , 255 , 255 , 255 } ,
{ 3 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 6 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 6 , 255 , 255 , 255 } ,
{ 2 , 3 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 6 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 5 , 6 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 6 , 255 , 255 } ,
{ 4 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } ,
{ 1 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 6 , 255 , 255 , 255 } ,
{ 2 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 6 , 255 , 255 , 255 } ,
{ 1 , 2 , 4 , 5 , 6 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 6 , 255 , 255 } ,
{ 3 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 6 , 255 , 255 , 255 } ,
{ 1 , 3 , 4 , 5 , 6 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 6 , 255 , 255 } ,
{ 2 , 3 , 4 , 5 , 6 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 6 , 255 , 255 } ,
{ 1 , 2 , 3 , 4 , 5 , 6 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 6 , 255 } ,
{ 7 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 7 , 255 , 255 , 255 , 255 , 255 } ,
{ 2 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 7 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 7 , 255 , 255 , 255 , 255 } ,
{ 3 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 7 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 7 , 255 , 255 , 255 , 255 } ,
{ 2 , 3 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 7 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 7 , 255 , 255 , 255 } ,
{ 4 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 7 , 255 , 255 , 255 , 255 } ,
{ 2 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 7 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 4 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 7 , 255 , 255 , 255 } ,
{ 3 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 7 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 4 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 7 , 255 , 255 , 255 } ,
{ 2 , 3 , 4 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 7 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 4 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 7 , 255 , 255 } ,
{ 5 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 7 , 255 , 255 , 255 , 255 } ,
{ 2 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 7 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 7 , 255 , 255 , 255 } ,
{ 3 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 7 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 7 , 255 , 255 , 255 } ,
{ 2 , 3 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 7 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 5 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 7 , 255 , 255 } ,
{ 4 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } ,
{ 1 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 7 , 255 , 255 , 255 } ,
{ 2 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 7 , 255 , 255 , 255 } ,
{ 1 , 2 , 4 , 5 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 7 , 255 , 255 } ,
{ 3 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 7 , 255 , 255 , 255 } ,
{ 1 , 3 , 4 , 5 , 7 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 7 , 255 , 255 } ,
{ 2 , 3 , 4 , 5 , 7 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 7 , 255 , 255 } ,
{ 1 , 2 , 3 , 4 , 5 , 7 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 7 , 255 } ,
{ 6 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } ,
{ 1 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 6 , 7 , 255 , 255 , 255 , 255 } ,
{ 2 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 6 , 7 , 255 , 255 , 255 , 255 } ,
{ 1 , 2 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 6 , 7 , 255 , 255 , 255 } ,
{ 3 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 6 , 7 , 255 , 255 , 255 , 255 } ,
{ 1 , 3 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 6 , 7 , 255 , 255 , 255 } ,
{ 2 , 3 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 6 , 7 , 255 , 255 , 255 } ,
{ 1 , 2 , 3 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 6 , 7 , 255 , 255 } ,
{ 4 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } ,
{ 1 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 6 , 7 , 255 , 255 , 255 } ,
{ 2 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 6 , 7 , 255 , 255 , 255 } ,
{ 1 , 2 , 4 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 6 , 7 , 255 , 255 } ,
{ 3 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 6 , 7 , 255 , 255 , 255 } ,
{ 1 , 3 , 4 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 6 , 7 , 255 , 255 } ,
{ 2 , 3 , 4 , 6 , 7 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 6 , 7 , 255 , 255 } ,
{ 1 , 2 , 3 , 4 , 6 , 7 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 6 , 7 , 255 } ,
{ 5 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } ,
{ 1 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 6 , 7 , 255 , 255 , 255 } ,
{ 2 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 6 , 7 , 255 , 255 , 255 } ,
{ 1 , 2 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 6 , 7 , 255 , 255 } ,
{ 3 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 6 , 7 , 255 , 255 , 255 } ,
{ 1 , 3 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 6 , 7 , 255 , 255 } ,
{ 2 , 3 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 6 , 7 , 255 , 255 } ,
{ 1 , 2 , 3 , 5 , 6 , 7 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 6 , 7 , 255 } ,
{ 4 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } ,
{ 1 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 6 , 7 , 255 , 255 } ,
{ 2 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 6 , 7 , 255 , 255 } ,
{ 1 , 2 , 4 , 5 , 6 , 7 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 6 , 7 , 255 } ,
{ 3 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 6 , 7 , 255 , 255 } ,
{ 1 , 3 , 4 , 5 , 6 , 7 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 6 , 7 , 255 } ,
{ 2 , 3 , 4 , 5 , 6 , 7 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 6 , 7 , 255 } ,
{ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 } };
#endif
cmph-2.0.2/src/bmz8.h 0000644 0001750 0001750 00000003157 13411542035 013630 0 ustar joseph joseph #ifndef __CMPH_BMZ8_H__
#define __CMPH_BMZ8_H__
#include "cmph.h"
typedef struct __bmz8_data_t bmz8_data_t;
typedef struct __bmz8_config_data_t bmz8_config_data_t;
bmz8_config_data_t *bmz8_config_new(void);
void bmz8_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void bmz8_config_destroy(cmph_config_t *mph);
cmph_t *bmz8_new(cmph_config_t *mph, double c);
void bmz8_load(FILE *f, cmph_t *mphf);
int bmz8_dump(cmph_t *mphf, FILE *f);
void bmz8_destroy(cmph_t *mphf);
cmph_uint8 bmz8_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
/** \fn void bmz8_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void bmz8_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 bmz8_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 bmz8_packed_size(cmph_t *mphf);
/** cmph_uint8 bmz8_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint8 bmz8_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
#endif
cmph-2.0.2/src/chm.c 0000644 0001750 0001750 00000026740 13411542035 013515 0 ustar joseph joseph #include "graph.h"
#include "chm.h"
#include "cmph_structs.h"
#include "chm_structs.h"
#include "hash.h"
#include "bitbool.h"
#include
#include
#include
#include
#include
//#define DEBUG
#include "debug.h"
static int chm_gen_edges(cmph_config_t *mph);
static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v);
chm_config_data_t *chm_config_new(void)
{
chm_config_data_t *chm = NULL;
chm = (chm_config_data_t *)malloc(sizeof(chm_config_data_t));
if (!chm) return NULL;
memset(chm, 0, sizeof(chm_config_data_t));
chm->hashfuncs[0] = CMPH_HASH_JENKINS;
chm->hashfuncs[1] = CMPH_HASH_JENKINS;
chm->g = NULL;
chm->graph = NULL;
chm->hashes = NULL;
return chm;
}
void chm_config_destroy(cmph_config_t *mph)
{
chm_config_data_t *data = (chm_config_data_t *)mph->data;
DEBUGP("Destroying algorithm dependent data\n");
free(data);
}
void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
chm_config_data_t *chm = (chm_config_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
cmph_uint32 i = 0;
while(*hashptr != CMPH_HASH_COUNT)
{
if (i >= 2) break; //chm only uses two hash functions
chm->hashfuncs[i] = *hashptr;
++i, ++hashptr;
}
}
cmph_t *chm_new(cmph_config_t *mph, double c)
{
cmph_t *mphf = NULL;
chm_data_t *chmf = NULL;
cmph_uint32 i;
cmph_uint32 iterations = 20;
cmph_uint8 *visited = NULL;
chm_config_data_t *chm = (chm_config_data_t *)mph->data;
chm->m = mph->key_source->nkeys;
if (c == 0) c = 2.09;
chm->n = (cmph_uint32)ceil(c * mph->key_source->nkeys);
DEBUGP("m (edges): %u n (vertices): %u c: %f\n", chm->m, chm->n, c);
chm->graph = graph_new(chm->n, chm->m);
DEBUGP("Created graph\n");
chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3);
for(i = 0; i < 3; ++i) chm->hashes[i] = NULL;
//Mapping step
if (mph->verbosity)
{
fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", chm->m, chm->n);
}
while(1)
{
int ok;
chm->hashes[0] = hash_state_new(chm->hashfuncs[0], chm->n);
chm->hashes[1] = hash_state_new(chm->hashfuncs[1], chm->n);
ok = chm_gen_edges(mph);
if (!ok)
{
--iterations;
hash_state_destroy(chm->hashes[0]);
chm->hashes[0] = NULL;
hash_state_destroy(chm->hashes[1]);
chm->hashes[1] = NULL;
DEBUGP("%u iterations remaining\n", iterations);
if (mph->verbosity)
{
fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations);
}
if (iterations == 0) break;
}
else break;
}
if (iterations == 0)
{
graph_destroy(chm->graph);
return NULL;
}
//Assignment step
if (mph->verbosity)
{
fprintf(stderr, "Starting assignment step\n");
}
DEBUGP("Assignment step\n");
visited = (cmph_uint8 *)malloc((size_t)(chm->n/8 + 1));
memset(visited, 0, (size_t)(chm->n/8 + 1));
free(chm->g);
chm->g = (cmph_uint32 *)malloc(chm->n * sizeof(cmph_uint32));
assert(chm->g);
for (i = 0; i < chm->n; ++i)
{
if (!GETBIT(visited,i))
{
chm->g[i] = 0;
chm_traverse(chm, visited, i);
}
}
graph_destroy(chm->graph);
free(visited);
chm->graph = NULL;
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
chmf = (chm_data_t *)malloc(sizeof(chm_data_t));
chmf->g = chm->g;
chm->g = NULL; //transfer memory ownership
chmf->hashes = chm->hashes;
chm->hashes = NULL; //transfer memory ownership
chmf->n = chm->n;
chmf->m = chm->m;
mphf->data = chmf;
mphf->size = chm->m;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
return mphf;
}
static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v)
{
graph_iterator_t it = graph_neighbors_it(chm->graph, v);
cmph_uint32 neighbor = 0;
SETBIT(visited,v);
DEBUGP("Visiting vertex %u\n", v);
while((neighbor = graph_next_neighbor(chm->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
DEBUGP("Visiting neighbor %u\n", neighbor);
if(GETBIT(visited,neighbor)) continue;
DEBUGP("Visiting neighbor %u\n", neighbor);
DEBUGP("Visiting edge %u->%u with id %u\n", v, neighbor, graph_edge_id(chm->graph, v, neighbor));
chm->g[neighbor] = graph_edge_id(chm->graph, v, neighbor) - chm->g[v];
DEBUGP("g is %u (%u - %u mod %u)\n", chm->g[neighbor], graph_edge_id(chm->graph, v, neighbor), chm->g[v], chm->m);
chm_traverse(chm, visited, neighbor);
}
}
static int chm_gen_edges(cmph_config_t *mph)
{
cmph_uint32 e;
chm_config_data_t *chm = (chm_config_data_t *)mph->data;
int cycles = 0;
DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", chm->n, cmph_hash_names[chm->hashfuncs[0]], cmph_hash_names[chm->hashfuncs[1]]);
graph_clear_edges(chm->graph);
mph->key_source->rewind(mph->key_source->data);
for (e = 0; e < mph->key_source->nkeys; ++e)
{
cmph_uint32 h1, h2;
cmph_uint32 keylen;
char *key;
mph->key_source->read(mph->key_source->data, &key, &keylen);
h1 = hash(chm->hashes[0], key, keylen) % chm->n;
h2 = hash(chm->hashes[1], key, keylen) % chm->n;
if (h1 == h2) if (++h2 >= chm->n) h2 = 0;
if (h1 == h2)
{
if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e);
mph->key_source->dispose(mph->key_source->data, key, keylen);
return 0;
}
DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key);
mph->key_source->dispose(mph->key_source->data, key, keylen);
graph_add_edge(chm->graph, h1, h2);
}
cycles = graph_is_cyclic(chm->graph);
if (mph->verbosity && cycles) fprintf(stderr, "Cyclic graph generated\n");
DEBUGP("Looking for cycles: %u\n", cycles);
return ! cycles;
}
int chm_dump(cmph_t *mphf, FILE *fd)
{
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint32 two = 2; //number of hash functions
chm_data_t *data = (chm_data_t *)mphf->data;
register size_t nbytes;
__cmph_dump(mphf, fd);
nbytes = fwrite(&two, sizeof(cmph_uint32), (size_t)1, fd);
hash_state_dump(data->hashes[0], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
hash_state_dump(data->hashes[1], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(data->g, sizeof(cmph_uint32)*data->n, (size_t)1, fd);
/* #ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
fprintf(stderr, "\n");
#endif*/
return 1;
}
void chm_load(FILE *f, cmph_t *mphf)
{
cmph_uint32 nhashes;
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint32 i;
chm_data_t *chm = (chm_data_t *)malloc(sizeof(chm_data_t));
register size_t nbytes;
DEBUGP("Loading chm mphf\n");
mphf->data = chm;
nbytes = fread(&nhashes, sizeof(cmph_uint32), (size_t)1, f);
chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1));
chm->hashes[nhashes] = NULL;
DEBUGP("Reading %u hashes\n", nhashes);
for (i = 0; i < nhashes; ++i)
{
hash_state_t *state = NULL;
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
state = hash_state_load(buf, buflen);
chm->hashes[i] = state;
free(buf);
}
DEBUGP("Reading m and n\n");
nbytes = fread(&(chm->n), sizeof(cmph_uint32), (size_t)1, f);
nbytes = fread(&(chm->m), sizeof(cmph_uint32), (size_t)1, f);
chm->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*chm->n);
nbytes = fread(chm->g, chm->n*sizeof(cmph_uint32), (size_t)1, f);
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < chm->n; ++i) fprintf(stderr, "%u ", chm->g[i]);
fprintf(stderr, "\n");
#endif
return;
}
cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
chm_data_t *chm = (chm_data_t *)mphf->data;
cmph_uint32 h1 = hash(chm->hashes[0], key, keylen) % chm->n;
cmph_uint32 h2 = hash(chm->hashes[1], key, keylen) % chm->n;
DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
if (h1 == h2 && ++h2 >= chm->n) h2 = 0;
DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, chm->g[h1], chm->g[h2], chm->m);
return (chm->g[h1] + chm->g[h2]) % chm->m;
}
void chm_destroy(cmph_t *mphf)
{
chm_data_t *data = (chm_data_t *)mphf->data;
free(data->g);
hash_state_destroy(data->hashes[0]);
hash_state_destroy(data->hashes[1]);
free(data->hashes);
free(data);
free(mphf);
}
/** \fn void chm_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void chm_pack(cmph_t *mphf, void *packed_mphf)
{
chm_data_t *data = (chm_data_t *)mphf->data;
cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
// packing h1 type
CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
*((cmph_uint32 *) ptr) = h1_type;
ptr += sizeof(cmph_uint32);
// packing h1
hash_state_pack(data->hashes[0], ptr);
ptr += hash_state_packed_size(h1_type);
// packing h2 type
CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
*((cmph_uint32 *) ptr) = h2_type;
ptr += sizeof(cmph_uint32);
// packing h2
hash_state_pack(data->hashes[1], ptr);
ptr += hash_state_packed_size(h2_type);
// packing n
*((cmph_uint32 *) ptr) = data->n;
ptr += sizeof(data->n);
// packing m
*((cmph_uint32 *) ptr) = data->m;
ptr += sizeof(data->m);
// packing g
memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n);
}
/** \fn cmph_uint32 chm_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 chm_packed_size(cmph_t *mphf)
{
chm_data_t *data = (chm_data_t *)mphf->data;
CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) +
4*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n);
}
/** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
register cmph_uint8 *h1_ptr = (cmph_uint8 *)packed_mphf;
register CMPH_HASH h1_type = (CMPH_HASH)(*((cmph_uint32 *)h1_ptr));
h1_ptr += 4;
register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
register CMPH_HASH h2_type = (CMPH_HASH)(*((cmph_uint32 *)h2_ptr));
h2_ptr += 4;
register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type));
register cmph_uint32 n = *g_ptr++;
register cmph_uint32 m = *g_ptr++;
register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n;
register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n;
DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
if (h1 == h2 && ++h2 >= n) h2 = 0;
DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m);
return (g_ptr[h1] + g_ptr[h2]) % m;
}
cmph-2.0.2/src/bdz_structs.h 0000755 0001750 0001750 00000001660 13411542035 015316 0 ustar joseph joseph #ifndef __CMPH_BDZ_STRUCTS_H__
#define __CMPH_BDZ_STRUCTS_H__
#include "hash_state.h"
struct __bdz_data_t
{
cmph_uint32 m; //edges (words) count
cmph_uint32 n; //vertex count
cmph_uint32 r; //partition vertex count
cmph_uint8 *g;
hash_state_t *hl; // linear hashing
cmph_uint32 k; //kth index in ranktable, $k = log_2(n=3r)/\varepsilon$
cmph_uint8 b; // number of bits of k
cmph_uint32 ranktablesize; //number of entries in ranktable, $n/k +1$
cmph_uint32 *ranktable; // rank table
};
struct __bdz_config_data_t
{
cmph_uint32 m; //edges (words) count
cmph_uint32 n; //vertex count
cmph_uint32 r; //partition vertex count
cmph_uint8 *g;
hash_state_t *hl; // linear hashing
cmph_uint32 k; //kth index in ranktable, $k = log_2(n=3r)/\varepsilon$
cmph_uint8 b; // number of bits of k
cmph_uint32 ranktablesize; //number of entries in ranktable, $n/k +1$
cmph_uint32 *ranktable; // rank table
CMPH_HASH hashfunc;
};
#endif
cmph-2.0.2/src/hashtree.h 0000644 0001750 0001750 00000001301 13411542035 014540 0 ustar joseph joseph #ifndef __CMPH_HASHTREE_H__
#define __CMPH_HASHTREE_H__
#include "cmph.h"
typedef struct __hashtree_data_t hashtree_data_t;
typedef struct __hashtree_config_data_t hashtree_config_data_t;
hashtree_config_data_t *hashtree_config_new();
void hashtree_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void hashtree_config_set_leaf_algo(cmph_config_t *mph, CMPH_ALGO leaf_algo);
void hashtree_config_destroy(cmph_config_t *mph);
cmph_t *hashtree_new(cmph_config_t *mph, double c);
void hashtree_load(FILE *f, cmph_t *mphf);
int hashtree_dump(cmph_t *mphf, FILE *f);
void hashtree_destroy(cmph_t *mphf);
cmph_uint32 hashtree_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
#endif
cmph-2.0.2/src/sdbm_hash.c 0000644 0001750 0001750 00000002102 13411542035 014660 0 ustar joseph joseph #include "sdbm_hash.h"
#include
sdbm_state_t *sdbm_state_new()
{
sdbm_state_t *state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t));
if (!state) return NULL;
state->hashfunc = CMPH_HASH_SDBM;
return state;
}
void sdbm_state_destroy(sdbm_state_t *state)
{
free(state);
}
cmph_uint32 sdbm_hash(sdbm_state_t *state, const char *k, cmph_uint32 keylen)
{
register cmph_uint32 hash = 0;
const unsigned char *ptr = (unsigned char *)k;
cmph_uint32 i = 0;
while(i < keylen) {
hash = *ptr + (hash << 6) + (hash << 16) - hash;
++ptr, ++i;
}
return hash;
}
void sdbm_state_dump(sdbm_state_t *state, char **buf, cmph_uint32 *buflen)
{
*buf = NULL;
*buflen = 0;
return;
}
sdbm_state_t *sdbm_state_copy(sdbm_state_t *src_state)
{
sdbm_state_t *dest_state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t));
dest_state->hashfunc = src_state->hashfunc;
return dest_state;
}
sdbm_state_t *sdbm_state_load(const char *buf, cmph_uint32 buflen)
{
sdbm_state_t *state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t));
state->hashfunc = CMPH_HASH_SDBM;
return state;
}
cmph-2.0.2/src/brz.c 0000755 0001750 0001750 00000074474 13411542035 013555 0 ustar joseph joseph #include "graph.h"
#include "fch.h"
#include "fch_structs.h"
#include "bmz8.h"
#include "bmz8_structs.h"
#include "brz.h"
#include "cmph_structs.h"
#include "brz_structs.h"
#include "buffer_manager.h"
#include "cmph.h"
#include "hash.h"
#include "bitbool.h"
#include
#include
#include
#include
#include
#define MAX_BUCKET_SIZE 255
//#define DEBUG
#include "debug.h"
static int brz_gen_mphf(cmph_config_t *mph);
static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n);
static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint32 nkeys);
static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen);
static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen);
brz_config_data_t *brz_config_new(void)
{
brz_config_data_t *brz = NULL;
brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t));
if (!brz) return NULL;
brz->algo = CMPH_FCH;
brz->b = 128;
brz->hashfuncs[0] = CMPH_HASH_JENKINS;
brz->hashfuncs[1] = CMPH_HASH_JENKINS;
brz->hashfuncs[2] = CMPH_HASH_JENKINS;
brz->size = NULL;
brz->offset = NULL;
brz->g = NULL;
brz->h1 = NULL;
brz->h2 = NULL;
brz->h0 = NULL;
brz->memory_availability = 1024*1024;
brz->tmp_dir = (cmph_uint8 *)calloc((size_t)10, sizeof(cmph_uint8));
brz->mphf_fd = NULL;
strcpy((char *)(brz->tmp_dir), "/var/tmp/");
assert(brz);
return brz;
}
void brz_config_destroy(cmph_config_t *mph)
{
brz_config_data_t *data = (brz_config_data_t *)mph->data;
free(data->tmp_dir);
DEBUGP("Destroying algorithm dependent data\n");
free(data);
}
void brz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
cmph_uint32 i = 0;
while(*hashptr != CMPH_HASH_COUNT)
{
if (i >= 3) break; //brz only uses three hash functions
brz->hashfuncs[i] = *hashptr;
++i, ++hashptr;
}
}
void brz_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability)
{
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
if(memory_availability > 0) brz->memory_availability = memory_availability*1024*1024;
}
void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir)
{
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
if(tmp_dir)
{
size_t len = strlen((char *)tmp_dir);
free(brz->tmp_dir);
if(tmp_dir[len-1] != '/')
{
brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+2, sizeof(cmph_uint8));
sprintf((char *)(brz->tmp_dir), "%s/", (char *)tmp_dir);
}
else
{
brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+1, sizeof(cmph_uint8));
sprintf((char *)(brz->tmp_dir), "%s", (char *)tmp_dir);
}
}
}
void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd)
{
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
brz->mphf_fd = mphf_fd;
assert(brz->mphf_fd);
}
void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b)
{
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
if(b <= 64 || b >= 175)
{
b = 128;
}
brz->b = (cmph_uint8)b;
}
void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo)
{
if (algo == CMPH_BMZ8 || algo == CMPH_FCH) // supported algorithms
{
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
brz->algo = algo;
}
}
cmph_t *brz_new(cmph_config_t *mph, double c)
{
cmph_t *mphf = NULL;
brz_data_t *brzf = NULL;
cmph_uint32 i;
cmph_uint32 iterations = 20;
DEBUGP("c: %f\n", c);
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
// Since we keep dumping partial pieces of the MPHF as it gets created
// the caller must set the file to store the resulting MPHF before calling
// this function.
if (brz->mphf_fd == NULL)
{
return NULL;
}
switch(brz->algo) // validating restrictions over parameter c.
{
case CMPH_BMZ8:
if (c == 0 || c >= 2.0) c = 1;
break;
case CMPH_FCH:
if (c <= 2.0) c = 2.6;
break;
default:
assert(0);
}
brz->c = c;
brz->m = mph->key_source->nkeys;
if (brz->m < 5)
{
brz->c = 5;
}
DEBUGP("m: %u\n", brz->m);
brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b));
DEBUGP("k: %u\n", brz->k);
brz->size = (cmph_uint8 *) calloc((size_t)brz->k, sizeof(cmph_uint8));
// Clustering the keys by graph id.
if (mph->verbosity)
{
fprintf(stderr, "Partitioning the set of keys.\n");
}
while(1)
{
int ok;
DEBUGP("hash function 3\n");
brz->h0 = hash_state_new(brz->hashfuncs[2], brz->k);
DEBUGP("Generating graphs\n");
ok = brz_gen_mphf(mph);
if (!ok)
{
--iterations;
hash_state_destroy(brz->h0);
brz->h0 = NULL;
DEBUGP("%u iterations remaining to create the graphs in a external file\n", iterations);
if (mph->verbosity)
{
fprintf(stderr, "Failure: A graph with more than 255 keys was created - %u iterations remaining\n", iterations);
}
if (iterations == 0) break;
}
else break;
}
if (iterations == 0)
{
DEBUGP("Graphs with more than 255 keys were created in all 20 iterations\n");
free(brz->size);
return NULL;
}
DEBUGP("Graphs generated\n");
brz->offset = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32));
for (i = 1; i < brz->k; ++i)
{
brz->offset[i] = brz->size[i-1] + brz->offset[i-1];
}
// Generating a mphf
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
brzf = (brz_data_t *)malloc(sizeof(brz_data_t));
brzf->g = brz->g;
brz->g = NULL; //transfer memory ownership
brzf->h1 = brz->h1;
brz->h1 = NULL; //transfer memory ownership
brzf->h2 = brz->h2;
brz->h2 = NULL; //transfer memory ownership
brzf->h0 = brz->h0;
brz->h0 = NULL; //transfer memory ownership
brzf->size = brz->size;
brz->size = NULL; //transfer memory ownership
brzf->offset = brz->offset;
brz->offset = NULL; //transfer memory ownership
brzf->k = brz->k;
brzf->c = brz->c;
brzf->m = brz->m;
brzf->algo = brz->algo;
mphf->data = brzf;
mphf->size = brz->m;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
return mphf;
}
static int brz_gen_mphf(cmph_config_t *mph)
{
cmph_uint32 i, e, error;
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
cmph_uint32 memory_usage = 0;
cmph_uint32 nkeys_in_buffer = 0;
cmph_uint8 *buffer = (cmph_uint8 *)malloc((size_t)brz->memory_availability);
cmph_uint32 *buckets_size = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32));
cmph_uint32 *keys_index = NULL;
cmph_uint8 **buffer_merge = NULL;
cmph_uint32 *buffer_h0 = NULL;
cmph_uint32 nflushes = 0;
cmph_uint32 h0;
register size_t nbytes;
FILE * tmp_fd = NULL;
buffer_manager_t * buff_manager = NULL;
char *filename = NULL;
char *key = NULL;
cmph_uint32 keylen;
cmph_uint32 cur_bucket = 0;
cmph_uint8 nkeys_vd = 0;
cmph_uint8 ** keys_vd = NULL;
mph->key_source->rewind(mph->key_source->data);
DEBUGP("Generating graphs from %u keys\n", brz->m);
// Partitioning
for (e = 0; e < brz->m; ++e)
{
mph->key_source->read(mph->key_source->data, &key, &keylen);
/* Buffers management */
if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers
{
if(mph->verbosity)
{
fprintf(stderr, "Flushing %u\n", nkeys_in_buffer);
}
cmph_uint32 value = buckets_size[0];
cmph_uint32 sum = 0;
cmph_uint32 keylen1 = 0;
buckets_size[0] = 0;
for(i = 1; i < brz->k; i++)
{
if(buckets_size[i] == 0) continue;
sum += value;
value = buckets_size[i];
buckets_size[i] = sum;
}
memory_usage = 0;
keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32));
for(i = 0; i < nkeys_in_buffer; i++)
{
memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1));
h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k;
keys_index[buckets_size[h0]] = memory_usage;
buckets_size[h0]++;
memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1);
}
filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes);
tmp_fd = fopen(filename, "wb");
free(filename);
filename = NULL;
for(i = 0; i < nkeys_in_buffer; i++)
{
memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1));
nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd);
}
nkeys_in_buffer = 0;
memory_usage = 0;
memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32));
nflushes++;
free(keys_index);
fclose(tmp_fd);
}
memcpy(buffer + memory_usage, &keylen, sizeof(keylen));
memcpy(buffer + memory_usage + sizeof(keylen), key, (size_t)keylen);
memory_usage += keylen + (cmph_uint32)sizeof(keylen);
h0 = hash(brz->h0, key, keylen) % brz->k;
if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0])))
{
free(buffer);
free(buckets_size);
return 0;
}
brz->size[h0] = (cmph_uint8)(brz->size[h0] + 1U);
buckets_size[h0] ++;
nkeys_in_buffer++;
mph->key_source->dispose(mph->key_source->data, key, keylen);
}
if (memory_usage != 0) // flush buffers
{
if(mph->verbosity)
{
fprintf(stderr, "Flushing %u\n", nkeys_in_buffer);
}
cmph_uint32 value = buckets_size[0];
cmph_uint32 sum = 0;
cmph_uint32 keylen1 = 0;
buckets_size[0] = 0;
for(i = 1; i < brz->k; i++)
{
if(buckets_size[i] == 0) continue;
sum += value;
value = buckets_size[i];
buckets_size[i] = sum;
}
memory_usage = 0;
keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32));
for(i = 0; i < nkeys_in_buffer; i++)
{
memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1));
h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k;
keys_index[buckets_size[h0]] = memory_usage;
buckets_size[h0]++;
memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1);
}
filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes);
tmp_fd = fopen(filename, "wb");
free(filename);
filename = NULL;
for(i = 0; i < nkeys_in_buffer; i++)
{
memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1));
nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd);
}
nkeys_in_buffer = 0;
memory_usage = 0;
memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32));
nflushes++;
free(keys_index);
fclose(tmp_fd);
}
free(buffer);
free(buckets_size);
if(nflushes > 1024) return 0; // Too many files generated.
// mphf generation
if(mph->verbosity)
{
fprintf(stderr, "\nMPHF generation \n");
}
/* Starting to dump to disk the resulting MPHF: __cmph_dump function */
nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd);
nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd);
nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd);
nbytes = fwrite(&(brz->algo), sizeof(brz->algo), (size_t)1, brz->mphf_fd);
nbytes = fwrite(&(brz->k), sizeof(cmph_uint32), (size_t)1, brz->mphf_fd); // number of MPHFs
nbytes = fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, brz->mphf_fd);
//tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *));
buff_manager = buffer_manager_new(brz->memory_availability, nflushes);
buffer_merge = (cmph_uint8 **)calloc((size_t)nflushes, sizeof(cmph_uint8 *));
buffer_h0 = (cmph_uint32 *)calloc((size_t)nflushes, sizeof(cmph_uint32));
memory_usage = 0;
for(i = 0; i < nflushes; i++)
{
filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
sprintf(filename, "%s%u.cmph",brz->tmp_dir, i);
buffer_manager_open(buff_manager, i, filename);
free(filename);
filename = NULL;
key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k;
buffer_h0[i] = h0;
buffer_merge[i] = (cmph_uint8 *)key;
key = NULL; //transfer memory ownership
}
e = 0;
keys_vd = (cmph_uint8 **)calloc((size_t)MAX_BUCKET_SIZE, sizeof(cmph_uint8 *));
nkeys_vd = 0;
error = 0;
while(e < brz->m)
{
i = brz_min_index(buffer_h0, nflushes);
cur_bucket = buffer_h0[i];
key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
if(key)
{
while(key)
{
//keylen = strlen(key);
h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k;
if (h0 != buffer_h0[i]) break;
keys_vd[nkeys_vd++] = (cmph_uint8 *)key;
key = NULL; //transfer memory ownership
e++;
key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
}
if (key)
{
assert(nkeys_vd < brz->size[cur_bucket]);
keys_vd[nkeys_vd++] = buffer_merge[i];
buffer_merge[i] = NULL; //transfer memory ownership
e++;
buffer_h0[i] = h0;
buffer_merge[i] = (cmph_uint8 *)key;
}
}
if(!key)
{
assert(nkeys_vd < brz->size[cur_bucket]);
keys_vd[nkeys_vd++] = buffer_merge[i];
buffer_merge[i] = NULL; //transfer memory ownership
e++;
buffer_h0[i] = UINT_MAX;
}
if(nkeys_vd == brz->size[cur_bucket]) // Generating mphf for each bucket.
{
cmph_io_adapter_t *source = NULL;
cmph_config_t *config = NULL;
cmph_t *mphf_tmp = NULL;
char *bufmphf = NULL;
cmph_uint32 buflenmphf = 0;
// Source of keys
source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd);
config = cmph_config_new(source);
cmph_config_set_algo(config, brz->algo);
cmph_config_set_hashfuncs(config, brz->hashfuncs);
cmph_config_set_graphsize(config, brz->c);
mphf_tmp = cmph_new(config);
if (mphf_tmp == NULL)
{
if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k);
error = 1;
cmph_config_destroy(config);
brz_destroy_keys_vd(keys_vd, nkeys_vd);
cmph_io_byte_vector_adapter_destroy(source);
break;
}
if(mph->verbosity)
{
if (cur_bucket % 1000 == 0)
{
fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k);
}
}
switch(brz->algo)
{
case CMPH_FCH:
{
fch_data_t * fchf = NULL;
fchf = (fch_data_t *)mphf_tmp->data;
bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf);
}
break;
case CMPH_BMZ8:
{
bmz8_data_t * bmzf = NULL;
bmzf = (bmz8_data_t *)mphf_tmp->data;
bufmphf = brz_copy_partial_bmz8_mphf(brz, bmzf, cur_bucket, &buflenmphf);
}
break;
default: assert(0);
}
nbytes = fwrite(bufmphf, (size_t)buflenmphf, (size_t)1, brz->mphf_fd);
free(bufmphf);
bufmphf = NULL;
cmph_config_destroy(config);
brz_destroy_keys_vd(keys_vd, nkeys_vd);
cmph_destroy(mphf_tmp);
cmph_io_byte_vector_adapter_destroy(source);
nkeys_vd = 0;
}
}
buffer_manager_destroy(buff_manager);
free(keys_vd);
free(buffer_merge);
free(buffer_h0);
if (error) return 0;
return 1;
}
static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n)
{
cmph_uint32 i, min_index = 0;
for(i = 1; i < n; i++)
{
if(vector[i] < vector[min_index]) min_index = i;
}
return min_index;
}
static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint32 nkeys)
{
cmph_uint8 i;
for(i = 0; i < nkeys; i++) { free(keys_vd[i]); keys_vd[i] = NULL;}
}
static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen)
{
cmph_uint32 i = 0;
cmph_uint32 buflenh1 = 0;
cmph_uint32 buflenh2 = 0;
char * bufh1 = NULL;
char * bufh2 = NULL;
char * buf = NULL;
cmph_uint32 n = fchf->b;//brz->size[index];
hash_state_dump(fchf->h1, &bufh1, &buflenh1);
hash_state_dump(fchf->h2, &bufh2, &buflenh2);
*buflen = buflenh1 + buflenh2 + n + 2U * (cmph_uint32)sizeof(cmph_uint32);
buf = (char *)malloc((size_t)(*buflen));
memcpy(buf, &buflenh1, sizeof(cmph_uint32));
memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1);
memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32));
memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2);
for (i = 0; i < n; i++) memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2+i,(fchf->g + i), (size_t)1);
free(bufh1);
free(bufh2);
return buf;
}
static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen)
{
cmph_uint32 buflenh1 = 0;
cmph_uint32 buflenh2 = 0;
char * bufh1 = NULL;
char * bufh2 = NULL;
char * buf = NULL;
cmph_uint32 n = (cmph_uint32)ceil(brz->c * brz->size[index]);
hash_state_dump(bmzf->hashes[0], &bufh1, &buflenh1);
hash_state_dump(bmzf->hashes[1], &bufh2, &buflenh2);
*buflen = buflenh1 + buflenh2 + n + 2U * (cmph_uint32)sizeof(cmph_uint32);
buf = (char *)malloc((size_t)(*buflen));
memcpy(buf, &buflenh1, sizeof(cmph_uint32));
memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1);
memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32));
memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2);
memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2,bmzf->g, (size_t)n);
free(bufh1);
free(bufh2);
return buf;
}
int brz_dump(cmph_t *mphf, FILE *fd)
{
brz_data_t *data = (brz_data_t *)mphf->data;
char *buf = NULL;
cmph_uint32 buflen;
register size_t nbytes;
DEBUGP("Dumping brzf\n");
// The initial part of the MPHF has already been dumped to disk during construction
// Dumping h0
hash_state_dump(data->h0, &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
// Dumping m and the vector offset.
nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(data->offset, sizeof(cmph_uint32)*(data->k), (size_t)1, fd);
return 1;
}
void brz_load(FILE *f, cmph_t *mphf)
{
char *buf = NULL;
cmph_uint32 buflen;
register size_t nbytes;
cmph_uint32 i, n;
brz_data_t *brz = (brz_data_t *)malloc(sizeof(brz_data_t));
DEBUGP("Loading brz mphf\n");
mphf->data = brz;
nbytes = fread(&(brz->c), sizeof(double), (size_t)1, f);
nbytes = fread(&(brz->algo), sizeof(brz->algo), (size_t)1, f); // Reading algo.
nbytes = fread(&(brz->k), sizeof(cmph_uint32), (size_t)1, f);
brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k);
nbytes = fread(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, f);
brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k);
brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k);
brz->g = (cmph_uint8 **) calloc((size_t)brz->k, sizeof(cmph_uint8 *));
DEBUGP("Reading c = %f k = %u algo = %u \n", brz->c, brz->k, brz->algo);
//loading h_i1, h_i2 and g_i.
for(i = 0; i < brz->k; i++)
{
// h1
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
DEBUGP("Hash state 1 has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
brz->h1[i] = hash_state_load(buf, buflen);
free(buf);
//h2
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
DEBUGP("Hash state 2 has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
brz->h2[i] = hash_state_load(buf, buflen);
free(buf);
switch(brz->algo)
{
case CMPH_FCH:
n = fch_calc_b(brz->c, brz->size[i]);
break;
case CMPH_BMZ8:
n = (cmph_uint32)ceil(brz->c * brz->size[i]);
break;
default: assert(0);
}
DEBUGP("g_i has %u bytes\n", n);
brz->g[i] = (cmph_uint8 *)calloc((size_t)n, sizeof(cmph_uint8));
nbytes = fread(brz->g[i], sizeof(cmph_uint8)*n, (size_t)1, f);
}
//loading h0
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
brz->h0 = hash_state_load(buf, buflen);
free(buf);
//loading c, m, and the vector offset.
nbytes = fread(&(brz->m), sizeof(cmph_uint32), (size_t)1, f);
brz->offset = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*brz->k);
nbytes = fread(brz->offset, sizeof(cmph_uint32)*(brz->k), (size_t)1, f);
return;
}
static cmph_uint32 brz_bmz8_search(brz_data_t *brz, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
{
register cmph_uint32 h0;
hash_vector(brz->h0, key, keylen, fingerprint);
h0 = fingerprint[2] % brz->k;
register cmph_uint32 m = brz->size[h0];
register cmph_uint32 n = (cmph_uint32)ceil(brz->c * m);
register cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % n;
register cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % n;
register cmph_uint8 mphf_bucket;
if (h1 == h2 && ++h2 >= n) h2 = 0;
mphf_bucket = (cmph_uint8)(brz->g[h0][h1] + brz->g[h0][h2]);
DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0);
DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h0]: %u edges: %u\n", key, brz->g[h0][h1], brz->g[h0][h2], brz->offset[h0], brz->m);
DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h0]);
return (mphf_bucket + brz->offset[h0]);
}
static cmph_uint32 brz_fch_search(brz_data_t *brz, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
{
register cmph_uint32 h0;
hash_vector(brz->h0, key, keylen, fingerprint);
h0 = fingerprint[2] % brz->k;
register cmph_uint32 m = brz->size[h0];
register cmph_uint32 b = fch_calc_b(brz->c, m);
register double p1 = fch_calc_p1(m);
register double p2 = fch_calc_p2(b);
register cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % m;
register cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % m;
register cmph_uint8 mphf_bucket = 0;
h1 = mixh10h11h12(b, p1, p2, h1);
mphf_bucket = (cmph_uint8)((h2 + brz->g[h0][h1]) % m);
return (mphf_bucket + brz->offset[h0]);
}
cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
brz_data_t *brz = (brz_data_t *)mphf->data;
cmph_uint32 fingerprint[3];
switch(brz->algo)
{
case CMPH_FCH:
return brz_fch_search(brz, key, keylen, fingerprint);
case CMPH_BMZ8:
return brz_bmz8_search(brz, key, keylen, fingerprint);
default: assert(0);
}
return 0;
}
void brz_destroy(cmph_t *mphf)
{
cmph_uint32 i;
brz_data_t *data = (brz_data_t *)mphf->data;
if(data->g)
{
for(i = 0; i < data->k; i++)
{
free(data->g[i]);
hash_state_destroy(data->h1[i]);
hash_state_destroy(data->h2[i]);
}
free(data->g);
free(data->h1);
free(data->h2);
}
hash_state_destroy(data->h0);
free(data->size);
free(data->offset);
free(data);
free(mphf);
}
/** \fn void brz_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void brz_pack(cmph_t *mphf, void *packed_mphf)
{
brz_data_t *data = (brz_data_t *)mphf->data;
cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
cmph_uint32 i,n;
// This assumes that if one function pointer is NULL,
// all the others will be as well.
if (data->h1 == NULL)
{
return;
}
// packing internal algo type
memcpy(ptr, &(data->algo), sizeof(data->algo));
ptr += sizeof(data->algo);
// packing h0 type
CMPH_HASH h0_type = hash_get_type(data->h0);
memcpy(ptr, &h0_type, sizeof(h0_type));
ptr += sizeof(h0_type);
// packing h0
hash_state_pack(data->h0, ptr);
ptr += hash_state_packed_size(h0_type);
// packing k
memcpy(ptr, &(data->k), sizeof(data->k));
ptr += sizeof(data->k);
// packing c
*((cmph_uint64 *)ptr) = (cmph_uint64)data->c;
ptr += sizeof(data->c);
// packing h1 type
CMPH_HASH h1_type = hash_get_type(data->h1[0]);
memcpy(ptr, &h1_type, sizeof(h1_type));
ptr += sizeof(h1_type);
// packing h2 type
CMPH_HASH h2_type = hash_get_type(data->h2[0]);
memcpy(ptr, &h2_type, sizeof(h2_type));
ptr += sizeof(h2_type);
// packing size
memcpy(ptr, data->size, sizeof(cmph_uint8)*data->k);
ptr += data->k;
// packing offset
memcpy(ptr, data->offset, sizeof(cmph_uint32)*data->k);
ptr += sizeof(cmph_uint32)*data->k;
#if defined (__ia64) || defined (__x86_64__)
cmph_uint64 * g_is_ptr = (cmph_uint64 *)ptr;
#else
cmph_uint32 * g_is_ptr = (cmph_uint32 *)ptr;
#endif
cmph_uint8 * g_i = (cmph_uint8 *) (g_is_ptr + data->k);
for(i = 0; i < data->k; i++)
{
#if defined (__ia64) || defined (__x86_64__)
*g_is_ptr++ = (cmph_uint64)g_i;
#else
*g_is_ptr++ = (cmph_uint32)g_i;
#endif
// packing h1[i]
hash_state_pack(data->h1[i], g_i);
g_i += hash_state_packed_size(h1_type);
// packing h2[i]
hash_state_pack(data->h2[i], g_i);
g_i += hash_state_packed_size(h2_type);
// packing g_i
switch(data->algo)
{
case CMPH_FCH:
n = fch_calc_b(data->c, data->size[i]);
break;
case CMPH_BMZ8:
n = (cmph_uint32)ceil(data->c * data->size[i]);
break;
default: assert(0);
}
memcpy(g_i, data->g[i], sizeof(cmph_uint8)*n);
g_i += n;
}
}
/** \fn cmph_uint32 brz_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 brz_packed_size(cmph_t *mphf)
{
cmph_uint32 i;
cmph_uint32 size = 0;
brz_data_t *data = (brz_data_t *)mphf->data;
CMPH_HASH h0_type;
CMPH_HASH h1_type;
CMPH_HASH h2_type;
// This assumes that if one function pointer is NULL,
// all the others will be as well.
if (data->h1 == NULL)
{
return 0U;
}
h0_type = hash_get_type(data->h0);
h1_type = hash_get_type(data->h1[0]);
h2_type = hash_get_type(data->h2[0]);
size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) +
sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k);
// pointers to g_is
#if defined (__ia64) || defined (__x86_64__)
size += (cmph_uint32) sizeof(cmph_uint64)*data->k;
#else
size += (cmph_uint32) sizeof(cmph_uint32)*data->k;
#endif
size += hash_state_packed_size(h1_type) * data->k;
size += hash_state_packed_size(h2_type) * data->k;
cmph_uint32 n = 0;
for(i = 0; i < data->k; i++)
{
switch(data->algo)
{
case CMPH_FCH:
n = fch_calc_b(data->c, data->size[i]);
break;
case CMPH_BMZ8:
n = (cmph_uint32)ceil(data->c * data->size[i]);
break;
default: assert(0);
}
size += n;
}
return size;
}
static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
{
register CMPH_HASH h0_type = (CMPH_HASH)*packed_mphf++;
register cmph_uint32 *h0_ptr = packed_mphf;
packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type));
register cmph_uint32 k = *packed_mphf++;
register double c = (double)(*((cmph_uint64*)packed_mphf));
packed_mphf += 2;
register CMPH_HASH h1_type = (CMPH_HASH)*packed_mphf++;
register CMPH_HASH h2_type = (CMPH_HASH)*packed_mphf++;
register cmph_uint8 * size = (cmph_uint8 *) packed_mphf;
packed_mphf = (cmph_uint32 *)(size + k);
register cmph_uint32 * offset = packed_mphf;
packed_mphf += k;
register cmph_uint32 h0;
hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint);
h0 = fingerprint[2] % k;
register cmph_uint32 m = size[h0];
register cmph_uint32 n = (cmph_uint32)ceil(c * m);
#if defined (__ia64) || defined (__x86_64__)
register cmph_uint64 * g_is_ptr = (cmph_uint64 *)packed_mphf;
#else
register cmph_uint32 * g_is_ptr = packed_mphf;
#endif
register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0];
register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type);
register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n;
register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n;
register cmph_uint8 mphf_bucket;
if (h1 == h2 && ++h2 >= n) h2 = 0;
mphf_bucket = (cmph_uint8)(g[h1] + g[h2]);
DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0);
DEBUGP("Address: %u\n", mphf_bucket + offset[h0]);
return (mphf_bucket + offset[h0]);
}
static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint)
{
register CMPH_HASH h0_type = (CMPH_HASH)*packed_mphf++;
register cmph_uint32 *h0_ptr = packed_mphf;
packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type));
register cmph_uint32 k = *packed_mphf++;
register double c = (double)(*((cmph_uint64*)packed_mphf));
packed_mphf += 2;
register CMPH_HASH h1_type = (CMPH_HASH)*packed_mphf++;
register CMPH_HASH h2_type = (CMPH_HASH)*packed_mphf++;
register cmph_uint8 * size = (cmph_uint8 *) packed_mphf;
packed_mphf = (cmph_uint32 *)(size + k);
register cmph_uint32 * offset = packed_mphf;
packed_mphf += k;
register cmph_uint32 h0;
hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint);
h0 = fingerprint[2] % k;
register cmph_uint32 m = size[h0];
register cmph_uint32 b = fch_calc_b(c, m);
register double p1 = fch_calc_p1(m);
register double p2 = fch_calc_p2(b);
#if defined (__ia64) || defined (__x86_64__)
register cmph_uint64 * g_is_ptr = (cmph_uint64 *)packed_mphf;
#else
register cmph_uint32 * g_is_ptr = packed_mphf;
#endif
register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0];
register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type);
register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m;
register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m;
register cmph_uint8 mphf_bucket = 0;
h1 = mixh10h11h12(b, p1, p2, h1);
mphf_bucket = (cmph_uint8)((h2 + g[h1]) % m);
return (mphf_bucket + offset[h0]);
}
/** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
register cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf;
register CMPH_ALGO algo = (CMPH_ALGO)*ptr++;
cmph_uint32 fingerprint[3];
switch(algo)
{
case CMPH_FCH:
return brz_fch_search_packed(ptr, key, keylen, fingerprint);
case CMPH_BMZ8:
return brz_bmz8_search_packed(ptr, key, keylen, fingerprint);
default: assert(0);
}
}
cmph-2.0.2/src/select.c 0000644 0001750 0001750 00000023111 13411542035 014212 0 ustar joseph joseph #include
#include
#include
#include
#include
#include "select_lookup_tables.h"
#include "select.h"
//#define DEBUG
#include "debug.h"
#ifndef STEP_SELECT_TABLE
#define STEP_SELECT_TABLE 128
#endif
#ifndef NBITS_STEP_SELECT_TABLE
#define NBITS_STEP_SELECT_TABLE 7
#endif
#ifndef MASK_STEP_SELECT_TABLE
#define MASK_STEP_SELECT_TABLE 0x7f // 0x7f = 127
#endif
static inline void select_insert_0(cmph_uint32 * buffer)
{
(*buffer) >>= 1;
};
static inline void select_insert_1(cmph_uint32 * buffer)
{
(*buffer) >>= 1;
(*buffer) |= 0x80000000;
};
void select_init(select_t * sel)
{
sel->n = 0;
sel->m = 0;
sel->bits_vec = 0;
sel->select_table = 0;
};
cmph_uint32 select_get_space_usage(select_t * sel)
{
register cmph_uint32 nbits;
register cmph_uint32 vec_size;
register cmph_uint32 sel_table_size;
register cmph_uint32 space_usage;
nbits = sel->n + sel->m;
vec_size = (nbits + 31) >> 5;
sel_table_size = (sel->n >> NBITS_STEP_SELECT_TABLE) + 1; // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE)
space_usage = 2 * sizeof(cmph_uint32) * 8; // n and m
space_usage += vec_size * (cmph_uint32) sizeof(cmph_uint32) * 8;
space_usage += sel_table_size * (cmph_uint32)sizeof(cmph_uint32) * 8;
return space_usage;
}
void select_destroy(select_t * sel)
{
free(sel->bits_vec);
free(sel->select_table);
sel->bits_vec = 0;
sel->select_table = 0;
};
static inline void select_generate_sel_table(select_t * sel)
{
register cmph_uint8 * bits_table = (cmph_uint8 *)sel->bits_vec;
register cmph_uint32 part_sum, old_part_sum;
register cmph_uint32 vec_idx, one_idx, sel_table_idx;
part_sum = vec_idx = one_idx = sel_table_idx = 0;
for(;;)
{
// FABIANO: Should'n it be one_idx >= sel->n
if(one_idx >= sel->n)
break;
do
{
old_part_sum = part_sum;
part_sum += rank_lookup_table[bits_table[vec_idx]];
vec_idx++;
} while (part_sum <= one_idx);
sel->select_table[sel_table_idx] = select_lookup_table[bits_table[vec_idx - 1]][one_idx - old_part_sum] + ((vec_idx - 1) << 3); // ((vec_idx - 1) << 3) = ((vec_idx - 1) * 8)
one_idx += STEP_SELECT_TABLE ;
sel_table_idx++;
};
};
void select_generate(select_t * sel, cmph_uint32 * keys_vec, cmph_uint32 n, cmph_uint32 m)
{
register cmph_uint32 i, j, idx;
cmph_uint32 buffer = 0;
register cmph_uint32 nbits;
register cmph_uint32 vec_size;
register cmph_uint32 sel_table_size;
sel->n = n;
sel->m = m; // n values in the range [0,m-1]
nbits = sel->n + sel->m;
vec_size = (nbits + 31) >> 5; // (nbits + 31) >> 5 = (nbits + 31)/32
sel_table_size = (sel->n >> NBITS_STEP_SELECT_TABLE) + 1; // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE)
if(sel->bits_vec)
{
free(sel->bits_vec);
}
sel->bits_vec = (cmph_uint32 *)calloc(vec_size, sizeof(cmph_uint32));
if(sel->select_table)
{
free(sel->select_table);
}
sel->select_table = (cmph_uint32 *)calloc(sel_table_size, sizeof(cmph_uint32));
idx = i = j = 0;
for(;;)
{
while(keys_vec[j]==i)
{
select_insert_1(&buffer);
idx++;
if((idx & 0x1f) == 0 ) // (idx & 0x1f) = idx % 32
sel->bits_vec[(idx >> 5) - 1] = buffer; // (idx >> 5) = idx/32
j++;
if(j == sel->n)
goto loop_end;
//assert(keys_vec[j] < keys_vec[j-1]);
}
if(i == sel->m)
break;
while(keys_vec[j] > i)
{
select_insert_0(&buffer);
idx++;
if((idx & 0x1f) == 0 ) // (idx & 0x1f) = idx % 32
sel->bits_vec[(idx >> 5) - 1] = buffer; // (idx >> 5) = idx/32
i++;
};
};
loop_end:
if((idx & 0x1f) != 0 ) // (idx & 0x1f) = idx % 32
{
buffer >>= 32 - (idx & 0x1f);
sel->bits_vec[ (idx - 1) >> 5 ] = buffer;
};
select_generate_sel_table(sel);
};
static inline cmph_uint32 _select_query(cmph_uint8 * bits_table, cmph_uint32 * select_table, cmph_uint32 one_idx)
{
register cmph_uint32 vec_bit_idx ,vec_byte_idx;
register cmph_uint32 part_sum, old_part_sum;
vec_bit_idx = select_table[one_idx >> NBITS_STEP_SELECT_TABLE]; // one_idx >> NBITS_STEP_SELECT_TABLE = one_idx/STEP_SELECT_TABLE
vec_byte_idx = vec_bit_idx >> 3; // vec_bit_idx / 8
one_idx &= MASK_STEP_SELECT_TABLE; // one_idx %= STEP_SELECT_TABLE == one_idx &= MASK_STEP_SELECT_TABLE
one_idx += rank_lookup_table[bits_table[vec_byte_idx] & ((1 << (vec_bit_idx & 0x7)) - 1)];
part_sum = 0;
do
{
old_part_sum = part_sum;
part_sum += rank_lookup_table[bits_table[vec_byte_idx]];
vec_byte_idx++;
}while (part_sum <= one_idx);
return select_lookup_table[bits_table[vec_byte_idx - 1]][one_idx - old_part_sum] + ((vec_byte_idx-1) << 3);
}
cmph_uint32 select_query(select_t * sel, cmph_uint32 one_idx)
{
return _select_query((cmph_uint8 *)sel->bits_vec, sel->select_table, one_idx);
};
static inline cmph_uint32 _select_next_query(cmph_uint8 * bits_table, cmph_uint32 vec_bit_idx)
{
register cmph_uint32 vec_byte_idx, one_idx;
register cmph_uint32 part_sum, old_part_sum;
vec_byte_idx = vec_bit_idx >> 3;
one_idx = rank_lookup_table[bits_table[vec_byte_idx] & ((1U << (vec_bit_idx & 0x7)) - 1U)] + 1U;
part_sum = 0;
do
{
old_part_sum = part_sum;
part_sum += rank_lookup_table[bits_table[vec_byte_idx]];
vec_byte_idx++;
}while (part_sum <= one_idx);
return select_lookup_table[bits_table[(vec_byte_idx - 1)]][(one_idx - old_part_sum)] + ((vec_byte_idx - 1) << 3);
}
cmph_uint32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx)
{
return _select_next_query((cmph_uint8 *)sel->bits_vec, vec_bit_idx);
};
void select_dump(select_t *sel, char **buf, cmph_uint32 *buflen)
{
register cmph_uint32 nbits = sel->n + sel->m;
register cmph_uint32 vec_size = ((nbits + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32); // (nbits + 31) >> 5 = (nbits + 31)/32
register cmph_uint32 sel_table_size = ((sel->n >> NBITS_STEP_SELECT_TABLE) + 1) * (cmph_uint32)sizeof(cmph_uint32); // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE)
register cmph_uint32 pos = 0;
*buflen = 2*(cmph_uint32)sizeof(cmph_uint32) + vec_size + sel_table_size;
*buf = (char *)calloc(*buflen, sizeof(char));
if (!*buf)
{
*buflen = UINT_MAX;
return;
}
memcpy(*buf, &(sel->n), sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
memcpy(*buf + pos, &(sel->m), sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
memcpy(*buf + pos, sel->bits_vec, vec_size);
pos += vec_size;
memcpy(*buf + pos, sel->select_table, sel_table_size);
DEBUGP("Dumped select structure with size %u bytes\n", *buflen);
}
void select_load(select_t * sel, const char *buf, cmph_uint32 buflen)
{
register cmph_uint32 pos = 0;
register cmph_uint32 nbits = 0;
register cmph_uint32 vec_size = 0;
register cmph_uint32 sel_table_size = 0;
memcpy(&(sel->n), buf, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
memcpy(&(sel->m), buf + pos, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
nbits = sel->n + sel->m;
vec_size = ((nbits + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32); // (nbits + 31) >> 5 = (nbits + 31)/32
sel_table_size = ((sel->n >> NBITS_STEP_SELECT_TABLE) + 1) * (cmph_uint32)sizeof(cmph_uint32); // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE)
if(sel->bits_vec)
{
free(sel->bits_vec);
}
sel->bits_vec = (cmph_uint32 *)calloc(vec_size/sizeof(cmph_uint32), sizeof(cmph_uint32));
if(sel->select_table)
{
free(sel->select_table);
}
sel->select_table = (cmph_uint32 *)calloc(sel_table_size/sizeof(cmph_uint32), sizeof(cmph_uint32));
memcpy(sel->bits_vec, buf + pos, vec_size);
pos += vec_size;
memcpy(sel->select_table, buf + pos, sel_table_size);
DEBUGP("Loaded select structure with size %u bytes\n", buflen);
}
/** \fn void select_pack(select_t *sel, void *sel_packed);
* \brief Support the ability to pack a select structure function into a preallocated contiguous memory space pointed by sel_packed.
* \param sel points to the select structure
* \param sel_packed pointer to the contiguous memory area used to store the select structure. The size of sel_packed must be at least @see select_packed_size
*/
void select_pack(select_t *sel, void *sel_packed)
{
if (sel && sel_packed)
{
char *buf = NULL;
cmph_uint32 buflen = 0;
select_dump(sel, &buf, &buflen);
memcpy(sel_packed, buf, buflen);
free(buf);
}
}
/** \fn cmph_uint32 select_packed_size(select_t *sel);
* \brief Return the amount of space needed to pack a select structure.
* \return the size of the packed select structure or zero for failures
*/
cmph_uint32 select_packed_size(select_t *sel)
{
register cmph_uint32 nbits = sel->n + sel->m;
register cmph_uint32 vec_size = ((nbits + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32); // (nbits + 31) >> 5 = (nbits + 31)/32
register cmph_uint32 sel_table_size = ((sel->n >> NBITS_STEP_SELECT_TABLE) + 1) * (cmph_uint32)sizeof(cmph_uint32); // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE)
return 2*(cmph_uint32)sizeof(cmph_uint32) + vec_size + sel_table_size;
}
cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx)
{
register cmph_uint32 *ptr = (cmph_uint32 *)sel_packed;
register cmph_uint32 n = *ptr++;
register cmph_uint32 m = *ptr++;
register cmph_uint32 nbits = n + m;
register cmph_uint32 vec_size = (nbits + 31) >> 5; // (nbits + 31) >> 5 = (nbits + 31)/32
register cmph_uint8 * bits_vec = (cmph_uint8 *)ptr;
register cmph_uint32 * select_table = ptr + vec_size;
return _select_query(bits_vec, select_table, one_idx);
}
cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx)
{
register cmph_uint8 * bits_vec = (cmph_uint8 *)sel_packed;
bits_vec += 8; // skipping n and m
return _select_next_query(bits_vec, vec_bit_idx);
}
cmph-2.0.2/src/bmz8_structs.h 0000644 0001750 0001750 00000000637 13411542035 015417 0 ustar joseph joseph #ifndef __CMPH_BMZ8_STRUCTS_H__
#define __CMPH_BMZ8_STRUCTS_H__
#include "hash_state.h"
struct __bmz8_data_t
{
cmph_uint8 m; //edges (words) count
cmph_uint8 n; //vertex count
cmph_uint8 *g;
hash_state_t **hashes;
};
struct __bmz8_config_data_t
{
CMPH_HASH hashfuncs[2];
cmph_uint8 m; //edges (words) count
cmph_uint8 n; //vertex count
graph_t *graph;
cmph_uint8 *g;
hash_state_t **hashes;
};
#endif
cmph-2.0.2/src/vstack.h 0000644 0001750 0001750 00000000676 13411542035 014246 0 ustar joseph joseph #ifndef __CMPH_VSTACK_H__
#define __CMPH_VSTACK_H__
#include "cmph_types.h"
typedef struct __vstack_t vstack_t;
vstack_t *vstack_new();
void vstack_destroy(vstack_t *stack);
void vstack_push(vstack_t *stack, cmph_uint32 val);
cmph_uint32 vstack_top(vstack_t *stack);
void vstack_pop(vstack_t *stack);
int vstack_empty(vstack_t *stack);
cmph_uint32 vstack_size(vstack_t *stack);
void vstack_reserve(vstack_t *stack, cmph_uint32 size);
#endif
cmph-2.0.2/src/bdz.h 0000755 0001750 0001750 00000003227 13411542035 013530 0 ustar joseph joseph #ifndef __CMPH_BDZ_H__
#define __CMPH_BDZ_H__
#include "cmph.h"
typedef struct __bdz_data_t bdz_data_t;
typedef struct __bdz_config_data_t bdz_config_data_t;
bdz_config_data_t *bdz_config_new(void);
void bdz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void bdz_config_destroy(cmph_config_t *mph);
void bdz_config_set_b(cmph_config_t *mph, cmph_uint32 b);
cmph_t *bdz_new(cmph_config_t *mph, double c);
void bdz_load(FILE *f, cmph_t *mphf);
int bdz_dump(cmph_t *mphf, FILE *f);
void bdz_destroy(cmph_t *mphf);
cmph_uint32 bdz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
/** \fn void bdz_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void bdz_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 bdz_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 bdz_packed_size(cmph_t *mphf);
/** cmph_uint32 bdz_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 bdz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
#endif
cmph-2.0.2/src/wingetopt.c 0000644 0001750 0001750 00000020435 13411542035 014761 0 ustar joseph joseph #ifdef WIN32
/*****************************************************************************
*
* MODULE NAME : GETOPT.C
*
* COPYRIGHTS:
* This module contains code made available by IBM
* Corporation on an AS IS basis. Any one receiving the
* module is considered to be licensed under IBM copyrights
* to use the IBM-provided source code in any way he or she
* deems fit, including copying it, compiling it, modifying
* it, and redistributing it, with or without
* modifications. No license under any IBM patents or
* patent applications is to be implied from this copyright
* license.
*
* A user of the module should understand that IBM cannot
* provide technical support for the module and will not be
* responsible for any consequences of use of the program.
*
* Any notices, including this one, are not to be removed
* from the module without the prior written consent of
* IBM.
*
* AUTHOR: Original author:
* G. R. Blair (BOBBLAIR at AUSVM1)
* Internet: bobblair@bobblair.austin.ibm.com
*
* Extensively revised by:
* John Q. Walker II, Ph.D. (JOHHQ at RALVM6)
* Internet: johnq@ralvm6.vnet.ibm.com
*
*****************************************************************************/
/******************************************************************************
* getopt()
*
* The getopt() function is a command line parser. It returns the next
* option character in argv that matches an option character in opstring.
*
* The argv argument points to an array of argc+1 elements containing argc
* pointers to character strings followed by a null pointer.
*
* The opstring argument points to a string of option characters; if an
* option character is followed by a colon, the option is expected to have
* an argument that may or may not be separated from it by white space.
* The external variable optarg is set to point to the start of the option
* argument on return from getopt().
*
* The getopt() function places in optind the argv index of the next argument
* to be processed. The system initializes the external variable optind to
* 1 before the first call to getopt().
*
* When all options have been processed (that is, up to the first nonoption
* argument), getopt() returns EOF. The special option "--" may be used to
* delimit the end of the options; EOF will be returned, and "--" will be
* skipped.
*
* The getopt() function returns a question mark (?) when it encounters an
* option character not included in opstring. This error message can be
* disabled by setting opterr to zero. Otherwise, it returns the option
* character that was detected.
*
* If the special option "--" is detected, or all options have been
* processed, EOF is returned.
*
* Options are marked by either a minus sign (-) or a slash (/).
*
* No errors are defined.
*****************************************************************************/
#include /* for EOF */
#include /* for strchr() */
/* static (global) variables that are specified as exported by getopt() */
extern char *optarg; /* pointer to the start of the option argument */
extern int optind; /* number of the next argv[] to be evaluated */
extern int opterr; /* non-zero if a question mark should be returned
when a non-valid option character is detected */
/* handle possible future character set concerns by putting this in a macro */
#define _next_char(string) (char)(*(string+1))
int getopt(int argc, char *argv[], char *opstring)
{
static char *pIndexPosition = NULL; /* place inside current argv string */
char *pArgString = NULL; /* where to start from next */
char *pOptString; /* the string in our program */
if (pIndexPosition != NULL) {
/* we last left off inside an argv string */
if (*(++pIndexPosition)) {
/* there is more to come in the most recent argv */
pArgString = pIndexPosition;
}
}
if (pArgString == NULL) {
/* we didn't leave off in the middle of an argv string */
if (optind >= argc) {
/* more command-line arguments than the argument count */
pIndexPosition = NULL; /* not in the middle of anything */
return EOF; /* used up all command-line arguments */
}
/*---------------------------------------------------------------------
* If the next argv[] is not an option, there can be no more options.
*-------------------------------------------------------------------*/
pArgString = argv[optind++]; /* set this to the next argument ptr */
if (('/' != *pArgString) && /* doesn't start with a slash or a dash? */
('-' != *pArgString)) {
--optind; /* point to current arg once we're done */
optarg = NULL; /* no argument follows the option */
pIndexPosition = NULL; /* not in the middle of anything */
return EOF; /* used up all the command-line flags */
}
/* check for special end-of-flags markers */
if ((strcmp(pArgString, "-") == 0) ||
(strcmp(pArgString, "--") == 0)) {
optarg = NULL; /* no argument follows the option */
pIndexPosition = NULL; /* not in the middle of anything */
return EOF; /* encountered the special flag */
}
pArgString++; /* look past the / or - */
}
if (':' == *pArgString) { /* is it a colon? */
/*---------------------------------------------------------------------
* Rare case: if opterr is non-zero, return a question mark;
* otherwise, just return the colon we're on.
*-------------------------------------------------------------------*/
return (opterr ? (int)'?' : (int)':');
}
else if ((pOptString = strchr(opstring, *pArgString)) == 0) {
/*---------------------------------------------------------------------
* The letter on the command-line wasn't any good.
*-------------------------------------------------------------------*/
optarg = NULL; /* no argument follows the option */
pIndexPosition = NULL; /* not in the middle of anything */
return (opterr ? (int)'?' : (int)*pArgString);
}
else {
/*---------------------------------------------------------------------
* The letter on the command-line matches one we expect to see
*-------------------------------------------------------------------*/
if (':' == _next_char(pOptString)) { /* is the next letter a colon? */
/* It is a colon. Look for an argument string. */
if ('\0' != _next_char(pArgString)) { /* argument in this argv? */
optarg = &pArgString[1]; /* Yes, it is */
}
else {
/*-------------------------------------------------------------
* The argument string must be in the next argv.
* But, what if there is none (bad input from the user)?
* In that case, return the letter, and optarg as NULL.
*-----------------------------------------------------------*/
if (optind < argc)
optarg = argv[optind++];
else {
optarg = NULL;
return (opterr ? (int)'?' : (int)*pArgString);
}
}
pIndexPosition = NULL; /* not in the middle of anything */
}
else {
/* it's not a colon, so just return the letter */
optarg = NULL; /* no argument follows the option */
pIndexPosition = pArgString; /* point to the letter we're on */
}
return (int)*pArgString; /* return the letter that matched */
}
}
#endif //WIN32
cmph-2.0.2/src/cmph_structs.h 0000644 0001750 0001750 00000001241 13411542035 015456 0 ustar joseph joseph #ifndef __CMPH_STRUCTS_H__
#define __CMPH_STRUCTS_H__
#include "cmph.h"
/** Hash generation algorithm data
*/
struct __config_t
{
CMPH_ALGO algo;
cmph_io_adapter_t *key_source;
cmph_uint32 verbosity;
double c;
void *data; // algorithm dependent data
};
/** Hash querying algorithm data
*/
struct __cmph_t
{
CMPH_ALGO algo;
cmph_uint32 size;
cmph_io_adapter_t *key_source;
void *data; // algorithm dependent data
};
cmph_config_t *__config_new(cmph_io_adapter_t *key_source);
void __config_destroy(cmph_config_t*);
void __cmph_dump(cmph_t *mphf, FILE *);
cmph_t *__cmph_load(FILE *f);
#endif
cmph-2.0.2/src/vqueue.c 0000644 0001750 0001750 00000002230 13411542035 014244 0 ustar joseph joseph #include "vqueue.h"
#include
#include
#include
struct __vqueue_t
{
cmph_uint32 * values;
cmph_uint32 beg, end, capacity;
};
vqueue_t * vqueue_new(cmph_uint32 capacity)
{
size_t capacity_plus_one = capacity + 1;
vqueue_t *q = (vqueue_t *)malloc(sizeof(vqueue_t));
if (!q) return NULL;
q->values = (cmph_uint32 *)calloc(capacity_plus_one, sizeof(cmph_uint32));
q->beg = q->end = 0;
q->capacity = (cmph_uint32) capacity_plus_one;
return q;
}
cmph_uint8 vqueue_is_empty(vqueue_t * q)
{
return (cmph_uint8)(q->beg == q->end);
}
void vqueue_insert(vqueue_t * q, cmph_uint32 val)
{
assert((q->end + 1)%q->capacity != q->beg); // Is queue full?
q->end = (q->end + 1)%q->capacity;
q->values[q->end] = val;
}
cmph_uint32 vqueue_remove(vqueue_t * q)
{
assert(!vqueue_is_empty(q)); // Is queue empty?
q->beg = (q->beg + 1)%q->capacity;
return q->values[q->beg];
}
void vqueue_print(vqueue_t * q)
{
cmph_uint32 i;
for (i = q->beg; i != q->end; i = (i + 1)%q->capacity)
fprintf(stderr, "%u\n", q->values[(i + 1)%q->capacity]);
}
void vqueue_destroy(vqueue_t *q)
{
free(q->values); q->values = NULL; free(q);
}
cmph-2.0.2/src/fch_buckets.c 0000644 0001750 0001750 00000013166 13411542035 015224 0 ustar joseph joseph #include "vqueue.h"
#include "fch_buckets.h"
#include "cmph_structs.h"
#include
#include
#include
//#define DEBUG
#include "debug.h"
typedef struct __fch_bucket_entry_t
{
char * value;
cmph_uint32 length;
} fch_bucket_entry_t;
typedef struct __fch_bucket_t
{
fch_bucket_entry_t * entries;
cmph_uint32 capacity, size;
} fch_bucket_t;
static void fch_bucket_new(fch_bucket_t *bucket)
{
assert(bucket);
bucket->size = 0;
bucket->entries = NULL;
bucket->capacity = 0;
}
static void fch_bucket_destroy(fch_bucket_t *bucket, cmph_config_t *mph)
{
cmph_uint32 i;
assert(bucket);
for (i = 0; i < bucket->size; i++)
{
fch_bucket_entry_t * entry = bucket->entries + i;
mph->key_source->dispose(mph->key_source->data, entry->value, entry->length);
}
free(bucket->entries);
}
static void fch_bucket_reserve(fch_bucket_t *bucket, cmph_uint32 size)
{
assert(bucket);
if (bucket->capacity < size)
{
cmph_uint32 new_capacity = bucket->capacity + 1;
DEBUGP("Increasing current capacity %u to %u\n", bucket->capacity, size);
while (new_capacity < size)
{
new_capacity *= 2;
}
bucket->entries = (fch_bucket_entry_t *)realloc(bucket->entries, sizeof(fch_bucket_entry_t)*new_capacity);
assert(bucket->entries);
bucket->capacity = new_capacity;
DEBUGP("Increased\n");
}
}
static void fch_bucket_insert(fch_bucket_t *bucket, char *val, cmph_uint32 val_length)
{
assert(bucket);
fch_bucket_reserve(bucket, bucket->size + 1);
(bucket->entries + bucket->size)->value = val;
(bucket->entries + bucket->size)->length = val_length;
++(bucket->size);
}
static cmph_uint8 fch_bucket_is_empty(fch_bucket_t *bucket)
{
assert(bucket);
return (cmph_uint8)(bucket->size == 0);
}
static cmph_uint32 fch_bucket_size(fch_bucket_t *bucket)
{
assert(bucket);
return bucket->size;
}
static char * fch_bucket_get_key(fch_bucket_t *bucket, cmph_uint32 index_key)
{
assert(bucket); assert(index_key < bucket->size);
return (bucket->entries + index_key)->value;
}
static cmph_uint32 fch_bucket_get_length(fch_bucket_t *bucket, cmph_uint32 index_key)
{
assert(bucket); assert(index_key < bucket->size);
return (bucket->entries + index_key)->length;
}
static void fch_bucket_print(fch_bucket_t * bucket, cmph_uint32 index)
{
cmph_uint32 i;
assert(bucket);
fprintf(stderr, "Printing bucket %u ...\n", index);
for (i = 0; i < bucket->size; i++)
{
fprintf(stderr, " key: %s\n", (bucket->entries + i)->value);
}
}
//////////////////////////////////////////////////////////////////////////////////////
struct __fch_buckets_t
{
fch_bucket_t * values;
cmph_uint32 nbuckets, max_size;
};
fch_buckets_t * fch_buckets_new(cmph_uint32 nbuckets)
{
cmph_uint32 i;
fch_buckets_t *buckets = (fch_buckets_t *)malloc(sizeof(fch_buckets_t));
if (!buckets) return NULL;
buckets->values = (fch_bucket_t *)calloc((size_t)nbuckets, sizeof(fch_bucket_t));
for (i = 0; i < nbuckets; i++) fch_bucket_new(buckets->values + i);
assert(buckets->values);
buckets->nbuckets = nbuckets;
buckets->max_size = 0;
return buckets;
}
cmph_uint8 fch_buckets_is_empty(fch_buckets_t * buckets, cmph_uint32 index)
{
assert(index < buckets->nbuckets);
return fch_bucket_is_empty(buckets->values + index);
}
void fch_buckets_insert(fch_buckets_t * buckets, cmph_uint32 index, char * key, cmph_uint32 length)
{
assert(index < buckets->nbuckets);
fch_bucket_insert(buckets->values + index, key, length);
if (fch_bucket_size(buckets->values + index) > buckets->max_size)
{
buckets->max_size = fch_bucket_size(buckets->values + index);
}
}
cmph_uint32 fch_buckets_get_size(fch_buckets_t * buckets, cmph_uint32 index)
{
assert(index < buckets->nbuckets);
return fch_bucket_size(buckets->values + index);
}
char * fch_buckets_get_key(fch_buckets_t * buckets, cmph_uint32 index, cmph_uint32 index_key)
{
assert(index < buckets->nbuckets);
return fch_bucket_get_key(buckets->values + index, index_key);
}
cmph_uint32 fch_buckets_get_keylength(fch_buckets_t * buckets, cmph_uint32 index, cmph_uint32 index_key)
{
assert(index < buckets->nbuckets);
return fch_bucket_get_length(buckets->values + index, index_key);
}
cmph_uint32 fch_buckets_get_max_size(fch_buckets_t * buckets)
{
return buckets->max_size;
}
cmph_uint32 fch_buckets_get_nbuckets(fch_buckets_t * buckets)
{
return buckets->nbuckets;
}
cmph_uint32 * fch_buckets_get_indexes_sorted_by_size(fch_buckets_t * buckets)
{
cmph_int32 i = 0;
cmph_uint32 sum = 0, value;
cmph_uint32 *nbuckets_size = (cmph_uint32 *) calloc((size_t)buckets->max_size + 1, sizeof(cmph_uint32));
cmph_uint32 * sorted_indexes = (cmph_uint32 *) calloc((size_t)buckets->nbuckets, sizeof(cmph_uint32));
// collect how many buckets for each size.
for(i = 0; i < (int)buckets->nbuckets; i++) nbuckets_size[fch_bucket_size(buckets->values + i)] ++;
// calculating offset considering a decreasing order of buckets size.
value = nbuckets_size[buckets->max_size];
nbuckets_size[buckets->max_size] = sum;
for(i = (int)buckets->max_size - 1; i >= 0; i--)
{
sum += value;
value = nbuckets_size[i];
nbuckets_size[i] = sum;
}
for(i = 0; i < (int)buckets->nbuckets; i++)
{
sorted_indexes[nbuckets_size[fch_bucket_size(buckets->values + i)]] = (cmph_uint32)i;
nbuckets_size[fch_bucket_size(buckets->values + i)] ++;
}
free(nbuckets_size);
return sorted_indexes;
}
void fch_buckets_print(fch_buckets_t * buckets)
{
cmph_uint32 i;
for (i = 0; i < buckets->nbuckets; i++) fch_bucket_print(buckets->values + i, i);
}
void fch_buckets_destroy(fch_buckets_t * buckets, cmph_config_t *mph)
{
cmph_uint32 i;
for (i = 0; i < buckets->nbuckets; i++) fch_bucket_destroy(buckets->values + i, mph);
free(buckets->values);
free(buckets);
}
cmph-2.0.2/src/fch_buckets.h 0000644 0001750 0001750 00000002067 13411542035 015227 0 ustar joseph joseph #ifndef __CMPH_FCH_BUCKETS_H__
#define __CMPH_FCH_BUCKETS_H__
#include "cmph_types.h"
#include "cmph.h"
typedef struct __fch_buckets_t fch_buckets_t;
fch_buckets_t * fch_buckets_new(cmph_uint32 nbuckets);
cmph_uint8 fch_buckets_is_empty(fch_buckets_t * buckets, cmph_uint32 index);
void fch_buckets_insert(fch_buckets_t * buckets, cmph_uint32 index, char * key, cmph_uint32 length);
cmph_uint32 fch_buckets_get_size(fch_buckets_t * buckets, cmph_uint32 index);
char * fch_buckets_get_key(fch_buckets_t * buckets, cmph_uint32 index, cmph_uint32 index_key);
cmph_uint32 fch_buckets_get_keylength(fch_buckets_t * buckets, cmph_uint32 index, cmph_uint32 index_key);
// returns the size of biggest bucket.
cmph_uint32 fch_buckets_get_max_size(fch_buckets_t * buckets);
// returns the number of buckets.
cmph_uint32 fch_buckets_get_nbuckets(fch_buckets_t * buckets);
cmph_uint32 * fch_buckets_get_indexes_sorted_by_size(fch_buckets_t * buckets);
void fch_buckets_print(fch_buckets_t * buckets);
void fch_buckets_destroy(fch_buckets_t * buckets, cmph_config_t* mph);
#endif
cmph-2.0.2/src/chd_structs_ph.h 0000644 0001750 0001750 00000001634 13411542035 015762 0 ustar joseph joseph #ifndef __CMPH_CHD_PH_STRUCTS_H__
#define __CMPH_CHD_PH_STRUCTS_H__
#include "hash_state.h"
#include "compressed_seq.h"
struct __chd_ph_data_t
{
compressed_seq_t * cs; // compressed displacement values
cmph_uint32 nbuckets; // number of buckets
cmph_uint32 n; // number of bins
hash_state_t *hl; // linear hash function
};
struct __chd_ph_config_data_t
{
CMPH_HASH hashfunc; // linear hash function to be used
compressed_seq_t * cs; // compressed displacement values
cmph_uint32 nbuckets; // number of buckets
cmph_uint32 n; // number of bins
hash_state_t *hl; // linear hash function
cmph_uint32 m; // number of keys
cmph_uint8 use_h; // flag to indicate the of use of a heuristic (use_h = 1)
cmph_uint32 keys_per_bin;//maximum number of keys per bin
cmph_uint32 keys_per_bucket; // average number of keys per bucket
cmph_uint8 *occup_table; // table that indicates occupied positions
};
#endif
cmph-2.0.2/src/sdbm_hash.h 0000644 0001750 0001750 00000000767 13411542035 014704 0 ustar joseph joseph #ifndef __SDBM_HASH_H__
#define __SDBM_HASH_H__
#include "hash.h"
typedef struct __sdbm_state_t
{
CMPH_HASH hashfunc;
} sdbm_state_t;
sdbm_state_t *sdbm_state_new();
cmph_uint32 sdbm_hash(sdbm_state_t *state, const char *k, cmph_uint32 keylen);
void sdbm_state_dump(sdbm_state_t *state, char **buf, cmph_uint32 *buflen);
sdbm_state_t *sdbm_state_copy(sdbm_state_t *src_state);
sdbm_state_t *sdbm_state_load(const char *buf, cmph_uint32 buflen);
void sdbm_state_destroy(sdbm_state_t *state);
#endif
cmph-2.0.2/src/buffer_manage.h 0000644 0001750 0001750 00000000761 13411542035 015527 0 ustar joseph joseph #ifndef __CMPH_BUFFER_MANAGE_H__
#define __CMPH_BUFFER_MANAGE_H__
#include "cmph_types.h"
#include
typedef struct __buffer_manage_t buffer_manage_t;
buffer_manage_t * buffer_manage_new(cmph_uint32 memory_avail, cmph_uint32 nentries);
void buffer_manage_open(buffer_manage_t * buffer_manage, cmph_uint32 index, char * filename);
cmph_uint8 * buffer_manage_read_key(buffer_manage_t * buffer_manage, cmph_uint32 index);
void buffer_manage_destroy(buffer_manage_t * buffer_manage);
#endif
cmph-2.0.2/src/miller_rabin.h 0000644 0001750 0001750 00000000212 13411542035 015374 0 ustar joseph joseph #ifndef _CMPH_MILLER_RABIN_H__
#define _CMPH_MILLER_RABIN_H__
#include "cmph_types.h"
cmph_uint8 check_primality(cmph_uint64 n);
#endif
cmph-2.0.2/src/hash_state.h 0000644 0001750 0001750 00000000260 13411542035 015063 0 ustar joseph joseph #ifndef __HASH_STATE_H__
#define __HASH_STATE_H__
#include "hash.h"
#include "jenkins_hash.h"
union __hash_state_t
{
CMPH_HASH hashfunc;
jenkins_state_t jenkins;
};
#endif
cmph-2.0.2/src/chd_structs.h 0000644 0001750 0001750 00000000734 13411542035 015273 0 ustar joseph joseph #ifndef __CMPH_CHD_STRUCTS_H__
#define __CMPH_CHD_STRUCTS_H__
#include "chd_structs_ph.h"
#include "chd_ph.h"
#include "compressed_rank.h"
struct __chd_data_t
{
cmph_uint32 packed_cr_size;
cmph_uint8 * packed_cr; // packed compressed rank structure to control the number of zeros in a bit vector
cmph_uint32 packed_chd_phf_size;
cmph_uint8 * packed_chd_phf;
};
struct __chd_config_data_t
{
cmph_config_t *chd_ph; // chd_ph algorithm must be used here
};
#endif
cmph-2.0.2/src/buffer_manage.c 0000644 0001750 0001750 00000004730 13411542035 015522 0 ustar joseph joseph #include "buffer_manage.h"
#include "buffer_entry.h"
#include
#include
#include
struct __buffer_manage_t
{
cmph_uint32 memory_avail; // memory available
buffer_entry_t ** buffer_entries; // buffer entries to be managed
cmph_uint32 nentries; // number of entries to be managed
cmph_uint32 *memory_avail_list; // memory available list
int pos_avail_list; // current position in memory available list
};
buffer_manage_t * buffer_manage_new(cmph_uint32 memory_avail, cmph_uint32 nentries)
{
cmph_uint32 memory_avail_entry, i;
buffer_manage_t *buff_manage = (buffer_manage_t *)malloc(sizeof(buffer_manage_t));
if (!buff_manage) return NULL;
buff_manage->memory_avail = memory_avail;
buff_manage->buffer_entries = (buffer_entry_t **)calloc((size_t)nentries, sizeof(buffer_entry_t *));
buff_manage->memory_avail_list = (cmph_uint32 *)calloc((size_t)nentries, sizeof(cmph_uint32));
buff_manage->pos_avail_list = -1;
buff_manage->nentries = nentries;
memory_avail_entry = buff_manage->memory_avail/buff_manage->nentries + 1;
for(i = 0; i < buff_manage->nentries; i++)
{
buff_manage->buffer_entries[i] = buffer_entry_new(memory_avail_entry);
}
return buff_manage;
}
void buffer_manage_open(buffer_manage_t * buffer_manage, cmph_uint32 index, char * filename)
{
buffer_entry_open(buffer_manage->buffer_entries[index], filename);
}
cmph_uint8 * buffer_manage_read_key(buffer_manage_t * buffer_manage, cmph_uint32 index)
{
cmph_uint8 * key = NULL;
if (buffer_manage->pos_avail_list >= 0 ) // recovering memory
{
cmph_uint32 new_capacity = buffer_entry_get_capacity(buffer_manage->buffer_entries[index]) + buffer_manage->memory_avail_list[(buffer_manage->pos_avail_list)--];
buffer_entry_set_capacity(buffer_manage->buffer_entries[index], new_capacity);
//fprintf(stderr, "recovering memory\n");
}
key = buffer_entry_read_key(buffer_manage->buffer_entries[index]);
if (key == NULL) // storing memory to be recovered
{
buffer_manage->memory_avail_list[++(buffer_manage->pos_avail_list)] = buffer_entry_get_capacity(buffer_manage->buffer_entries[index]);
//fprintf(stderr, "storing memory to be recovered\n");
}
return key;
}
void buffer_manage_destroy(buffer_manage_t * buffer_manage)
{
cmph_uint32 i;
for(i = 0; i < buffer_manage->nentries; i++)
{
buffer_entry_destroy(buffer_manage->buffer_entries[i]);
}
free(buffer_manage->memory_avail_list);
free(buffer_manage->buffer_entries);
free(buffer_manage);
}
cmph-2.0.2/src/brz_structs.h 0000755 0001750 0001750 00000002462 13411542035 015335 0 ustar joseph joseph #ifndef __CMPH_BRZ_STRUCTS_H__
#define __CMPH_BRZ_STRUCTS_H__
#include "hash_state.h"
struct __brz_data_t
{
CMPH_ALGO algo; // CMPH algo for generating the MPHFs for the buckets (Just CMPH_FCH and CMPH_BMZ8)
cmph_uint32 m; // edges (words) count
double c; // constant c
cmph_uint8 *size; // size[i] stores the number of edges represented by g[i][...].
cmph_uint32 *offset; // offset[i] stores the sum: size[0] + size[1] + ... size[i-1].
cmph_uint8 **g; // g function.
cmph_uint32 k; // number of components
hash_state_t **h1;
hash_state_t **h2;
hash_state_t * h0;
};
struct __brz_config_data_t
{
CMPH_HASH hashfuncs[3];
CMPH_ALGO algo; // CMPH algo for generating the MPHFs for the buckets (Just CMPH_FCH and CMPH_BMZ8)
double c; // constant c
cmph_uint32 m; // edges (words) count
cmph_uint8 *size; // size[i] stores the number of edges represented by g[i][...].
cmph_uint32 *offset; // offset[i] stores the sum: size[0] + size[1] + ... size[i-1].
cmph_uint8 **g; // g function.
cmph_uint8 b; // parameter b.
cmph_uint32 k; // number of components
hash_state_t **h1;
hash_state_t **h2;
hash_state_t * h0;
cmph_uint32 memory_availability;
cmph_uint8 * tmp_dir; // temporary directory
FILE * mphf_fd; // mphf file
};
#endif
cmph-2.0.2/src/bitbool.h 0000644 0001750 0001750 00000015354 13411542035 014404 0 ustar joseph joseph #ifndef _CMPH_BITBOOL_H__
#define _CMPH_BITBOOL_H__
#include "cmph_types.h"
static const cmph_uint8 bitmask[] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
static const cmph_uint32 bitmask32[] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7,
1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15,
1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22, 1 << 23,
1 << 24, 1 << 25, 1 << 26, 1 << 27, 1 << 28, 1 << 29, 1 << 30, 1U << 31
};
static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f};
/** \def GETBIT(array, i)
* \brief get the value of an 1-bit integer stored in an array.
* \param array to get 1-bit integer values from
* \param i is the index in array to get the 1-bit integer value from
*
* GETBIT(array, i) is a macro that gets the value of an 1-bit integer stored in array.
*/
#define GETBIT(array, i) ((array[i >> 3] & bitmask[i & 0x00000007]) >> (i & 0x00000007))
/** \def SETBIT(array, i)
* \brief set 1 to an 1-bit integer stored in an array.
* \param array to store 1-bit integer values
* \param i is the index in array to set the the bit to 1
*
* SETBIT(array, i) is a macro that sets 1 to an 1-bit integer stored in an array.
*/
#define SETBIT(array, i) (array[i >> 3] |= bitmask[i & 0x00000007])
//#define GETBIT(array, i) (array[(i) / 8] & bitmask[(i) % 8])
//#define SETBIT(array, i) (array[(i) / 8] |= bitmask[(i) % 8])
//#define UNSETBIT(array, i) (array[(i) / 8] ^= ((bitmask[(i) % 8])))
/** \def SETVALUE1(array, i, v)
* \brief set a value for a 2-bit integer stored in an array initialized with 1s.
* \param array to store 2-bit integer values
* \param i is the index in array to set the value v
* \param v is the value to be set
*
* SETVALUE1(array, i, v) is a macro that set a value for a 2-bit integer stored in an array.
* The array should be initialized with all bits set to 1. For example:
* memset(array, 0xff, arraySize);
*/
#define SETVALUE1(array, i, v) (array[i >> 2] &= (cmph_uint8)((v << ((i & 0x00000003) << 1)) | valuemask[i & 0x00000003]))
/** \def SETVALUE0(array, i, v)
* \brief set a value for a 2-bit integer stored in an array initialized with 0s.
* \param array to store 2-bit integer values
* \param i is the index in array to set the value v
* \param v is the value to be set
*
* SETVALUE0(array, i, v) is a macro that set a value for a 2-bit integer stored in an array.
* The array should be initialized with all bits set to 0. For example:
* memset(array, 0, arraySize);
*/
#define SETVALUE0(array, i, v) (array[i >> 2] |= (cmph_uint8)(v << ((i & 0x00000003) << 1)))
/** \def GETVALUE(array, i)
* \brief get a value for a 2-bit integer stored in an array.
* \param array to get 2-bit integer values from
* \param i is the index in array to get the value from
*
* GETVALUE(array, i) is a macro that get a value for a 2-bit integer stored in an array.
*/
#define GETVALUE(array, i) ((cmph_uint8)((array[i >> 2] >> ((i & 0x00000003U) << 1U)) & 0x00000003U))
/** \def SETBIT32(array, i)
* \brief set 1 to an 1-bit integer stored in an array of 32-bit words.
* \param array to store 1-bit integer values. The entries are 32-bit words.
* \param i is the index in array to set the the bit to 1
*
* SETBIT32(array, i) is a macro that sets 1 to an 1-bit integer stored in an array of 32-bit words.
*/
#define SETBIT32(array, i) (array[i >> 5] |= bitmask32[i & 0x0000001f])
/** \def GETBIT32(array, i)
* \brief get the value of an 1-bit integer stored in an array of 32-bit words.
* \param array to get 1-bit integer values from. The entries are 32-bit words.
* \param i is the index in array to get the 1-bit integer value from
*
* GETBIT32(array, i) is a macro that gets the value of an 1-bit integer stored in an array of 32-bit words.
*/
#define GETBIT32(array, i) (array[i >> 5] & bitmask32[i & 0x0000001f])
/** \def UNSETBIT32(array, i)
* \brief set 0 to an 1-bit integer stored in an array of 32-bit words.
* \param array to store 1-bit integer values. The entries ar 32-bit words
* \param i is the index in array to set the the bit to 0
*
* UNSETBIT32(array, i) is a macro that sets 0 to an 1-bit integer stored in an array of 32-bit words.
*/
#define UNSETBIT32(array, i) (array[i >> 5] ^= ((bitmask32[i & 0x0000001f])))
#define BITS_TABLE_SIZE(n, bits_length) ((n * bits_length + 31) >> 5)
static inline void set_bits_value(cmph_uint32 * bits_table, cmph_uint32 index, cmph_uint32 bits_string,
cmph_uint32 string_length, cmph_uint32 string_mask)
{
register cmph_uint32 bit_idx = index * string_length;
register cmph_uint32 word_idx = bit_idx >> 5;
register cmph_uint32 shift1 = bit_idx & 0x0000001f;
register cmph_uint32 shift2 = 32 - shift1;
bits_table[word_idx] &= ~((string_mask) << shift1);
bits_table[word_idx] |= bits_string << shift1;
if(shift2 < string_length)
{
bits_table[word_idx+1] &= ~((string_mask) >> shift2);
bits_table[word_idx+1] |= bits_string >> shift2;
};
};
static inline cmph_uint32 get_bits_value(cmph_uint32 * bits_table,cmph_uint32 index, cmph_uint32 string_length, cmph_uint32 string_mask)
{
register cmph_uint32 bit_idx = index * string_length;
register cmph_uint32 word_idx = bit_idx >> 5;
register cmph_uint32 shift1 = bit_idx & 0x0000001f;
register cmph_uint32 shift2 = 32-shift1;
register cmph_uint32 bits_string;
bits_string = (bits_table[word_idx] >> shift1) & string_mask;
if(shift2 < string_length)
bits_string |= (bits_table[word_idx+1] << shift2) & string_mask;
return bits_string;
};
static inline void set_bits_at_pos(cmph_uint32 * bits_table, cmph_uint32 pos, cmph_uint32 bits_string, cmph_uint32 string_length)
{
register cmph_uint32 word_idx = pos >> 5;
register cmph_uint32 shift1 = pos & 0x0000001f;
register cmph_uint32 shift2 = 32-shift1;
register cmph_uint32 string_mask = (1U << string_length) - 1;
bits_table[word_idx] &= ~((string_mask) << shift1);
bits_table[word_idx] |= bits_string << shift1;
if(shift2 < string_length)
{
bits_table[word_idx+1] &= ~((string_mask) >> shift2);
bits_table[word_idx+1] |= bits_string >> shift2;
}
};
static inline cmph_uint32 get_bits_at_pos(cmph_uint32 * bits_table,cmph_uint32 pos,cmph_uint32 string_length)
{
register cmph_uint32 word_idx = pos >> 5;
register cmph_uint32 shift1 = pos & 0x0000001f;
register cmph_uint32 shift2 = 32 - shift1;
register cmph_uint32 string_mask = (1U << string_length) - 1;
register cmph_uint32 bits_string;
bits_string = (bits_table[word_idx] >> shift1) & string_mask;
if(shift2 < string_length)
bits_string |= (bits_table[word_idx+1] << shift2) & string_mask;
return bits_string;
}
#endif
cmph-2.0.2/src/cmph.c 0000644 0001750 0001750 00000054535 13411542035 013700 0 ustar joseph joseph #include "cmph.h"
#include "cmph_structs.h"
#include "chm.h"
#include "bmz.h"
#include "bmz8.h"
#include "brz.h"
#include "fch.h"
#include "bdz.h"
#include "bdz_ph.h"
#include "chd_ph.h"
#include "chd.h"
#include
#include
#include
// #define DEBUG
#include "debug.h"
const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", "chd", NULL };
typedef struct
{
void *vector;
cmph_uint32 position; // access position when data is a vector
} cmph_vector_t;
/**
* Support a vector of struct as the source of keys.
*
* E.g. The keys could be the fieldB's in a vector of struct rec where
* struct rec is defined as:
* struct rec {
* fieldA;
* fieldB;
* fieldC;
* }
*/
typedef struct
{
void *vector; /* Pointer to the vector of struct */
cmph_uint32 position; /* current position */
cmph_uint32 struct_size; /* The size of the struct */
cmph_uint32 key_offset; /* The byte offset of the key in the struct */
cmph_uint32 key_len; /* The length of the key */
} cmph_struct_vector_t;
static cmph_io_adapter_t *cmph_io_vector_new(void * vector, cmph_uint32 nkeys);
static void cmph_io_vector_destroy(cmph_io_adapter_t * key_source);
static cmph_io_adapter_t *cmph_io_struct_vector_new(void * vector, cmph_uint32 struct_size, cmph_uint32 key_offset, cmph_uint32 key_len, cmph_uint32 nkeys);
static void cmph_io_struct_vector_destroy(cmph_io_adapter_t * key_source);
static int key_nlfile_read(void *data, char **key, cmph_uint32 *keylen)
{
FILE *fd = (FILE *)data;
*key = NULL;
*keylen = 0;
while(1)
{
char buf[BUFSIZ];
char *c = fgets(buf, BUFSIZ, fd);
if (c == NULL) return -1;
if (feof(fd)) return -1;
*key = (char *)realloc(*key, *keylen + strlen(buf) + 1);
memcpy(*key + *keylen, buf, strlen(buf));
*keylen += (cmph_uint32)strlen(buf);
if (buf[strlen(buf) - 1] != '\n') continue;
break;
}
if ((*keylen) && (*key)[*keylen - 1] == '\n')
{
(*key)[(*keylen) - 1] = 0;
--(*keylen);
}
return (int)(*keylen);
}
static int key_byte_vector_read(void *data, char **key, cmph_uint32 *keylen)
{
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
cmph_uint8 **keys_vd = (cmph_uint8 **)cmph_vector->vector;
size_t size;
memcpy(keylen, keys_vd[cmph_vector->position], sizeof(*keylen));
size = *keylen;
*key = (char *)malloc(size);
memcpy(*key, keys_vd[cmph_vector->position] + sizeof(*keylen), size);
cmph_vector->position = cmph_vector->position + 1;
return (int)(*keylen);
}
static int key_struct_vector_read(void *data, char **key, cmph_uint32 *keylen)
{
cmph_struct_vector_t *cmph_struct_vector = (cmph_struct_vector_t *)data;
char *keys_vd = (char *)cmph_struct_vector->vector;
cmph_uint64 keys_vd_offset;
size_t size;
*keylen = cmph_struct_vector->key_len;
size = *keylen;
*key = (char *)malloc(size);
keys_vd_offset = ((cmph_uint64)cmph_struct_vector->position *
(cmph_uint64)cmph_struct_vector->struct_size) +
(cmph_uint64)cmph_struct_vector->key_offset;
memcpy(*key, keys_vd + keys_vd_offset, size);
cmph_struct_vector->position = cmph_struct_vector->position + 1;
return (int)(*keylen);
}
static int key_vector_read(void *data, char **key, cmph_uint32 *keylen)
{
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
char **keys_vd = (char **)cmph_vector->vector;
size_t size;
*keylen = (cmph_uint32)strlen(keys_vd[cmph_vector->position]);
size = *keylen;
*key = (char *)malloc(size + 1);
strcpy(*key, keys_vd[cmph_vector->position]);
cmph_vector->position = cmph_vector->position + 1;
return (int)(*keylen);
}
static void key_nlfile_dispose(void *data, char *key, cmph_uint32 keylen)
{
free(key);
}
static void key_vector_dispose(void *data, char *key, cmph_uint32 keylen)
{
free(key);
}
static void key_nlfile_rewind(void *data)
{
FILE *fd = (FILE *)data;
rewind(fd);
}
static void key_struct_vector_rewind(void *data)
{
cmph_struct_vector_t *cmph_struct_vector = (cmph_struct_vector_t *)data;
cmph_struct_vector->position = 0;
}
static void key_vector_rewind(void *data)
{
cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
cmph_vector->position = 0;
}
static cmph_uint32 count_nlfile_keys(FILE *fd)
{
cmph_uint32 count = 0;
register char * ptr;
rewind(fd);
while(1)
{
char buf[BUFSIZ];
ptr = fgets(buf, BUFSIZ, fd);
if (feof(fd)) break;
if (ferror(fd) || ptr == NULL) {
perror("Error reading input file");
return 0;
}
if (buf[strlen(buf) - 1] != '\n') continue;
++count;
}
rewind(fd);
return count;
}
cmph_io_adapter_t *cmph_io_nlfile_adapter(FILE * keys_fd)
{
cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
assert(key_source);
key_source->data = (void *)keys_fd;
key_source->nkeys = count_nlfile_keys(keys_fd);
key_source->read = key_nlfile_read;
key_source->dispose = key_nlfile_dispose;
key_source->rewind = key_nlfile_rewind;
return key_source;
}
void cmph_io_nlfile_adapter_destroy(cmph_io_adapter_t * key_source)
{
free(key_source);
}
cmph_io_adapter_t *cmph_io_nlnkfile_adapter(FILE * keys_fd, cmph_uint32 nkeys)
{
cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
assert(key_source);
key_source->data = (void *)keys_fd;
key_source->nkeys = nkeys;
key_source->read = key_nlfile_read;
key_source->dispose = key_nlfile_dispose;
key_source->rewind = key_nlfile_rewind;
return key_source;
}
void cmph_io_nlnkfile_adapter_destroy(cmph_io_adapter_t * key_source)
{
free(key_source);
}
static cmph_io_adapter_t *cmph_io_struct_vector_new(void * vector, cmph_uint32 struct_size, cmph_uint32 key_offset, cmph_uint32 key_len, cmph_uint32 nkeys)
{
cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
cmph_struct_vector_t * cmph_struct_vector = (cmph_struct_vector_t *)malloc(sizeof(cmph_struct_vector_t));
assert(key_source);
assert(cmph_struct_vector);
cmph_struct_vector->vector = vector;
cmph_struct_vector->position = 0;
cmph_struct_vector->struct_size = struct_size;
cmph_struct_vector->key_offset = key_offset;
cmph_struct_vector->key_len = key_len;
key_source->data = (void *)cmph_struct_vector;
key_source->nkeys = nkeys;
return key_source;
}
static void cmph_io_struct_vector_destroy(cmph_io_adapter_t * key_source)
{
cmph_struct_vector_t *cmph_struct_vector = (cmph_struct_vector_t *)key_source->data;
cmph_struct_vector->vector = NULL;
free(cmph_struct_vector);
free(key_source);
}
static cmph_io_adapter_t *cmph_io_vector_new(void * vector, cmph_uint32 nkeys)
{
cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
assert(key_source);
assert(cmph_vector);
cmph_vector->vector = vector;
cmph_vector->position = 0;
key_source->data = (void *)cmph_vector;
key_source->nkeys = nkeys;
return key_source;
}
static void cmph_io_vector_destroy(cmph_io_adapter_t * key_source)
{
cmph_vector_t *cmph_vector = (cmph_vector_t *)key_source->data;
cmph_vector->vector = NULL;
free(cmph_vector);
free(key_source);
}
cmph_io_adapter_t *cmph_io_byte_vector_adapter(cmph_uint8 ** vector, cmph_uint32 nkeys)
{
cmph_io_adapter_t * key_source = cmph_io_vector_new(vector, nkeys);
key_source->read = key_byte_vector_read;
key_source->dispose = key_vector_dispose;
key_source->rewind = key_vector_rewind;
return key_source;
}
void cmph_io_byte_vector_adapter_destroy(cmph_io_adapter_t * key_source)
{
cmph_io_vector_destroy(key_source);
}
cmph_io_adapter_t *cmph_io_struct_vector_adapter(void * vector, cmph_uint32 struct_size, cmph_uint32 key_offset, cmph_uint32 key_len, cmph_uint32 nkeys)
{
cmph_io_adapter_t * key_source = cmph_io_struct_vector_new(vector, struct_size, key_offset, key_len, nkeys);
key_source->read = key_struct_vector_read;
key_source->dispose = key_vector_dispose;
key_source->rewind = key_struct_vector_rewind;
return key_source;
}
void cmph_io_struct_vector_adapter_destroy(cmph_io_adapter_t * key_source)
{
cmph_io_struct_vector_destroy(key_source);
}
cmph_io_adapter_t *cmph_io_vector_adapter(char ** vector, cmph_uint32 nkeys)
{
cmph_io_adapter_t * key_source = cmph_io_vector_new(vector, nkeys);
key_source->read = key_vector_read;
key_source->dispose = key_vector_dispose;
key_source->rewind = key_vector_rewind;
return key_source;
}
void cmph_io_vector_adapter_destroy(cmph_io_adapter_t * key_source)
{
cmph_io_vector_destroy(key_source);
}
cmph_config_t *cmph_config_new(cmph_io_adapter_t *key_source)
{
cmph_config_t *mph = NULL;
mph = __config_new(key_source);
assert(mph);
mph->algo = CMPH_CHM; // default value
mph->data = chm_config_new();
return mph;
}
void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo)
{
if (algo != mph->algo)
{
switch (mph->algo)
{
case CMPH_CHM:
chm_config_destroy(mph);
break;
case CMPH_BMZ:
bmz_config_destroy(mph);
break;
case CMPH_BMZ8:
bmz8_config_destroy(mph);
break;
case CMPH_BRZ:
brz_config_destroy(mph);
break;
case CMPH_FCH:
fch_config_destroy(mph);
break;
case CMPH_BDZ:
bdz_config_destroy(mph);
break;
case CMPH_BDZ_PH:
bdz_ph_config_destroy(mph);
break;
case CMPH_CHD_PH:
chd_ph_config_destroy(mph);
break;
case CMPH_CHD:
chd_config_destroy(mph);
break;
default:
assert(0);
}
switch(algo)
{
case CMPH_CHM:
mph->data = chm_config_new();
break;
case CMPH_BMZ:
mph->data = bmz_config_new();
break;
case CMPH_BMZ8:
mph->data = bmz8_config_new();
break;
case CMPH_BRZ:
mph->data = brz_config_new();
break;
case CMPH_FCH:
mph->data = fch_config_new();
break;
case CMPH_BDZ:
mph->data = bdz_config_new();
break;
case CMPH_BDZ_PH:
mph->data = bdz_ph_config_new();
break;
case CMPH_CHD_PH:
mph->data = chd_ph_config_new();
break;
case CMPH_CHD:
mph->data = chd_config_new(mph);
break;
default:
assert(0);
}
}
mph->algo = algo;
}
void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir)
{
if (mph->algo == CMPH_BRZ)
{
brz_config_set_tmp_dir(mph, tmp_dir);
}
}
void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd)
{
if (mph->algo == CMPH_BRZ)
{
brz_config_set_mphf_fd(mph, mphf_fd);
}
}
void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b)
{
if (mph->algo == CMPH_BRZ)
{
brz_config_set_b(mph, b);
}
else if (mph->algo == CMPH_BDZ)
{
bdz_config_set_b(mph, b);
}
else if (mph->algo == CMPH_CHD_PH)
{
chd_ph_config_set_b(mph, b);
}
else if (mph->algo == CMPH_CHD)
{
chd_config_set_b(mph, b);
}
}
void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
{
if (mph->algo == CMPH_CHD_PH)
{
chd_ph_config_set_keys_per_bin(mph, keys_per_bin);
}
else if (mph->algo == CMPH_CHD)
{
chd_config_set_keys_per_bin(mph, keys_per_bin);
}
}
void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability)
{
if (mph->algo == CMPH_BRZ)
{
brz_config_set_memory_availability(mph, memory_availability);
}
}
void cmph_config_destroy(cmph_config_t *mph)
{
if(mph)
{
DEBUGP("Destroying mph with algo %s\n", cmph_names[mph->algo]);
switch (mph->algo)
{
case CMPH_CHM:
chm_config_destroy(mph);
break;
case CMPH_BMZ: /* included -- Fabiano */
bmz_config_destroy(mph);
break;
case CMPH_BMZ8: /* included -- Fabiano */
bmz8_config_destroy(mph);
break;
case CMPH_BRZ: /* included -- Fabiano */
brz_config_destroy(mph);
break;
case CMPH_FCH: /* included -- Fabiano */
fch_config_destroy(mph);
break;
case CMPH_BDZ: /* included -- Fabiano */
bdz_config_destroy(mph);
break;
case CMPH_BDZ_PH: /* included -- Fabiano */
bdz_ph_config_destroy(mph);
break;
case CMPH_CHD_PH: /* included -- Fabiano */
chd_ph_config_destroy(mph);
break;
case CMPH_CHD: /* included -- Fabiano */
chd_config_destroy(mph);
break;
default:
assert(0);
}
__config_destroy(mph);
}
}
void cmph_config_set_verbosity(cmph_config_t *mph, cmph_uint32 verbosity)
{
mph->verbosity = verbosity;
}
void cmph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
switch (mph->algo)
{
case CMPH_CHM:
chm_config_set_hashfuncs(mph, hashfuncs);
break;
case CMPH_BMZ: /* included -- Fabiano */
bmz_config_set_hashfuncs(mph, hashfuncs);
break;
case CMPH_BMZ8: /* included -- Fabiano */
bmz8_config_set_hashfuncs(mph, hashfuncs);
break;
case CMPH_BRZ: /* included -- Fabiano */
brz_config_set_hashfuncs(mph, hashfuncs);
break;
case CMPH_FCH: /* included -- Fabiano */
fch_config_set_hashfuncs(mph, hashfuncs);
break;
case CMPH_BDZ: /* included -- Fabiano */
bdz_config_set_hashfuncs(mph, hashfuncs);
break;
case CMPH_BDZ_PH: /* included -- Fabiano */
bdz_ph_config_set_hashfuncs(mph, hashfuncs);
break;
case CMPH_CHD_PH: /* included -- Fabiano */
chd_ph_config_set_hashfuncs(mph, hashfuncs);
break;
case CMPH_CHD: /* included -- Fabiano */
chd_config_set_hashfuncs(mph, hashfuncs);
break;
default:
break;
}
return;
}
void cmph_config_set_graphsize(cmph_config_t *mph, double c)
{
mph->c = c;
return;
}
cmph_t *cmph_new(cmph_config_t *mph)
{
cmph_t *mphf = NULL;
double c = mph->c;
DEBUGP("Creating mph with algorithm %s\n", cmph_names[mph->algo]);
switch (mph->algo)
{
case CMPH_CHM:
DEBUGP("Creating chm hash\n");
mphf = chm_new(mph, c);
break;
case CMPH_BMZ: /* included -- Fabiano */
DEBUGP("Creating bmz hash\n");
mphf = bmz_new(mph, c);
break;
case CMPH_BMZ8: /* included -- Fabiano */
DEBUGP("Creating bmz8 hash\n");
mphf = bmz8_new(mph, c);
break;
case CMPH_BRZ: /* included -- Fabiano */
DEBUGP("Creating brz hash\n");
if (c >= 2.0) brz_config_set_algo(mph, CMPH_FCH);
else brz_config_set_algo(mph, CMPH_BMZ8);
mphf = brz_new(mph, c);
break;
case CMPH_FCH: /* included -- Fabiano */
DEBUGP("Creating fch hash\n");
mphf = fch_new(mph, c);
break;
case CMPH_BDZ: /* included -- Fabiano */
DEBUGP("Creating bdz hash\n");
mphf = bdz_new(mph, c);
break;
case CMPH_BDZ_PH: /* included -- Fabiano */
DEBUGP("Creating bdz_ph hash\n");
mphf = bdz_ph_new(mph, c);
break;
case CMPH_CHD_PH: /* included -- Fabiano */
DEBUGP("Creating chd_ph hash\n");
mphf = chd_ph_new(mph, c);
break;
case CMPH_CHD: /* included -- Fabiano */
DEBUGP("Creating chd hash\n");
mphf = chd_new(mph, c);
break;
default:
assert(0);
}
return mphf;
}
int cmph_dump(cmph_t *mphf, FILE *f)
{
switch (mphf->algo)
{
case CMPH_CHM:
return chm_dump(mphf, f);
case CMPH_BMZ: /* included -- Fabiano */
return bmz_dump(mphf, f);
case CMPH_BMZ8: /* included -- Fabiano */
return bmz8_dump(mphf, f);
case CMPH_BRZ: /* included -- Fabiano */
return brz_dump(mphf, f);
case CMPH_FCH: /* included -- Fabiano */
return fch_dump(mphf, f);
case CMPH_BDZ: /* included -- Fabiano */
return bdz_dump(mphf, f);
case CMPH_BDZ_PH: /* included -- Fabiano */
return bdz_ph_dump(mphf, f);
case CMPH_CHD_PH: /* included -- Fabiano */
return chd_ph_dump(mphf, f);
case CMPH_CHD: /* included -- Fabiano */
return chd_dump(mphf, f);
default:
assert(0);
}
assert(0);
return 0;
}
cmph_t *cmph_load(FILE *f)
{
cmph_t *mphf = NULL;
DEBUGP("Loading mphf generic parts\n");
mphf = __cmph_load(f);
if (mphf == NULL) return NULL;
DEBUGP("Loading mphf algorithm dependent parts\n");
switch (mphf->algo)
{
case CMPH_CHM:
chm_load(f, mphf);
break;
case CMPH_BMZ: /* included -- Fabiano */
DEBUGP("Loading bmz algorithm dependent parts\n");
bmz_load(f, mphf);
break;
case CMPH_BMZ8: /* included -- Fabiano */
DEBUGP("Loading bmz8 algorithm dependent parts\n");
bmz8_load(f, mphf);
break;
case CMPH_BRZ: /* included -- Fabiano */
DEBUGP("Loading brz algorithm dependent parts\n");
brz_load(f, mphf);
break;
case CMPH_FCH: /* included -- Fabiano */
DEBUGP("Loading fch algorithm dependent parts\n");
fch_load(f, mphf);
break;
case CMPH_BDZ: /* included -- Fabiano */
DEBUGP("Loading bdz algorithm dependent parts\n");
bdz_load(f, mphf);
break;
case CMPH_BDZ_PH: /* included -- Fabiano */
DEBUGP("Loading bdz_ph algorithm dependent parts\n");
bdz_ph_load(f, mphf);
break;
case CMPH_CHD_PH: /* included -- Fabiano */
DEBUGP("Loading chd_ph algorithm dependent parts\n");
chd_ph_load(f, mphf);
break;
case CMPH_CHD: /* included -- Fabiano */
DEBUGP("Loading chd algorithm dependent parts\n");
chd_load(f, mphf);
break;
default:
assert(0);
}
DEBUGP("Loaded mphf\n");
return mphf;
}
cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
DEBUGP("mphf algorithm: %u \n", mphf->algo);
switch(mphf->algo)
{
case CMPH_CHM:
return chm_search(mphf, key, keylen);
case CMPH_BMZ: /* included -- Fabiano */
DEBUGP("bmz algorithm search\n");
return bmz_search(mphf, key, keylen);
case CMPH_BMZ8: /* included -- Fabiano */
DEBUGP("bmz8 algorithm search\n");
return bmz8_search(mphf, key, keylen);
case CMPH_BRZ: /* included -- Fabiano */
DEBUGP("brz algorithm search\n");
return brz_search(mphf, key, keylen);
case CMPH_FCH: /* included -- Fabiano */
DEBUGP("fch algorithm search\n");
return fch_search(mphf, key, keylen);
case CMPH_BDZ: /* included -- Fabiano */
DEBUGP("bdz algorithm search\n");
return bdz_search(mphf, key, keylen);
case CMPH_BDZ_PH: /* included -- Fabiano */
DEBUGP("bdz_ph algorithm search\n");
return bdz_ph_search(mphf, key, keylen);
case CMPH_CHD_PH: /* included -- Fabiano */
DEBUGP("chd_ph algorithm search\n");
return chd_ph_search(mphf, key, keylen);
case CMPH_CHD: /* included -- Fabiano */
DEBUGP("chd algorithm search\n");
return chd_search(mphf, key, keylen);
default:
assert(0);
}
assert(0);
return 0;
}
cmph_uint32 cmph_size(cmph_t *mphf)
{
return mphf->size;
}
void cmph_destroy(cmph_t *mphf)
{
switch(mphf->algo)
{
case CMPH_CHM:
chm_destroy(mphf);
return;
case CMPH_BMZ: /* included -- Fabiano */
bmz_destroy(mphf);
return;
case CMPH_BMZ8: /* included -- Fabiano */
bmz8_destroy(mphf);
return;
case CMPH_BRZ: /* included -- Fabiano */
brz_destroy(mphf);
return;
case CMPH_FCH: /* included -- Fabiano */
fch_destroy(mphf);
return;
case CMPH_BDZ: /* included -- Fabiano */
bdz_destroy(mphf);
return;
case CMPH_BDZ_PH: /* included -- Fabiano */
bdz_ph_destroy(mphf);
return;
case CMPH_CHD_PH: /* included -- Fabiano */
chd_ph_destroy(mphf);
return;
case CMPH_CHD: /* included -- Fabiano */
chd_destroy(mphf);
return;
default:
assert(0);
}
assert(0);
return;
}
/** \fn void cmph_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void cmph_pack(cmph_t *mphf, void *packed_mphf)
{
// packing algorithm type to be used in cmph.c
cmph_uint32 * ptr = (cmph_uint32 *) packed_mphf;
*ptr++ = mphf->algo;
DEBUGP("mphf->algo = %u\n", mphf->algo);
switch(mphf->algo)
{
case CMPH_CHM:
chm_pack(mphf, ptr);
break;
case CMPH_BMZ: /* included -- Fabiano */
bmz_pack(mphf, ptr);
break;
case CMPH_BMZ8: /* included -- Fabiano */
bmz8_pack(mphf, ptr);
break;
case CMPH_BRZ: /* included -- Fabiano */
brz_pack(mphf, ptr);
break;
case CMPH_FCH: /* included -- Fabiano */
fch_pack(mphf, ptr);
break;
case CMPH_BDZ: /* included -- Fabiano */
bdz_pack(mphf, ptr);
break;
case CMPH_BDZ_PH: /* included -- Fabiano */
bdz_ph_pack(mphf, ptr);
break;
case CMPH_CHD_PH: /* included -- Fabiano */
chd_ph_pack(mphf, ptr);
break;
case CMPH_CHD: /* included -- Fabiano */
chd_pack(mphf, ptr);
break;
default:
assert(0);
}
return;
}
/** \fn cmph_uint32 cmph_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 cmph_packed_size(cmph_t *mphf)
{
switch(mphf->algo)
{
case CMPH_CHM:
return chm_packed_size(mphf);
case CMPH_BMZ: /* included -- Fabiano */
return bmz_packed_size(mphf);
case CMPH_BMZ8: /* included -- Fabiano */
return bmz8_packed_size(mphf);
case CMPH_BRZ: /* included -- Fabiano */
return brz_packed_size(mphf);
case CMPH_FCH: /* included -- Fabiano */
return fch_packed_size(mphf);
case CMPH_BDZ: /* included -- Fabiano */
return bdz_packed_size(mphf);
case CMPH_BDZ_PH: /* included -- Fabiano */
return bdz_ph_packed_size(mphf);
case CMPH_CHD_PH: /* included -- Fabiano */
return chd_ph_packed_size(mphf);
case CMPH_CHD: /* included -- Fabiano */
return chd_packed_size(mphf);
default:
assert(0);
}
return 0; // FAILURE
}
/** cmph_uint32 cmph_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf;
// fprintf(stderr, "algo:%u\n", *ptr);
switch(*ptr)
{
case CMPH_CHM:
return chm_search_packed(++ptr, key, keylen);
case CMPH_BMZ: /* included -- Fabiano */
return bmz_search_packed(++ptr, key, keylen);
case CMPH_BMZ8: /* included -- Fabiano */
return bmz8_search_packed(++ptr, key, keylen);
case CMPH_BRZ: /* included -- Fabiano */
return brz_search_packed(++ptr, key, keylen);
case CMPH_FCH: /* included -- Fabiano */
return fch_search_packed(++ptr, key, keylen);
case CMPH_BDZ: /* included -- Fabiano */
return bdz_search_packed(++ptr, key, keylen);
case CMPH_BDZ_PH: /* included -- Fabiano */
return bdz_ph_search_packed(++ptr, key, keylen);
case CMPH_CHD_PH: /* included -- Fabiano */
return chd_ph_search_packed(++ptr, key, keylen);
case CMPH_CHD: /* included -- Fabiano */
return chd_search_packed(++ptr, key, keylen);
default:
assert(0);
}
return 0; // FAILURE
}
cmph-2.0.2/src/jenkins_hash.c 0000644 0001750 0001750 00000023131 13411542035 015401 0 ustar joseph joseph #include "jenkins_hash.h"
#include
#ifdef WIN32
#define _USE_MATH_DEFINES //For M_LOG2E
#endif
#include
#include
#include
//#define DEBUG
#include "debug.h"
#define hashsize(n) ((cmph_uint32)1<<(n))
#define hashmask(n) (hashsize(n)-1)
//#define NM2 /* Define this if you do not want power of 2 table sizes*/
/*
--------------------------------------------------------------------
mix -- mix 3 32-bit values reversibly.
For every delta with one or two bits set, and the deltas of all three
high bits or all three low bits, whether the original value of a,b,c
is almost all zero or is uniformly distributed,
* If mix() is run forward or backward, at least 32 bits in a,b,c
have at least 1/4 probability of changing.
* If mix() is run forward, every bit of c will change between 1/3 and
2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.)
mix() was built out of 36 single-cycle latency instructions in a
structure that could supported 2x parallelism, like so:
a -= b;
a -= c; x = (c>>13);
b -= c; a ^= x;
b -= a; x = (a<<8);
c -= a; b ^= x;
c -= b; x = (b>>13);
...
Unfortunately, superscalar Pentiums and Sparcs can't take advantage
of that parallelism. They've also turned some of those single-cycle
latency instructions into multi-cycle latency instructions. Still,
this is the fastest good hash I could find. There were about 2^^68
to choose from. I only looked at a billion or so.
--------------------------------------------------------------------
*/
#define mix(a,b,c) \
{ \
a -= b; a -= c; a ^= (c>>13); \
b -= c; b -= a; b ^= (a<<8); \
c -= a; c -= b; c ^= (b>>13); \
a -= b; a -= c; a ^= (c>>12); \
b -= c; b -= a; b ^= (a<<16); \
c -= a; c -= b; c ^= (b>>5); \
a -= b; a -= c; a ^= (c>>3); \
b -= c; b -= a; b ^= (a<<10); \
c -= a; c -= b; c ^= (b>>15); \
}
/*
--------------------------------------------------------------------
hash() -- hash a variable-length key into a 32-bit value
k : the key (the unaligned variable-length array of bytes)
len : the length of the key, counting by bytes
initval : can be any 4-byte value
Returns a 32-bit value. Every bit of the key affects every bit of
the return value. Every 1-bit and 2-bit delta achieves avalanche.
About 6*len+35 instructions.
The best hash table sizes are powers of 2. There is no need to do
mod a prime (mod is sooo slow!). If you need less than 32 bits,
use a bitmask. For example, if you need only 10 bits, do
h = (h & hashmask(10));
In which case, the hash table should have hashsize(10) elements.
If you are hashing n strings (cmph_uint8 **)k, do it like this:
for (i=0, h=0; i 0) state->seed = ((cmph_uint32)rand() % size);
else state->seed = 0;
return state;
}
void jenkins_state_destroy(jenkins_state_t *state)
{
free(state);
}
static inline void __jenkins_hash_vector(cmph_uint32 seed, const unsigned char *k, cmph_uint32 keylen, cmph_uint32 * hashes)
{
register cmph_uint32 len, length;
/* Set up the internal state */
length = keylen;
len = length;
hashes[0] = hashes[1] = 0x9e3779b9; /* the golden ratio; an arbitrary value */
hashes[2] = seed; /* the previous hash value - seed in our case */
/*---------------------------------------- handle most of the key */
while (len >= 12)
{
hashes[0] += ((cmph_uint32)k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24));
hashes[1] += ((cmph_uint32)k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24));
hashes[2] += ((cmph_uint32)k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24));
mix(hashes[0],hashes[1],hashes[2]);
k += 12; len -= 12;
}
/*------------------------------------- handle the last 11 bytes */
hashes[2] += length;
switch(len) /* all the case statements fall through */
{
case 11:
hashes[2] +=((cmph_uint32)k[10]<<24);
case 10:
hashes[2] +=((cmph_uint32)k[9]<<16);
case 9 :
hashes[2] +=((cmph_uint32)k[8]<<8);
/* the first byte of hashes[2] is reserved for the length */
case 8 :
hashes[1] +=((cmph_uint32)k[7]<<24);
case 7 :
hashes[1] +=((cmph_uint32)k[6]<<16);
case 6 :
hashes[1] +=((cmph_uint32)k[5]<<8);
case 5 :
hashes[1] +=(cmph_uint8) k[4];
case 4 :
hashes[0] +=((cmph_uint32)k[3]<<24);
case 3 :
hashes[0] +=((cmph_uint32)k[2]<<16);
case 2 :
hashes[0] +=((cmph_uint32)k[1]<<8);
case 1 :
hashes[0] +=(cmph_uint8)k[0];
/* case 0: nothing left to add */
}
mix(hashes[0],hashes[1],hashes[2]);
}
cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen)
{
cmph_uint32 hashes[3];
__jenkins_hash_vector(state->seed, (const unsigned char*)k, keylen, hashes);
return hashes[2];
/* cmph_uint32 a, b, c;
cmph_uint32 len, length;
// Set up the internal state
length = keylen;
len = length;
a = b = 0x9e3779b9; // the golden ratio; an arbitrary value
c = state->seed; // the previous hash value - seed in our case
// handle most of the key
while (len >= 12)
{
a += (k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24));
b += (k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24));
c += (k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24));
mix(a,b,c);
k += 12; len -= 12;
}
// handle the last 11 bytes
c += length;
switch(len) /// all the case statements fall through
{
case 11:
c +=((cmph_uint32)k[10]<<24);
case 10:
c +=((cmph_uint32)k[9]<<16);
case 9 :
c +=((cmph_uint32)k[8]<<8);
// the first byte of c is reserved for the length
case 8 :
b +=((cmph_uint32)k[7]<<24);
case 7 :
b +=((cmph_uint32)k[6]<<16);
case 6 :
b +=((cmph_uint32)k[5]<<8);
case 5 :
b +=k[4];
case 4 :
a +=((cmph_uint32)k[3]<<24);
case 3 :
a +=((cmph_uint32)k[2]<<16);
case 2 :
a +=((cmph_uint32)k[1]<<8);
case 1 :
a +=k[0];
// case 0: nothing left to add
}
mix(a,b,c);
/// report the result
return c;
*/
}
void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes)
{
__jenkins_hash_vector(state->seed, (const unsigned char*)k, keylen, hashes);
}
void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen)
{
*buflen = sizeof(cmph_uint32);
*buf = (char *)malloc(sizeof(cmph_uint32));
if (!*buf)
{
*buflen = UINT_MAX;
return;
}
memcpy(*buf, &(state->seed), sizeof(cmph_uint32));
DEBUGP("Dumped jenkins state with seed %u\n", state->seed);
return;
}
jenkins_state_t *jenkins_state_copy(jenkins_state_t *src_state)
{
jenkins_state_t *dest_state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t));
dest_state->hashfunc = src_state->hashfunc;
dest_state->seed = src_state->seed;
return dest_state;
}
jenkins_state_t *jenkins_state_load(const char *buf, cmph_uint32 buflen)
{
jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t));
state->seed = *(cmph_uint32 *)buf;
state->hashfunc = CMPH_HASH_JENKINS;
DEBUGP("Loaded jenkins state with seed %u\n", state->seed);
return state;
}
/** \fn void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed);
* \brief Support the ability to pack a jenkins function into a preallocated contiguous memory space pointed by jenkins_packed.
* \param state points to the jenkins function
* \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size()
*/
void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed)
{
if (state && jenkins_packed)
{
memcpy(jenkins_packed, &(state->seed), sizeof(cmph_uint32));
}
}
/** \fn cmph_uint32 jenkins_state_packed_size(jenkins_state_t *state);
* \brief Return the amount of space needed to pack a jenkins function.
* \return the size of the packed function or zero for failures
*/
cmph_uint32 jenkins_state_packed_size(void)
{
return sizeof(cmph_uint32);
}
/** \fn cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen);
* \param jenkins_packed is a pointer to a contiguous memory area
* \param key is a pointer to a key
* \param keylen is the key length
* \return an integer that represents a hash value of 32 bits.
*/
cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen)
{
cmph_uint32 hashes[3];
__jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), (const unsigned char*)k, keylen, hashes);
return hashes[2];
}
/** \fn jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes);
* \param jenkins_packed is a pointer to a contiguous memory area
* \param key is a pointer to a key
* \param keylen is the key length
* \param hashes is a pointer to a memory large enough to fit three 32-bit integers.
*/
void jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes)
{
__jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), (const unsigned char*)k, keylen, hashes);
}
cmph-2.0.2/src/chd.c 0000644 0001750 0001750 00000017205 13411542035 013500 0 ustar joseph joseph #include
#include
#include
#include
#include
#include
#include
#include "cmph_structs.h"
#include "chd_structs.h"
#include "chd.h"
#include "bitbool.h"
//#define DEBUG
#include "debug.h"
chd_config_data_t *chd_config_new(cmph_config_t *mph)
{
cmph_io_adapter_t *key_source = mph->key_source;
chd_config_data_t *chd;
chd = (chd_config_data_t *)malloc(sizeof(chd_config_data_t));
if (!chd) return NULL;
memset(chd, 0, sizeof(chd_config_data_t));
chd->chd_ph = cmph_config_new(key_source);
cmph_config_set_algo(chd->chd_ph, CMPH_CHD_PH);
return chd;
}
void chd_config_destroy(cmph_config_t *mph)
{
chd_config_data_t *data = (chd_config_data_t *) mph->data;
DEBUGP("Destroying algorithm dependent data\n");
if(data->chd_ph)
{
cmph_config_destroy(data->chd_ph);
data->chd_ph = NULL;
}
free(data);
}
void chd_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
chd_config_data_t *data = (chd_config_data_t *) mph->data;
cmph_config_set_hashfuncs(data->chd_ph, hashfuncs);
}
void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket)
{
chd_config_data_t *data = (chd_config_data_t *) mph->data;
cmph_config_set_b(data->chd_ph, keys_per_bucket);
}
void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
{
chd_config_data_t *data = (chd_config_data_t *) mph->data;
cmph_config_set_keys_per_bin(data->chd_ph, keys_per_bin);
}
cmph_t *chd_new(cmph_config_t *mph, double c)
{
DEBUGP("Creating new chd");
cmph_t *mphf = NULL;
chd_data_t *chdf = NULL;
chd_config_data_t *chd = (chd_config_data_t *)mph->data;
chd_ph_config_data_t * chd_ph = (chd_ph_config_data_t *)chd->chd_ph->data;
compressed_rank_t cr;
register cmph_t * chd_phf = NULL;
register cmph_uint32 packed_chd_phf_size = 0;
cmph_uint8 * packed_chd_phf = NULL;
register cmph_uint32 packed_cr_size = 0;
cmph_uint8 * packed_cr = NULL;
register cmph_uint32 i, idx, nkeys, nvals, nbins;
cmph_uint32 * vals_table = NULL;
register cmph_uint32 * occup_table = NULL;
#ifdef CMPH_TIMING
double construction_time_begin = 0.0;
double construction_time = 0.0;
ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
#endif
cmph_config_set_verbosity(chd->chd_ph, mph->verbosity);
cmph_config_set_graphsize(chd->chd_ph, c);
if (mph->verbosity)
{
fprintf(stderr, "Generating a CHD_PH perfect hash function with a load factor equal to %.3f\n", c);
}
chd_phf = cmph_new(chd->chd_ph);
if(chd_phf == NULL)
{
return NULL;
}
packed_chd_phf_size = cmph_packed_size(chd_phf);
DEBUGP("packed_chd_phf_size = %u\n", packed_chd_phf_size);
/* Make sure that we have enough space to pack the mphf. */
packed_chd_phf = (cmph_uint8 *)calloc((size_t)packed_chd_phf_size,(size_t)1);
/* Pack the mphf. */
cmph_pack(chd_phf, packed_chd_phf);
cmph_destroy(chd_phf);
if (mph->verbosity)
{
fprintf(stderr, "Compressing the range of the resulting CHD_PH perfect hash function\n");
}
compressed_rank_init(&cr);
nbins = chd_ph->n;
nkeys = chd_ph->m;
nvals = nbins - nkeys;
vals_table = (cmph_uint32 *)calloc(nvals, sizeof(cmph_uint32));
occup_table = (cmph_uint32 *)chd_ph->occup_table;
for(i = 0, idx = 0; i < nbins; i++)
{
if(!GETBIT32(occup_table, i))
{
vals_table[idx++] = i;
}
}
compressed_rank_generate(&cr, vals_table, nvals);
free(vals_table);
packed_cr_size = compressed_rank_packed_size(&cr);
packed_cr = (cmph_uint8 *) calloc(packed_cr_size, sizeof(cmph_uint8));
compressed_rank_pack(&cr, packed_cr);
compressed_rank_destroy(&cr);
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
chdf = (chd_data_t *)malloc(sizeof(chd_data_t));
chdf->packed_cr = packed_cr;
packed_cr = NULL; //transfer memory ownership
chdf->packed_chd_phf = packed_chd_phf;
packed_chd_phf = NULL; //transfer memory ownership
chdf->packed_chd_phf_size = packed_chd_phf_size;
chdf->packed_cr_size = packed_cr_size;
mphf->data = chdf;
mphf->size = nkeys;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
#ifdef CMPH_TIMING
ELAPSED_TIME_IN_SECONDS(&construction_time);
register cmph_uint32 space_usage = chd_packed_size(mphf)*8;
construction_time = construction_time - construction_time_begin;
fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", nkeys, c, chd_ph->keys_per_bucket, construction_time, space_usage/(double)nkeys);
#endif
return mphf;
}
void chd_load(FILE *fd, cmph_t *mphf)
{
register size_t nbytes;
chd_data_t *chd = (chd_data_t *)malloc(sizeof(chd_data_t));
DEBUGP("Loading chd mphf\n");
mphf->data = chd;
nbytes = fread(&chd->packed_chd_phf_size, sizeof(cmph_uint32), (size_t)1, fd);
DEBUGP("Loading CHD_PH perfect hash function with %u bytes to disk\n", chd->packed_chd_phf_size);
chd->packed_chd_phf = (cmph_uint8 *) calloc((size_t)chd->packed_chd_phf_size,(size_t)1);
nbytes = fread(chd->packed_chd_phf, chd->packed_chd_phf_size, (size_t)1, fd);
nbytes = fread(&chd->packed_cr_size, sizeof(cmph_uint32), (size_t)1, fd);
DEBUGP("Loading Compressed rank structure, which has %u bytes\n", chd->packed_cr_size);
chd->packed_cr = (cmph_uint8 *) calloc((size_t)chd->packed_cr_size, (size_t)1);
nbytes = fread(chd->packed_cr, chd->packed_cr_size, (size_t)1, fd);
}
int chd_dump(cmph_t *mphf, FILE *fd)
{
register size_t nbytes;
chd_data_t *data = (chd_data_t *)mphf->data;
__cmph_dump(mphf, fd);
// Dumping CHD_PH perfect hash function
DEBUGP("Dumping CHD_PH perfect hash function with %u bytes to disk\n", data->packed_chd_phf_size);
nbytes = fwrite(&data->packed_chd_phf_size, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(data->packed_chd_phf, data->packed_chd_phf_size, (size_t)1, fd);
DEBUGP("Dumping compressed rank structure with %u bytes to disk\n", 1);
nbytes = fwrite(&data->packed_cr_size, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(data->packed_cr, data->packed_cr_size, (size_t)1, fd);
return 1;
}
void chd_destroy(cmph_t *mphf)
{
chd_data_t *data = (chd_data_t *)mphf->data;
free(data->packed_chd_phf);
free(data->packed_cr);
free(data);
free(mphf);
}
static inline cmph_uint32 _chd_search(void * packed_chd_phf, void * packed_cr, const char *key, cmph_uint32 keylen)
{
register cmph_uint32 bin_idx = cmph_search_packed(packed_chd_phf, key, keylen);
register cmph_uint32 rank = compressed_rank_query_packed(packed_cr, bin_idx);
return bin_idx - rank;
}
cmph_uint32 chd_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
register chd_data_t * chd = (chd_data_t *)mphf->data;
return _chd_search(chd->packed_chd_phf, chd->packed_cr, key, keylen);
}
void chd_pack(cmph_t *mphf, void *packed_mphf)
{
chd_data_t *data = (chd_data_t *)mphf->data;
cmph_uint32 * ptr = (cmph_uint32 *)packed_mphf;
cmph_uint8 * ptr8;
// packing packed_cr_size and packed_cr
*ptr = data->packed_cr_size;
ptr8 = (cmph_uint8 *) (ptr + 1);
memcpy(ptr8, data->packed_cr, data->packed_cr_size);
ptr8 += data->packed_cr_size;
ptr = (cmph_uint32 *) ptr8;
*ptr = data->packed_chd_phf_size;
ptr8 = (cmph_uint8 *) (ptr + 1);
memcpy(ptr8, data->packed_chd_phf, data->packed_chd_phf_size);
}
cmph_uint32 chd_packed_size(cmph_t *mphf)
{
register chd_data_t *data = (chd_data_t *)mphf->data;
return (cmph_uint32)(sizeof(CMPH_ALGO) + 2*sizeof(cmph_uint32) + data->packed_cr_size + data->packed_chd_phf_size);
}
cmph_uint32 chd_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
register cmph_uint32 * ptr = (cmph_uint32 *)packed_mphf;
register cmph_uint32 packed_cr_size = *ptr++;
register cmph_uint8 * packed_chd_phf = ((cmph_uint8 *) ptr) + packed_cr_size + sizeof(cmph_uint32);
return _chd_search(packed_chd_phf, ptr, key, keylen);
}
cmph-2.0.2/src/jenkins_hash.h 0000644 0001750 0001750 00000005612 13411542035 015412 0 ustar joseph joseph #ifndef __JEKINS_HASH_H__
#define __JEKINS_HASH_H__
#include "hash.h"
typedef struct __jenkins_state_t
{
CMPH_HASH hashfunc;
cmph_uint32 seed;
} jenkins_state_t;
jenkins_state_t *jenkins_state_new(cmph_uint32 size); //size of hash table
/** \fn cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen);
* \param state is a pointer to a jenkins_state_t structure
* \param key is a pointer to a key
* \param keylen is the key length
* \return an integer that represents a hash value of 32 bits.
*/
cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen);
/** \fn void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes);
* \param state is a pointer to a jenkins_state_t structure
* \param key is a pointer to a key
* \param keylen is the key length
* \param hashes is a pointer to a memory large enough to fit three 32-bit integers.
*/
void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes);
void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen);
jenkins_state_t *jenkins_state_copy(jenkins_state_t *src_state);
jenkins_state_t *jenkins_state_load(const char *buf, cmph_uint32 buflen);
void jenkins_state_destroy(jenkins_state_t *state);
/** \fn void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed);
* \brief Support the ability to pack a jenkins function into a preallocated contiguous memory space pointed by jenkins_packed.
* \param state points to the jenkins function
* \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size()
*/
void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed);
/** \fn cmph_uint32 jenkins_state_packed_size();
* \brief Return the amount of space needed to pack a jenkins function.
* \return the size of the packed function or zero for failures
*/
cmph_uint32 jenkins_state_packed_size(void);
/** \fn cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen);
* \param jenkins_packed is a pointer to a contiguous memory area
* \param key is a pointer to a key
* \param keylen is the key length
* \return an integer that represents a hash value of 32 bits.
*/
cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen);
/** \fn jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes);
* \param jenkins_packed is a pointer to a contiguous memory area
* \param key is a pointer to a key
* \param keylen is the key length
* \param hashes is a pointer to a memory large enough to fit three 32-bit integers.
*/
void jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes);
#endif
cmph-2.0.2/src/hash.c 0000644 0001750 0001750 00000013252 13411542035 013663 0 ustar joseph joseph #include "hash_state.h"
#include
#include
#include
#include
//#define DEBUG
#include "debug.h"
const char *cmph_hash_names[] = { "jenkins", NULL };
hash_state_t *hash_state_new(CMPH_HASH hashfunc, cmph_uint32 hashsize)
{
hash_state_t *state = NULL;
switch (hashfunc)
{
case CMPH_HASH_JENKINS:
DEBUGP("Jenkins function - %u\n", hashsize);
state = (hash_state_t *)jenkins_state_new(hashsize);
DEBUGP("Jenkins function created\n");
break;
default:
assert(0);
}
state->hashfunc = hashfunc;
return state;
}
cmph_uint32 hash(hash_state_t *state, const char *key, cmph_uint32 keylen)
{
switch (state->hashfunc)
{
case CMPH_HASH_JENKINS:
return jenkins_hash((jenkins_state_t *)state, key, keylen);
default:
assert(0);
}
assert(0);
return 0;
}
void hash_vector(hash_state_t *state, const char *key, cmph_uint32 keylen, cmph_uint32 * hashes)
{
switch (state->hashfunc)
{
case CMPH_HASH_JENKINS:
jenkins_hash_vector_((jenkins_state_t *)state, key, keylen, hashes);
break;
default:
assert(0);
}
}
void hash_state_dump(hash_state_t *state, char **buf, cmph_uint32 *buflen)
{
char *algobuf;
size_t len;
switch (state->hashfunc)
{
case CMPH_HASH_JENKINS:
jenkins_state_dump((jenkins_state_t *)state, &algobuf, buflen);
if (*buflen == UINT_MAX) {
goto cmph_cleanup;
}
break;
default:
assert(0);
}
*buf = (char *)malloc(strlen(cmph_hash_names[state->hashfunc]) + 1 + *buflen);
memcpy(*buf, cmph_hash_names[state->hashfunc], strlen(cmph_hash_names[state->hashfunc]) + 1);
DEBUGP("Algobuf is %u\n", *(cmph_uint32 *)algobuf);
len = *buflen;
memcpy(*buf + strlen(cmph_hash_names[state->hashfunc]) + 1, algobuf, len);
*buflen = (cmph_uint32)strlen(cmph_hash_names[state->hashfunc]) + 1 + *buflen;
cmph_cleanup:
free(algobuf);
return;
}
hash_state_t * hash_state_copy(hash_state_t *src_state)
{
hash_state_t *dest_state = NULL;
switch (src_state->hashfunc)
{
case CMPH_HASH_JENKINS:
dest_state = (hash_state_t *)jenkins_state_copy((jenkins_state_t *)src_state);
break;
default:
assert(0);
}
dest_state->hashfunc = src_state->hashfunc;
return dest_state;
}
hash_state_t *hash_state_load(const char *buf, cmph_uint32 buflen)
{
cmph_uint32 i;
cmph_uint32 offset;
CMPH_HASH hashfunc = CMPH_HASH_COUNT;
for (i = 0; i < CMPH_HASH_COUNT; ++i)
{
if (strcmp(buf, cmph_hash_names[i]) == 0)
{
hashfunc = (CMPH_HASH)(i);
break;
}
}
if (hashfunc == CMPH_HASH_COUNT) return NULL;
offset = (cmph_uint32)strlen(cmph_hash_names[hashfunc]) + 1;
switch (hashfunc)
{
case CMPH_HASH_JENKINS:
return (hash_state_t *)jenkins_state_load(buf + offset, buflen - offset);
default:
return NULL;
}
return NULL;
}
void hash_state_destroy(hash_state_t *state)
{
switch (state->hashfunc)
{
case CMPH_HASH_JENKINS:
jenkins_state_destroy((jenkins_state_t *)state);
break;
default:
assert(0);
}
return;
}
/** \fn void hash_state_pack(hash_state_t *state, void *hash_packed)
* \brief Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed.
* \param state points to the hash function
* \param hash_packed pointer to the contiguous memory area used to store the hash function. The size of hash_packed must be at least hash_state_packed_size()
*
* Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed.
* However, the hash function type must be packed outside.
*/
void hash_state_pack(hash_state_t *state, void *hash_packed)
{
switch (state->hashfunc)
{
case CMPH_HASH_JENKINS:
// pack the jenkins hash function
jenkins_state_pack((jenkins_state_t *)state, hash_packed);
break;
default:
assert(0);
}
return;
}
/** \fn cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc)
* \brief Return the amount of space needed to pack a hash function.
* \param hashfunc function type
* \return the size of the packed function or zero for failures
*/
cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc)
{
cmph_uint32 size = 0;
switch (hashfunc)
{
case CMPH_HASH_JENKINS:
size += jenkins_state_packed_size();
break;
default:
assert(0);
}
return size;
}
/** \fn cmph_uint32 hash_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen)
* \param hash_packed is a pointer to a contiguous memory area
* \param hashfunc is the type of the hash function packed in hash_packed
* \param key is a pointer to a key
* \param keylen is the key length
* \return an integer that represents a hash value of 32 bits.
*/
cmph_uint32 hash_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen)
{
switch (hashfunc)
{
case CMPH_HASH_JENKINS:
return jenkins_hash_packed(hash_packed, k, keylen);
default:
assert(0);
}
assert(0);
return 0;
}
/** \fn hash_vector_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes)
* \param hash_packed is a pointer to a contiguous memory area
* \param key is a pointer to a key
* \param keylen is the key length
* \param hashes is a pointer to a memory large enough to fit three 32-bit integers.
*/
void hash_vector_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes)
{
switch (hashfunc)
{
case CMPH_HASH_JENKINS:
jenkins_hash_vector_packed(hash_packed, k, keylen, hashes);
break;
default:
assert(0);
}
}
/** \fn CMPH_HASH hash_get_type(hash_state_t *state);
* \param state is a pointer to a hash_state_t structure
* \return the hash function type pointed by state
*/
CMPH_HASH hash_get_type(hash_state_t *state)
{
return state->hashfunc;
}
cmph-2.0.2/src/Makefile.am 0000644 0001750 0001750 00000002504 13411542035 014626 0 ustar joseph joseph bin_PROGRAMS = cmph
noinst_PROGRAMS = bm_numbers
lib_LTLIBRARIES = libcmph.la
include_HEADERS = cmph.h cmph_types.h cmph_time.h chd_ph.h
libcmph_la_SOURCES = hash.h hash.c \
jenkins_hash.h jenkins_hash.c \
hash_state.h debug.h \
vstack.h vstack.c vqueue.h vqueue.c\
graph.h graph.c bitbool.h \
cmph.h cmph.c cmph_structs.h cmph_structs.c\
chm.h chm.c chm_structs.h \
bmz.h bmz.c bmz_structs.h \
bmz8.h bmz8.c bmz8_structs.h \
bdz.h bdz.c bdz_structs.h \
bdz_ph.h bdz_ph.c bdz_structs_ph.h \
brz.h brz.c brz_structs.h \
fch.h fch.c fch_structs.h \
fch_buckets.h fch_buckets.c \
chd.h chd.c chd_structs.h \
chd_ph.h chd_ph.c chd_structs_ph.h \
miller_rabin.h miller_rabin.c \
buffer_manager.h buffer_manager.c \
buffer_entry.h buffer_entry.c\
select.h select.c select_lookup_tables.h \
compressed_seq.h compressed_seq.c \
compressed_rank.h compressed_rank.c \
linear_string_map.h linear_string_map.c \
cmph_benchmark.h cmph_benchmark.c \
cmph_time.h
libcmph_la_LDFLAGS = -version-info 0:0:0
cmph_SOURCES = main.c wingetopt.h wingetopt.c
cmph_LDADD = libcmph.la
bm_numbers_SOURCES = bm_numbers.c
bm_numbers_LDADD = libcmph.la
cmph-2.0.2/src/debug.h 0000644 0001750 0001750 00000001675 13411542035 014041 0 ustar joseph joseph #ifdef DEBUGP
#undef DEBUGP
#endif
#ifdef __cplusplus
#include
#ifdef WIN32
#include
#endif
#else
#include
#ifdef WIN32
#include
#endif
#endif
#ifndef __GNUC__
#ifndef __DEBUG_H__
#define __DEBUG_H__
#include
static void debugprintf(const char *format, ...)
{
va_list ap;
char *f = NULL;
const char *p="%s:%d ";
size_t plen = strlen(p);
va_start(ap, format);
f = (char *)malloc(plen + strlen(format) + 1);
if (!f) return;
memcpy(f, p, plen);
memcpy(f + plen, format, strlen(format) + 1);
vfprintf(stderr, f, ap);
va_end(ap);
free(f);
}
static void dummyprintf(const char *format, ...)
{}
#endif
#endif
#ifdef DEBUG
#ifndef __GNUC__
#define DEBUGP debugprintf
#else
#define DEBUGP(args...) do { fprintf(stderr, "%s:%d ", __FILE__, __LINE__); fprintf(stderr, ## args); } while(0)
#endif
#else
#ifndef __GNUC__
#define DEBUGP dummyprintf
#else
#define DEBUGP(args...)
#endif
#endif
cmph-2.0.2/src/hashtree_structs.h 0000644 0001750 0001750 00000001421 13411542035 016332 0 ustar joseph joseph #ifndef __CMPH_HASHTREE_STRUCTS_H__
#define __CMPH_HASHTREE_STRUCTS_H__
#include "hash_state.h"
struct __hashtree_data_t
{
cmph_uint32 m; //edges (words) count
double c; //constant c
cmph_uint8 *size; //size[i] stores the number of edges represented by g[i]
cmph_uint32 **g;
cmph_uint32 k; //number of components
hash_state_t **h1;
hash_state_t **h2;
hash_state_t *h3;
};
struct __hashtree_config_data_t
{
CMPH_ALGO leaf_algo;
CMPH_HASH hashfuncs[3];
cmph_uint32 m; //edges (words) count
cmph_uint8 *size; //size[i] stores the number of edges represented by g[i]
cmph_uint32 *offset; //offset[i] stores the sum size[0] + ... size[i - 1]
cmph_uint32 k; //number of components
cmph_uint32 memory;
hash_state_t **h1;
hash_state_t **h2;
hash_state_t *h3;
};
#endif
cmph-2.0.2/src/bdz_ph.h 0000755 0001750 0001750 00000003234 13411542035 014215 0 ustar joseph joseph #ifndef __CMPH_BDZ_PH_H__
#define __CMPH_BDZ_PH_H__
#include "cmph.h"
typedef struct __bdz_ph_data_t bdz_ph_data_t;
typedef struct __bdz_ph_config_data_t bdz_ph_config_data_t;
bdz_ph_config_data_t *bdz_ph_config_new(void);
void bdz_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void bdz_ph_config_destroy(cmph_config_t *mph);
cmph_t *bdz_ph_new(cmph_config_t *mph, double c);
void bdz_ph_load(FILE *f, cmph_t *mphf);
int bdz_ph_dump(cmph_t *mphf, FILE *f);
void bdz_ph_destroy(cmph_t *mphf);
cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
/** \fn void bdz_ph_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void bdz_ph_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 bdz_ph_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 bdz_ph_packed_size(cmph_t *mphf);
/** cmph_uint32 bdz_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 bdz_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
#endif
cmph-2.0.2/src/chd_ph.c 0000644 0001750 0001750 00000066167 13411542035 014202 0 ustar joseph joseph #include
#include
#include
#include
#include
#include
#include
#include "cmph_structs.h"
#include "chd_structs_ph.h"
#include "chd_ph.h"
#include"miller_rabin.h"
#include"bitbool.h"
//#define DEBUG
#include "debug.h"
// NO_ELEMENT is equivalent to null pointer
#ifndef NO_ELEMENT
#define NO_ELEMENT UINT_MAX
#endif
// struct used to represent items at mapping, ordering and searching phases
struct _chd_ph_item_t
{
cmph_uint32 f;
cmph_uint32 h;
};
typedef struct _chd_ph_item_t chd_ph_item_t;
// struct to represent the items at mapping phase only.
struct _chd_ph_map_item_t
{
cmph_uint32 f;
cmph_uint32 h;
cmph_uint32 bucket_num;
};
typedef struct _chd_ph_map_item_t chd_ph_map_item_t;
// struct to represent a bucket
struct _chd_ph_bucket_t
{
cmph_uint32 items_list; // offset
union
{
cmph_uint32 size;
cmph_uint32 bucket_id;
};
};
typedef struct _chd_ph_bucket_t chd_ph_bucket_t;
struct _chd_ph_sorted_list_t
{
cmph_uint32 buckets_list;
cmph_uint32 size;
};
typedef struct _chd_ph_sorted_list_t chd_ph_sorted_list_t;
static inline chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets);
static inline void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets);
static inline void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets);
chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets)
{
chd_ph_bucket_t * buckets = (chd_ph_bucket_t *) calloc(nbuckets, sizeof(chd_ph_bucket_t));
return buckets;
}
void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets)
{
register cmph_uint32 i = 0;
assert(buckets);
for(i = 0; i < nbuckets; i++)
buckets[i].size = 0;
}
static cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item_t * map_items, chd_ph_item_t * items,
cmph_uint32 nbuckets,cmph_uint32 item_idx)
{
register cmph_uint32 i = 0;
register chd_ph_item_t * tmp_item;
register chd_ph_map_item_t * tmp_map_item = map_items + item_idx;
register chd_ph_bucket_t * bucket = buckets + tmp_map_item->bucket_num;
tmp_item = items + bucket->items_list;
for(i = 0; i < bucket->size; i++)
{
if(tmp_item->f == tmp_map_item->f && tmp_item->h == tmp_map_item->h)
{
DEBUGP("Item not added\n");
return 0;
};
tmp_item++;
};
tmp_item->f = tmp_map_item->f;
tmp_item->h = tmp_map_item->h;
bucket->size++;
return 1;
};
void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets)
{
free(buckets);
}
static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items,
cmph_uint32 *max_bucket_size);
static chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** items,
cmph_uint32 nbuckets,cmph_uint32 nitems, cmph_uint32 max_bucket_size);
static cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table);
static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r)
{
double r = _r, n = _n;
return (1 + (r/n - 1.0 + 1.0/(2.0*n))*log(1 - n/r))/log(2);
};
/* computes the entropy of non empty buckets.*/
static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, cmph_uint32 max_probes)
{
register cmph_uint32 * probe_counts = (cmph_uint32 *) calloc(max_probes, sizeof(cmph_uint32));
register cmph_uint32 i;
register double entropy = 0;
for(i = 0; i < n; i++)
{
probe_counts[disp_table[i]]++;
};
for(i = 0; i < max_probes; i++)
{
if(probe_counts[i] > 0)
entropy -= probe_counts[i]*log((double)probe_counts[i]/(double)n)/log(2);
};
free(probe_counts);
return entropy;
};
chd_ph_config_data_t *chd_ph_config_new(void)
{
chd_ph_config_data_t *chd_ph;
chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t));
if (!chd_ph) return NULL;
memset(chd_ph, 0, sizeof(chd_ph_config_data_t));
chd_ph->hashfunc = CMPH_HASH_JENKINS;
chd_ph->cs = NULL;
chd_ph->nbuckets = 0;
chd_ph->n = 0;
chd_ph->hl = NULL;
chd_ph->m = 0;
chd_ph->use_h = 1;
chd_ph->keys_per_bin = 1;
chd_ph->keys_per_bucket = 4;
chd_ph->occup_table = 0;
return chd_ph;
}
void chd_ph_config_destroy(cmph_config_t *mph)
{
chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data;
DEBUGP("Destroying algorithm dependent data\n");
if(data->occup_table)
{
free(data->occup_table);
data->occup_table = NULL;
}
free(data);
}
void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
cmph_uint32 i = 0;
while(*hashptr != CMPH_HASH_COUNT)
{
if (i >= 1) break; //chd_ph only uses one linear hash function
chd_ph->hashfunc = *hashptr;
++i, ++hashptr;
}
}
void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket)
{
assert(mph);
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
if(keys_per_bucket < 1 || keys_per_bucket >= 15)
{
keys_per_bucket = 4;
}
chd_ph->keys_per_bucket = keys_per_bucket;
}
void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
{
assert(mph);
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
if(keys_per_bin <= 1 || keys_per_bin >= 128)
{
keys_per_bin = 1;
}
chd_ph->keys_per_bin = keys_per_bin;
}
cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size)
{
register cmph_uint32 i = 0, g = 0;
cmph_uint32 hl[3];
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
char * key = NULL;
cmph_uint32 keylen = 0;
chd_ph_map_item_t * map_item;
chd_ph_map_item_t * map_items = (chd_ph_map_item_t *)malloc(chd_ph->m*sizeof(chd_ph_map_item_t));
register cmph_uint32 mapping_iterations = 1000;
*max_bucket_size = 0;
while(1)
{
mapping_iterations--;
if (chd_ph->hl) hash_state_destroy(chd_ph->hl);
chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m);
chd_ph_bucket_clean(buckets, chd_ph->nbuckets);
mph->key_source->rewind(mph->key_source->data);
for(i = 0; i < chd_ph->m; i++)
{
mph->key_source->read(mph->key_source->data, &key, &keylen);
hash_vector(chd_ph->hl, key, keylen, hl);
map_item = (map_items + i);
g = hl[0] % chd_ph->nbuckets;
map_item->f = hl[1] % chd_ph->n;
map_item->h = hl[2] % (chd_ph->n - 1) + 1;
map_item->bucket_num=g;
mph->key_source->dispose(mph->key_source->data, key, keylen);
// if(buckets[g].size == (chd_ph->keys_per_bucket << 2))
// {
// DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2));
// goto error;
// }
buckets[g].size++;
if(buckets[g].size > *max_bucket_size)
{
*max_bucket_size = buckets[g].size;
}
}
buckets[0].items_list = 0;
for(i = 1; i < chd_ph->nbuckets; i++)
{
buckets[i].items_list = buckets[i-1].items_list + buckets[i - 1].size;
buckets[i - 1].size = 0;
};
buckets[i - 1].size = 0;
for(i = 0; i < chd_ph->m; i++)
{
map_item = (map_items + i);
if(!chd_ph_bucket_insert(buckets, map_items, items, chd_ph->nbuckets, i))
break;
}
if(i == chd_ph->m)
{
free(map_items);
return 1; // SUCCESS
}
if(mapping_iterations == 0)
{
goto error;
}
}
error:
free(map_items);
hash_state_destroy(chd_ph->hl);
chd_ph->hl = NULL;
return 0; // FAILURE
}
chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_t ** _items,
cmph_uint32 nbuckets, cmph_uint32 nitems, cmph_uint32 max_bucket_size)
{
chd_ph_sorted_list_t * sorted_lists = (chd_ph_sorted_list_t *) calloc(max_bucket_size + 1, sizeof(chd_ph_sorted_list_t));
chd_ph_bucket_t * input_buckets = (*_buckets);
chd_ph_bucket_t * output_buckets;
chd_ph_item_t * input_items = (*_items);
chd_ph_item_t * output_items;
register cmph_uint32 i, j, bucket_size, position, position2;
// cmph_uint32 non_empty_buckets;
DEBUGP("MAX BUCKET SIZE = %u\n", max_bucket_size);
// Determine size of each list of buckets
for(i = 0; i < nbuckets; i++)
{
bucket_size = input_buckets[i].size;
if(bucket_size == 0)
continue;
sorted_lists[bucket_size].size++;
};
sorted_lists[1].buckets_list = 0;
// Determine final position of list of buckets into the contiguous array that will store all the buckets
for(i = 2; i <= max_bucket_size; i++)
{
sorted_lists[i].buckets_list = sorted_lists[i-1].buckets_list + sorted_lists[i-1].size;
sorted_lists[i-1].size = 0;
};
sorted_lists[i-1].size = 0;
// Store the buckets in a new array which is sorted by bucket sizes
output_buckets = (chd_ph_bucket_t *)calloc(nbuckets, sizeof(chd_ph_bucket_t)); // everything is initialized with zero
// non_empty_buckets = nbuckets;
for(i = 0; i < nbuckets; i++)
{
bucket_size = input_buckets[i].size;
if(bucket_size == 0)
{
// non_empty_buckets--;
continue;
};
position = sorted_lists[bucket_size].buckets_list + sorted_lists[bucket_size].size;
output_buckets[position].bucket_id = i;
output_buckets[position].items_list = input_buckets[i].items_list;
sorted_lists[bucket_size].size++;
};
/* for(i = non_empty_buckets; i < nbuckets; i++)
output_buckets[i].size=0;*/
// Return the buckets sorted in new order and free the old buckets sorted in old order
free(input_buckets);
(*_buckets) = output_buckets;
// Store the items according to the new order of buckets.
output_items = (chd_ph_item_t*)calloc(nitems, sizeof(chd_ph_item_t));
position = 0;
i = 0;
for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
{
for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + sorted_lists[bucket_size].buckets_list; i++)
{
position2 = output_buckets[i].items_list;
output_buckets[i].items_list = position;
for(j = 0; j < bucket_size; j++)
{
output_items[position].f = input_items[position2].f;
output_items[position].h = input_items[position2].h;
position++;
position2++;
};
};
};
//Return the items sorted in new order and free the old items sorted in old order
free(input_items);
(*_items) = output_items;
return sorted_lists;
};
static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets,
chd_ph_item_t *items, cmph_uint32 probe0_num, cmph_uint32 probe1_num,
cmph_uint32 bucket_num, cmph_uint32 size)
{
register cmph_uint32 i;
register chd_ph_item_t * item;
register cmph_uint32 position;
item = items + buckets[bucket_num].items_list;
// try place bucket with probe_num
if(chd_ph->keys_per_bin > 1)
{
for(i = 0; i < size; i++) // placement
{
position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
{
break;
}
(chd_ph->occup_table[position])++;
item++;
};
} else
{
for(i = 0; i < size; i++) // placement
{
position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
if(GETBIT32(((cmph_uint32 *)chd_ph->occup_table), position))
{
break;
}
SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
item++;
};
};
if(i != size) // Undo the placement
{
item = items + buckets[bucket_num].items_list;
if(chd_ph->keys_per_bin > 1)
{
while(1)
{
if(i == 0)
{
break;
}
position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
(chd_ph->occup_table[position])--;
item++;
i--;
};
} else
{
while(1)
{
if(i == 0)
{
break;
}
position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
UNSETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
// ([position/32]^=(1<<(position%32));
item++;
i--;
};
};
return 0;
}
return 1;
};
static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes,
cmph_uint32 * disp_table, cmph_uint32 bucket_num, cmph_uint32 size)
{
register cmph_uint32 probe0_num, probe1_num, probe_num;
probe0_num = 0;
probe1_num = 0;
probe_num = 0;
while(1)
{
if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, bucket_num,size))
{
disp_table[buckets[bucket_num].bucket_id] = probe0_num + probe1_num * chd_ph->n;
return 1;
}
probe0_num++;
if(probe0_num >= chd_ph->n)
{
probe0_num -= chd_ph->n;
probe1_num++;
};
probe_num++;
if(probe_num >= max_probes || probe1_num >= chd_ph->n)
{
return 0;
};
};
return 0;
};
static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t * buckets, chd_ph_item_t *items,
cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
cmph_uint32 * disp_table)
{
register cmph_uint32 i = 0;
register cmph_uint32 curr_bucket = 0;
for(i = max_bucket_size; i > 0; i--)
{
curr_bucket = sorted_lists[i].buckets_list;
while(curr_bucket < sorted_lists[i].size + sorted_lists[i].buckets_list)
{
if(!place_bucket(chd_ph, buckets, items, max_probes, disp_table, curr_bucket, i))
{
return 0;
}
curr_bucket++;
};
};
return 1;
};
static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items,
cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
cmph_uint32 * disp_table)
{
register cmph_uint32 i,j, non_placed_bucket;
register cmph_uint32 curr_bucket;
register cmph_uint32 probe_num, probe0_num, probe1_num;
cmph_uint32 sorted_list_size;
#ifdef DEBUG
cmph_uint32 items_list;
cmph_uint32 bucket_id;
#endif
DEBUGP("USING HEURISTIC TO PLACE BUCKETS\n");
for(i = max_bucket_size; i > 0; i--)
{
probe_num = 0;
probe0_num = 0;
probe1_num = 0;
sorted_list_size = sorted_lists[i].size;
while(sorted_lists[i].size != 0)
{
curr_bucket = sorted_lists[i].buckets_list;
for(j = 0, non_placed_bucket = 0; j < sorted_lists[i].size; j++)
{
// if bucket is successfully placed remove it from list
if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, curr_bucket, i))
{
disp_table[buckets[curr_bucket].bucket_id] = probe0_num + probe1_num * chd_ph->n;
// DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]);
}
else
{
// DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket);
#ifdef DEBUG
items_list = buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list;
bucket_id = buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id;
#endif
buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list = buckets[curr_bucket].items_list;
buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id = buckets[curr_bucket].bucket_id;
#ifdef DEBUG
buckets[curr_bucket].items_list=items_list;
buckets[curr_bucket].bucket_id=bucket_id;
#endif
non_placed_bucket++;
}
curr_bucket++;
};
sorted_lists[i].size = non_placed_bucket;
probe0_num++;
if(probe0_num >= chd_ph->n)
{
probe0_num -= chd_ph->n;
probe1_num++;
};
probe_num++;
if(probe_num >= max_probes || probe1_num >= chd_ph->n)
{
sorted_lists[i].size = sorted_list_size;
return 0;
};
};
sorted_lists[i].size = sorted_list_size;
};
return 1;
};
cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
cmph_uint32 * disp_table)
{
if(chd_ph->use_h)
{
return place_buckets2(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
}
else
{
return place_buckets1(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
}
}
static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items,
cmph_uint32 * disp_table, chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_bucket_size)
{
register cmph_uint32 bucket_size, i, j;
register cmph_uint32 position, probe0_num, probe1_num;
register cmph_uint32 m = 0;
register chd_ph_item_t * item;
if(chd_ph->keys_per_bin > 1)
memset(chd_ph->occup_table, 0, chd_ph->n);
else
memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size +
sorted_lists[bucket_size].buckets_list; i++)
{
j = bucket_size;
item = items + buckets[i].items_list;
probe0_num = disp_table[buckets[i].bucket_id] % chd_ph->n;
probe1_num = disp_table[buckets[i].bucket_id] / chd_ph->n;
for(; j > 0; j--)
{
m++;
position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
if(chd_ph->keys_per_bin > 1)
{
if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
{
return 0;
}
(chd_ph->occup_table[position])++;
}
else
{
if(GETBIT32(((cmph_uint32*)chd_ph->occup_table), position))
{
return 0;
}
SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
};
item++;
};
};
DEBUGP("We were able to place m = %u keys\n", m);
return 1;
};
cmph_t *chd_ph_new(cmph_config_t *mph, double c)
{
cmph_t *mphf = NULL;
chd_ph_data_t *chd_phf = NULL;
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
register double load_factor = c;
register cmph_uint8 searching_success = 0;
register cmph_uint32 max_probes_default = 1 << 20; // default value for max_probes
register cmph_uint32 max_probes;
register cmph_uint32 iterations = 100;
chd_ph_bucket_t * buckets = NULL;
chd_ph_item_t * items = NULL;
register cmph_uint8 failure = 0;
cmph_uint32 max_bucket_size = 0;
chd_ph_sorted_list_t * sorted_lists = NULL;
cmph_uint32 * disp_table = NULL;
register double space_lower_bound = 0;
#ifdef CMPH_TIMING
double construction_time_begin = 0.0;
double construction_time = 0.0;
ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
#endif
chd_ph->m = mph->key_source->nkeys;
DEBUGP("m = %u\n", chd_ph->m);
chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1;
DEBUGP("nbuckets = %u\n", chd_ph->nbuckets);
if(load_factor < 0.5 )
{
load_factor = 0.5;
}
if(load_factor >= 0.99)
{
load_factor = 0.99;
}
DEBUGP("load_factor = %.3f\n", load_factor);
chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1;
//Round the number of bins to the prime immediately above
if(chd_ph->n % 2 == 0) chd_ph->n++;
for(;;)
{
if(check_primality(chd_ph->n) == 1)
break;
chd_ph->n += 2; // just odd numbers can be primes for n > 2
};
DEBUGP("n = %u \n", chd_ph->n);
if(chd_ph->keys_per_bin == 1)
{
space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n);
}
if(mph->verbosity)
{
fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound);
}
// We allocate the working tables
buckets = chd_ph_bucket_new(chd_ph->nbuckets);
items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t));
max_probes = (cmph_uint32)((log(chd_ph->m)/log(2))/20);
if (max_probes == 0) {
max_probes = max_probes_default;
} else {
max_probes = max_probes * max_probes_default;
}
if(chd_ph->keys_per_bin == 1)
chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32));
else
chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8));
disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32));
//
// init_genrand(time(0));
while(1)
{
iterations --;
if (mph->verbosity)
{
fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n);
}
if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size))
{
if (mph->verbosity)
{
fprintf(stderr, "Failure in mapping step\n");
}
failure = 1;
goto cleanup;
}
if (mph->verbosity)
{
fprintf(stderr, "Starting ordering step\n");
}
if(sorted_lists)
{
free(sorted_lists);
}
sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size);
if (mph->verbosity)
{
fprintf(stderr, "Starting searching step\n");
}
searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
if(searching_success) break;
// reset occup_table
if(chd_ph->keys_per_bin > 1)
memset(chd_ph->occup_table, 0, chd_ph->n);
else
memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
if(iterations == 0)
{
// Cleanup memory
if (mph->verbosity)
{
fprintf(stderr, "Failure because the max trials was exceeded\n");
}
failure = 1;
goto cleanup;
};
}
#ifdef DEBUG
{
if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size))
{
DEBUGP("Error for bin packing generation");
failure = 1;
goto cleanup;
}
}
#endif
if (mph->verbosity)
{
fprintf(stderr, "Starting compressing step\n");
}
if(chd_ph->cs)
{
free(chd_ph->cs);
}
chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
compressed_seq_init(chd_ph->cs);
compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets);
#ifdef CMPH_TIMING
ELAPSED_TIME_IN_SECONDS(&construction_time);
register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes);
DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m);
#endif
cleanup:
chd_ph_bucket_destroy(buckets);
free(items);
free(sorted_lists);
free(disp_table);
if(failure)
{
if(chd_ph->hl)
{
hash_state_destroy(chd_ph->hl);
}
chd_ph->hl = NULL;
return NULL;
}
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
chd_phf->cs = chd_ph->cs;
chd_ph->cs = NULL; //transfer memory ownership
chd_phf->hl = chd_ph->hl;
chd_ph->hl = NULL; //transfer memory ownership
chd_phf->n = chd_ph->n;
chd_phf->nbuckets = chd_ph->nbuckets;
mphf->data = chd_phf;
mphf->size = chd_ph->n;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
#ifdef CMPH_TIMING
register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8;
construction_time = construction_time - construction_time_begin;
fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m);
#endif
return mphf;
}
void chd_ph_load(FILE *fd, cmph_t *mphf)
{
char *buf = NULL;
cmph_uint32 buflen;
register size_t nbytes;
chd_ph_data_t *chd_ph = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
DEBUGP("Loading chd_ph mphf\n");
mphf->data = chd_ph;
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
chd_ph->hl = hash_state_load(buf, buflen);
free(buf);
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
DEBUGP("Compressed sequence structure has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
compressed_seq_load(chd_ph->cs, buf, buflen);
free(buf);
// loading n and nbuckets
DEBUGP("Reading n and nbuckets\n");
nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
}
int chd_ph_dump(cmph_t *mphf, FILE *fd)
{
char *buf = NULL;
cmph_uint32 buflen;
register size_t nbytes;
chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
__cmph_dump(mphf, fd);
hash_state_dump(data->hl, &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
compressed_seq_dump(data->cs, &buf, &buflen);
DEBUGP("Dumping compressed sequence structure with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
// dumping n and nbuckets
nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(&(data->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
return 1;
}
void chd_ph_destroy(cmph_t *mphf)
{
chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
compressed_seq_destroy(data->cs);
free(data->cs);
hash_state_destroy(data->hl);
free(data);
free(mphf);
}
cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
register chd_ph_data_t * chd_ph = (chd_ph_data_t *)mphf->data;
cmph_uint32 hl[3];
register cmph_uint32 disp,position;
register cmph_uint32 probe0_num,probe1_num;
register cmph_uint32 f,g,h;
hash_vector(chd_ph->hl, key, keylen, hl);
g = hl[0] % chd_ph->nbuckets;
f = hl[1] % chd_ph->n;
h = hl[2] % (chd_ph->n-1) + 1;
disp = compressed_seq_query(chd_ph->cs, g);
probe0_num = disp % chd_ph->n;
probe1_num = disp/chd_ph->n;
position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % chd_ph->n);
return position;
}
void chd_ph_pack(cmph_t *mphf, void *packed_mphf)
{
chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
// packing hl type
CMPH_HASH hl_type = hash_get_type(data->hl);
*((cmph_uint32 *) ptr) = hl_type;
ptr += sizeof(cmph_uint32);
// packing hl
hash_state_pack(data->hl, ptr);
ptr += hash_state_packed_size(hl_type);
// packing n
*((cmph_uint32 *) ptr) = data->n;
ptr += sizeof(data->n);
// packing nbuckets
*((cmph_uint32 *) ptr) = data->nbuckets;
ptr += sizeof(data->nbuckets);
// packing cs
compressed_seq_pack(data->cs, ptr);
//ptr += compressed_seq_packed_size(data->cs);
}
cmph_uint32 chd_ph_packed_size(cmph_t *mphf)
{
register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
register CMPH_HASH hl_type = hash_get_type(data->hl);
register cmph_uint32 hash_state_pack_size = hash_state_packed_size(hl_type);
register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs);
return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32));
}
cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
register CMPH_HASH hl_type = (CMPH_HASH)*(cmph_uint32 *)packed_mphf;
register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4;
register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type));
register cmph_uint32 n = *ptr++;
register cmph_uint32 nbuckets = *ptr++;
cmph_uint32 hl[3];
register cmph_uint32 disp,position;
register cmph_uint32 probe0_num,probe1_num;
register cmph_uint32 f,g,h;
hash_vector_packed(hl_ptr, hl_type, key, keylen, hl);
g = hl[0] % nbuckets;
f = hl[1] % n;
h = hl[2] % (n-1) + 1;
disp = compressed_seq_query_packed(ptr, g);
probe0_num = disp % n;
probe1_num = disp/n;
position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n);
return position;
}
cmph-2.0.2/src/cmph_structs.c 0000644 0001750 0001750 00000003100 13411542035 015445 0 ustar joseph joseph #include "cmph_structs.h"
#include
//#define DEBUG
#include "debug.h"
cmph_config_t *__config_new(cmph_io_adapter_t *key_source)
{
cmph_config_t *mph = (cmph_config_t *)malloc(sizeof(cmph_config_t));
if (mph == NULL) return NULL;
memset(mph, 0, sizeof(cmph_config_t));
mph->key_source = key_source;
mph->verbosity = 0;
mph->data = NULL;
mph->c = 0;
return mph;
}
void __config_destroy(cmph_config_t *mph)
{
free(mph);
}
void __cmph_dump(cmph_t *mphf, FILE *fd)
{
register size_t nbytes;
nbytes = fwrite(cmph_names[mphf->algo], (size_t)(strlen(cmph_names[mphf->algo]) + 1), (size_t)1, fd);
nbytes = fwrite(&(mphf->size), sizeof(mphf->size), (size_t)1, fd);
}
cmph_t *__cmph_load(FILE *f)
{
cmph_t *mphf = NULL;
cmph_uint32 i;
char algo_name[BUFSIZ];
char *ptr = algo_name;
CMPH_ALGO algo = CMPH_COUNT;
register size_t nbytes;
DEBUGP("Loading mphf\n");
for(i = 0; i < BUFSIZ; i++)
{
size_t c = fread(ptr, (size_t)1, (size_t)1, f);
if (c != 1) return NULL;
if (*ptr == 0) break;
++ptr;
}
if(algo_name[i] != 0)
{
DEBUGP("Attempted buffer overflow while loading mph file\n");
return NULL;
}
for(i = 0; i < CMPH_COUNT; ++i)
{
if (strcmp(algo_name, cmph_names[i]) == 0)
{
algo = (CMPH_ALGO)(i);
}
}
if (algo == CMPH_COUNT)
{
DEBUGP("Algorithm %s not found\n", algo_name);
return NULL;
}
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = algo;
nbytes = fread(&(mphf->size), sizeof(mphf->size), (size_t)1, f);
mphf->data = NULL;
DEBUGP("Algorithm is %s and mphf is sized %u\n", cmph_names[algo], mphf->size);
return mphf;
}
cmph-2.0.2/src/djb2_hash.c 0000644 0001750 0001750 00000002061 13411542035 014560 0 ustar joseph joseph #include "djb2_hash.h"
#include
djb2_state_t *djb2_state_new()
{
djb2_state_t *state = (djb2_state_t *)malloc(sizeof(djb2_state_t));
if (!djb2_state) return NULL;
state->hashfunc = CMPH_HASH_DJB2;
return state;
}
void djb2_state_destroy(djb2_state_t *state)
{
free(state);
}
cmph_uint32 djb2_hash(djb2_state_t *state, const char *k, cmph_uint32 keylen)
{
register cmph_uint32 hash = 5381;
const unsigned char *ptr = (unsigned char *)k;
cmph_uint32 i = 0;
while (i < keylen)
{
hash = hash*33 ^ *ptr;
++ptr, ++i;
}
return hash;
}
void djb2_state_dump(djb2_state_t *state, char **buf, cmph_uint32 *buflen)
{
*buf = NULL;
*buflen = 0;
return;
}
djb2_state_t *djb2_state_copy(djb2_state_t *src_state)
{
djb2_state_t *dest_state = (djb2_state_t *)malloc(sizeof(djb2_state_t));
dest_state->hashfunc = src_state->hashfunc;
return dest_state;
}
djb2_state_t *djb2_state_load(const char *buf, cmph_uint32 buflen)
{
djb2_state_t *state = (djb2_state_t *)malloc(sizeof(djb2_state_t));
state->hashfunc = CMPH_HASH_DJB2;
return state;
}
cmph-2.0.2/src/hashtree.c 0000644 0001750 0001750 00000021406 13411542035 014543 0 ustar joseph joseph #include "graph.h"
#include "hashtree.h"
#include "cmph_structs.h"
#include "hastree_structs.h"
#include "hash.h"
#include "bitbool.h"
#include
#include
#include
#include
#include
//#define DEBUG
#include "debug.h"
hashtree_config_data_t *hashtree_config_new()
{
hashtree_config_data_t *hashtree;
hashtree = (hashtree_config_data_t *)malloc(sizeof(hashtree_config_data_t));
if (!hashtree) return NULL;
memset(hashtree, 0, sizeof(hashtree_config_data_t));
hashtree->hashfuncs[0] = CMPH_HASH_JENKINS;
hashtree->hashfuncs[1] = CMPH_HASH_JENKINS;
hashtree->hashfuncs[2] = CMPH_HASH_JENKINS;
hashtree->memory = 32 * 1024 * 1024;
return hashtree;
}
void hashtree_config_destroy(cmph_config_t *mph)
{
hashtree_config_data_t *data = (hashtree_config_data_t *)mph->data;
DEBUGP("Destroying algorithm dependent data\n");
free(data);
}
void hashtree_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
hashtree_config_data_t *hashtree = (hashtree_config_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
cmph_uint32 i = 0;
while(*hashptr != CMPH_HASH_COUNT)
{
if (i >= 3) break; //hashtree only uses three hash functions
hashtree->hashfuncs[i] = *hashptr;
++i, ++hashptr;
}
}
cmph_t *hashtree_new(cmph_config_t *mph, double c)
{
cmph_t *mphf = NULL;
hashtree_data_t *hashtreef = NULL;
cmph_uint32 i;
cmph_uint32 iterations = 20;
cmph_uint8 *visited = NULL;
hashtree_config_data_t *hashtree = (hashtree_config_data_t *)mph->data;
hashtree->m = mph->key_source->nkeys;
hashtree->n = ceil(c * mph->key_source->nkeys);
DEBUGP("m (edges): %u n (vertices): %u c: %f\n", hashtree->m, hashtree->n, c);
hashtree->graph = graph_new(hashtree->n, hashtree->m);
DEBUGP("Created graph\n");
hashtree->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3);
for(i = 0; i < 3; ++i) hashtree->hashes[i] = NULL;
//Mapping step
if (mph->verbosity)
{
fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", hashtree->m, hashtree->n);
}
while(1)
{
int ok;
hashtree->hashes[0] = hash_state_new(hashtree->hashfuncs[0], hashtree->n);
hashtree->hashes[1] = hash_state_new(hashtree->hashfuncs[1], hashtree->n);
ok = hashtree_gen_edges(mph);
if (!ok)
{
--iterations;
hash_state_destroy(hashtree->hashes[0]);
hashtree->hashes[0] = NULL;
hash_state_destroy(hashtree->hashes[1]);
hashtree->hashes[1] = NULL;
DEBUGP("%u iterations remaining\n", iterations);
if (mph->verbosity)
{
fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations);
}
if (iterations == 0) break;
}
else break;
}
if (iterations == 0)
{
graph_destroy(hashtree->graph);
return NULL;
}
//Assignment step
if (mph->verbosity)
{
fprintf(stderr, "Starting assignment step\n");
}
DEBUGP("Assignment step\n");
visited = (char *)malloc(hashtree->n/8 + 1);
memset(visited, 0, hashtree->n/8 + 1);
free(hashtree->g);
hashtree->g = (cmph_uint32 *)malloc(hashtree->n * sizeof(cmph_uint32));
assert(hashtree->g);
for (i = 0; i < hashtree->n; ++i)
{
if (!GETBIT(visited,i))
{
hashtree->g[i] = 0;
hashtree_traverse(hashtree, visited, i);
}
}
graph_destroy(hashtree->graph);
free(visited);
hashtree->graph = NULL;
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
hashtreef = (hashtree_data_t *)malloc(sizeof(hashtree_data_t));
hashtreef->g = hashtree->g;
hashtree->g = NULL; //transfer memory ownership
hashtreef->hashes = hashtree->hashes;
hashtree->hashes = NULL; //transfer memory ownership
hashtreef->n = hashtree->n;
hashtreef->m = hashtree->m;
mphf->data = hashtreef;
mphf->size = hashtree->m;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
return mphf;
}
static void hashtree_traverse(hashtree_config_data_t *hashtree, cmph_uint8 *visited, cmph_uint32 v)
{
graph_iterator_t it = graph_neighbors_it(hashtree->graph, v);
cmph_uint32 neighbor = 0;
SETBIT(visited,v);
DEBUGP("Visiting vertex %u\n", v);
while((neighbor = graph_next_neighbor(hashtree->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
DEBUGP("Visiting neighbor %u\n", neighbor);
if(GETBIT(visited,neighbor)) continue;
DEBUGP("Visiting neighbor %u\n", neighbor);
DEBUGP("Visiting edge %u->%u with id %u\n", v, neighbor, graph_edge_id(hashtree->graph, v, neighbor));
hashtree->g[neighbor] = graph_edge_id(hashtree->graph, v, neighbor) - hashtree->g[v];
DEBUGP("g is %u (%u - %u mod %u)\n", hashtree->g[neighbor], graph_edge_id(hashtree->graph, v, neighbor), hashtree->g[v], hashtree->m);
hashtree_traverse(hashtree, visited, neighbor);
}
}
static int hashtree_gen_edges(cmph_config_t *mph)
{
cmph_uint32 e;
hashtree_config_data_t *hashtree = (hashtree_config_data_t *)mph->data;
int cycles = 0;
DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", hashtree->n, cmph_hash_names[hashtree->hashfuncs[0]], cmph_hash_names[hashtree->hashfuncs[1]]);
graph_clear_edges(hashtree->graph);
mph->key_source->rewind(mph->key_source->data);
for (e = 0; e < mph->key_source->nkeys; ++e)
{
cmph_uint32 h1, h2;
cmph_uint32 keylen;
char *key;
mph->key_source->read(mph->key_source->data, &key, &keylen);
h1 = hash(hashtree->hashes[0], key, keylen) % hashtree->n;
h2 = hash(hashtree->hashes[1], key, keylen) % hashtree->n;
if (h1 == h2) if (++h2 >= hashtree->n) h2 = 0;
if (h1 == h2)
{
if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e);
mph->key_source->dispose(mph->key_source->data, key, keylen);
return 0;
}
DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key);
mph->key_source->dispose(mph->key_source->data, key, keylen);
graph_add_edge(hashtree->graph, h1, h2);
}
cycles = graph_is_cyclic(hashtree->graph);
if (mph->verbosity && cycles) fprintf(stderr, "Cyclic graph generated\n");
DEBUGP("Looking for cycles: %u\n", cycles);
return ! cycles;
}
int hashtree_dump(cmph_t *mphf, FILE *fd)
{
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint32 two = 2; //number of hash functions
hashtree_data_t *data = (hashtree_data_t *)mphf->data;
__cmph_dump(mphf, fd);
fwrite(&two, sizeof(cmph_uint32), 1, fd);
hash_state_dump(data->hashes[0], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
fwrite(&buflen, sizeof(cmph_uint32), 1, fd);
fwrite(buf, buflen, 1, fd);
free(buf);
hash_state_dump(data->hashes[1], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
fwrite(&buflen, sizeof(cmph_uint32), 1, fd);
fwrite(buf, buflen, 1, fd);
free(buf);
fwrite(&(data->n), sizeof(cmph_uint32), 1, fd);
fwrite(&(data->m), sizeof(cmph_uint32), 1, fd);
fwrite(data->g, sizeof(cmph_uint32)*data->n, 1, fd);
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
fprintf(stderr, "\n");
#endif
return 1;
}
void hashtree_load(FILE *f, cmph_t *mphf)
{
cmph_uint32 nhashes;
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint32 i;
hashtree_data_t *hashtree = (hashtree_data_t *)malloc(sizeof(hashtree_data_t));
DEBUGP("Loading hashtree mphf\n");
mphf->data = hashtree;
fread(&nhashes, sizeof(cmph_uint32), 1, f);
hashtree->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1));
hashtree->hashes[nhashes] = NULL;
DEBUGP("Reading %u hashes\n", nhashes);
for (i = 0; i < nhashes; ++i)
{
hash_state_t *state = NULL;
fread(&buflen, sizeof(cmph_uint32), 1, f);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc(buflen);
fread(buf, buflen, 1, f);
state = hash_state_load(buf, buflen);
hashtree->hashes[i] = state;
free(buf);
}
DEBUGP("Reading m and n\n");
fread(&(hashtree->n), sizeof(cmph_uint32), 1, f);
fread(&(hashtree->m), sizeof(cmph_uint32), 1, f);
hashtree->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*hashtree->n);
fread(hashtree->g, hashtree->n*sizeof(cmph_uint32), 1, f);
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < hashtree->n; ++i) fprintf(stderr, "%u ", hashtree->g[i]);
fprintf(stderr, "\n");
#endif
return;
}
cmph_uint32 hashtree_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
hashtree_data_t *hashtree = mphf->data;
cmph_uint32 h1 = hash(hashtree->hashes[0], key, keylen) % hashtree->n;
cmph_uint32 h2 = hash(hashtree->hashes[1], key, keylen) % hashtree->n;
DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
if (h1 == h2 && ++h2 >= hashtree->n) h2 = 0;
DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, hashtree->g[h1], hashtree->g[h2], hashtree->m);
return (hashtree->g[h1] + hashtree->g[h2]) % hashtree->m;
}
void hashtree_destroy(cmph_t *mphf)
{
hashtree_data_t *data = (hashtree_data_t *)mphf->data;
free(data->g);
hash_state_destroy(data->hashes[0]);
hash_state_destroy(data->hashes[1]);
free(data->hashes);
free(data);
free(mphf);
}
cmph-2.0.2/src/cmph_benchmark.h 0000644 0001750 0001750 00000000603 13411542035 015702 0 ustar joseph joseph #ifndef __CMPH_BENCHMARK_H__
#define __CMPH_BENCHMARK_H__
#include
#include
#ifdef __cplusplus
extern "C"
{
#endif
#define BM_REGISTER(func, iters) bm_register(#func, func, iters)
void bm_register(const char* name, void (*func)(int), int iters);
void run_benchmarks(int argc, char** argv);
#ifdef __cplusplus
}
#endif
#endif // __CMPH_BENCHMARK_H__
cmph-2.0.2/src/fnv_hash.h 0000644 0001750 0001750 00000000746 13411542035 014545 0 ustar joseph joseph #ifndef __FNV_HASH_H__
#define __FNV_HASH_H__
#include "hash.h"
typedef struct __fnv_state_t
{
CMPH_HASH hashfunc;
} fnv_state_t;
fnv_state_t *fnv_state_new();
cmph_uint32 fnv_hash(fnv_state_t *state, const char *k, cmph_uint32 keylen);
void fnv_state_dump(fnv_state_t *state, char **buf, cmph_uint32 *buflen);
fnv_state_t *fnv_state_copy(fnv_state_t *src_state);
fnv_state_t *fnv_state_load(const char *buf, cmph_uint32 buflen);
void fnv_state_destroy(fnv_state_t *state);
#endif
cmph-2.0.2/src/vstack.c 0000644 0001750 0001750 00000002737 13411542035 014241 0 ustar joseph joseph #include "vstack.h"
#include
#include
//#define DEBUG
#include "debug.h"
struct __vstack_t
{
cmph_uint32 pointer;
cmph_uint32 *values;
cmph_uint32 capacity;
};
vstack_t *vstack_new(void)
{
vstack_t *stack = (vstack_t *)malloc(sizeof(vstack_t));
assert(stack);
stack->pointer = 0;
stack->values = NULL;
stack->capacity = 0;
return stack;
}
void vstack_destroy(vstack_t *stack)
{
assert(stack);
free(stack->values);
free(stack);
}
void vstack_push(vstack_t *stack, cmph_uint32 val)
{
assert(stack);
vstack_reserve(stack, stack->pointer + 1);
stack->values[stack->pointer] = val;
++(stack->pointer);
}
void vstack_pop(vstack_t *stack)
{
assert(stack);
assert(stack->pointer > 0);
--(stack->pointer);
}
cmph_uint32 vstack_top(vstack_t *stack)
{
assert(stack);
assert(stack->pointer > 0);
return stack->values[(stack->pointer - 1)];
}
int vstack_empty(vstack_t *stack)
{
assert(stack);
return stack->pointer == 0;
}
cmph_uint32 vstack_size(vstack_t *stack)
{
return stack->pointer;
}
void vstack_reserve(vstack_t *stack, cmph_uint32 size)
{
assert(stack);
if (stack->capacity < size)
{
cmph_uint32 new_capacity = stack->capacity + 1;
DEBUGP("Increasing current capacity %u to %u\n", stack->capacity, size);
while (new_capacity < size)
{
new_capacity *= 2;
}
stack->values = (cmph_uint32 *)realloc(stack->values, sizeof(cmph_uint32)*new_capacity);
assert(stack->values);
stack->capacity = new_capacity;
DEBUGP("Increased\n");
}
}
cmph-2.0.2/src/fch.c 0000644 0001750 0001750 00000037100 13411542035 013476 0 ustar joseph joseph #include "fch.h"
#include "cmph_structs.h"
#include "fch_structs.h"
#include "hash.h"
#include "bitbool.h"
#include "fch_buckets.h"
#include
#include
#include
#include
#include
#define INDEX 0 /* alignment index within a bucket */
//#define DEBUG
#include "debug.h"
static fch_buckets_t * mapping(cmph_config_t *mph);
static cmph_uint32 * ordering(fch_buckets_t * buckets);
static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t * buckets, cmph_uint32 *sorted_indexes);
static void permut(cmph_uint32 * vector, cmph_uint32 n);
static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph_uint32 *sorted_indexes);
fch_config_data_t *fch_config_new()
{
fch_config_data_t *fch;
fch = (fch_config_data_t *)malloc(sizeof(fch_config_data_t));
if (!fch) return NULL;
memset(fch, 0, sizeof(fch_config_data_t));
fch->hashfuncs[0] = CMPH_HASH_JENKINS;
fch->hashfuncs[1] = CMPH_HASH_JENKINS;
fch->m = fch->b = 0;
fch->c = fch->p1 = fch->p2 = 0.0;
fch->g = NULL;
fch->h1 = NULL;
fch->h2 = NULL;
return fch;
}
void fch_config_destroy(cmph_config_t *mph)
{
fch_config_data_t *data = (fch_config_data_t *)mph->data;
//DEBUGP("Destroying algorithm dependent data\n");
free(data);
}
void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
fch_config_data_t *fch = (fch_config_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
cmph_uint32 i = 0;
while(*hashptr != CMPH_HASH_COUNT)
{
if (i >= 2) break; //fch only uses two hash functions
fch->hashfuncs[i] = *hashptr;
++i, ++hashptr;
}
}
cmph_uint32 mixh10h11h12(cmph_uint32 b, double p1, double p2, cmph_uint32 initial_index)
{
register cmph_uint32 int_p2 = (cmph_uint32)p2;
if (initial_index < p1) initial_index %= int_p2; /* h11 o h10 */
else { /* h12 o h10 */
initial_index %= b;
if(initial_index < p2) initial_index += int_p2;
}
return initial_index;
}
cmph_uint32 fch_calc_b(double c, cmph_uint32 m)
{
return (cmph_uint32)ceil((c*m)/(log((double)m)/log(2.0) + 1));
}
double fch_calc_p1(cmph_uint32 m)
{
return ceil(0.55*m);
}
double fch_calc_p2(cmph_uint32 b)
{
return ceil(0.3*b);
}
static fch_buckets_t * mapping(cmph_config_t *mph)
{
cmph_uint32 i = 0;
fch_buckets_t *buckets = NULL;
fch_config_data_t *fch = (fch_config_data_t *)mph->data;
if (fch->h1) hash_state_destroy(fch->h1);
fch->h1 = hash_state_new(fch->hashfuncs[0], fch->m);
fch->b = fch_calc_b(fch->c, fch->m);
fch->p1 = fch_calc_p1(fch->m);
fch->p2 = fch_calc_p2(fch->b);
//DEBUGP("b:%u p1:%f p2:%f\n", fch->b, fch->p1, fch->p2);
buckets = fch_buckets_new(fch->b);
mph->key_source->rewind(mph->key_source->data);
for(i = 0; i < fch->m; i++)
{
cmph_uint32 h1, keylen;
char *key = NULL;
mph->key_source->read(mph->key_source->data, &key, &keylen);
h1 = hash(fch->h1, key, keylen) % fch->m;
h1 = mixh10h11h12 (fch->b, fch->p1, fch->p2, h1);
fch_buckets_insert(buckets, h1, key, keylen);
key = NULL; // transger memory ownership
}
return buckets;
}
// returns the buckets indexes sorted by their sizes.
static cmph_uint32 * ordering(fch_buckets_t * buckets)
{
return fch_buckets_get_indexes_sorted_by_size(buckets);
}
/* Check whether function h2 causes collisions among the keys of each bucket */
static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t * buckets, cmph_uint32 *sorted_indexes)
{
//cmph_uint32 max_size = fch_buckets_get_max_size(buckets);
cmph_uint8 * hashtable = (cmph_uint8 *)calloc((size_t)fch->m, sizeof(cmph_uint8));
cmph_uint32 nbuckets = fch_buckets_get_nbuckets(buckets);
cmph_uint32 i = 0, index = 0, j =0;
for (i = 0; i < nbuckets; i++)
{
cmph_uint32 nkeys = fch_buckets_get_size(buckets, sorted_indexes[i]);
memset(hashtable, 0, (size_t)fch->m);
//DEBUGP("bucket %u -- nkeys: %u\n", i, nkeys);
for (j = 0; j < nkeys; j++)
{
char * key = fch_buckets_get_key(buckets, sorted_indexes[i], j);
cmph_uint32 keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], j);
index = hash(fch->h2, key, keylen) % fch->m;
if(hashtable[index]) { // collision detected
free(hashtable);
return 1;
}
hashtable[index] = 1;
}
}
free(hashtable);
return 0;
}
static void permut(cmph_uint32 * vector, cmph_uint32 n)
{
cmph_uint32 i, j, b;
for (i = 0; i < n; i++) {
j = (cmph_uint32) rand() % n;
b = vector[i];
vector[i] = vector[j];
vector[j] = b;
}
}
static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph_uint32 *sorted_indexes)
{
cmph_uint32 * random_table = (cmph_uint32 *) calloc((size_t)fch->m, sizeof(cmph_uint32));
cmph_uint32 * map_table = (cmph_uint32 *) calloc((size_t)fch->m, sizeof(cmph_uint32));
cmph_uint32 iteration_to_generate_h2 = 0;
cmph_uint32 searching_iterations = 0;
cmph_uint8 restart = 0;
cmph_uint32 nbuckets = fch_buckets_get_nbuckets(buckets);
cmph_uint32 i, j, z, counter = 0, filled_count = 0;
if (fch->g) free (fch->g);
fch->g = (cmph_uint32 *) calloc((size_t)fch->b, sizeof(cmph_uint32));
//DEBUGP("max bucket size: %u\n", fch_buckets_get_max_size(buckets));
for(i = 0; i < fch->m; i++)
{
random_table[i] = i;
}
permut(random_table, fch->m);
for(i = 0; i < fch->m; i++)
{
map_table[random_table[i]] = i;
}
do {
if (fch->h2) hash_state_destroy(fch->h2);
fch->h2 = hash_state_new(fch->hashfuncs[1], fch->m);
restart = check_for_collisions_h2(fch, buckets, sorted_indexes);
filled_count = 0;
if (!restart)
{
searching_iterations++; iteration_to_generate_h2 = 0;
//DEBUGP("searching_iterations: %u\n", searching_iterations);
}
else {
iteration_to_generate_h2++;
//DEBUGP("iteration_to_generate_h2: %u\n", iteration_to_generate_h2);
}
for(i = 0; (i < nbuckets) && !restart; i++) {
cmph_uint32 bucketsize = fch_buckets_get_size(buckets, sorted_indexes[i]);
if (bucketsize == 0)
{
restart = 0; // false
break;
}
else restart = 1; // true
for(z = 0; (z < (fch->m - filled_count)) && restart; z++) {
char * key = fch_buckets_get_key(buckets, sorted_indexes[i], INDEX);
cmph_uint32 keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], INDEX);
cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m;
counter = 0;
restart = 0; // false
fch->g[sorted_indexes[i]] = (fch->m + random_table[filled_count + z] - h2) % fch->m;
//DEBUGP("g[%u]: %u\n", sorted_indexes[i], fch->g[sorted_indexes[i]]);
j = INDEX;
do {
cmph_uint32 index = 0;
key = fch_buckets_get_key(buckets, sorted_indexes[i], j);
keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], j);
h2 = hash(fch->h2, key, keylen) % fch->m;
index = (h2 + fch->g[sorted_indexes[i]]) % fch->m;
//DEBUGP("key:%s keylen:%u index: %u h2:%u bucketsize:%u\n", key, keylen, index, h2, bucketsize);
if (map_table[index] >= filled_count) {
cmph_uint32 y = map_table[index];
cmph_uint32 ry = random_table[y];
random_table[y] = random_table[filled_count];
random_table[filled_count] = ry;
map_table[random_table[y]] = y;
map_table[random_table[filled_count]] = filled_count;
filled_count++;
counter ++;
}
else {
restart = 1; // true
filled_count = filled_count - counter;
counter = 0;
break;
}
j = (j + 1) % bucketsize;
} while(j % bucketsize != INDEX);
}
//getchar();
}
} while(restart && (searching_iterations < 10) && (iteration_to_generate_h2 < 1000));
free(map_table);
free(random_table);
return restart;
}
cmph_t *fch_new(cmph_config_t *mph, double c)
{
cmph_t *mphf = NULL;
fch_data_t *fchf = NULL;
cmph_uint32 iterations = 100;
cmph_uint8 restart_mapping = 0;
fch_buckets_t * buckets = NULL;
cmph_uint32 * sorted_indexes = NULL;
fch_config_data_t *fch = (fch_config_data_t *)mph->data;
fch->m = mph->key_source->nkeys;
//DEBUGP("m: %f\n", fch->m);
if (c <= 2) c = 2.6; // validating restrictions over parameter c.
fch->c = c;
//DEBUGP("c: %f\n", fch->c);
fch->h1 = NULL;
fch->h2 = NULL;
fch->g = NULL;
do
{
if (mph->verbosity)
{
fprintf(stderr, "Entering mapping step for mph creation of %u keys\n", fch->m);
}
if (buckets) fch_buckets_destroy(buckets, mph);
buckets = mapping(mph);
if (mph->verbosity)
{
fprintf(stderr, "Starting ordering step\n");
}
if (sorted_indexes) free (sorted_indexes);
sorted_indexes = ordering(buckets);
if (mph->verbosity)
{
fprintf(stderr, "Starting searching step.\n");
}
restart_mapping = searching(fch, buckets, sorted_indexes);
iterations--;
} while(restart_mapping && iterations > 0);
if (buckets) fch_buckets_destroy(buckets, mph);
if (sorted_indexes) free (sorted_indexes);
if (iterations == 0) return NULL;
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
fchf = (fch_data_t *)malloc(sizeof(fch_data_t));
fchf->g = fch->g;
fch->g = NULL; //transfer memory ownership
fchf->h1 = fch->h1;
fch->h1 = NULL; //transfer memory ownership
fchf->h2 = fch->h2;
fch->h2 = NULL; //transfer memory ownership
fchf->p2 = fch->p2;
fchf->p1 = fch->p1;
fchf->b = fch->b;
fchf->c = fch->c;
fchf->m = fch->m;
mphf->data = fchf;
mphf->size = fch->m;
//DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
return mphf;
}
int fch_dump(cmph_t *mphf, FILE *fd)
{
char *buf = NULL;
cmph_uint32 buflen;
register size_t nbytes;
fch_data_t *data = (fch_data_t *)mphf->data;
__cmph_dump(mphf, fd);
hash_state_dump(data->h1, &buf, &buflen);
//DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
hash_state_dump(data->h2, &buf, &buflen);
//DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(&(data->c), sizeof(double), (size_t)1, fd);
nbytes = fwrite(&(data->b), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(&(data->p1), sizeof(double), (size_t)1, fd);
nbytes = fwrite(&(data->p2), sizeof(double), (size_t)1, fd);
nbytes = fwrite(data->g, sizeof(cmph_uint32)*(data->b), (size_t)1, fd);
#ifdef DEBUG
cmph_uint32 i;
fprintf(stderr, "G: ");
for (i = 0; i < data->b; ++i) fprintf(stderr, "%u ", data->g[i]);
fprintf(stderr, "\n");
#endif
return 1;
}
void fch_load(FILE *f, cmph_t *mphf)
{
char *buf = NULL;
cmph_uint32 buflen;
register size_t nbytes;
fch_data_t *fch = (fch_data_t *)malloc(sizeof(fch_data_t));
//DEBUGP("Loading fch mphf\n");
mphf->data = fch;
//DEBUGP("Reading h1\n");
fch->h1 = NULL;
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
//DEBUGP("Hash state of h1 has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
fch->h1 = hash_state_load(buf, buflen);
free(buf);
//DEBUGP("Loading fch mphf\n");
mphf->data = fch;
//DEBUGP("Reading h2\n");
fch->h2 = NULL;
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
//DEBUGP("Hash state of h2 has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
fch->h2 = hash_state_load(buf, buflen);
free(buf);
//DEBUGP("Reading m and n\n");
nbytes = fread(&(fch->m), sizeof(cmph_uint32), (size_t)1, f);
nbytes = fread(&(fch->c), sizeof(double), (size_t)1, f);
nbytes = fread(&(fch->b), sizeof(cmph_uint32), (size_t)1, f);
nbytes = fread(&(fch->p1), sizeof(double), (size_t)1, f);
nbytes = fread(&(fch->p2), sizeof(double), (size_t)1, f);
fch->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*fch->b);
nbytes = fread(fch->g, fch->b*sizeof(cmph_uint32), (size_t)1, f);
#ifdef DEBUG
cmph_uint32 i;
fprintf(stderr, "G: ");
for (i = 0; i < fch->b; ++i) fprintf(stderr, "%u ", fch->g[i]);
fprintf(stderr, "\n");
#endif
return;
}
cmph_uint32 fch_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
fch_data_t *fch = (fch_data_t *)mphf->data;
cmph_uint32 h1 = hash(fch->h1, key, keylen) % fch->m;
cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m;
h1 = mixh10h11h12 (fch->b, fch->p1, fch->p2, h1);
//DEBUGP("key: %s h1: %u h2: %u g[h1]: %u\n", key, h1, h2, fch->g[h1]);
return (h2 + fch->g[h1]) % fch->m;
}
void fch_destroy(cmph_t *mphf)
{
fch_data_t *data = (fch_data_t *)mphf->data;
free(data->g);
hash_state_destroy(data->h1);
hash_state_destroy(data->h2);
free(data);
free(mphf);
}
/** \fn void fch_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void fch_pack(cmph_t *mphf, void *packed_mphf)
{
fch_data_t *data = (fch_data_t *)mphf->data;
cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
// packing h1 type
CMPH_HASH h1_type = hash_get_type(data->h1);
*((cmph_uint32 *) ptr) = h1_type;
ptr += sizeof(cmph_uint32);
// packing h1
hash_state_pack(data->h1, ptr);
ptr += hash_state_packed_size(h1_type);
// packing h2 type
CMPH_HASH h2_type = hash_get_type(data->h2);
*((cmph_uint32 *) ptr) = h2_type;
ptr += sizeof(cmph_uint32);
// packing h2
hash_state_pack(data->h2, ptr);
ptr += hash_state_packed_size(h2_type);
// packing m
*((cmph_uint32 *) ptr) = data->m;
ptr += sizeof(data->m);
// packing b
*((cmph_uint32 *) ptr) = data->b;
ptr += sizeof(data->b);
// packing p1
*((cmph_uint64 *)ptr) = (cmph_uint64)data->p1;
ptr += sizeof(data->p1);
// packing p2
*((cmph_uint64 *)ptr) = (cmph_uint64)data->p2;
ptr += sizeof(data->p2);
// packing g
memcpy(ptr, data->g, sizeof(cmph_uint32)*(data->b));
}
/** \fn cmph_uint32 fch_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 fch_packed_size(cmph_t *mphf)
{
fch_data_t *data = (fch_data_t *)mphf->data;
CMPH_HASH h1_type = hash_get_type(data->h1);
CMPH_HASH h2_type = hash_get_type(data->h2);
return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) +
4*sizeof(cmph_uint32) + 2*sizeof(double) + sizeof(cmph_uint32)*(data->b));
}
/** cmph_uint32 fch_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
register cmph_uint8 *h1_ptr = (cmph_uint8 *)packed_mphf;
register CMPH_HASH h1_type = (CMPH_HASH)*((cmph_uint32 *)h1_ptr);
h1_ptr += 4;
register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
register CMPH_HASH h2_type = (CMPH_HASH)*((cmph_uint32 *)h2_ptr);
h2_ptr += 4;
register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type));
register cmph_uint32 m = *g_ptr++;
register cmph_uint32 b = *g_ptr++;
register double p1 = (double)(*((cmph_uint64 *)g_ptr));
g_ptr += 2;
register double p2 = (double)(*((cmph_uint64 *)g_ptr));
g_ptr += 2;
register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m;
register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m;
h1 = mixh10h11h12 (b, p1, p2, h1);
return (h2 + g_ptr[h1]) % m;
}
cmph-2.0.2/src/hash.h 0000644 0001750 0001750 00000006415 13411542035 013673 0 ustar joseph joseph #ifndef __CMPH_HASH_H__
#define __CMPH_HASH_H__
#include "cmph_types.h"
typedef union __hash_state_t hash_state_t;
hash_state_t *hash_state_new(CMPH_HASH, cmph_uint32 hashsize);
/** \fn cmph_uint32 hash(hash_state_t *state, const char *key, cmph_uint32 keylen);
* \param state is a pointer to a hash_state_t structure
* \param key is a pointer to a key
* \param keylen is the key length
* \return an integer that represents a hash value of 32 bits.
*/
cmph_uint32 hash(hash_state_t *state, const char *key, cmph_uint32 keylen);
/** \fn void hash_vector(hash_state_t *state, const char *key, cmph_uint32 keylen, cmph_uint32 * hashes);
* \param state is a pointer to a hash_state_t structure
* \param key is a pointer to a key
* \param keylen is the key length
* \param hashes is a pointer to a memory large enough to fit three 32-bit integers.
*/
void hash_vector(hash_state_t *state, const char *key, cmph_uint32 keylen, cmph_uint32 * hashes);
void hash_state_dump(hash_state_t *state, char **buf, cmph_uint32 *buflen);
hash_state_t * hash_state_copy(hash_state_t *src_state);
hash_state_t *hash_state_load(const char *buf, cmph_uint32 buflen);
void hash_state_destroy(hash_state_t *state);
/** \fn void hash_state_pack(hash_state_t *state, void *hash_packed);
* \brief Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed.
* \param state points to the hash function
* \param hash_packed pointer to the contiguous memory area used to store the hash function. The size of hash_packed must be at least hash_state_packed_size()
*
* Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed.
* However, the hash function type must be packed outside.
*/
void hash_state_pack(hash_state_t *state, void *hash_packed);
/** \fn cmph_uint32 hash_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen);
* \param hash_packed is a pointer to a contiguous memory area
* \param hashfunc is the type of the hash function packed in hash_packed
* \param key is a pointer to a key
* \param keylen is the key length
* \return an integer that represents a hash value of 32 bits.
*/
cmph_uint32 hash_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen);
/** \fn cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc)
* \brief Return the amount of space needed to pack a hash function.
* \param hashfunc function type
* \return the size of the packed function or zero for failures
*/
cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc);
/** \fn hash_vector_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes);
* \param hash_packed is a pointer to a contiguous memory area
* \param key is a pointer to a key
* \param keylen is the key length
* \param hashes is a pointer to a memory large enough to fit three 32-bit integers.
*/
void hash_vector_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes);
/** \fn CMPH_HASH hash_get_type(hash_state_t *state);
* \param state is a pointer to a hash_state_t structure
* \return the hash function type pointed by state
*/
CMPH_HASH hash_get_type(hash_state_t *state);
#endif
cmph-2.0.2/src/brz.h 0000644 0001750 0001750 00000005144 13411542035 013543 0 ustar joseph joseph #ifndef __CMPH_BRZ_H__
#define __CMPH_BRZ_H__
#include "cmph.h"
/*
* The BRZ algorithm has been built so to consume the bare minimum
* amount of memory to generate the MPHFs. Thereby we decided
* to dump the resulting MPHFs to disk while creating them. Thus,
* to use the BRZ algorithm, one has to call brz_config_set_mphf_fd
* before calling brz_new. Otherwise we will fail the MPHF creation.
* One side effect of this design decision is that the resulting
* MPHF cannot be used until its dumping process is finalized
* by calling brz_dump and the caller must use brz_load before
* any call to either one of the following functions is made:
* brz_search
* brz_pack
* brz_packed_size
* brz_search_packed
*/
typedef struct __brz_data_t brz_data_t;
typedef struct __brz_config_data_t brz_config_data_t;
brz_config_data_t *brz_config_new(void);
void brz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir);
void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd);
void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b);
void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo);
void brz_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability);
void brz_config_destroy(cmph_config_t *mph);
cmph_t *brz_new(cmph_config_t *mph, double c);
void brz_load(FILE *f, cmph_t *mphf);
int brz_dump(cmph_t *mphf, FILE *f);
void brz_destroy(cmph_t *mphf);
cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
/** \fn void brz_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void brz_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 brz_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 brz_packed_size(cmph_t *mphf);
/** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
#endif
cmph-2.0.2/src/graph.h 0000644 0001750 0001750 00000002360 13411542035 014044 0 ustar joseph joseph #ifndef _CMPH_GRAPH_H__
#define _CMPH_GRAPH_H__
#include
#include "cmph_types.h"
#define GRAPH_NO_NEIGHBOR UINT_MAX
typedef struct __graph_t graph_t;
typedef struct __graph_iterator_t graph_iterator_t;
struct __graph_iterator_t
{
cmph_uint32 vertex;
cmph_uint32 edge;
};
graph_t *graph_new(cmph_uint32 nnodes, cmph_uint32 nedges);
void graph_destroy(graph_t *graph);
void graph_add_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2);
void graph_del_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2);
void graph_clear_edges(graph_t *g);
cmph_uint32 graph_edge_id(graph_t *g, cmph_uint32 v1, cmph_uint32 v2);
cmph_uint8 graph_contains_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2);
graph_iterator_t graph_neighbors_it(graph_t *g, cmph_uint32 v);
cmph_uint32 graph_next_neighbor(graph_t *g, graph_iterator_t* it);
void graph_obtain_critical_nodes(graph_t *g); /* included -- Fabiano*/
cmph_uint8 graph_node_is_critical(graph_t * g, cmph_uint32 v); /* included -- Fabiano */
cmph_uint32 graph_ncritical_nodes(graph_t *g); /* included -- Fabiano*/
cmph_uint32 graph_vertex_id(graph_t *g, cmph_uint32 e, cmph_uint32 id); /* included -- Fabiano*/
int graph_is_cyclic(graph_t *g);
void graph_print(graph_t *);
#endif
cmph-2.0.2/src/graph.c 0000644 0001750 0001750 00000017651 13411542035 014050 0 ustar joseph joseph #include "graph.h"
#include
#include
#include
#include
#include
#include "vstack.h"
#include "bitbool.h"
// #define DEBUG
#include "debug.h"
/* static const cmph_uint8 bitmask[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; */
/* #define GETBIT(array, i) (array[(i) / 8] & bitmask[(i) % 8]) */
/* #define SETBIT(array, i) (array[(i) / 8] |= bitmask[(i) % 8]) */
/* #define UNSETBIT(array, i) (array[(i) / 8] &= (~(bitmask[(i) % 8]))) */
#define abs_edge(e, i) (e % g->nedges + i * g->nedges)
struct __graph_t
{
cmph_uint32 nnodes;
cmph_uint32 nedges;
cmph_uint32 *edges;
cmph_uint32 *first;
cmph_uint32 *next;
cmph_uint8 *critical_nodes; /* included -- Fabiano*/
cmph_uint32 ncritical_nodes; /* included -- Fabiano*/
cmph_uint32 cedges;
int shrinking;
};
static cmph_uint32 EMPTY = UINT_MAX;
graph_t *graph_new(cmph_uint32 nnodes, cmph_uint32 nedges)
{
graph_t *graph = (graph_t *)malloc(sizeof(graph_t));
if (!graph) return NULL;
graph->edges = (cmph_uint32 *)malloc(sizeof(cmph_uint32) * 2 * nedges);
graph->next = (cmph_uint32 *)malloc(sizeof(cmph_uint32) * 2 * nedges);
graph->first = (cmph_uint32 *)malloc(sizeof(cmph_uint32) * nnodes);
graph->critical_nodes = NULL; /* included -- Fabiano*/
graph->ncritical_nodes = 0; /* included -- Fabiano*/
graph->nnodes = nnodes;
graph->nedges = nedges;
graph_clear_edges(graph);
return graph;
}
void graph_destroy(graph_t *graph)
{
DEBUGP("Destroying graph\n");
free(graph->edges);
free(graph->first);
free(graph->next);
free(graph->critical_nodes); /* included -- Fabiano*/
free(graph);
return;
}
void graph_print(graph_t *g)
{
cmph_uint32 i, e;
for (i = 0; i < g->nnodes; ++i)
{
DEBUGP("Printing edges connected to %u\n", i);
e = g->first[i];
if (e != EMPTY)
{
printf("%u -> %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]);
while ((e = g->next[e]) != EMPTY)
{
printf("%u -> %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]);
}
}
}
return;
}
void graph_add_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2)
{
cmph_uint32 e = g->cedges;
assert(v1 < g->nnodes);
assert(v2 < g->nnodes);
assert(e < g->nedges);
assert(!g->shrinking);
g->next[e] = g->first[v1];
g->first[v1] = e;
g->edges[e] = v2;
g->next[e + g->nedges] = g->first[v2];
g->first[v2] = e + g->nedges;
g->edges[e + g->nedges] = v1;
++(g->cedges);
}
static int check_edge(graph_t *g, cmph_uint32 e, cmph_uint32 v1, cmph_uint32 v2)
{
DEBUGP("Checking edge %u %u looking for %u %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)], v1, v2);
if (g->edges[abs_edge(e, 0)] == v1 && g->edges[abs_edge(e, 1)] == v2) return 1;
if (g->edges[abs_edge(e, 0)] == v2 && g->edges[abs_edge(e, 1)] == v1) return 1;
return 0;
}
cmph_uint32 graph_edge_id(graph_t *g, cmph_uint32 v1, cmph_uint32 v2)
{
cmph_uint32 e;
e = g->first[v1];
assert(e != EMPTY);
if (check_edge(g, e, v1, v2)) return abs_edge(e, 0);
do
{
e = g->next[e];
assert(e != EMPTY);
}
while (!check_edge(g, e, v1, v2));
return abs_edge(e, 0);
}
static void del_edge_point(graph_t *g, cmph_uint32 v1, cmph_uint32 v2)
{
cmph_uint32 e, prev;
DEBUGP("Deleting edge point %u %u\n", v1, v2);
e = g->first[v1];
if (check_edge(g, e, v1, v2))
{
g->first[v1] = g->next[e];
//g->edges[e] = EMPTY;
DEBUGP("Deleted\n");
return;
}
DEBUGP("Checking linked list\n");
do
{
prev = e;
e = g->next[e];
assert(e != EMPTY);
}
while (!check_edge(g, e, v1, v2));
g->next[prev] = g->next[e];
//g->edges[e] = EMPTY;
DEBUGP("Deleted\n");
}
void graph_del_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2)
{
g->shrinking = 1;
del_edge_point(g, v1, v2);
del_edge_point(g, v2, v1);
}
void graph_clear_edges(graph_t *g)
{
cmph_uint32 i;
for (i = 0; i < g->nnodes; ++i) g->first[i] = EMPTY;
for (i = 0; i < g->nedges*2; ++i)
{
g->edges[i] = EMPTY;
g->next[i] = EMPTY;
}
g->cedges = 0;
g->shrinking = 0;
}
static cmph_uint8 find_degree1_edge(graph_t *g, cmph_uint32 v, cmph_uint8 *deleted, cmph_uint32 *e)
{
cmph_uint32 edge = g->first[v];
cmph_uint8 found = 0;
DEBUGP("Checking degree of vertex %u connected to edge %u\n", v, edge);
if (edge == EMPTY) return 0;
else if (!(GETBIT(deleted, abs_edge(edge, 0))))
{
found = 1;
*e = edge;
}
while(1)
{
edge = g->next[edge];
if (edge == EMPTY) break;
if (GETBIT(deleted, abs_edge(edge, 0))) continue;
if (found) return 0;
DEBUGP("Found first edge\n");
*e = edge;
found = 1;
}
return found;
}
static void cyclic_del_edge(graph_t *g, cmph_uint32 v, cmph_uint8 *deleted)
{
cmph_uint32 e = 0;
cmph_uint8 degree1;
cmph_uint32 v1 = v;
cmph_uint32 v2 = 0;
degree1 = find_degree1_edge(g, v1, deleted, &e);
if (!degree1) return;
while(1)
{
DEBUGP("Deleting edge %u (%u->%u)\n", e, g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]);
SETBIT(deleted, abs_edge(e, 0));
v2 = g->edges[abs_edge(e, 0)];
if (v2 == v1) v2 = g->edges[abs_edge(e, 1)];
DEBUGP("Checking if second endpoint %u has degree 1\n", v2);
degree1 = find_degree1_edge(g, v2, deleted, &e);
if (degree1)
{
DEBUGP("Inspecting vertex %u\n", v2);
v1 = v2;
}
else break;
}
}
int graph_is_cyclic(graph_t *g)
{
cmph_uint32 i;
cmph_uint32 v;
cmph_uint8 *deleted = (cmph_uint8 *)malloc((g->nedges*sizeof(cmph_uint8))/8 + 1);
size_t deleted_len = g->nedges/8 + 1;
memset(deleted, 0, deleted_len);
DEBUGP("Looking for cycles in graph with %u vertices and %u edges\n", g->nnodes, g->nedges);
for (v = 0; v < g->nnodes; ++v)
{
cyclic_del_edge(g, v, deleted);
}
for (i = 0; i < g->nedges; ++i)
{
if (!(GETBIT(deleted, i)))
{
DEBUGP("Edge %u %u->%u was not deleted\n", i, g->edges[i], g->edges[i + g->nedges]);
free(deleted);
return 1;
}
}
free(deleted);
return 0;
}
cmph_uint8 graph_node_is_critical(graph_t * g, cmph_uint32 v) /* included -- Fabiano */
{
return (cmph_uint8)GETBIT(g->critical_nodes,v);
}
void graph_obtain_critical_nodes(graph_t *g) /* included -- Fabiano*/
{
cmph_uint32 i;
cmph_uint32 v;
cmph_uint8 *deleted = (cmph_uint8 *)malloc((g->nedges*sizeof(cmph_uint8))/8+1);
size_t deleted_len = g->nedges/8 + 1;
memset(deleted, 0, deleted_len);
free(g->critical_nodes);
g->critical_nodes = (cmph_uint8 *)malloc((g->nnodes*sizeof(cmph_uint8))/8 + 1);
g->ncritical_nodes = 0;
memset(g->critical_nodes, 0, (g->nnodes*sizeof(cmph_uint8))/8 + 1);
DEBUGP("Looking for the 2-core in graph with %u vertices and %u edges\n", g->nnodes, g->nedges);
for (v = 0; v < g->nnodes; ++v)
{
cyclic_del_edge(g, v, deleted);
}
for (i = 0; i < g->nedges; ++i)
{
if (!(GETBIT(deleted,i)))
{
DEBUGP("Edge %u %u->%u belongs to the 2-core\n", i, g->edges[i], g->edges[i + g->nedges]);
if(!(GETBIT(g->critical_nodes,g->edges[i])))
{
g->ncritical_nodes ++;
SETBIT(g->critical_nodes,g->edges[i]);
}
if(!(GETBIT(g->critical_nodes,g->edges[i + g->nedges])))
{
g->ncritical_nodes ++;
SETBIT(g->critical_nodes,g->edges[i + g->nedges]);
}
}
}
free(deleted);
}
cmph_uint8 graph_contains_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) /* included -- Fabiano*/
{
cmph_uint32 e;
e = g->first[v1];
if(e == EMPTY) return 0;
if (check_edge(g, e, v1, v2)) return 1;
do
{
e = g->next[e];
if(e == EMPTY) return 0;
}
while (!check_edge(g, e, v1, v2));
return 1;
}
cmph_uint32 graph_vertex_id(graph_t *g, cmph_uint32 e, cmph_uint32 id) /* included -- Fabiano*/
{
return (g->edges[e + id*g->nedges]);
}
cmph_uint32 graph_ncritical_nodes(graph_t *g) /* included -- Fabiano*/
{
return g->ncritical_nodes;
}
graph_iterator_t graph_neighbors_it(graph_t *g, cmph_uint32 v)
{
graph_iterator_t it;
it.vertex = v;
it.edge = g->first[v];
return it;
}
cmph_uint32 graph_next_neighbor(graph_t *g, graph_iterator_t* it)
{
cmph_uint32 ret;
if(it->edge == EMPTY) return GRAPH_NO_NEIGHBOR;
if (g->edges[it->edge] == it->vertex) ret = g->edges[it->edge + g->nedges];
else ret = g->edges[it->edge];
it->edge = g->next[it->edge];
return ret;
}
cmph-2.0.2/src/linear_string_map.h 0000644 0001750 0001750 00000001127 13411542035 016440 0 ustar joseph joseph // A simple linked list based dynamic sized associative map from const char* to
// void*. Designed to maximize ease of use instead of performance. Should be
// used in benchmarks and tests only, not to be distributed with the cmph
// runtime headers.
typedef struct __linear_string_map_t lsmap_t;
lsmap_t *lsmap_new();
void lsmap_append(lsmap_t *lsmap, const char *key, void *value);
void* lsmap_search(lsmap_t *lsmap, const char *key);
void lsmap_foreach_key(lsmap_t* lsmap, void (*f)(const char*));
void lsmap_foreach_value(lsmap_t* lsmap, void (*f)(void*));
void lsmap_destroy(lsmap_t* lsmap);
cmph-2.0.2/src/bm_numbers.c 0000644 0001750 0001750 00000007600 13411542035 015071 0 ustar joseph joseph #include
#include
#include "bitbool.h"
#include "cmph.h"
#include "cmph_benchmark.h"
#include "linear_string_map.h"
// Generates a vector with random unique 32 bits integers
cmph_uint32* random_numbers_vector_new(cmph_uint32 size) {
cmph_uint32 i = 0;
cmph_uint32 dup_bits = sizeof(cmph_uint32)*size*8;
char* dup = (char*)malloc(dup_bits/8);
cmph_uint32* vec = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*size);
memset(dup, 0, dup_bits/8);
for (i = 0; i < size; ++i) {
cmph_uint32 v = random();
while (GETBIT(dup, v % dup_bits)) { v = random(); }
SETBIT(dup, v % dup_bits);
vec[i] = v;
}
free(dup);
return vec;
}
int cmph_uint32_cmp(const void *a, const void *b) {
return *(const cmph_uint32*)a - *(const cmph_uint32*)b;
}
char* create_lsmap_key(CMPH_ALGO algo, int iters) {
char mphf_name[128];
snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters);
return strdup(mphf_name);
}
static cmph_uint32 g_numbers_len = 0;
static cmph_uint32 *g_numbers = NULL;
static lsmap_t *g_created_mphf = NULL;
static lsmap_t *g_expected_probes = NULL;
static lsmap_t *g_mphf_probes = NULL;
void bm_create(CMPH_ALGO algo, int iters) {
cmph_io_adapter_t* source = NULL;
cmph_config_t* config = NULL;
cmph_t* mphf = NULL;
if (iters > (int)g_numbers_len) {
fprintf(stderr, "No input with proper size.");
exit(-1);
}
source = cmph_io_struct_vector_adapter(
(void*)g_numbers, sizeof(cmph_uint32),
0, sizeof(cmph_uint32), iters);
config = cmph_config_new(source);
cmph_config_set_algo(config, algo);
mphf = cmph_new(config);
if (!mphf) {
fprintf(stderr, "Failed to create mphf for algorithm %s with %u keys",
cmph_names[algo], iters);
exit(-1);
}
cmph_config_destroy(config);
cmph_io_struct_vector_adapter_destroy(source);
lsmap_append(g_created_mphf, create_lsmap_key(algo, iters), mphf);
}
void bm_search(CMPH_ALGO algo, int iters) {
int i = 0;
char *mphf_name;
cmph_t* mphf = NULL;
mphf_name = create_lsmap_key(algo, iters);
mphf = (cmph_t*)lsmap_search(g_created_mphf, mphf_name);
free(mphf_name);
cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters);
for (i = 0; i < iters * 100; ++i) {
cmph_uint32 pos = random() % iters;
const char* buf = (const char*)(g_numbers + pos);
cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32));
++count[pos];
++hash_count[h];
}
// Verify correctness later.
lsmap_append(g_expected_probes, create_lsmap_key(algo, iters), count);
lsmap_append(g_mphf_probes, create_lsmap_key(algo, iters), hash_count);
}
void verify() { }
#define DECLARE_ALGO(algo) \
void bm_create_ ## algo(int iters) { bm_create(algo, iters); } \
void bm_search_ ## algo(int iters) { bm_search(algo, iters); }
DECLARE_ALGO(CMPH_BMZ);
DECLARE_ALGO(CMPH_CHM);
DECLARE_ALGO(CMPH_BRZ);
DECLARE_ALGO(CMPH_FCH);
DECLARE_ALGO(CMPH_BDZ);
int main(int argc, char** argv) {
g_numbers_len = 1000 * 1000;
g_numbers = random_numbers_vector_new(g_numbers_len);
g_created_mphf = lsmap_new();
g_expected_probes = lsmap_new();
g_mphf_probes = lsmap_new();
BM_REGISTER(bm_create_CMPH_BMZ, 1000 * 1000);
BM_REGISTER(bm_search_CMPH_BMZ, 1000 * 1000);
BM_REGISTER(bm_create_CMPH_CHM, 1000 * 1000);
BM_REGISTER(bm_search_CMPH_CHM, 1000 * 1000);
// BM_REGISTER(bm_create_CMPH_BRZ, 1000 * 1000);
// BM_REGISTER(bm_search_CMPH_BRZ, 1000 * 1000);
// BM_REGISTER(bm_create_CMPH_FCH, 1000 * 1000);
// BM_REGISTER(bm_search_CMPH_FCH, 1000 * 1000);
BM_REGISTER(bm_create_CMPH_BDZ, 1000 * 1000);
BM_REGISTER(bm_search_CMPH_BDZ, 1000 * 1000);
run_benchmarks(argc, argv);
verify();
free(g_numbers);
lsmap_foreach_key(g_created_mphf, (void(*)(const char*))free);
lsmap_foreach_value(g_created_mphf, (void(*)(void*))cmph_destroy);
lsmap_destroy(g_created_mphf);
return 0;
}
cmph-2.0.2/src/cmph.h 0000644 0001750 0001750 00000010055 13411542035 013672 0 ustar joseph joseph #ifndef __CMPH_H__
#define __CMPH_H__
#include
#include
#ifdef __cplusplus
extern "C"
{
#endif
#include "cmph_types.h"
typedef struct __config_t cmph_config_t;
typedef struct __cmph_t cmph_t;
typedef struct
{
void *data;
cmph_uint32 nkeys;
int (*read)(void *, char **, cmph_uint32 *);
void (*dispose)(void *, char *, cmph_uint32);
void (*rewind)(void *);
} cmph_io_adapter_t;
/** Adapter pattern API **/
/* please call free() in the created adapters */
cmph_io_adapter_t *cmph_io_nlfile_adapter(FILE * keys_fd);
void cmph_io_nlfile_adapter_destroy(cmph_io_adapter_t * key_source);
cmph_io_adapter_t *cmph_io_nlnkfile_adapter(FILE * keys_fd, cmph_uint32 nkeys);
void cmph_io_nlnkfile_adapter_destroy(cmph_io_adapter_t * key_source);
cmph_io_adapter_t *cmph_io_vector_adapter(char ** vector, cmph_uint32 nkeys);
void cmph_io_vector_adapter_destroy(cmph_io_adapter_t * key_source);
cmph_io_adapter_t *cmph_io_byte_vector_adapter(cmph_uint8 ** vector, cmph_uint32 nkeys);
void cmph_io_byte_vector_adapter_destroy(cmph_io_adapter_t * key_source);
cmph_io_adapter_t *cmph_io_struct_vector_adapter(void * vector,
cmph_uint32 struct_size,
cmph_uint32 key_offset,
cmph_uint32 key_len,
cmph_uint32 nkeys);
void cmph_io_struct_vector_adapter_destroy(cmph_io_adapter_t * key_source);
/** Hash configuration API **/
cmph_config_t *cmph_config_new(cmph_io_adapter_t *key_source);
void cmph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void cmph_config_set_verbosity(cmph_config_t *mph, cmph_uint32 verbosity);
void cmph_config_set_graphsize(cmph_config_t *mph, double c);
void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo);
void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir);
void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd);
void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b);
void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin);
void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability);
void cmph_config_destroy(cmph_config_t *mph);
/** Hash API **/
cmph_t *cmph_new(cmph_config_t *mph);
/** cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
* \brief Computes the mphf value.
* \param mphf pointer to the resulting function
* \param key is the key to be hashed
* \param keylen is the key legth in bytes
* \return The mphf value
*/
cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
cmph_uint32 cmph_size(cmph_t *mphf);
void cmph_destroy(cmph_t *mphf);
/** Hash serialization/deserialization */
int cmph_dump(cmph_t *mphf, FILE *f);
cmph_t *cmph_load(FILE *f);
/** \fn void cmph_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the
* \param resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void cmph_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 cmph_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 cmph_packed_size(cmph_t *mphf);
/** cmph_uint32 cmph_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
// TIMING functions. To use the macro CMPH_TIMING must be defined
#include "cmph_time.h"
#ifdef __cplusplus
}
#endif
#endif
cmph-2.0.2/src/chd_ph.h 0000644 0001750 0001750 00000004567 13411542035 014203 0 ustar joseph joseph #ifndef _CMPH_CHD_PH_H__
#define _CMPH_CHD_PH_H__
#include "cmph.h"
typedef struct __chd_ph_data_t chd_ph_data_t;
typedef struct __chd_ph_config_data_t chd_ph_config_data_t;
/* Config API */
chd_ph_config_data_t *chd_ph_config_new(void);
void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
/** \fn void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin);
* \brief Allows to set the number of keys per bin.
* \param mph pointer to the configuration structure
* \param keys_per_bin value for the number of keys per bin
*/
void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin);
/** \fn void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket);
* \brief Allows to set the number of keys per bucket.
* \param mph pointer to the configuration structure
* \param keys_per_bucket value for the number of keys per bucket
*/
void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket);
void chd_ph_config_destroy(cmph_config_t *mph);
/* Chd algorithm API */
cmph_t *chd_ph_new(cmph_config_t *mph, double c);
void chd_ph_load(FILE *fd, cmph_t *mphf);
int chd_ph_dump(cmph_t *mphf, FILE *fd);
void chd_ph_destroy(cmph_t *mphf);
cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
/** \fn void chd_ph_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void chd_ph_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 chd_ph_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 chd_ph_packed_size(cmph_t *mphf);
/** cmph_uint32 chd_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
#endif
cmph-2.0.2/src/bdz.c 0000755 0001750 0001750 00000052046 13411542035 013526 0 ustar joseph joseph #include "bdz.h"
#include "cmph_structs.h"
#include "bdz_structs.h"
#include "hash.h"
#include "bitbool.h"
#include
#include
#include
#include
#include
// #define DEBUG
#include "debug.h"
#define UNASSIGNED 3U
#define NULL_EDGE 0xffffffff
//cmph_uint32 ngrafos = 0;
//cmph_uint32 ngrafos_aciclicos = 0;
// table used for looking up the number of assigned vertices a 8-bit integer
const cmph_uint8 bdz_lookup_table[] =
{
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2,
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1,
2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0
};
typedef struct
{
cmph_uint32 vertices[3];
cmph_uint32 next_edges[3];
}bdz_edge_t;
typedef cmph_uint32 * bdz_queue_t;
static void bdz_alloc_queue(bdz_queue_t * queuep, cmph_uint32 nedges)
{
(*queuep)=(cmph_uint32 *)malloc(nedges*sizeof(cmph_uint32));
};
static void bdz_free_queue(bdz_queue_t * queue)
{
free(*queue);
};
typedef struct
{
cmph_uint32 nedges;
bdz_edge_t * edges;
cmph_uint32 * first_edge;
cmph_uint8 * vert_degree;
}bdz_graph3_t;
static void bdz_alloc_graph3(bdz_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices)
{
graph3->edges=(bdz_edge_t *)malloc(nedges*sizeof(bdz_edge_t));
graph3->first_edge=(cmph_uint32 *)malloc(nvertices*sizeof(cmph_uint32));
graph3->vert_degree=(cmph_uint8 *)malloc((size_t)nvertices);
};
static void bdz_init_graph3(bdz_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices)
{
memset(graph3->first_edge,0xff,nvertices*sizeof(cmph_uint32));
memset(graph3->vert_degree,0,(size_t)nvertices);
graph3->nedges=0;
};
static void bdz_free_graph3(bdz_graph3_t *graph3)
{
free(graph3->edges);
free(graph3->first_edge);
free(graph3->vert_degree);
};
static void bdz_partial_free_graph3(bdz_graph3_t *graph3)
{
free(graph3->first_edge);
free(graph3->vert_degree);
graph3->first_edge = NULL;
graph3->vert_degree = NULL;
};
static void bdz_add_edge(bdz_graph3_t * graph3, cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2)
{
graph3->edges[graph3->nedges].vertices[0]=v0;
graph3->edges[graph3->nedges].vertices[1]=v1;
graph3->edges[graph3->nedges].vertices[2]=v2;
graph3->edges[graph3->nedges].next_edges[0]=graph3->first_edge[v0];
graph3->edges[graph3->nedges].next_edges[1]=graph3->first_edge[v1];
graph3->edges[graph3->nedges].next_edges[2]=graph3->first_edge[v2];
graph3->first_edge[v0]=graph3->first_edge[v1]=graph3->first_edge[v2]=graph3->nedges;
graph3->vert_degree[v0]++;
graph3->vert_degree[v1]++;
graph3->vert_degree[v2]++;
graph3->nedges++;
};
static void bdz_dump_graph(bdz_graph3_t* graph3, cmph_uint32 nedges, cmph_uint32 nvertices)
{
cmph_uint32 i;
for(i=0;iedges[i].vertices[0],
graph3->edges[i].vertices[1],graph3->edges[i].vertices[2]);
printf(" nexts %d %d %d",graph3->edges[i].next_edges[0],
graph3->edges[i].next_edges[1],graph3->edges[i].next_edges[2]);
};
#ifdef DEBUG
for(i=0;ifirst_edge[i]);
};
#endif
};
static void bdz_remove_edge(bdz_graph3_t * graph3, cmph_uint32 curr_edge)
{
cmph_uint32 i,j=0,vert,edge1,edge2;
for(i=0;i<3;i++){
vert=graph3->edges[curr_edge].vertices[i];
edge1=graph3->first_edge[vert];
edge2=NULL_EDGE;
while(edge1!=curr_edge&&edge1!=NULL_EDGE){
edge2=edge1;
if(graph3->edges[edge1].vertices[0]==vert){
j=0;
} else if(graph3->edges[edge1].vertices[1]==vert){
j=1;
} else
j=2;
edge1=graph3->edges[edge1].next_edges[j];
};
if(edge1==NULL_EDGE){
printf("\nerror remove edge %d dump graph",curr_edge);
bdz_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4);
exit(-1);
};
if(edge2!=NULL_EDGE){
graph3->edges[edge2].next_edges[j] =
graph3->edges[edge1].next_edges[i];
} else
graph3->first_edge[vert]=
graph3->edges[edge1].next_edges[i];
graph3->vert_degree[vert]--;
};
};
static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_queue_t queue, bdz_graph3_t* graph3)
{
cmph_uint32 i,v0,v1,v2;
cmph_uint32 queue_head=0,queue_tail=0;
cmph_uint32 curr_edge;
cmph_uint32 tmp_edge;
cmph_uint8 * marked_edge = (cmph_uint8 *)malloc((size_t)(nedges >> 3) + 1);
memset(marked_edge, 0, (size_t)(nedges >> 3) + 1);
for(i=0;iedges[i].vertices[0];
v1=graph3->edges[i].vertices[1];
v2=graph3->edges[i].vertices[2];
if(graph3->vert_degree[v0]==1 ||
graph3->vert_degree[v1]==1 ||
graph3->vert_degree[v2]==1){
if(!GETBIT(marked_edge,i)) {
queue[queue_head++]=i;
SETBIT(marked_edge,i);
}
};
};
DEBUGP("Queue head %d Queue tail %d\n", queue_head, queue_tail);
#ifdef DEBUG
bdz_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4);
#endif
while(queue_tail!=queue_head){
curr_edge=queue[queue_tail++];
bdz_remove_edge(graph3,curr_edge);
DEBUGP("Removing edge %d\n", curr_edge);
v0=graph3->edges[curr_edge].vertices[0];
v1=graph3->edges[curr_edge].vertices[1];
v2=graph3->edges[curr_edge].vertices[2];
if(graph3->vert_degree[v0]==1 ) {
tmp_edge=graph3->first_edge[v0];
if(!GETBIT(marked_edge,tmp_edge)) {
queue[queue_head++]=tmp_edge;
SETBIT(marked_edge,tmp_edge);
};
};
if(graph3->vert_degree[v1]==1) {
tmp_edge=graph3->first_edge[v1];
if(!GETBIT(marked_edge,tmp_edge)){
queue[queue_head++]=tmp_edge;
SETBIT(marked_edge,tmp_edge);
};
};
if(graph3->vert_degree[v2]==1){
tmp_edge=graph3->first_edge[v2];
if(!GETBIT(marked_edge,tmp_edge)){
queue[queue_head++]=tmp_edge;
SETBIT(marked_edge,tmp_edge);
};
};
};
free(marked_edge);
return (int)(queue_head-nedges);/* returns 0 if successful otherwies return negative number*/
};
static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t queue);
static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t queue);
static void ranking(bdz_config_data_t *bdz);
static cmph_uint32 rank(cmph_uint32 b, cmph_uint32 * ranktable, cmph_uint8 * g, cmph_uint32 vertex);
bdz_config_data_t *bdz_config_new(void)
{
bdz_config_data_t *bdz;
bdz = (bdz_config_data_t *)malloc(sizeof(bdz_config_data_t));
if (!bdz) return NULL;
memset(bdz, 0, sizeof(bdz_config_data_t));
bdz->hashfunc = CMPH_HASH_JENKINS;
bdz->g = NULL;
bdz->hl = NULL;
bdz->k = 0; //kth index in ranktable, $k = log_2(n=3r)/\varepsilon$
bdz->b = 7; // number of bits of k
bdz->ranktablesize = 0; //number of entries in ranktable, $n/k +1$
bdz->ranktable = NULL; // rank table
return bdz;
}
void bdz_config_destroy(cmph_config_t *mph)
{
bdz_config_data_t *data = (bdz_config_data_t *)mph->data;
DEBUGP("Destroying algorithm dependent data\n");
free(data);
}
void bdz_config_set_b(cmph_config_t *mph, cmph_uint32 b)
{
bdz_config_data_t *bdz = (bdz_config_data_t *)mph->data;
if (b <= 2 || b > 10) b = 7; // validating restrictions over parameter b.
bdz->b = (cmph_uint8)b;
DEBUGP("b: %u\n", b);
}
void bdz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
bdz_config_data_t *bdz = (bdz_config_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
cmph_uint32 i = 0;
while(*hashptr != CMPH_HASH_COUNT)
{
if (i >= 1) break; //bdz only uses one linear hash function
bdz->hashfunc = *hashptr;
++i, ++hashptr;
}
}
cmph_t *bdz_new(cmph_config_t *mph, double c)
{
cmph_t *mphf = NULL;
bdz_data_t *bdzf = NULL;
cmph_uint32 iterations;
bdz_queue_t edges;
bdz_graph3_t graph3;
bdz_config_data_t *bdz = (bdz_config_data_t *)mph->data;
#ifdef CMPH_TIMING
double construction_time_begin = 0.0;
double construction_time = 0.0;
ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
#endif
if (c == 0) c = 1.23; // validating restrictions over parameter c.
DEBUGP("c: %f\n", c);
bdz->m = mph->key_source->nkeys;
bdz->r = (cmph_uint32)ceil((c * mph->key_source->nkeys)/3);
if ((bdz->r % 2) == 0) bdz->r+=1;
if (bdz->r == 1) { // workaround for small key sets
bdz->r = 3;
}
bdz->n = 3*bdz->r;
bdz->k = (1U << bdz->b);
DEBUGP("b: %u -- k: %u\n", bdz->b, bdz->k);
bdz->ranktablesize = (cmph_uint32)ceil(bdz->n/(double)bdz->k);
DEBUGP("ranktablesize: %u\n", bdz->ranktablesize);
bdz_alloc_graph3(&graph3, bdz->m, bdz->n);
bdz_alloc_queue(&edges,bdz->m);
DEBUGP("Created hypergraph\n");
DEBUGP("m (edges): %u n (vertices): %u r: %u c: %f \n", bdz->m, bdz->n, bdz->r, c);
// Mapping step
iterations = 1000;
if (mph->verbosity)
{
fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", bdz->m, bdz->n);
}
while(1)
{
int ok;
DEBUGP("linear hash function \n");
bdz->hl = hash_state_new(bdz->hashfunc, 15);
ok = bdz_mapping(mph, &graph3, edges);
//ok = 0;
if (!ok)
{
--iterations;
hash_state_destroy(bdz->hl);
bdz->hl = NULL;
DEBUGP("%u iterations remaining\n", iterations);
if (mph->verbosity)
{
fprintf(stderr, "acyclic graph creation failure - %u iterations remaining\n", iterations);
}
if (iterations == 0) break;
}
else break;
}
if (iterations == 0)
{
bdz_free_queue(&edges);
bdz_free_graph3(&graph3);
return NULL;
}
bdz_partial_free_graph3(&graph3);
// Assigning step
if (mph->verbosity)
{
fprintf(stderr, "Entering assigning step for mph creation of %u keys with graph sized %u\n", bdz->m, bdz->n);
}
assigning(bdz, &graph3, edges);
bdz_free_queue(&edges);
bdz_free_graph3(&graph3);
if (mph->verbosity)
{
fprintf(stderr, "Entering ranking step for mph creation of %u keys with graph sized %u\n", bdz->m, bdz->n);
}
ranking(bdz);
#ifdef CMPH_TIMING
ELAPSED_TIME_IN_SECONDS(&construction_time);
#endif
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
bdzf = (bdz_data_t *)malloc(sizeof(bdz_data_t));
bdzf->g = bdz->g;
bdz->g = NULL; //transfer memory ownership
bdzf->hl = bdz->hl;
bdz->hl = NULL; //transfer memory ownership
bdzf->ranktable = bdz->ranktable;
bdz->ranktable = NULL; //transfer memory ownership
bdzf->ranktablesize = bdz->ranktablesize;
bdzf->k = bdz->k;
bdzf->b = bdz->b;
bdzf->n = bdz->n;
bdzf->m = bdz->m;
bdzf->r = bdz->r;
mphf->data = bdzf;
mphf->size = bdz->m;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
#ifdef CMPH_TIMING
register cmph_uint32 space_usage = bdz_packed_size(mphf)*8;
register cmph_uint32 keys_per_bucket = 1;
construction_time = construction_time - construction_time_begin;
fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz->m, bdz->m/(double)bdz->n, keys_per_bucket, construction_time, space_usage/(double)bdz->m);
#endif
return mphf;
}
static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t queue)
{
cmph_uint32 e;
int cycles = 0;
cmph_uint32 hl[3];
bdz_config_data_t *bdz = (bdz_config_data_t *)mph->data;
bdz_init_graph3(graph3, bdz->m, bdz->n);
mph->key_source->rewind(mph->key_source->data);
for (e = 0; e < mph->key_source->nkeys; ++e)
{
cmph_uint32 h0, h1, h2;
cmph_uint32 keylen;
char *key = NULL;
mph->key_source->read(mph->key_source->data, &key, &keylen);
hash_vector(bdz->hl, key, keylen,hl);
h0 = hl[0] % bdz->r;
h1 = hl[1] % bdz->r + bdz->r;
h2 = hl[2] % bdz->r + (bdz->r << 1);
DEBUGP("Key: %.*s (%u %u %u)\n", keylen, key, h0, h1, h2);
mph->key_source->dispose(mph->key_source->data, key, keylen);
bdz_add_edge(graph3,h0,h1,h2);
}
cycles = bdz_generate_queue(bdz->m, bdz->n, queue, graph3);
return (cycles == 0);
}
static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t queue)
{
cmph_uint32 i;
cmph_uint32 nedges=graph3->nedges;
cmph_uint32 curr_edge;
cmph_uint32 v0,v1,v2;
cmph_uint8 * marked_vertices = (cmph_uint8 *)malloc((size_t)(bdz->n >> 3) + 1);
cmph_uint32 sizeg = (cmph_uint32)ceil(bdz->n/4.0);
bdz->g = (cmph_uint8 *)calloc((size_t)(sizeg), sizeof(cmph_uint8));
memset(marked_vertices, 0, (size_t)(bdz->n >> 3) + 1);
memset(bdz->g, 0xff, (size_t)(sizeg));
for(i=nedges-1;i+1>=1;i--){
curr_edge=queue[i];
v0=graph3->edges[curr_edge].vertices[0];
v1=graph3->edges[curr_edge].vertices[1];
v2=graph3->edges[curr_edge].vertices[2];
DEBUGP("B:%u %u %u -- %u %u %u edge %u\n", v0, v1, v2, GETVALUE(bdz->g, v0), GETVALUE(bdz->g, v1), GETVALUE(bdz->g, v2), curr_edge);
if(!GETBIT(marked_vertices, v0)){
if(!GETBIT(marked_vertices,v1))
{
SETVALUE1(bdz->g, v1, UNASSIGNED);
SETBIT(marked_vertices, v1);
}
if(!GETBIT(marked_vertices,v2))
{
SETVALUE1(bdz->g, v2, UNASSIGNED);
SETBIT(marked_vertices, v2);
}
SETVALUE1(bdz->g, v0, (6-(GETVALUE(bdz->g, v1) + GETVALUE(bdz->g,v2)))%3);
SETBIT(marked_vertices, v0);
} else if(!GETBIT(marked_vertices, v1)) {
if(!GETBIT(marked_vertices, v2))
{
SETVALUE1(bdz->g, v2, UNASSIGNED);
SETBIT(marked_vertices, v2);
}
SETVALUE1(bdz->g, v1, (7-(GETVALUE(bdz->g, v0)+GETVALUE(bdz->g, v2)))%3);
SETBIT(marked_vertices, v1);
}else {
SETVALUE1(bdz->g, v2, (8-(GETVALUE(bdz->g,v0)+GETVALUE(bdz->g, v1)))%3);
SETBIT(marked_vertices, v2);
}
DEBUGP("A:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz->g, v0), GETVALUE(bdz->g, v1), GETVALUE(bdz->g, v2));
};
free(marked_vertices);
}
static void ranking(bdz_config_data_t *bdz)
{
cmph_uint32 i, j, offset = 0U, count = 0U, size = (bdz->k >> 2U), nbytes_total = (cmph_uint32)ceil(bdz->n/4.0), nbytes;
bdz->ranktable = (cmph_uint32 *)calloc((size_t)bdz->ranktablesize, sizeof(cmph_uint32));
// ranktable computation
bdz->ranktable[0] = 0;
i = 1;
while(1)
{
if(i == bdz->ranktablesize) break;
nbytes = size < nbytes_total? size : nbytes_total;
for(j = 0; j < nbytes; j++)
{
count += bdz_lookup_table[*(bdz->g + offset + j)];
}
bdz->ranktable[i] = count;
offset += nbytes;
nbytes_total -= size;
i++;
}
}
int bdz_dump(cmph_t *mphf, FILE *fd)
{
char *buf = NULL;
cmph_uint32 buflen;
register size_t nbytes;
bdz_data_t *data = (bdz_data_t *)mphf->data;
__cmph_dump(mphf, fd);
hash_state_dump(data->hl, &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(&(data->r), sizeof(cmph_uint32), (size_t)1, fd);
cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/4.0);
nbytes = fwrite(data->g, sizeof(cmph_uint8)*sizeg, (size_t)1, fd);
nbytes = fwrite(&(data->k), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(&(data->b), sizeof(cmph_uint8), (size_t)1, fd);
nbytes = fwrite(&(data->ranktablesize), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(data->ranktable, sizeof(cmph_uint32)*(data->ranktablesize), (size_t)1, fd);
#ifdef DEBUG
cmph_uint32 i;
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", GETVALUE(data->g, i));
fprintf(stderr, "\n");
#endif
return 1;
}
void bdz_load(FILE *f, cmph_t *mphf)
{
char *buf = NULL;
cmph_uint32 buflen, sizeg;
register size_t nbytes;
bdz_data_t *bdz = (bdz_data_t *)malloc(sizeof(bdz_data_t));
DEBUGP("Loading bdz mphf\n");
mphf->data = bdz;
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
bdz->hl = hash_state_load(buf, buflen);
free(buf);
DEBUGP("Reading m and n\n");
nbytes = fread(&(bdz->n), sizeof(cmph_uint32), (size_t)1, f);
nbytes = fread(&(bdz->m), sizeof(cmph_uint32), (size_t)1, f);
nbytes = fread(&(bdz->r), sizeof(cmph_uint32), (size_t)1, f);
sizeg = (cmph_uint32)ceil(bdz->n/4.0);
bdz->g = (cmph_uint8 *)calloc((size_t)(sizeg), sizeof(cmph_uint8));
nbytes = fread(bdz->g, sizeg*sizeof(cmph_uint8), (size_t)1, f);
nbytes = fread(&(bdz->k), sizeof(cmph_uint32), (size_t)1, f);
nbytes = fread(&(bdz->b), sizeof(cmph_uint8), (size_t)1, f);
nbytes = fread(&(bdz->ranktablesize), sizeof(cmph_uint32), (size_t)1, f);
bdz->ranktable = (cmph_uint32 *)calloc((size_t)bdz->ranktablesize, sizeof(cmph_uint32));
nbytes = fread(bdz->ranktable, sizeof(cmph_uint32)*(bdz->ranktablesize), (size_t)1, f);
#ifdef DEBUG
cmph_uint32 i = 0;
fprintf(stderr, "G: ");
for (i = 0; i < bdz->n; ++i) fprintf(stderr, "%u ", GETVALUE(bdz->g,i));
fprintf(stderr, "\n");
#endif
return;
}
static inline cmph_uint32 rank(cmph_uint32 b, cmph_uint32 * ranktable, cmph_uint8 * g, cmph_uint32 vertex)
{
register cmph_uint32 index = vertex >> b;
register cmph_uint32 base_rank = ranktable[index];
register cmph_uint32 beg_idx_v = index << b;
register cmph_uint32 beg_idx_b = beg_idx_v >> 2;
register cmph_uint32 end_idx_b = vertex >> 2;
while(beg_idx_b < end_idx_b)
{
base_rank += bdz_lookup_table[*(g + beg_idx_b++)];
}
DEBUGP("base rank %u\n", base_rank);
beg_idx_v = beg_idx_b << 2;
DEBUGP("beg_idx_v %u\n", beg_idx_v);
while(beg_idx_v < vertex)
{
if(GETVALUE(g, beg_idx_v) != UNASSIGNED) base_rank++;
beg_idx_v++;
}
return base_rank;
}
cmph_uint32 bdz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
register cmph_uint32 vertex;
register bdz_data_t *bdz = (bdz_data_t *)mphf->data;
cmph_uint32 hl[3];
hash_vector(bdz->hl, key, keylen, hl);
hl[0] = hl[0] % bdz->r;
hl[1] = hl[1] % bdz->r + bdz->r;
hl[2] = hl[2] % bdz->r + (bdz->r << 1);
vertex = hl[(GETVALUE(bdz->g, hl[0]) + GETVALUE(bdz->g, hl[1]) + GETVALUE(bdz->g, hl[2])) % 3];
DEBUGP("Search found vertex %u\n", vertex);
return rank(bdz->b, bdz->ranktable, bdz->g, vertex);
}
void bdz_destroy(cmph_t *mphf)
{
bdz_data_t *data = (bdz_data_t *)mphf->data;
free(data->g);
hash_state_destroy(data->hl);
free(data->ranktable);
free(data);
free(mphf);
}
/** \fn void bdz_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void bdz_pack(cmph_t *mphf, void *packed_mphf)
{
bdz_data_t *data = (bdz_data_t *)mphf->data;
cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
// packing hl type
CMPH_HASH hl_type = hash_get_type(data->hl);
*((cmph_uint32 *) ptr) = hl_type;
ptr += sizeof(cmph_uint32);
// packing hl
hash_state_pack(data->hl, ptr);
ptr += hash_state_packed_size(hl_type);
// packing r
*((cmph_uint32 *) ptr) = data->r;
ptr += sizeof(data->r);
// packing ranktablesize
*((cmph_uint32 *) ptr) = data->ranktablesize;
ptr += sizeof(data->ranktablesize);
// packing ranktable
memcpy(ptr, data->ranktable, sizeof(cmph_uint32)*(data->ranktablesize));
ptr += sizeof(cmph_uint32)*(data->ranktablesize);
// packing b
*ptr++ = data->b;
// packing g
cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/4.0);
memcpy(ptr, data->g, sizeof(cmph_uint8)*sizeg);
}
/** \fn cmph_uint32 bdz_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 bdz_packed_size(cmph_t *mphf)
{
bdz_data_t *data = (bdz_data_t *)mphf->data;
CMPH_HASH hl_type = hash_get_type(data->hl);
return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(hl_type) + 3*sizeof(cmph_uint32) + sizeof(cmph_uint32)*(data->ranktablesize) + sizeof(cmph_uint8) + sizeof(cmph_uint8)* (cmph_uint32)(ceil(data->n/4.0)));
}
/** cmph_uint32 bdz_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 bdz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
register cmph_uint32 vertex;
register CMPH_HASH hl_type = (CMPH_HASH)(*(cmph_uint32 *)packed_mphf);
register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4;
register cmph_uint32 *ranktable = (cmph_uint32*)(hl_ptr + hash_state_packed_size(hl_type));
register cmph_uint32 r = *ranktable++;
register cmph_uint32 ranktablesize = *ranktable++;
register cmph_uint8 * g = (cmph_uint8 *)(ranktable + ranktablesize);
register cmph_uint8 b = *g++;
cmph_uint32 hl[3];
hash_vector_packed(hl_ptr, hl_type, key, keylen, hl);
hl[0] = hl[0] % r;
hl[1] = hl[1] % r + r;
hl[2] = hl[2] % r + (r << 1);
vertex = hl[(GETVALUE(g, hl[0]) + GETVALUE(g, hl[1]) + GETVALUE(g, hl[2])) % 3];
return rank(b, ranktable, g, vertex);
}
cmph-2.0.2/src/buffer_manager.h 0000644 0001750 0001750 00000001024 13411542035 015702 0 ustar joseph joseph #ifndef __CMPH_BUFFER_MANAGE_H__
#define __CMPH_BUFFER_MANAGE_H__
#include "cmph_types.h"
#include
typedef struct __buffer_manager_t buffer_manager_t;
buffer_manager_t * buffer_manager_new(cmph_uint32 memory_avail, cmph_uint32 nentries);
void buffer_manager_open(buffer_manager_t * buffer_manager, cmph_uint32 index, char * filename);
cmph_uint8 * buffer_manager_read_key(buffer_manager_t * buffer_manager, cmph_uint32 index, cmph_uint32 * keylen);
void buffer_manager_destroy(buffer_manager_t * buffer_manager);
#endif
cmph-2.0.2/src/fch.h 0000644 0001750 0001750 00000003514 13411542035 013505 0 ustar joseph joseph #ifndef __CMPH_FCH_H__
#define __CMPH_FCH_H__
#include "cmph.h"
typedef struct __fch_data_t fch_data_t;
typedef struct __fch_config_data_t fch_config_data_t;
/* Parameters calculation */
cmph_uint32 fch_calc_b(double c, cmph_uint32 m);
double fch_calc_p1(cmph_uint32 m);
double fch_calc_p2(cmph_uint32 b);
cmph_uint32 mixh10h11h12(cmph_uint32 b, double p1, double p2, cmph_uint32 initial_index);
fch_config_data_t *fch_config_new(void);
void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void fch_config_destroy(cmph_config_t *mph);
cmph_t *fch_new(cmph_config_t *mph, double c);
void fch_load(FILE *f, cmph_t *mphf);
int fch_dump(cmph_t *mphf, FILE *f);
void fch_destroy(cmph_t *mphf);
cmph_uint32 fch_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
/** \fn void fch_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void fch_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 fch_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 fch_packed_size(cmph_t *mphf);
/** cmph_uint32 fch_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
#endif
cmph-2.0.2/src/main.c 0000644 0001750 0001750 00000023620 13411542035 013664 0 ustar joseph joseph #ifdef WIN32
#include "wingetopt.h"
#else
#include
#endif
#include
#include
#include
#include
#include
#include
#include
#include "cmph.h"
#include "hash.h"
#ifdef WIN32
#define VERSION "0.8"
#else
#include "config.h"
#endif
void usage(const char *prg)
{
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
}
void usage_long(const char *prg)
{
cmph_uint32 i;
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
fprintf(stderr, "Minimum perfect hashing tool\n\n");
fprintf(stderr, " -h\t print this help message\n");
fprintf(stderr, " -c\t c value determines:\n");
fprintf(stderr, " \t * the number of vertices in the graph for the algorithms BMZ and CHM\n");
fprintf(stderr, " \t * the number of bits per key required in the FCH algorithm\n");
fprintf(stderr, " \t * the load factor in the CHD_PH algorithm\n");
fprintf(stderr, " -a\t algorithm - valid values are\n");
for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_names[i]);
fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n");
for (i = 0; i < CMPH_HASH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_hash_names[i]);
fprintf(stderr, " -V\t print version number and exit\n");
fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n");
fprintf(stderr, " -k\t number of keys\n");
fprintf(stderr, " -g\t generation mode\n");
fprintf(stderr, " -s\t random seed\n");
fprintf(stderr, " -m\t minimum perfect hash function file \n");
fprintf(stderr, " -M\t main memory availability (in MB) used in BRZ algorithm \n");
fprintf(stderr, " -d\t temporary directory used in BRZ algorithm \n");
fprintf(stderr, " -b\t the meaning of this parameter depends on the algorithm selected in the -a option:\n");
fprintf(stderr, " \t * For BRZ it is used to make the maximal number of keys in a bucket lower than 256.\n");
fprintf(stderr, " \t In this case its value should be an integer in the range [64,175]. Default is 128.\n\n");
fprintf(stderr, " \t * For BDZ it is used to determine the size of some precomputed rank\n");
fprintf(stderr, " \t information and its value should be an integer in the range [3,10]. Default\n");
fprintf(stderr, " \t is 7. The larger is this value, the more compact are the resulting functions\n");
fprintf(stderr, " \t and the slower are them at evaluation time.\n\n");
fprintf(stderr, " \t * For CHD and CHD_PH it is used to set the average number of keys per bucket\n");
fprintf(stderr, " \t and its value should be an integer in the range [1,32]. Default is 4. The\n");
fprintf(stderr, " \t larger is this value, the slower is the construction of the functions.\n");
fprintf(stderr, " \t This parameter has no effect for other algorithms.\n\n");
fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function. A t-perfect\n");
fprintf(stderr, " \t hash function allows at most t collisions in a given bin. This parameter applies\n");
fprintf(stderr, " \t only to the CHD and CHD_PH algorithms. Its value should be an integer in the\n");
fprintf(stderr, " \t range [1,128]. Defaul is 1\n");
fprintf(stderr, " keysfile\t line separated file with keys\n");
}
int main(int argc, char **argv)
{
cmph_uint32 verbosity = 0;
char generate = 0;
char *mphf_file = NULL;
FILE *mphf_fd = stdout;
const char *keys_file = NULL;
FILE *keys_fd;
cmph_uint32 nkeys = UINT_MAX;
cmph_uint32 seed = UINT_MAX;
CMPH_HASH *hashes = NULL;
cmph_uint32 nhashes = 0;
cmph_uint32 i;
CMPH_ALGO mph_algo = CMPH_CHM;
double c = 0;
cmph_config_t *config = NULL;
cmph_t *mphf = NULL;
char * tmp_dir = NULL;
cmph_io_adapter_t *source;
cmph_uint32 memory_availability = 0;
cmph_uint32 b = 0;
cmph_uint32 keys_per_bin = 1;
while (1)
{
char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:");
if (ch == -1) break;
switch (ch)
{
case 's':
{
char *cptr;
seed = (cmph_uint32)strtoul(optarg, &cptr, 10);
if(*cptr != 0) {
fprintf(stderr, "Invalid seed %s\n", optarg);
exit(1);
}
}
break;
case 'c':
{
char *endptr;
c = strtod(optarg, &endptr);
if(*endptr != 0) {
fprintf(stderr, "Invalid c value %s\n", optarg);
exit(1);
}
}
break;
case 'g':
generate = 1;
break;
case 'k':
{
char *endptr;
nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10);
if(*endptr != 0) {
fprintf(stderr, "Invalid number of keys %s\n", optarg);
exit(1);
}
}
break;
case 'm':
mphf_file = strdup(optarg);
break;
case 'd':
tmp_dir = strdup(optarg);
break;
case 'M':
{
char *cptr;
memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10);
if(*cptr != 0) {
fprintf(stderr, "Invalid memory availability %s\n", optarg);
exit(1);
}
}
break;
case 'b':
{
char *cptr;
b = (cmph_uint32)strtoul(optarg, &cptr, 10);
if(*cptr != 0) {
fprintf(stderr, "Parameter b was not found: %s\n", optarg);
exit(1);
}
}
break;
case 't':
{
char *cptr;
keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10);
if(*cptr != 0) {
fprintf(stderr, "Parameter t was not found: %s\n", optarg);
exit(1);
}
}
break;
case 'v':
++verbosity;
break;
case 'V':
printf("%s\n", VERSION);
return 0;
case 'h':
usage_long(argv[0]);
return 0;
case 'a':
{
char valid = 0;
for (i = 0; i < CMPH_COUNT; ++i)
{
if (strcmp(cmph_names[i], optarg) == 0)
{
mph_algo = (CMPH_ALGO)i;
valid = 1;
break;
}
}
if (!valid)
{
fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION);
return -1;
}
}
break;
case 'f':
{
char valid = 0;
for (i = 0; i < CMPH_HASH_COUNT; ++i)
{
if (strcmp(cmph_hash_names[i], optarg) == 0)
{
hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 ));
hashes[nhashes] = (CMPH_HASH)i;
hashes[nhashes + 1] = CMPH_HASH_COUNT;
++nhashes;
valid = 1;
break;
}
}
if (!valid)
{
fprintf(stderr, "Invalid hash function: %s\n", optarg);
return -1;
}
}
break;
default:
usage(argv[0]);
return 1;
}
}
if (optind != argc - 1)
{
usage(argv[0]);
return 1;
}
keys_file = argv[optind];
if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
srand(seed);
int ret = 0;
if (mphf_file == NULL)
{
mphf_file = (char *)malloc(strlen(keys_file) + 5);
memcpy(mphf_file, keys_file, strlen(keys_file));
memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
}
keys_fd = fopen(keys_file, "r");
if (keys_fd == NULL)
{
fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
return -1;
}
if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
if (generate)
{
//Create mphf
mphf_fd = fopen(mphf_file, "wb");
config = cmph_config_new(source);
cmph_config_set_algo(config, mph_algo);
if (nhashes) cmph_config_set_hashfuncs(config, hashes);
cmph_config_set_verbosity(config, verbosity);
cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir);
cmph_config_set_mphf_fd(config, mphf_fd);
cmph_config_set_memory_availability(config, memory_availability);
cmph_config_set_b(config, b);
cmph_config_set_keys_per_bin(config, keys_per_bin);
//if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15;
if (c != 0) cmph_config_set_graphsize(config, c);
mphf = cmph_new(config);
cmph_config_destroy(config);
if (mphf == NULL)
{
fprintf(stderr, "Unable to create minimum perfect hashing function\n");
//cmph_config_destroy(config);
free(mphf_file);
return -1;
}
if (mphf_fd == NULL)
{
fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno));
free(mphf_file);
return -1;
}
cmph_dump(mphf, mphf_fd);
cmph_destroy(mphf);
fclose(mphf_fd);
}
else
{
cmph_uint8 * hashtable = NULL;
mphf_fd = fopen(mphf_file, "rb");
if (mphf_fd == NULL)
{
fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
free(mphf_file);
return -1;
}
mphf = cmph_load(mphf_fd);
fclose(mphf_fd);
if (!mphf)
{
fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
free(mphf_file);
return -1;
}
cmph_uint32 siz = cmph_size(mphf);
hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8));
memset(hashtable, 0,(size_t) siz);
//check all keys
for (i = 0; i < source->nkeys; ++i)
{
cmph_uint32 h;
char *buf;
cmph_uint32 buflen = 0;
source->read(source->data, &buf, &buflen);
h = cmph_search(mphf, buf, buflen);
if (!(h < siz))
{
fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
ret = 1;
} else if(hashtable[h] >= keys_per_bin)
{
fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h);
fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
ret = 1;
} else hashtable[h]++;
if (verbosity)
{
printf("%s -> %u\n", buf, h);
}
source->dispose(source->data, buf, buflen);
}
cmph_destroy(mphf);
free(hashtable);
}
fclose(keys_fd);
free(mphf_file);
free(tmp_dir);
cmph_io_nlfile_adapter_destroy(source);
return ret;
}
cmph-2.0.2/src/djb2_hash.h 0000644 0001750 0001750 00000000767 13411542035 014600 0 ustar joseph joseph #ifndef __DJB2_HASH_H__
#define __DJB2_HASH_H__
#include "hash.h"
typedef struct __djb2_state_t
{
CMPH_HASH hashfunc;
} djb2_state_t;
djb2_state_t *djb2_state_new();
cmph_uint32 djb2_hash(djb2_state_t *state, const char *k, cmph_uint32 keylen);
void djb2_state_dump(djb2_state_t *state, char **buf, cmph_uint32 *buflen);
djb2_state_t *djb2_state_copy(djb2_state_t *src_state);
djb2_state_t *djb2_state_load(const char *buf, cmph_uint32 buflen);
void djb2_state_destroy(djb2_state_t *state);
#endif
cmph-2.0.2/src/bdz_structs_ph.h 0000755 0001750 0001750 00000001002 13411542035 015773 0 ustar joseph joseph #ifndef __CMPH_BDZ_STRUCTS_PH_H__
#define __CMPH_BDZ_STRUCTS_PH_H__
#include "hash_state.h"
struct __bdz_ph_data_t
{
cmph_uint32 m; //edges (words) count
cmph_uint32 n; //vertex count
cmph_uint32 r; //partition vertex count
cmph_uint8 *g;
hash_state_t *hl; // linear hashing
};
struct __bdz_ph_config_data_t
{
CMPH_HASH hashfunc;
cmph_uint32 m; //edges (words) count
cmph_uint32 n; //vertex count
cmph_uint32 r; //partition vertex count
cmph_uint8 *g;
hash_state_t *hl; // linear hashing
};
#endif
cmph-2.0.2/src/buffer_entry.h 0000644 0001750 0001750 00000001123 13411542035 015431 0 ustar joseph joseph #ifndef __CMPH_BUFFER_ENTRY_H__
#define __CMPH_BUFFER_ENTRY_H__
#include "cmph_types.h"
#include
typedef struct __buffer_entry_t buffer_entry_t;
buffer_entry_t * buffer_entry_new(cmph_uint32 capacity);
void buffer_entry_set_capacity(buffer_entry_t * buffer_entry, cmph_uint32 capacity);
cmph_uint32 buffer_entry_get_capacity(buffer_entry_t * buffer_entry);
void buffer_entry_open(buffer_entry_t * buffer_entry, char * filename);
cmph_uint8 * buffer_entry_read_key(buffer_entry_t * buffer_entry, cmph_uint32 * keylen);
void buffer_entry_destroy(buffer_entry_t * buffer_entry);
#endif
cmph-2.0.2/src/cmph_types.h 0000644 0001750 0001750 00000002201 13411542035 015110 0 ustar joseph joseph #ifndef __CMPH_TYPES_H__
#define __CMPH_TYPES_H__
typedef char cmph_int8;
typedef unsigned char cmph_uint8;
typedef short cmph_int16;
typedef unsigned short cmph_uint16;
typedef int cmph_int32;
typedef unsigned int cmph_uint32;
#if defined(__ia64) || defined(__x86_64__)
/** \typedef long cmph_int64;
* \brief 64-bit integer for a 64-bit achitecture.
*/
typedef long cmph_int64;
/** \typedef unsigned long cmph_uint64;
* \brief Unsigned 64-bit integer for a 64-bit achitecture.
*/
typedef unsigned long cmph_uint64;
#else
/** \typedef long long cmph_int64;
* \brief 64-bit integer for a 32-bit achitecture.
*/
typedef long long cmph_int64;
/** \typedef unsigned long long cmph_uint64;
* \brief Unsigned 64-bit integer for a 32-bit achitecture.
*/
typedef unsigned long long cmph_uint64;
#endif
typedef enum { CMPH_HASH_JENKINS, CMPH_HASH_COUNT } CMPH_HASH;
extern const char *cmph_hash_names[];
typedef enum { CMPH_BMZ, CMPH_BMZ8, CMPH_CHM, CMPH_BRZ, CMPH_FCH,
CMPH_BDZ, CMPH_BDZ_PH,
CMPH_CHD_PH, CMPH_CHD, CMPH_COUNT } CMPH_ALGO;
extern const char *cmph_names[];
#endif
cmph-2.0.2/src/linear_string_map.c 0000644 0001750 0001750 00000002625 13411542035 016437 0 ustar joseph joseph #include
#include
#include
#include "linear_string_map.h"
struct __linear_string_map_t {
const char *key;
void *value;
struct __linear_string_map_t* next;
};
lsmap_t *lsmap_new() {
lsmap_t* lsmap = (lsmap_t*)malloc(sizeof(lsmap_t));
if (!lsmap) return NULL;
lsmap->key = "dummy node";
lsmap->next = NULL;
return lsmap;
}
int lsmap_size(lsmap_t *lsmap) {
int size = 0;
while (lsmap->next != NULL) ++size;
return size;
}
void lsmap_append(lsmap_t *lsmap, const char *key, void *value) {
while (lsmap->next != NULL) lsmap = lsmap->next;
lsmap->next = (lsmap_t*)malloc(sizeof(lsmap_t));
lsmap->key = key;
lsmap->value = value;
lsmap = lsmap->next;
lsmap->key = "dummy node";
lsmap->next = NULL;
}
void* lsmap_search(lsmap_t *lsmap, const char *key) {
while (lsmap->next != NULL) {
if (strcmp(lsmap->key, key) == 0) {
return lsmap->value;
}
lsmap = lsmap->next;
}
return NULL;
}
void lsmap_foreach_key(lsmap_t *lsmap, void (*f)(const char*)) {
while (lsmap->next != NULL) {
f(lsmap->key);
lsmap = lsmap->next;
}
}
void lsmap_foreach_value(lsmap_t *lsmap, void (*f)(void*)) {
while (lsmap->next != NULL) {
f(lsmap->value);
lsmap = lsmap->next;
}
}
void lsmap_destroy(lsmap_t *lsmap) {
while (lsmap->next != NULL) {
lsmap_t* freeme = lsmap;
lsmap = lsmap->next;
free(freeme);
}
free(lsmap);
}
cmph-2.0.2/src/bmz.h 0000644 0001750 0001750 00000003135 13411542035 013534 0 ustar joseph joseph #ifndef __CMPH_BMZ_H__
#define __CMPH_BMZ_H__
#include "cmph.h"
typedef struct __bmz_data_t bmz_data_t;
typedef struct __bmz_config_data_t bmz_config_data_t;
bmz_config_data_t *bmz_config_new(void);
void bmz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void bmz_config_destroy(cmph_config_t *mph);
cmph_t *bmz_new(cmph_config_t *mph, double c);
void bmz_load(FILE *f, cmph_t *mphf);
int bmz_dump(cmph_t *mphf, FILE *f);
void bmz_destroy(cmph_t *mphf);
cmph_uint32 bmz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
/** \fn void bmz_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void bmz_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 bmz_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 bmz_packed_size(cmph_t *mphf);
/** cmph_uint32 bmz_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
#endif
cmph-2.0.2/src/bmz8.c 0000644 0001750 0001750 00000050700 13411542035 013617 0 ustar joseph joseph #include "graph.h"
#include "bmz8.h"
#include "cmph_structs.h"
#include "bmz8_structs.h"
#include "hash.h"
#include "vqueue.h"
#include "bitbool.h"
#include
#include
#include
#include
#include
//#define DEBUG
#include "debug.h"
static int bmz8_gen_edges(cmph_config_t *mph);
static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint32 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited);
static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz8, cmph_uint32 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited);
static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint8 * visited);
bmz8_config_data_t *bmz8_config_new(void)
{
bmz8_config_data_t *bmz8;
bmz8 = (bmz8_config_data_t *)malloc(sizeof(bmz8_config_data_t));
if (!bmz8) return NULL;
memset(bmz8, 0, sizeof(bmz8_config_data_t));
bmz8->hashfuncs[0] = CMPH_HASH_JENKINS;
bmz8->hashfuncs[1] = CMPH_HASH_JENKINS;
bmz8->g = NULL;
bmz8->graph = NULL;
bmz8->hashes = NULL;
return bmz8;
}
void bmz8_config_destroy(cmph_config_t *mph)
{
bmz8_config_data_t *data = (bmz8_config_data_t *)mph->data;
DEBUGP("Destroying algorithm dependent data\n");
free(data);
}
void bmz8_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
cmph_uint8 i = 0;
while(*hashptr != CMPH_HASH_COUNT)
{
if (i >= 2) break; //bmz8 only uses two hash functions
bmz8->hashfuncs[i] = *hashptr;
++i, ++hashptr;
}
}
cmph_t *bmz8_new(cmph_config_t *mph, double c)
{
cmph_t *mphf = NULL;
bmz8_data_t *bmz8f = NULL;
cmph_uint8 i;
cmph_uint8 iterations;
cmph_uint8 iterations_map = 20;
cmph_uint8 *used_edges = NULL;
cmph_uint8 restart_mapping = 0;
cmph_uint8 * visited = NULL;
bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data;
if (mph->key_source->nkeys >= 256)
{
if (mph->verbosity) fprintf(stderr, "The number of keys in BMZ8 must be lower than 256.\n");
return NULL;
}
if (c == 0) c = 1.15; // validating restrictions over parameter c.
DEBUGP("c: %f\n", c);
bmz8->m = (cmph_uint8) mph->key_source->nkeys;
bmz8->n = (cmph_uint8) ceil(c * mph->key_source->nkeys);
if (bmz8->n < 5) // workaround for small key sets
{
bmz8->n = 5;
}
DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz8->m, bmz8->n, c);
bmz8->graph = graph_new(bmz8->n, bmz8->m);
DEBUGP("Created graph\n");
bmz8->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3);
for(i = 0; i < 3; ++i) bmz8->hashes[i] = NULL;
do
{
// Mapping step
cmph_uint8 biggest_g_value = 0;
cmph_uint8 biggest_edge_value = 1;
iterations = 100;
if (mph->verbosity)
{
fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", bmz8->m, bmz8->n);
}
while(1)
{
int ok;
DEBUGP("hash function 1\n");
bmz8->hashes[0] = hash_state_new(bmz8->hashfuncs[0], bmz8->n);
DEBUGP("hash function 2\n");
bmz8->hashes[1] = hash_state_new(bmz8->hashfuncs[1], bmz8->n);
DEBUGP("Generating edges\n");
ok = bmz8_gen_edges(mph);
if (!ok)
{
--iterations;
hash_state_destroy(bmz8->hashes[0]);
bmz8->hashes[0] = NULL;
hash_state_destroy(bmz8->hashes[1]);
bmz8->hashes[1] = NULL;
DEBUGP("%u iterations remaining\n", iterations);
if (mph->verbosity)
{
fprintf(stderr, "simple graph creation failure - %u iterations remaining\n", iterations);
}
if (iterations == 0) break;
}
else break;
}
if (iterations == 0)
{
graph_destroy(bmz8->graph);
return NULL;
}
// Ordering step
if (mph->verbosity)
{
fprintf(stderr, "Starting ordering step\n");
}
graph_obtain_critical_nodes(bmz8->graph);
// Searching step
if (mph->verbosity)
{
fprintf(stderr, "Starting Searching step.\n");
fprintf(stderr, "\tTraversing critical vertices.\n");
}
DEBUGP("Searching step\n");
visited = (cmph_uint8 *)malloc((size_t)bmz8->n/8 + 1);
memset(visited, 0, (size_t)bmz8->n/8 + 1);
used_edges = (cmph_uint8 *)malloc((size_t)bmz8->m/8 + 1);
memset(used_edges, 0, (size_t)bmz8->m/8 + 1);
free(bmz8->g);
bmz8->g = (cmph_uint8 *)calloc((size_t)bmz8->n, sizeof(cmph_uint8));
assert(bmz8->g);
for (i = 0; i < bmz8->n; ++i) // critical nodes
{
if (graph_node_is_critical(bmz8->graph, i) && (!GETBIT(visited,i)))
{
if(c > 1.14) restart_mapping = bmz8_traverse_critical_nodes(bmz8, i, &biggest_g_value, &biggest_edge_value, used_edges, visited);
else restart_mapping = bmz8_traverse_critical_nodes_heuristic(bmz8, i, &biggest_g_value, &biggest_edge_value, used_edges, visited);
if(restart_mapping) break;
}
}
if(!restart_mapping)
{
if (mph->verbosity)
{
fprintf(stderr, "\tTraversing non critical vertices.\n");
}
bmz8_traverse_non_critical_nodes(bmz8, used_edges, visited); // non_critical_nodes
}
else
{
iterations_map--;
if (mph->verbosity) fprintf(stderr, "Restarting mapping step. %u iterations remaining.\n", iterations_map);
}
free(used_edges);
free(visited);
}while(restart_mapping && iterations_map > 0);
graph_destroy(bmz8->graph);
bmz8->graph = NULL;
if (iterations_map == 0)
{
return NULL;
}
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
bmz8f = (bmz8_data_t *)malloc(sizeof(bmz8_data_t));
bmz8f->g = bmz8->g;
bmz8->g = NULL; //transfer memory ownership
bmz8f->hashes = bmz8->hashes;
bmz8->hashes = NULL; //transfer memory ownership
bmz8f->n = bmz8->n;
bmz8f->m = bmz8->m;
mphf->data = bmz8f;
mphf->size = bmz8->m;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
return mphf;
}
static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint32 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited)
{
cmph_uint8 next_g;
cmph_uint32 u; /* Auxiliary vertex */
cmph_uint32 lav; /* lookahead vertex */
cmph_uint8 collision;
vqueue_t * q = vqueue_new((cmph_uint32)(graph_ncritical_nodes(bmz8->graph)));
graph_iterator_t it, it1;
DEBUGP("Labelling critical vertices\n");
bmz8->g[v] = (cmph_uint8)(ceil ((double)(*biggest_edge_value)/2) - 1);
SETBIT(visited, v);
next_g = (cmph_uint8)floor((double)(*biggest_edge_value/2)); /* next_g is incremented in the do..while statement*/
vqueue_insert(q, v);
while(!vqueue_is_empty(q))
{
v = vqueue_remove(q);
it = graph_neighbors_it(bmz8->graph, v);
while ((u = graph_next_neighbor(bmz8->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz8->graph, u) && (!GETBIT(visited,u)))
{
collision = 1;
while(collision) // lookahead to resolve collisions
{
next_g = (cmph_uint8)(*biggest_g_value + 1);
it1 = graph_neighbors_it(bmz8->graph, u);
collision = 0;
while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz8->graph, lav) && GETBIT(visited,lav))
{
if(next_g + bmz8->g[lav] >= bmz8->m)
{
vqueue_destroy(q);
return 1; // restart mapping step.
}
if (GETBIT(used_edges, (next_g + bmz8->g[lav])))
{
collision = 1;
break;
}
}
}
if (next_g > *biggest_g_value) *biggest_g_value = next_g;
}
// Marking used edges...
it1 = graph_neighbors_it(bmz8->graph, u);
while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz8->graph, lav) && GETBIT(visited, lav))
{
SETBIT(used_edges,(next_g + bmz8->g[lav]));
if(next_g + bmz8->g[lav] > *biggest_edge_value)
*biggest_edge_value = (cmph_uint8)(next_g + bmz8->g[lav]);
}
}
bmz8->g[u] = next_g; // Labelling vertex u.
SETBIT(visited,u);
vqueue_insert(q, u);
}
}
}
vqueue_destroy(q);
return 0;
}
static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz8, cmph_uint32 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited)
{
cmph_uint8 next_g;
cmph_uint32 u;
cmph_uint32 lav;
cmph_uint8 collision;
cmph_uint8 * unused_g_values = NULL;
cmph_uint8 unused_g_values_capacity = 0;
cmph_uint8 nunused_g_values = 0;
vqueue_t * q = vqueue_new((cmph_uint32)(graph_ncritical_nodes(bmz8->graph)));
graph_iterator_t it, it1;
DEBUGP("Labelling critical vertices\n");
bmz8->g[v] = (cmph_uint8)(ceil ((double)(*biggest_edge_value)/2) - 1);
SETBIT(visited, v);
next_g = (cmph_uint8)floor((double)(*biggest_edge_value/2));
vqueue_insert(q, v);
while(!vqueue_is_empty(q))
{
v = vqueue_remove(q);
it = graph_neighbors_it(bmz8->graph, v);
while ((u = graph_next_neighbor(bmz8->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz8->graph, u) && (!GETBIT(visited,u)))
{
cmph_uint8 next_g_index = 0;
collision = 1;
while(collision) // lookahead to resolve collisions
{
if (next_g_index < nunused_g_values)
{
next_g = unused_g_values[next_g_index++];
}
else
{
next_g = (cmph_uint8)(*biggest_g_value + 1);
next_g_index = 255;//UINT_MAX;
}
it1 = graph_neighbors_it(bmz8->graph, u);
collision = 0;
while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz8->graph, lav) && GETBIT(visited,lav))
{
if(next_g + bmz8->g[lav] >= bmz8->m)
{
vqueue_destroy(q);
free(unused_g_values);
return 1; // restart mapping step.
}
if (GETBIT(used_edges, (next_g + bmz8->g[lav])))
{
collision = 1;
break;
}
}
}
if(collision && (next_g > *biggest_g_value)) // saving the current g value stored in next_g.
{
if(nunused_g_values == unused_g_values_capacity)
{
unused_g_values = (cmph_uint8*)realloc(unused_g_values, ((size_t)(unused_g_values_capacity + BUFSIZ))*sizeof(cmph_uint8));
unused_g_values_capacity += (cmph_uint8)BUFSIZ;
}
unused_g_values[nunused_g_values++] = next_g;
}
if (next_g > *biggest_g_value) *biggest_g_value = next_g;
}
next_g_index--;
if (next_g_index < nunused_g_values) unused_g_values[next_g_index] = unused_g_values[--nunused_g_values];
// Marking used edges...
it1 = graph_neighbors_it(bmz8->graph, u);
while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz8->graph, lav) && GETBIT(visited, lav))
{
SETBIT(used_edges,(next_g + bmz8->g[lav]));
if(next_g + bmz8->g[lav] > *biggest_edge_value)
*biggest_edge_value = (cmph_uint8)(next_g + bmz8->g[lav]);
}
}
bmz8->g[u] = next_g; // Labelling vertex u.
SETBIT(visited, u);
vqueue_insert(q, u);
}
}
}
vqueue_destroy(q);
free(unused_g_values);
return 0;
}
static cmph_uint8 next_unused_edge(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint32 unused_edge_index)
{
while(1)
{
assert(unused_edge_index < bmz8->m);
if(GETBIT(used_edges, unused_edge_index)) unused_edge_index ++;
else break;
}
return (cmph_uint8)unused_edge_index;
}
static void bmz8_traverse(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint32 v, cmph_uint8 * unused_edge_index, cmph_uint8 * visited)
{
graph_iterator_t it = graph_neighbors_it(bmz8->graph, v);
cmph_uint32 neighbor = 0;
while((neighbor = graph_next_neighbor(bmz8->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
if(GETBIT(visited,neighbor)) continue;
//DEBUGP("Visiting neighbor %u\n", neighbor);
*unused_edge_index = next_unused_edge(bmz8, used_edges, *unused_edge_index);
bmz8->g[neighbor] = (cmph_uint8)(*unused_edge_index - bmz8->g[v]);
//if (bmz8->g[neighbor] >= bmz8->m) bmz8->g[neighbor] += bmz8->m;
SETBIT(visited, neighbor);
(*unused_edge_index)++;
bmz8_traverse(bmz8, used_edges, neighbor, unused_edge_index, visited);
}
}
static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint8 * visited)
{
cmph_uint8 i, v1, v2, unused_edge_index = 0;
DEBUGP("Labelling non critical vertices\n");
for(i = 0; i < bmz8->m; i++)
{
v1 = (cmph_uint8)graph_vertex_id(bmz8->graph, i, 0);
v2 = (cmph_uint8)graph_vertex_id(bmz8->graph, i, 1);
if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue;
if(GETBIT(visited,v1)) bmz8_traverse(bmz8, used_edges, v1, &unused_edge_index, visited);
else bmz8_traverse(bmz8, used_edges, v2, &unused_edge_index, visited);
}
for(i = 0; i < bmz8->n; i++)
{
if(!GETBIT(visited,i))
{
bmz8->g[i] = 0;
SETBIT(visited, i);
bmz8_traverse(bmz8, used_edges, i, &unused_edge_index, visited);
}
}
}
static int bmz8_gen_edges(cmph_config_t *mph)
{
cmph_uint8 e;
bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data;
cmph_uint8 multiple_edges = 0;
DEBUGP("Generating edges for %u vertices\n", bmz8->n);
graph_clear_edges(bmz8->graph);
mph->key_source->rewind(mph->key_source->data);
for (e = 0; e < mph->key_source->nkeys; ++e)
{
cmph_uint8 h1, h2;
cmph_uint32 keylen;
char *key = NULL;
mph->key_source->read(mph->key_source->data, &key, &keylen);
// if (key == NULL)fprintf(stderr, "key = %s -- read BMZ\n", key);
h1 = (cmph_uint8)(hash(bmz8->hashes[0], key, keylen) % bmz8->n);
h2 = (cmph_uint8)(hash(bmz8->hashes[1], key, keylen) % bmz8->n);
if (h1 == h2) if (++h2 >= bmz8->n) h2 = 0;
if (h1 == h2)
{
if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e);
mph->key_source->dispose(mph->key_source->data, key, keylen);
return 0;
}
//DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key);
mph->key_source->dispose(mph->key_source->data, key, keylen);
// fprintf(stderr, "key = %s -- dispose BMZ\n", key);
multiple_edges = graph_contains_edge(bmz8->graph, h1, h2);
if (mph->verbosity && multiple_edges) fprintf(stderr, "A non simple graph was generated\n");
if (multiple_edges) return 0; // checking multiple edge restriction.
graph_add_edge(bmz8->graph, h1, h2);
}
return !multiple_edges;
}
int bmz8_dump(cmph_t *mphf, FILE *fd)
{
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint8 two = 2; //number of hash functions
bmz8_data_t *data = (bmz8_data_t *)mphf->data;
register size_t nbytes;
__cmph_dump(mphf, fd);
nbytes = fwrite(&two, sizeof(cmph_uint8), (size_t)1, fd);
hash_state_dump(data->hashes[0], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
hash_state_dump(data->hashes[1], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
free(buf);
nbytes = fwrite(&(data->n), sizeof(cmph_uint8), (size_t)1, fd);
nbytes = fwrite(&(data->m), sizeof(cmph_uint8), (size_t)1, fd);
nbytes = fwrite(data->g, sizeof(cmph_uint8)*(data->n), (size_t)1, fd);
/* #ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
fprintf(stderr, "\n");
#endif*/
return 1;
}
void bmz8_load(FILE *f, cmph_t *mphf)
{
cmph_uint8 nhashes;
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint8 i;
register size_t nbytes;
bmz8_data_t *bmz8 = (bmz8_data_t *)malloc(sizeof(bmz8_data_t));
DEBUGP("Loading bmz8 mphf\n");
mphf->data = bmz8;
nbytes = fread(&nhashes, sizeof(cmph_uint8), (size_t)1, f);
bmz8->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(size_t)(nhashes + 1));
bmz8->hashes[nhashes] = NULL;
DEBUGP("Reading %u hashes\n", nhashes);
for (i = 0; i < nhashes; ++i)
{
hash_state_t *state = NULL;
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc((size_t)buflen);
nbytes = fread(buf, (size_t)buflen, (size_t)1, f);
state = hash_state_load(buf, buflen);
bmz8->hashes[i] = state;
free(buf);
}
DEBUGP("Reading m and n\n");
nbytes = fread(&(bmz8->n), sizeof(cmph_uint8), (size_t)1, f);
nbytes = fread(&(bmz8->m), sizeof(cmph_uint8), (size_t)1, f);
bmz8->g = (cmph_uint8 *)malloc(sizeof(cmph_uint8)*bmz8->n);
nbytes = fread(bmz8->g, bmz8->n*sizeof(cmph_uint8), (size_t)1, f);
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < bmz8->n; ++i) fprintf(stderr, "%u ", bmz8->g[i]);
fprintf(stderr, "\n");
#endif
return;
}
cmph_uint8 bmz8_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
bmz8_data_t *bmz8 = (bmz8_data_t *)mphf->data;
cmph_uint8 h1 = (cmph_uint8)(hash(bmz8->hashes[0], key, keylen) % bmz8->n);
cmph_uint8 h2 = (cmph_uint8)(hash(bmz8->hashes[1], key, keylen) % bmz8->n);
DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
if (h1 == h2 && ++h2 > bmz8->n) h2 = 0;
DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, bmz8->g[h1], bmz8->g[h2], bmz8->m);
return (cmph_uint8)(bmz8->g[h1] + bmz8->g[h2]);
}
void bmz8_destroy(cmph_t *mphf)
{
bmz8_data_t *data = (bmz8_data_t *)mphf->data;
free(data->g);
hash_state_destroy(data->hashes[0]);
hash_state_destroy(data->hashes[1]);
free(data->hashes);
free(data);
free(mphf);
}
/** \fn void bmz8_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void bmz8_pack(cmph_t *mphf, void *packed_mphf)
{
bmz8_data_t *data = (bmz8_data_t *)mphf->data;
cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
// packing h1 type
CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
*((cmph_uint32 *) ptr) = h1_type;
ptr += sizeof(cmph_uint32);
// packing h1
hash_state_pack(data->hashes[0], ptr);
ptr += hash_state_packed_size(h1_type);
// packing h2 type
CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
*((cmph_uint32 *) ptr) = h2_type;
ptr += sizeof(cmph_uint32);
// packing h2
hash_state_pack(data->hashes[1], ptr);
ptr += hash_state_packed_size(h2_type);
// packing n
*ptr++ = data->n;
// packing g
memcpy(ptr, data->g, sizeof(cmph_uint8)*data->n);
}
/** \fn cmph_uint32 bmz8_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 bmz8_packed_size(cmph_t *mphf)
{
bmz8_data_t *data = (bmz8_data_t *)mphf->data;
CMPH_HASH h1_type = hash_get_type(data->hashes[0]);
CMPH_HASH h2_type = hash_get_type(data->hashes[1]);
return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) +
2*sizeof(cmph_uint32) + sizeof(cmph_uint8) + sizeof(cmph_uint8)*data->n);
}
/** cmph_uint8 bmz8_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint8 bmz8_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
register cmph_uint8 *h1_ptr = (cmph_uint8 *)packed_mphf;
register CMPH_HASH h1_type = (CMPH_HASH)(*((cmph_uint32 *)h1_ptr));
h1_ptr += 4;
register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type);
register CMPH_HASH h2_type = (CMPH_HASH)(*((cmph_uint32 *)h2_ptr));
h2_ptr += 4;
register cmph_uint8 *g_ptr = h2_ptr + hash_state_packed_size(h2_type);
register cmph_uint8 n = *g_ptr++;
register cmph_uint8 h1 = (cmph_uint8)(hash_packed(h1_ptr, h1_type, key, keylen) % n);
register cmph_uint8 h2 = (cmph_uint8)(hash_packed(h2_ptr, h2_type, key, keylen) % n);
DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
if (h1 == h2 && ++h2 > n) h2 = 0;
return (cmph_uint8)(g_ptr[h1] + g_ptr[h2]);
}
cmph-2.0.2/src/compressed_rank.h 0000644 0001750 0001750 00000004253 13411542035 016125 0 ustar joseph joseph #ifndef __CMPH_COMPRESSED_RANK_H__
#define __CMPH_COMPRESSED_RANK_H__
#include "select.h"
struct _compressed_rank_t
{
cmph_uint32 max_val;
cmph_uint32 n; // number of values stored in vals_rems
// The length in bits of each value is decomposed into two compnents: the lg(n) MSBs are stored in rank_select data structure
// the remaining LSBs are stored in a table of n cells, each one of rem_r bits.
cmph_uint32 rem_r;
select_t sel;
cmph_uint32 * vals_rems;
};
typedef struct _compressed_rank_t compressed_rank_t;
void compressed_rank_init(compressed_rank_t * cr);
void compressed_rank_destroy(compressed_rank_t * cr);
void compressed_rank_generate(compressed_rank_t * cr, cmph_uint32 * vals_table, cmph_uint32 n);
cmph_uint32 compressed_rank_query(compressed_rank_t * cr, cmph_uint32 idx);
cmph_uint32 compressed_rank_get_space_usage(compressed_rank_t * cr);
void compressed_rank_dump(compressed_rank_t * cr, char **buf, cmph_uint32 *buflen);
void compressed_rank_load(compressed_rank_t * cr, const char *buf, cmph_uint32 buflen);
/** \fn void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed);
* \brief Support the ability to pack a compressed_rank structure into a preallocated contiguous memory space pointed by cr_packed.
* \param cr points to the compressed_rank structure
* \param cr_packed pointer to the contiguous memory area used to store the compressed_rank structure. The size of cr_packed must be at least @see compressed_rank_packed_size
*/
void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed);
/** \fn cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr);
* \brief Return the amount of space needed to pack a compressed_rank structure.
* \return the size of the packed compressed_rank structure or zero for failures
*/
cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr);
/** \fn cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx);
* \param cr_packed is a pointer to a contiguous memory area
* \param idx is an index to compute the rank
* \return an integer that represents the compressed_rank value.
*/
cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx);
#endif
cmph-2.0.2/src/fch_structs.h 0000755 0001750 0001750 00000001774 13411542035 015305 0 ustar joseph joseph #ifndef __CMPH_FCH_STRUCTS_H__
#define __CMPH_FCH_STRUCTS_H__
#include "hash_state.h"
struct __fch_data_t
{
cmph_uint32 m; // words count
double c; // constant c
cmph_uint32 b; // parameter b = ceil(c*m/(log(m)/log(2) + 1)). Don't need to be stored
double p1; // constant p1 = ceil(0.6*m). Don't need to be stored
double p2; // constant p2 = ceil(0.3*b). Don't need to be stored
cmph_uint32 *g; // g function.
hash_state_t *h1; // h10 function.
hash_state_t *h2; // h20 function.
};
struct __fch_config_data_t
{
CMPH_HASH hashfuncs[2];
cmph_uint32 m; // words count
double c; // constant c
cmph_uint32 b; // parameter b = ceil(c*m/(log(m)/log(2) + 1)). Don't need to be stored
double p1; // constant p1 = ceil(0.6*m). Don't need to be stored
double p2; // constant p2 = ceil(0.3*b). Don't need to be stored
cmph_uint32 *g; // g function.
hash_state_t *h1; // h10 function.
hash_state_t *h2; // h20 function.
};
#endif
cmph-2.0.2/src/buffer_entry.c 0000644 0001750 0001750 00000006305 13411542035 015433 0 ustar joseph joseph #include "buffer_entry.h"
#include
#include
#include
#include
struct __buffer_entry_t
{
FILE *fd;
cmph_uint8 * buff;
cmph_uint32 capacity, // buffer entry capacity
nbytes, // buffer entry used bytes
pos; // current read position in buffer entry
cmph_uint8 eof; // flag to indicate end of file
};
buffer_entry_t * buffer_entry_new(cmph_uint32 capacity)
{
buffer_entry_t *buff_entry = (buffer_entry_t *)malloc(sizeof(buffer_entry_t));
if (!buff_entry) return NULL;
buff_entry->fd = NULL;
buff_entry->buff = NULL;
buff_entry->capacity = capacity;
buff_entry->nbytes = capacity;
buff_entry->pos = capacity;
buff_entry->eof = 0;
return buff_entry;
}
void buffer_entry_open(buffer_entry_t * buffer_entry, char * filename)
{
buffer_entry->fd = fopen(filename, "rb");
}
void buffer_entry_set_capacity(buffer_entry_t * buffer_entry, cmph_uint32 capacity)
{
buffer_entry->capacity = capacity;
}
cmph_uint32 buffer_entry_get_capacity(buffer_entry_t * buffer_entry)
{
return buffer_entry->capacity;
}
static void buffer_entry_load(buffer_entry_t * buffer_entry)
{
free(buffer_entry->buff);
buffer_entry->buff = (cmph_uint8 *)calloc((size_t)buffer_entry->capacity, sizeof(cmph_uint8));
buffer_entry->nbytes = (cmph_uint32)fread(buffer_entry->buff, (size_t)1, (size_t)buffer_entry->capacity, buffer_entry->fd);
if (buffer_entry->nbytes != buffer_entry->capacity) buffer_entry->eof = 1;
buffer_entry->pos = 0;
}
cmph_uint8 * buffer_entry_read_key(buffer_entry_t * buffer_entry, cmph_uint32 * keylen)
{
cmph_uint8 * buf = NULL;
cmph_uint32 lacked_bytes = sizeof(*keylen);
cmph_uint32 copied_bytes = 0;
if(buffer_entry->eof && (buffer_entry->pos == buffer_entry->nbytes)) // end
{
free(buf);
return NULL;
}
if((buffer_entry->pos + lacked_bytes) > buffer_entry->nbytes)
{
copied_bytes = buffer_entry->nbytes - buffer_entry->pos;
lacked_bytes = (buffer_entry->pos + lacked_bytes) - buffer_entry->nbytes;
if (copied_bytes != 0) memcpy(keylen, buffer_entry->buff + buffer_entry->pos, (size_t)copied_bytes);
buffer_entry_load(buffer_entry);
}
memcpy(keylen + copied_bytes, buffer_entry->buff + buffer_entry->pos, (size_t)lacked_bytes);
buffer_entry->pos += lacked_bytes;
lacked_bytes = *keylen;
copied_bytes = 0;
buf = (cmph_uint8 *)malloc(*keylen + sizeof(*keylen));
memcpy(buf, keylen, sizeof(*keylen));
if((buffer_entry->pos + lacked_bytes) > buffer_entry->nbytes) {
copied_bytes = buffer_entry->nbytes - buffer_entry->pos;
lacked_bytes = (buffer_entry->pos + lacked_bytes) - buffer_entry->nbytes;
if (copied_bytes != 0) {
memcpy(buf + sizeof(*keylen), buffer_entry->buff + buffer_entry->pos, (size_t)copied_bytes);
}
buffer_entry_load(buffer_entry);
}
memcpy(buf+sizeof(*keylen)+copied_bytes, buffer_entry->buff + buffer_entry->pos, (size_t)lacked_bytes);
buffer_entry->pos += lacked_bytes;
return buf;
}
void buffer_entry_destroy(buffer_entry_t * buffer_entry)
{
fclose(buffer_entry->fd);
buffer_entry->fd = NULL;
free(buffer_entry->buff);
buffer_entry->buff = NULL;
buffer_entry->capacity = 0;
buffer_entry->nbytes = 0;
buffer_entry->pos = 0;
buffer_entry->eof = 0;
free(buffer_entry);
}
cmph-2.0.2/src/cmph_time.h 0000644 0001750 0001750 00000003010 13411542035 014701 0 ustar joseph joseph #ifdef ELAPSED_TIME_IN_SECONDS
#undef ELAPSED_TIME_IN_SECONDS
#endif
#ifdef ELAPSED_TIME_IN_uSECONDS
#undef ELAPSED_TIME_IN_uSECONDS
#endif
#ifdef WIN32
// include headers to use gettimeofday
#else
#ifdef __GNUC__
#include
#include
#endif
#endif
#ifdef __GNUC__
#ifndef __CMPH_TIME_H__
#define __CMPH_TIME_H__
static inline void elapsed_time_in_seconds(double * elapsed_time)
{
struct timeval e_time;
if (gettimeofday(&e_time, NULL) < 0) {
return;
}
*elapsed_time = (double)e_time.tv_sec + ((double)e_time.tv_usec/1000000.0);
}
static inline void dummy_elapsed_time_in_seconds()
{
}
static inline void elapsed_time_in_useconds(cmph_uint64 * elapsed_time)
{
struct timeval e_time;
if (gettimeofday(&e_time, NULL) < 0) {
return;
}
*elapsed_time = (cmph_uint64)(e_time.tv_sec*1000000 + e_time.tv_usec);
}
static inline void dummy_elapsed_time_in_useconds()
{
}
#endif
#endif
#ifdef CMPH_TIMING
#ifdef __GNUC__
#define ELAPSED_TIME_IN_SECONDS elapsed_time_in_seconds
#define ELAPSED_TIME_IN_uSECONDS elapsed_time_in_useconds
#else
#define ELAPSED_TIME_IN_SECONDS dummy_elapsed_time_in_seconds
#define ELAPSED_TIME_IN_uSECONDS dummy_elapsed_time_in_useconds
#endif
#else
#ifdef __GNUC__
#define ELAPSED_TIME_IN_SECONDS
#define ELAPSED_TIME_IN_uSECONDS
#else
#define ELAPSED_TIME_IN_SECONDS dummy_elapsed_time_in_seconds
#define ELAPSED_TIME_IN_uSECONDS dummy_elapsed_time_in_useconds
#endif
#endif
cmph-2.0.2/src/cmph_benchmark.c 0000644 0001750 0001750 00000006606 13411542035 015706 0 ustar joseph joseph // A simple benchmark tool around getrusage
#include
#include
#include
#include
#include
#include "cmph_benchmark.h"
typedef struct {
const char* name;
void (*func)(int);
int iters;
struct rusage begin;
struct rusage end;
} benchmark_t;
static benchmark_t* global_benchmarks = NULL;
/* Subtract the `struct timeval' values X and Y,
storing the result in RESULT.
Return 1 if the difference is negative, otherwise 0. */
int timeval_subtract (
struct timeval *result, struct timeval *x, struct timeval* y) {
/* Perform the carry for the later subtraction by updating y. */
if (x->tv_usec < y->tv_usec) {
int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
y->tv_usec -= 1000000 * nsec;
y->tv_sec += nsec;
}
if (x->tv_usec - y->tv_usec > 1000000) {
int nsec = (x->tv_usec - y->tv_usec) / 1000000;
y->tv_usec += 1000000 * nsec;
y->tv_sec -= nsec;
}
/* Compute the time remaining to wait.
tv_usec is certainly positive. */
result->tv_sec = x->tv_sec - y->tv_sec;
result->tv_usec = x->tv_usec - y->tv_usec;
/* Return 1 if result is negative. */
return x->tv_sec < y->tv_sec;
}
benchmark_t* find_benchmark(const char* name) {
benchmark_t* benchmark = global_benchmarks;
while (benchmark && benchmark->name != NULL) {
if (strcmp(benchmark->name, name) == 0) break;
++benchmark;
}
if (!benchmark || !benchmark->name) return NULL;
return benchmark;
}
int global_benchmarks_length() {
benchmark_t* benchmark = global_benchmarks;
int length = 0;
if (benchmark == NULL) return 0;
while (benchmark->name != NULL) ++length, ++benchmark;
return length;
}
void bm_register(const char* name, void (*func)(int), int iters) {
benchmark_t benchmark;
int length = global_benchmarks_length();
benchmark.name = name;
benchmark.func = func;
benchmark.iters = iters;
assert(!find_benchmark(name));
global_benchmarks = (benchmark_t *)realloc(
global_benchmarks, (length + 2)*sizeof(benchmark_t));
global_benchmarks[length] = benchmark;
memset(&benchmark, 0, sizeof(benchmark_t)); // pivot
global_benchmarks[length + 1] = benchmark;
}
void bm_start(const char* name) {
benchmark_t* benchmark;
struct rusage rs;
benchmark = find_benchmark(name);
assert(benchmark);
int ret = getrusage(RUSAGE_SELF, &rs);
if (ret != 0) {
perror("rusage failed");
exit(-1);
}
benchmark->begin = rs;
(*benchmark->func)(benchmark->iters);
}
void bm_end(const char* name) {
benchmark_t* benchmark;
struct rusage rs;
int ret = getrusage(RUSAGE_SELF, &rs);
if (ret != 0) {
perror("rusage failed");
exit(-1);
}
benchmark = find_benchmark(name);
benchmark->end = rs;
struct timeval utime;
timeval_subtract(&utime, &benchmark->end.ru_utime, &benchmark->begin.ru_utime);
struct timeval stime;
timeval_subtract(&stime, &benchmark->end.ru_stime, &benchmark->begin.ru_stime);
printf("Benchmark: %s\n", benchmark->name);
printf("User time used : %ld.%06ld\n",
utime.tv_sec, (long int)utime.tv_usec);
printf("System time used: %ld.%06ld\n",
stime.tv_sec, (long int)stime.tv_usec);
printf("\n");
}
void run_benchmarks(int argc, char** argv) {
benchmark_t* benchmark = global_benchmarks;
while (benchmark && benchmark->name != NULL) {
bm_start(benchmark->name);
bm_end(benchmark->name);
++benchmark;
}
}
cmph-2.0.2/src/bdz_gen_lookup_table.c 0000755 0001750 0001750 00000001174 13411542035 017113 0 ustar joseph joseph #include
#include
#include
void help(char * prname)
{
fprintf(stderr, "USE: %s \n", prname);
exit(1);
}
int main(int argc, char ** argv)
{
if(argc != 3) help(argv[0]);
int n = atoi(argv[1]);
int wordsize = (atoi(argv[2]) >> 1);
int i, j, n_assigned;
for(i = 0; i < n; i++)
{
int num = i;
n_assigned = 0;
for(j = 0; j < wordsize; j++)
{
if ((num & 0x0003) != 3)
{
n_assigned++;
//fprintf(stderr, "num:%d\n", num);
}
num = num >> 2;
}
if(i%16 == 0) fprintf(stderr, "\n");
fprintf(stderr, "%d, ", n_assigned);
}
fprintf(stderr, "\n");
}
cmph-2.0.2/src/wingetopt.h 0000644 0001750 0001750 00000001103 13411542035 014755 0 ustar joseph joseph #ifdef __cplusplus
extern "C" {
#endif
#ifndef WIN32
#include
#else
#ifndef _GETOPT_
#define _GETOPT_
#include /* for EOF */
#include /* for strchr() */
char *optarg = NULL; /* pointer to the start of the option argument */
int optind = 1; /* number of the next argv[] to be evaluated */
int opterr = 1; /* non-zero if a question mark should be returned */
int getopt(int argc, char *argv[], char *opstring);
#endif //_GETOPT_
#endif //WIN32
#ifdef __cplusplus
}
#endif
cmph-2.0.2/src/bmz_structs.h 0000644 0001750 0001750 00000000641 13411542035 015322 0 ustar joseph joseph #ifndef __CMPH_BMZ_STRUCTS_H__
#define __CMPH_BMZ_STRUCTS_H__
#include "hash_state.h"
struct __bmz_data_t
{
cmph_uint32 m; //edges (words) count
cmph_uint32 n; //vertex count
cmph_uint32 *g;
hash_state_t **hashes;
};
struct __bmz_config_data_t
{
CMPH_HASH hashfuncs[2];
cmph_uint32 m; //edges (words) count
cmph_uint32 n; //vertex count
graph_t *graph;
cmph_uint32 *g;
hash_state_t **hashes;
};
#endif
cmph-2.0.2/src/chm.h 0000644 0001750 0001750 00000003135 13411542035 013513 0 ustar joseph joseph #ifndef __CMPH_CHM_H__
#define __CMPH_CHM_H__
#include "cmph.h"
typedef struct __chm_data_t chm_data_t;
typedef struct __chm_config_data_t chm_config_data_t;
chm_config_data_t *chm_config_new(void);
void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void chm_config_destroy(cmph_config_t *mph);
cmph_t *chm_new(cmph_config_t *mph, double c);
void chm_load(FILE *f, cmph_t *mphf);
int chm_dump(cmph_t *mphf, FILE *f);
void chm_destroy(cmph_t *mphf);
cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
/** \fn void chm_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void chm_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 chm_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 chm_packed_size(cmph_t *mphf);
/** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
#endif
cmph-2.0.2/src/compressed_seq.c 0000644 0001750 0001750 00000024132 13411542035 015753 0 ustar joseph joseph #include "compressed_seq.h"
#include
#include
#include
#include
#include
#include "bitbool.h"
// #define DEBUG
#include "debug.h"
static inline cmph_uint32 compressed_seq_i_log2(cmph_uint32 x)
{
register cmph_uint32 res = 0;
while(x > 1)
{
x >>= 1;
res++;
}
return res;
};
void compressed_seq_init(compressed_seq_t * cs)
{
select_init(&cs->sel);
cs->n = 0;
cs->rem_r = 0;
cs->length_rems = 0;
cs->total_length = 0;
cs->store_table = 0;
}
void compressed_seq_destroy(compressed_seq_t * cs)
{
free(cs->store_table);
cs->store_table = 0;
free(cs->length_rems);
cs->length_rems = 0;
select_destroy(&cs->sel);
};
void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n)
{
register cmph_uint32 i;
// lengths: represents lengths of encoded values
register cmph_uint32 * lengths = (cmph_uint32 *)calloc(n, sizeof(cmph_uint32));
register cmph_uint32 rems_mask;
register cmph_uint32 stored_value;
cs->n = n;
cs->total_length = 0;
for(i = 0; i < cs->n; i++)
{
if(vals_table[i] == 0)
{
lengths[i] = 0;
}
else
{
lengths[i] = compressed_seq_i_log2(vals_table[i] + 1);
cs->total_length += lengths[i];
};
};
if(cs->store_table)
{
free(cs->store_table);
}
cs->store_table = (cmph_uint32 *) calloc(((cs->total_length + 31) >> 5), sizeof(cmph_uint32));
cs->total_length = 0;
for(i = 0; i < cs->n; i++)
{
if(vals_table[i] == 0)
continue;
stored_value = vals_table[i] - ((1U << lengths[i]) - 1U);
set_bits_at_pos(cs->store_table, cs->total_length, stored_value, lengths[i]);
cs->total_length += lengths[i];
};
cs->rem_r = compressed_seq_i_log2(cs->total_length/cs->n);
if(cs->rem_r == 0)
{
cs->rem_r = 1;
}
if(cs->length_rems)
{
free(cs->length_rems);
}
cs->length_rems = (cmph_uint32 *) calloc(BITS_TABLE_SIZE(cs->n, cs->rem_r), sizeof(cmph_uint32));
rems_mask = (1U << cs->rem_r) - 1U;
cs->total_length = 0;
for(i = 0; i < cs->n; i++)
{
cs->total_length += lengths[i];
set_bits_value(cs->length_rems, i, cs->total_length & rems_mask, cs->rem_r, rems_mask);
lengths[i] = cs->total_length >> cs->rem_r;
};
select_init(&cs->sel);
// FABIANO: before it was (cs->total_length >> cs->rem_r) + 1. But I wiped out the + 1 because
// I changed the select structure to work up to m, instead of up to m - 1.
select_generate(&cs->sel, lengths, cs->n, (cs->total_length >> cs->rem_r));
free(lengths);
};
cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs)
{
register cmph_uint32 space_usage = select_get_space_usage(&cs->sel);
space_usage += ((cs->total_length + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32) * 8;
space_usage += BITS_TABLE_SIZE(cs->n, cs->rem_r) * (cmph_uint32)sizeof(cmph_uint32) * 8;
return 4 * (cmph_uint32)sizeof(cmph_uint32) * 8 + space_usage;
}
cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx)
{
register cmph_uint32 enc_idx, enc_length;
register cmph_uint32 rems_mask;
register cmph_uint32 stored_value;
register cmph_uint32 sel_res;
assert(idx < cs->n); // FABIANO ADDED
rems_mask = (1U << cs->rem_r) - 1U;
if(idx == 0)
{
enc_idx = 0;
sel_res = select_query(&cs->sel, idx);
}
else
{
sel_res = select_query(&cs->sel, idx - 1);
enc_idx = (sel_res - (idx - 1)) << cs->rem_r;
enc_idx += get_bits_value(cs->length_rems, idx-1, cs->rem_r, rems_mask);
sel_res = select_next_query(&cs->sel, sel_res);
};
enc_length = (sel_res - idx) << cs->rem_r;
enc_length += get_bits_value(cs->length_rems, idx, cs->rem_r, rems_mask);
enc_length -= enc_idx;
if(enc_length == 0)
return 0;
stored_value = get_bits_at_pos(cs->store_table, enc_idx, enc_length);
return stored_value + ((1U << enc_length) - 1U);
};
void compressed_seq_dump(compressed_seq_t * cs, char ** buf, cmph_uint32 * buflen)
{
register cmph_uint32 sel_size = select_packed_size(&(cs->sel));
register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r) * 4;
register cmph_uint32 store_table_size = ((cs->total_length + 31) >> 5) * 4;
register cmph_uint32 pos = 0;
char * buf_sel = 0;
cmph_uint32 buflen_sel = 0;
*buflen = 4*(cmph_uint32)sizeof(cmph_uint32) + sel_size + length_rems_size + store_table_size;
DEBUGP("sel_size = %u\n", sel_size);
DEBUGP("length_rems_size = %u\n", length_rems_size);
DEBUGP("store_table_size = %u\n", store_table_size);
*buf = (char *)calloc(*buflen, sizeof(char));
if (!*buf)
{
*buflen = UINT_MAX;
return;
}
// dumping n, rem_r and total_length
memcpy(*buf, &(cs->n), sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("n = %u\n", cs->n);
memcpy(*buf + pos, &(cs->rem_r), sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("rem_r = %u\n", cs->rem_r);
memcpy(*buf + pos, &(cs->total_length), sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("total_length = %u\n", cs->total_length);
// dumping sel
select_dump(&cs->sel, &buf_sel, &buflen_sel);
memcpy(*buf + pos, &buflen_sel, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("buflen_sel = %u\n", buflen_sel);
memcpy(*buf + pos, buf_sel, buflen_sel);
#ifdef DEBUG
cmph_uint32 i = 0;
for(i = 0; i < buflen_sel; i++)
{
DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(*buf + pos + i));
}
#endif
pos += buflen_sel;
free(buf_sel);
// dumping length_rems
memcpy(*buf + pos, cs->length_rems, length_rems_size);
#ifdef DEBUG
for(i = 0; i < length_rems_size; i++)
{
DEBUGP("pos = %u -- length_rems_size = %u -- length_rems[%u] = %u\n", pos, length_rems_size, i, *(*buf + pos + i));
}
#endif
pos += length_rems_size;
// dumping store_table
memcpy(*buf + pos, cs->store_table, store_table_size);
#ifdef DEBUG
for(i = 0; i < store_table_size; i++)
{
DEBUGP("pos = %u -- store_table_size = %u -- store_table[%u] = %u\n", pos, store_table_size, i, *(*buf + pos + i));
}
#endif
DEBUGP("Dumped compressed sequence structure with size %u bytes\n", *buflen);
}
void compressed_seq_load(compressed_seq_t * cs, const char * buf, cmph_uint32 buflen)
{
register cmph_uint32 pos = 0;
cmph_uint32 buflen_sel = 0;
register cmph_uint32 length_rems_size = 0;
register cmph_uint32 store_table_size = 0;
// loading n, rem_r and total_length
memcpy(&(cs->n), buf, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("n = %u\n", cs->n);
memcpy(&(cs->rem_r), buf + pos, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("rem_r = %u\n", cs->rem_r);
memcpy(&(cs->total_length), buf + pos, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("total_length = %u\n", cs->total_length);
// loading sel
memcpy(&buflen_sel, buf + pos, sizeof(cmph_uint32));
pos += (cmph_uint32)sizeof(cmph_uint32);
DEBUGP("buflen_sel = %u\n", buflen_sel);
select_load(&cs->sel, buf + pos, buflen_sel);
#ifdef DEBUG
cmph_uint32 i = 0;
for(i = 0; i < buflen_sel; i++)
{
DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(buf + pos + i));
}
#endif
pos += buflen_sel;
// loading length_rems
if(cs->length_rems)
{
free(cs->length_rems);
}
length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r);
cs->length_rems = (cmph_uint32 *) calloc(length_rems_size, sizeof(cmph_uint32));
length_rems_size *= 4;
memcpy(cs->length_rems, buf + pos, length_rems_size);
#ifdef DEBUG
for(i = 0; i < length_rems_size; i++)
{
DEBUGP("pos = %u -- length_rems_size = %u -- length_rems[%u] = %u\n", pos, length_rems_size, i, *(buf + pos + i));
}
#endif
pos += length_rems_size;
// loading store_table
store_table_size = ((cs->total_length + 31) >> 5);
if(cs->store_table)
{
free(cs->store_table);
}
cs->store_table = (cmph_uint32 *) calloc(store_table_size, sizeof(cmph_uint32));
store_table_size *= 4;
memcpy(cs->store_table, buf + pos, store_table_size);
#ifdef DEBUG
for(i = 0; i < store_table_size; i++)
{
DEBUGP("pos = %u -- store_table_size = %u -- store_table[%u] = %u\n", pos, store_table_size, i, *(buf + pos + i));
}
#endif
DEBUGP("Loaded compressed sequence structure with size %u bytes\n", buflen);
}
void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed)
{
if (cs && cs_packed)
{
char *buf = NULL;
cmph_uint32 buflen = 0;
compressed_seq_dump(cs, &buf, &buflen);
memcpy(cs_packed, buf, buflen);
free(buf);
}
}
cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs)
{
register cmph_uint32 sel_size = select_packed_size(&cs->sel);
register cmph_uint32 store_table_size = ((cs->total_length + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32);
register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r) * (cmph_uint32)sizeof(cmph_uint32);
return 4 * (cmph_uint32)sizeof(cmph_uint32) + sel_size + store_table_size + length_rems_size;
}
cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx)
{
// unpacking cs_packed
register cmph_uint32 *ptr = (cmph_uint32 *)cs_packed;
register cmph_uint32 n = *ptr++;
register cmph_uint32 rem_r = *ptr++;
ptr++; // skipping total_length
// register cmph_uint32 total_length = *ptr++;
register cmph_uint32 buflen_sel = *ptr++;
register cmph_uint32 * sel_packed = ptr;
register cmph_uint32 * length_rems = (ptr += (buflen_sel >> 2));
register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(n, rem_r);
register cmph_uint32 * store_table = (ptr += length_rems_size);
// compressed sequence query computation
register cmph_uint32 enc_idx, enc_length;
register cmph_uint32 rems_mask;
register cmph_uint32 stored_value;
register cmph_uint32 sel_res;
rems_mask = (1U << rem_r) - 1U;
if(idx == 0)
{
enc_idx = 0;
sel_res = select_query_packed(sel_packed, idx);
}
else
{
sel_res = select_query_packed(sel_packed, idx - 1);
enc_idx = (sel_res - (idx - 1)) << rem_r;
enc_idx += get_bits_value(length_rems, idx-1, rem_r, rems_mask);
sel_res = select_next_query_packed(sel_packed, sel_res);
};
enc_length = (sel_res - idx) << rem_r;
enc_length += get_bits_value(length_rems, idx, rem_r, rems_mask);
enc_length -= enc_idx;
if(enc_length == 0)
return 0;
stored_value = get_bits_at_pos(store_table, enc_idx, enc_length);
return stored_value + ((1U << enc_length) - 1U);
}
cmph-2.0.2/scpscript 0000755 0001750 0001750 00000002066 13411542035 013746 0 ustar joseph joseph #!/bin/bash
upload_opt=
while getopts "u:" opt; do
case $opt in
u)
upload_opt="$OPTARG"
;;
esac
done
function usage {
echo "Usage: $0 -u "
echo "-u What to upload. It can be either of the following options:"
echo " * html"
echo " * examples"
echo " * papers"
echo " * all"
exit 1
}
if [ "$upload_opt" = "" ]
then
usage
fi
if [ "$upload_opt" != "html" ] &&
[ "$upload_opt" != "examples" ] &&
[ "$upload_opt" != "papers" ] &&
[ "$upload_opt" != "all" ]
then
usage
fi
if [ "$upload_opt" = "html" ] || [ "$upload_opt" = "all" ]
then
scp -r *.html fc_botelho@web.sourceforge.net:/home/project-web/cmph/htdocs/
fi
if [ "$upload_opt" = "examples" ] || [ "$upload_opt" = "all" ]
then
scp -r examples/*.c examples/keys.txt fc_botelho@web.sourceforge.net:/home/project-web/cmph/htdocs/examples/
fi
if [ "$upload_opt" = "papers" ] || [ "$upload_opt" = "all" ]
then
scp -r papers/*.pdf fc_botelho@web.sourceforge.net:/home/project-web/cmph/htdocs/papers/
fi
cmph-2.0.2/tests/ 0000755 0001750 0001750 00000000000 13411542035 013144 5 ustar joseph joseph cmph-2.0.2/tests/packed_mphf_tests.c 0000644 0001750 0001750 00000011614 13411542035 016776 0 ustar joseph joseph #ifdef WIN32
#include "../wingetopt.h"
#else
#include
#endif
#include
#include
#include
#include
#include
#include
#include
#include
//#include "hash.h"
#ifdef WIN32
#define VERSION "0.8"
#else
#include "config.h"
#endif
void usage(const char *prg)
{
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-t keys_per_bin] [-k nkeys] [-m file.mph] keysfile\n", prg);
}
void usage_long(const char *prg)
{
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-t keys_per_bin] [-k nkeys] [-m file.mph] keysfile\n", prg);
fprintf(stderr, "Packed MPHFs testing tool\n\n");
fprintf(stderr, " -h\t print this help message\n");
fprintf(stderr, " -V\t print version number and exit\n");
fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n");
fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function.\n");
fprintf(stderr, " \t A t-perfect hashing function allows at most t collisions in a given bin.\n");
fprintf(stderr, " -k\t number of keys\n");
fprintf(stderr, " -m\t minimum perfect hash function file \n");
fprintf(stderr, " keysfile\t line separated file with keys\n");
}
int main(int argc, char **argv)
{
char verbosity = 0;
char *mphf_file = NULL;
const char *keys_file = NULL;
FILE *mphf_fd = stdout;
FILE *keys_fd;
cmph_uint32 nkeys = UINT_MAX;
cmph_uint32 i = 0;
cmph_t *mphf = NULL;
cmph_io_adapter_t *source;
cmph_uint32 keys_per_bin = 1;
while (1)
{
char ch = (char)getopt(argc, argv, "hVvt:k:m:");
if (ch == -1) break;
switch (ch)
{
case 'k':
{
char *endptr;
nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10);
if(*endptr != 0) {
fprintf(stderr, "Invalid number of keys %s\n", optarg);
exit(1);
}
}
break;
case 'm':
mphf_file = strdup(optarg);
break;
case 'v':
++verbosity;
break;
case 't':
{
char *cptr;
keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10);
if(*cptr != 0) {
fprintf(stderr, "Parameter t was not found: %s\n", optarg);
exit(1);
}
}
break;
case 'V':
printf("%s\n", VERSION);
return 0;
case 'h':
usage_long(argv[0]);
return 0;
default:
usage(argv[0]);
return 1;
}
}
if (optind != argc - 1)
{
usage(argv[0]);
return 1;
}
keys_file = argv[optind];
int ret = 0;
if (mphf_file == NULL)
{
mphf_file = (char *)malloc(strlen(keys_file) + 5);
memcpy(mphf_file, keys_file, strlen(keys_file));
memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
}
keys_fd = fopen(keys_file, "r");
if (keys_fd == NULL)
{
fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
return -1;
}
if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
cmph_uint8 * hashtable = NULL;
mphf_fd = fopen(mphf_file, "rb");
if (mphf_fd == NULL)
{
fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
free(mphf_file);
return -1;
}
mphf = cmph_load(mphf_fd);
fclose(mphf_fd);
if (!mphf)
{
fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
free(mphf_file);
return -1;
}
cmph_uint32 siz = cmph_size(mphf);
hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8));
memset(hashtable, 0, (size_t)siz);
// packing the function
/* Determine how much space is needed to pack the mphf. */
cmph_uint32 packed_size = cmph_packed_size(mphf);
fprintf(stderr, "packed_size = %u\n", packed_size);
/* Make sure that we have enough space to pack the mphf. */
cmph_uint8 * packed_mphf = (cmph_uint8 *)calloc((size_t)packed_size,(size_t)1);
/* Pack the mphf. */
cmph_pack(mphf, packed_mphf);
// testing the packed function
//check all keys
#ifdef CMPH_TIMING
double evaluation_time_begin = 0.0;
double evaluation_time = 0.0;
ELAPSED_TIME_IN_SECONDS(&evaluation_time_begin);
#endif
for (i = 0; i < source->nkeys; ++i)
{
cmph_uint32 h;
char *buf;
cmph_uint32 buflen = 0;
source->read(source->data, &buf, &buflen);
h = cmph_search_packed(packed_mphf, buf, buflen);
if (!(h < siz))
{
fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
ret = 1;
} else if(hashtable[h] >= keys_per_bin)
{
fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h);
fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
ret = 1;
} else hashtable[h]++;
if (verbosity)
{
printf("%s -> %u\n", buf, h);
}
source->dispose(source->data, buf, buflen);
}
#ifdef CMPH_TIMING
ELAPSED_TIME_IN_SECONDS(&evaluation_time);
evaluation_time = evaluation_time - evaluation_time_begin;
fprintf(stdout, "%u\t%.2f\n", source->nkeys, evaluation_time);
#endif
free(packed_mphf);
cmph_destroy(mphf);
free(hashtable);
fclose(keys_fd);
free(mphf_file);
cmph_io_nlfile_adapter_destroy(source);
return ret;
}
cmph-2.0.2/tests/compressed_seq_tests.c 0000644 0001750 0001750 00000003264 13411542035 017553 0 ustar joseph joseph #include "../src/compressed_seq.h"
#define DEBUG
#include "../src/debug.h"
#include
static inline void print_values(compressed_seq_t * cs, cmph_uint32 idx)
{
register cmph_uint32 index;
index = compressed_seq_query(cs, idx);
fprintf(stderr, "Index[%u]\t= %u\n", idx, index);
}
static inline void print_values_packed(char * cs_packed, cmph_uint32 idx)
{
register cmph_uint32 index;
index = compressed_seq_query_packed(cs_packed, idx);
fprintf(stderr, "Index[%u]\t= %u\n", idx, index);
}
int main(int argc, char **argv)
{
compressed_seq_t cs;
cmph_uint32 i = 0;
cmph_uint32 n = 20;
cmph_uint32 keys_vec[] = { 0, 1, 1, 1, 2, 2, 2, 3, 5, 5,
6, 6, 9, 9, 9, 12, 12, 13, 17, 1077};
char *buf = NULL;
cmph_uint32 buflen = 0;
char * cs_packed = NULL;
cmph_uint32 cs_pack_size = 0;
compressed_seq_init(&cs);
compressed_seq_generate(&cs, keys_vec, n);
fprintf(stderr, "Space usage = %u\n", compressed_seq_get_space_usage(&cs));
for(i = 0; i < n; i++)
{
print_values(&cs, i);
}
fprintf(stderr, "Dumping compressed seq structure\n");
compressed_seq_dump(&cs, &buf, &buflen);
compressed_seq_destroy(&cs);
fprintf(stderr, "Loading compressed seq structure\n");
compressed_seq_load(&cs, buf, buflen);
for(i = 0; i < n; i++)
{
print_values(&cs, i);
}
free(buf);
cs_pack_size = compressed_seq_packed_size(&cs);
cs_packed = (char *) calloc(cs_pack_size, sizeof(char));
compressed_seq_pack(&cs, cs_packed);
compressed_seq_destroy(&cs);
fprintf(stderr, "Querying the packed compressed seq structure\n");
for(i = 0; i < n; i++)
{
print_values_packed(cs_packed, i);
}
free(cs_packed);
return 0;
}
cmph-2.0.2/tests/select_tests.c 0000644 0001750 0001750 00000004655 13411542035 016023 0 ustar joseph joseph #include "../src/select.h"
#define DEBUG
#include "../src/debug.h"
#include
static inline void print_values(select_t * sel)
{
register cmph_uint32 index;
index = select_query(sel, 0);
fprintf(stderr, "Index[0]\t= %u\n", index - 0);
index = select_next_query(sel, index);
fprintf(stderr, "Next Index\t= %u\n", index);
index = select_query(sel, 1);
fprintf(stderr, "Index[1]\t= %u\n", index - 1);
index = select_next_query(sel, index);
fprintf(stderr, "Next Index\t= %u\n", index);
index = select_query(sel, 2);
fprintf(stderr, "Index[2]\t= %u\n", index - 2);
index = select_next_query(sel, index);
fprintf(stderr, "Next Index\t= %u\n", index);
index = select_query(sel, 3);
fprintf(stderr, "Index[3]\t= %u\n", index - 3);
}
static inline void print_values_packed(char * sel_packed)
{
register cmph_uint32 index;
index = select_query_packed(sel_packed, 0);
fprintf(stderr, "Index[0]\t= %u\n", index - 0);
index = select_next_query_packed(sel_packed, index);
fprintf(stderr, "Next Index\t= %u\n", index);
index = select_query_packed(sel_packed, 1);
fprintf(stderr, "Index[1]\t= %u\n", index - 1);
index = select_next_query_packed(sel_packed, index);
fprintf(stderr, "Next Index\t= %u\n", index);
index = select_query_packed(sel_packed, 2);
fprintf(stderr, "Index[2]\t= %u\n", index - 2);
index = select_next_query_packed(sel_packed, index);
fprintf(stderr, "Next Index\t= %u\n", index);
index = select_query_packed(sel_packed, 3);
fprintf(stderr, "Index[3]\t= %u\n", index - 3);
}
int main(int argc, char **argv)
{
select_t sel;
cmph_uint32 n = 4;
cmph_uint32 keys_vec[4] = {0,1,2,3};
cmph_uint32 m = keys_vec[3];
char *buf = NULL;
cmph_uint32 buflen = 0;
char * select_packed = NULL;
cmph_uint32 select_pack_size = 0;
select_init(&sel);
select_generate(&sel, keys_vec, n, m);
fprintf(stderr, "Space usage = %u\n", select_get_space_usage(&sel));
print_values(&sel);
fprintf(stderr, "Dumping select structure\n");
select_dump(&sel, &buf, &buflen);
select_destroy(&sel);
fprintf(stderr, "Loading select structure\n");
select_load(&sel, buf, buflen);
print_values(&sel);
free(buf);;
select_pack_size = select_packed_size(&sel);
select_packed = (char *) calloc(select_pack_size, sizeof(char));
select_pack(&sel, select_packed);
select_destroy(&sel);
fprintf(stderr, "Querying the packed select structure\n");
print_values_packed(select_packed);
free(select_packed);
return 0;
}
cmph-2.0.2/tests/compressed_rank_tests.c 0000644 0001750 0001750 00000003614 13411542035 017715 0 ustar joseph joseph #include "../src/compressed_rank.h"
#define DEBUG
#include "../src/debug.h"
#include
static inline void print_values(compressed_rank_t * cr, cmph_uint32 idx)
{
register cmph_uint32 index;
index = compressed_rank_query(cr, idx);
fprintf(stderr, "Index[%u]\t= %u\n", idx, index);
}
static inline void print_values_packed(char * cr_packed, cmph_uint32 idx)
{
register cmph_uint32 index;
index = compressed_rank_query_packed(cr_packed, idx);
fprintf(stderr, "Index[%u]\t= %u\n", idx, index);
}
/*
n = 20
Indices: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
vector[] = {0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1}
nzeros = 12
zeroIndices[] = {0, 1, 2, 5, 7, 9, 11, 12, 13, 16, 17, 18}
*/
int main(int argc, char **argv)
{
compressed_rank_t cr;
cmph_uint32 i = 0;
cmph_uint32 n = 12;
cmph_uint32 nIndices = 20;
cmph_uint32 keys_vec[] = {0, 1, 2, 5, 7, 9, 11, 12, 13, 16, 17, 18};
char *buf = NULL;
cmph_uint32 buflen = 0;
char * cr_packed = NULL;
cmph_uint32 cr_pack_size = 0;
compressed_rank_init(&cr);
compressed_rank_generate(&cr, keys_vec, n);
fprintf(stderr, "Space usage = %u\n", compressed_rank_get_space_usage(&cr));
for(i = 0; i < nIndices; i++)
{
print_values(&cr, i);
}
fprintf(stderr, "Dumping compressed rank structure\n");
compressed_rank_dump(&cr, &buf, &buflen);
compressed_rank_destroy(&cr);
fprintf(stderr, "Loading compressed rank structure\n");
compressed_rank_load(&cr, buf, buflen);
for(i = 0; i < nIndices; i++)
{
print_values(&cr, i);
}
free(buf);
cr_pack_size = compressed_rank_packed_size(&cr);
cr_packed = (char *) calloc(cr_pack_size, sizeof(char));
compressed_rank_pack(&cr, cr_packed);
compressed_rank_destroy(&cr);
fprintf(stderr, "Querying the packed compressed rank structure\n");
for(i = 0; i < nIndices; i++)
{
print_values_packed(cr_packed, i);
}
free(cr_packed);
return 0;
}
cmph-2.0.2/tests/Makefile.am 0000644 0001750 0001750 00000001473 13411542035 015205 0 ustar joseph joseph TESTS = $(check_PROGRAMS)
check_PROGRAMS = graph_tests select_tests compressed_seq_tests compressed_rank_tests cmph_benchmark_test
noinst_PROGRAMS = packed_mphf_tests mphf_tests
AM_CPPFLAGS = -I../src/
graph_tests_SOURCES = graph_tests.c
graph_tests_LDADD = ../src/libcmph.la
packed_mphf_tests_SOURCES = packed_mphf_tests.c
packed_mphf_tests_LDADD = ../src/libcmph.la
mphf_tests_SOURCES = mphf_tests.c
mphf_tests_LDADD = ../src/libcmph.la
select_tests_SOURCES = select_tests.c
select_tests_LDADD = ../src/libcmph.la
compressed_seq_tests_SOURCES = compressed_seq_tests.c
compressed_seq_tests_LDADD = ../src/libcmph.la
compressed_rank_tests_SOURCES = compressed_rank_tests.c
compressed_rank_tests_LDADD = ../src/libcmph.la
cmph_benchmark_test_SOURCES = cmph_benchmark_test.c
cmph_benchmark_test_LDADD = ../src/libcmph.la
cmph-2.0.2/tests/cmph_benchmark_test.c 0000644 0001750 0001750 00000000555 13411542035 017315 0 ustar joseph joseph #include // for sleep
#include
#include "cmph_benchmark.h"
void bm_sleep(int iters) {
sleep(1);
}
void bm_increment(int iters) {
int i, v = 0;
for (i = 0; i < INT_MAX; ++i) {
v += i;
}
}
int main(int argc, char** argv) {
BM_REGISTER(bm_sleep, 1);
BM_REGISTER(bm_increment, 1);
run_benchmarks(argc, argv);
return 0;
}
cmph-2.0.2/tests/mphf_tests.c 0000644 0001750 0001750 00000007014 13411542035 015466 0 ustar joseph joseph #ifdef WIN32
#include "../wingetopt.h"
#else
#include
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#ifdef WIN32
#define VERSION "0.8"
#else
#include "config.h"
#endif
void usage(const char *prg)
{
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-m file.mph] keysfile\n", prg);
}
void usage_long(const char *prg)
{
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-m file.mph] keysfile\n", prg);
fprintf(stderr, "Packed MPHFs testing tool\n\n");
fprintf(stderr, " -h\t print this help message\n");
fprintf(stderr, " -V\t print version number and exit\n");
fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n");
fprintf(stderr, " -k\t number of keys\n");
fprintf(stderr, " -m\t minimum perfect hash function file \n");
fprintf(stderr, " keysfile\t line separated file with keys\n");
}
int main(int argc, char **argv)
{
char verbosity = 0;
char *mphf_file = NULL;
const char *keys_file = NULL;
FILE *mphf_fd = stdout;
FILE *keys_fd;
cmph_uint32 nkeys = UINT_MAX;
cmph_uint32 i = 0;
cmph_t *mphf = NULL;
cmph_io_adapter_t *source;
while (1)
{
char ch = (char)getopt(argc, argv, "hVvk:m:");
if (ch == -1) break;
switch (ch)
{
case 'k':
{
char *endptr;
nkeys = (cmph_uint32) strtoul(optarg, &endptr, 10);
if(*endptr != 0) {
fprintf(stderr, "Invalid number of keys %s\n", optarg);
exit(1);
}
}
break;
case 'm':
mphf_file = strdup(optarg);
break;
case 'v':
++verbosity;
break;
case 'V':
printf("%s\n", VERSION);
return 0;
case 'h':
usage_long(argv[0]);
return 0;
default:
usage(argv[0]);
return 1;
}
}
if (optind != argc - 1)
{
usage(argv[0]);
return 1;
}
keys_file = argv[optind];
int ret = 0;
if (mphf_file == NULL)
{
mphf_file = (char *)malloc(strlen(keys_file) + 5);
memcpy(mphf_file, keys_file, strlen(keys_file));
memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
}
keys_fd = fopen(keys_file, "r");
if (keys_fd == NULL)
{
fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
return -1;
}
if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
cmph_uint8 * hashtable = NULL;
mphf_fd = fopen(mphf_file, "rb");
if (mphf_fd == NULL)
{
fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
free(mphf_file);
return -1;
}
mphf = cmph_load(mphf_fd);
fclose(mphf_fd);
if (!mphf)
{
fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
free(mphf_file);
return -1;
}
cmph_uint32 siz = cmph_size(mphf);
hashtable = (cmph_uint8*)malloc(siz*sizeof(cmph_uint8));
memset(hashtable, 0, (size_t)siz);
//check all keys
for (i = 0; i < source->nkeys; ++i)
{
cmph_uint32 h;
char *buf;
cmph_uint32 buflen = 0;
source->read(source->data, &buf, &buflen);
h = cmph_search(mphf, buf, buflen);
if (!(h < siz))
{
fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
ret = 1;
} else if(hashtable[h])
{
fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
ret = 1;
} else hashtable[h] = 1;
if (verbosity)
{
printf("%s -> %u\n", buf, h);
}
source->dispose(source->data, buf, buflen);
}
cmph_destroy(mphf);
free(hashtable);
fclose(keys_fd);
free(mphf_file);
cmph_io_nlfile_adapter_destroy(source);
return ret;
}
cmph-2.0.2/tests/graph_tests.c 0000644 0001750 0001750 00000002713 13411542035 015636 0 ustar joseph joseph #include "../src/graph.h"
#define DEBUG
#include "../src/debug.h"
int main(int argc, char **argv)
{
graph_iterator_t it;
cmph_uint32 i, neighbor;
graph_t *g = graph_new(5, 10);
fprintf(stderr, "Building random graph\n");
for (i = 0; i < 10; ++i)
{
cmph_uint32 v1 = i % 5;
cmph_uint32 v2 = (i*2) % 5;
if (v1 == v2) continue;
graph_add_edge(g, v1, v2);
DEBUGP("Added edge %u %u\n", v1, v2);
}
graph_print(g);
graph_del_edge(g, 4, 3);
graph_print(g);
graph_clear_edges(g);
graph_print(g);
graph_destroy(g);
fprintf(stderr, "Building cyclic graph\n");
g = graph_new(4, 5);
graph_add_edge(g, 0, 3);
graph_add_edge(g, 0, 1);
graph_add_edge(g, 1, 2);
graph_add_edge(g, 2, 0);
if (!graph_is_cyclic(g))
{
return 1;
}
graph_destroy(g);
fprintf(stderr, "Building non-cyclic graph\n");
g = graph_new(5, 4);
graph_add_edge(g, 0, 1);
graph_add_edge(g, 1, 2);
graph_add_edge(g, 2, 3);
graph_add_edge(g, 3, 4);
if (graph_is_cyclic(g))
{
return 1;
}
fprintf(stderr, "Checking neighbors iterator\n");
it = graph_neighbors_it(g, 1);
neighbor = graph_next_neighbor(g, &it);
DEBUGP("Neighbor is %u\n", neighbor);
if (neighbor != 0 && neighbor != 2) return 1;
neighbor = graph_next_neighbor(g, &it);
DEBUGP("Neighbor is %u\n", neighbor);
if (neighbor != 0 && neighbor != 2) return 1;
neighbor = graph_next_neighbor(g, &it);
DEBUGP("Neighbor is %u\n", neighbor);
if (neighbor != GRAPH_NO_NEIGHBOR) return 1;
graph_destroy(g);
return 0;
}
cmph-2.0.2/FCH.t2t 0000644 0001750 0001750 00000005067 13411542035 013045 0 ustar joseph joseph FCH Algorithm
%!includeconf: CONFIG.t2t
----------------------------------------
==The Algorithm==
The algorithm is presented in [[1 #papers]].
----------------------------------------
==Memory Consumption==
Now we detail the memory consumption to generate and to store minimal perfect hash functions
using the FCH algorithm. The structures responsible for memory consumption are in the
following:
- A vector containing all the //n// keys.
- Data structure to speed up the searching step:
+ **random_table**: is a vector used to remember currently empty slots in the hash table. It stores //n// 4 byte long integer numbers. This vector initially contains a random permutation of the //n// hash addresses. A pointer called filled_count is used to keep the invariant that any slots to the right side of filled_count (inclusive) are empty and any ones to the left are filled.
+ **hash_table**: Table used to check whether all the collisions were resolved. It has //n// entries of one byte.
+ **map_table**: For any unfilled slot //x// in hash_table, the map_table vector contains //n// 4 byte long pointers pointing at random_table such that random_table[map_table[x]] = x. Thus, given an empty slot x in the hash_table, we can locate its position in the random_table vector through map_table.
- Other auxiliary structures
+ **sorted_indexes**: is a vector of //cn/(log(n) + 1)// 4 byte long pointers to indirectly keep the buckets sorted by decreasing order of their sizes.
+ **function //g//**: is represented by a vector of //cn/(log(n) + 1)// 4 byte long integer numbers, one for each bucket. It is used to spread all the keys in a given bucket into the hash table without collisions.
Thus, the total memory consumption of FCH algorithm for generating a minimal
perfect hash function (MPHF) is: //O(n) + 9n + 8cn/(log(n) + 1)// bytes.
The value of parameter //c// must be greater than or equal to 2.6.
Now we present the memory consumption to store the resulting function.
We only need to store the //g// function and a constant number of bytes for the seed of the hash functions used in the resulting MPHF. Thus, we need //cn/(log(n) + 1) + O(1)// bytes.
----------------------------------------
==Papers==[papers]
+ E.A. Fox, Q.F. Chen, and L.S. Heath. [A faster algorithm for constructing minimal perfect hash functions. papers/fch92.pdf] In Proc. 15th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pages 266-273, 1992.
%!include: ALGORITHMS.t2t
%!include: FOOTER.t2t
%!include(html): ''GOOGLEANALYTICS.t2t'' cmph-2.0.2/gendocs 0000755 0001750 0001750 00000002252 13411542035 013353 0 ustar joseph joseph #!/bin/sh
txt2tags -t html --mask-email -i README.t2t -o index.html
txt2tags -t html -i CHD.t2t -o chd.html
txt2tags -t html -i BDZ.t2t -o bdz.html
txt2tags -t html -i BMZ.t2t -o bmz.html
txt2tags -t html -i BRZ.t2t -o brz.html
txt2tags -t html -i CHM.t2t -o chm.html
txt2tags -t html -i FCH.t2t -o fch.html
txt2tags -t html -i COMPARISON.t2t -o comparison.html
txt2tags -t html -i GPERF.t2t -o gperf.html
txt2tags -t html -i FAQ.t2t -o faq.html
txt2tags -t html -i CONCEPTS.t2t -o concepts.html
txt2tags -t html -i NEWSLOG.t2t -o newslog.html
txt2tags -t html -i EXAMPLES.t2t -o examples.html
txt2tags -t txt --mask-email -i README.t2t -o README
txt2tags -t txt -i CHD.t2t -o CHD
txt2tags -t txt -i BDZ.t2t -o BDZ
txt2tags -t txt -i BMZ.t2t -o BMZ
txt2tags -t txt -i BRZ.t2t -o BRZ
txt2tags -t txt -i CHM.t2t -o CHM
txt2tags -t txt -i FCH.t2t -o FCH
txt2tags -t txt -i COMPARISON.t2t -o COMPARISON
txt2tags -t txt -i GPERF.t2t -o GPERF
txt2tags -t txt -i FAQ.t2t -o FAQ
txt2tags -t txt -i CONCEPTS.t2t -o CONCEPTS
txt2tags -t txt -i NEWSLOG.t2t -o NEWSLOG
# txt2tags struggles with the lower cap examples directory
txt2tags -t txt -i EXAMPLES.t2t -o EXAMPLES.txt
mv -v EXAMPLES.txt EXAMPLES
cmph-2.0.2/tex/ 0000755 0001750 0001750 00000000000 13411542035 012602 5 ustar joseph joseph cmph-2.0.2/tex/chd/ 0000755 0001750 0001750 00000000000 13411542035 013340 5 ustar joseph joseph cmph-2.0.2/tex/chd/chd.bib 0000755 0001750 0001750 00000012626 13411542035 014566 0 ustar joseph joseph @inproceedings{bpz07,
author = {F.C. Botelho and R. Pagh and N. Ziviani},
title = {Simple and Space-Efficient Minimal Perfect Hash Functions},
booktitle = {Proceedings of the 10th Workshop on Algorithms and Data Structures (WADs'07)},
publisher = {Springer LNCS vol. 4619},
pages = {139-150},
Moth = August,
location = {Halifax, Canada},
year = 2007,
key = {author}
}
@inproceedings{pb06,
author = {B. Prabhakar and F. Bonomi},
title = {Perfect Hashing for Network Applications},
booktitle = {Proceedings of the IEEE International Symposium
on Information Theory},
year = {2006},
location = {Seattle, Washington, USA},
publisher = {IEEE Press}
}
@inproceedings{dp08,
author = {Martin Dietzfelbinger and Rasmus Pagh},
title = {Succinct Data Structures for Retrieval and Approximate Membership},
booktitle = {Proceedings of the 35th international colloquium on Automata, Languages and Programming (ICALP'08)},
year = {2008},
isbn = {978-3-540-70574-1},
pages = {385--396},
location = {Reykjavik, Iceland},
doi = {http://dx.doi.org/10.1007/978-3-540-70575-8_32},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
}
@inproceedings{bbd09,
author = {D. Belazzougui, F.C. Botelho and M. Dietzfelbinger},
title = {Compress, Hash and Displace},
booktitle = {Proceedings of the 17th European Symposium on Algorithms (ESA'09)},
publisher = {Springer LNCS},
OPTpages = {139-150},
Moth = September,
location = {Copenhagen, Denmark},
year = 2009,
key = {author}
}
@PhdThesis{b08,
author = {F. C. Botelho},
title = {Near-Optimal Space Perfect Hashing Algorithms},
school = {Federal University of Minas Gerais},
year = {2008},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
month = {September},
note = {Supervised by Nivio Ziviani, \url{http://www.dcc.ufmg.br/pos/cursos/defesas/255D.PDF}},
OPTannote = {},
OPTurl = {http://www.dcc.ufmg.br/pos/cursos/defesas/255D.PDF},
OPTdoi = {},
OPTissn = {},
OPTlocalfile = {},
OPTabstract = {}
}
@Article{mwhc96,
author = {B.S. Majewski and N.C. Wormald and G. Havas and Z.J. Czech},
title = {A family of perfect hashing methods},
journal = {The Computer Journal},
year = {1996},
volume = {39},
number = {6},
pages = {547-554},
key = {author}
}
@inproceedings{ckrt04,
author = {B. Chazelle and J. Kilian and R. Rubinfeld and A. Tal},
title = {The Bloomier Filter: An Efficient Data Structure for Static Support Lookup Tables},
booktitle = {Proceedings of the 15th annual ACM-SIAM symposium on Discrete algorithms (SODA'04)},
year = {2004},
isbn = {0-89871-558-X},
pages = {30--39},
location = {New Orleans, Louisiana},
publisher = {Society for Industrial and Applied Mathematics},
address = {Philadelphia, PA, USA},
optpublisher = {Society for Industrial and Applied Mathematics}
}
@Article{j97,
author = {B. Jenkins},
title = {Algorithm Alley: Hash Functions},
journal = {Dr. Dobb's Journal of Software Tools},
volume = {22},
number = {9},
month = {september},
year = {1997},
note = {Extended version available at \url{http://burtleburtle.net/bob/hash/doobs.html}}
}
@Article{e87,
author = {J. Ebert},
title = {A Versatile Data Structure for Edges Oriented Graph Algorithms},
journal = {Communication of The ACM},
year = {1987},
OPTkey = {},
OPTvolume = {},
number = {30},
pages = {513-519},
OPTmonth = {},
OPTnote = {},
OPTannote = {}
}
@article {dict-jour,
AUTHOR = {R. Pagh},
TITLE = {Low Redundancy in Static Dictionaries with Constant Query Time},
OPTJOURNAL = sicomp,
JOURNAL = fsicomp,
VOLUME = {31},
YEAR = {2001},
NUMBER = {2},
PAGES = {353--363},
}
@inproceedings{sg06,
author = {K. Sadakane and R. Grossi},
title = {Squeezing succinct data structures into entropy bounds},
booktitle = {Proceedings of the 17th annual ACM-SIAM symposium on Discrete algorithms (SODA'06)},
year = {2006},
pages = {1230--1239}
}
@inproceedings{gn06,
author = {R. Gonzalez and
G. Navarro},
title = {Statistical Encoding of Succinct Data Structures},
booktitle = {Proceedings of the 19th Annual Symposium on Combinatorial Pattern Matching (CPM'06)},
year = {2006},
pages = {294--305}
}
@inproceedings{fn07,
author = {K. Fredriksson and
F. Nikitin},
title = {Simple Compression Code Supporting Random Access and Fast
String Matching},
booktitle = {Proceedings of the 6th International Workshop on Efficient and Experimental Algorithms (WEA'07)},
year = {2007},
pages = {203--216}
}
@inproceedings{os07,
author = {D. Okanohara and K. Sadakane},
title = {Practical Entropy-Compressed Rank/Select Dictionary},
booktitle = {Proceedings of the Workshop on Algorithm Engineering and
Experiments (ALENEX'07)},
year = {2007},
location = {New Orleans, Louisiana, USA}
}
@inproceedings{rrr02,
author = {R. Raman and V. Raman and S. S. Rao},
title = {Succinct indexable dictionaries with applications to encoding k-ary trees and multisets},
booktitle = {Proceedings of the thirteenth annual ACM-SIAM symposium on Discrete algorithms (SODA'02)},
year = {2002},
isbn = {0-89871-513-X},
pages = {233--242},
location = {San Francisco, California},
publisher = {Society for Industrial and Applied Mathematics},
address = {Philadelphia, PA, USA},
}
cmph-2.0.2/tex/chd/makefile 0000755 0001750 0001750 00000000335 13411542035 015044 0 ustar joseph joseph all:
latex chd.tex
bibtex chd
latex chd.tex
latex chd.tex
dvips chd.dvi -o chd.ps
run: clean all
gv chd.ps &
html: clean all
latex2html chd.tex
clean:
rm chd.dvi chd.ps *.lot *.lof *.aux *.bbl *.blg *.log *.toc
cmph-2.0.2/tex/chd/chd.tex 0000755 0001750 0001750 00000002627 13411542035 014632 0 ustar joseph joseph \documentclass[12pt]{article}
\usepackage{graphicx}
\usepackage{latexsym}
\usepackage{url}
\usepackage{a4wide}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{fancyhdr}
\usepackage{graphics}
\usepackage{multicol}
\usepackage{epsfig}
\usepackage{textcomp}
\usepackage{url}
% \usepackage{subfigure}
% \usepackage{subfig}
% \usepackage{wrapfig}
\bibliographystyle{plain}
% \bibliographystyle{sbc}
% \bibliographystyle{abnt-alf}
% \bibliographystyle{abnt-num}
\begin{document}
\sloppy
% \renewcommand{\baselinestretch}{1.24}\normalsize % set the space between lines to 1.24
% set headings
% \pagestyle{fancy}
% \lhead[\fancyplain{}{\footnotesize\thepage}]
% {\fancyplain{}{\footnotesize\rightmark}}
% \rhead[\fancyplain{}{\footnotesize\leftmark}]
% {\fancyplain{}{\footnotesize\thepage}}
%
% \cfoot{}
\lstset{
language=C,
basicstyle=\fontsize{8}{8}\selectfont,
captionpos=t,
aboveskip=0mm,
belowskip=0mm,
abovecaptionskip=0.5mm,
belowcaptionskip=0.5mm,
% numbers = left,
mathescape=true,
escapechar=@,
extendedchars=true,
showstringspaces=false,
% columns=fixed,
basewidth=0.515em,
frame=single,
framesep=1mm,
xleftmargin=1mm,
xrightmargin=1mm,
framerule=0pt
}
\include{introduction} % Introducao
\bibliography{chd}
\end{document}
cmph-2.0.2/tex/chd/introduction.tex 0000755 0001750 0001750 00000006611 13411542035 016612 0 ustar joseph joseph \section{Introduction} \label{sec:introduction}
The important performance parameters of a PHF are representation size, evaluation time and construction time. The representation size plays an important role when the whole function fits in a faster memory and the actual data is stored in a slower memory. For instace, compact PHFs can be entirely fit in a CPU cache and this makes their computation really fast by avoiding cache misses. The CHD algorithm plays an important role in this context. It was designed by Djamal Belazzougui, Fabiano C. Botelho, and Martin Dietzfelbinger in \cite{bbd09}.
The CHD algorithm permits to obtain PHFs with representation size very close to optimal while retaining $O(n)$ construction time and $O(1)$ evaluation time. For example, in the case $m=2n$ we obtain a PHF that uses space $0.67$ bits per key, and for $m=1.23n$ we obtain space $1.4$ bits per key, which was not achievable with previously known methods. The CHD algorithm is inspired by several known algorithms;
the main new feature is that it combines a modification of Pagh's ``hash-and-displace'' approach
with data compression on a sequence of hash function indices.
That combination makes it possible to significantly reduce space usage
while retaining linear construction time and constant query time.
The CHD algorithm can also be used for $k$-perfect hashing,
where at most $k$ keys may be mapped to the same value.
For the analysis we assume that fully random hash functions are given for free;
such assumptions can be justified and were made in previous papers.
The compact PHFs generated by the CHD algorithm can be used in many applications in which we want to assign a unique identifier to each key without storing any information on the key. One of the most obvious applications of those functions
(or $k$-perfect hash functions) is when we have a small fast memory in which we can store the perfect hash function while the keys and associated satellite data are stored in slower but larger memory.
The size of a block or a transfer unit may be chosen so that $k$ data items can be retrieved in
one read access. In this case we can ensure that data associated with a key can be retrieved in a single probe to slower memory. This has been used for example in hardware routers~\cite{pb06}.
% Perfect hashing has also been found to be competitive with traditional hashing in internal memory~\cite{blmz08} on standard computers. Recently perfect hashing has been used to accelerate algorithms on graphs~\cite{ESS08} when the graph representation does not fit in main memory.
The CHD algorithm generates the most compact PHFs and MPHFs we know of in~$O(n)$ time.
The time required to evaluate the generated functions is constant (in practice less than $1.4$ microseconds).
The storage space of the resulting PHFs and MPHFs are distant from the information
theoretic lower bound by a factor of $1.43$.
The closest competitor is the algorithm by Martin and Pagh \cite{dp08} but
their algorithm do not work in linear time.
Furthermore, the CHD algorithm
can be tuned to run faster than the BPZ algorithm \cite{bpz07} (the fastest algorithm
available in the literature so far) and to obtain more compact functions.
The most impressive characteristic is that it has the ability, in principle, to
approximate the information theoretic lower bound while being practical.
A detailed description of the CHD algorithm can be found in \cite{bbd09}.
cmph-2.0.2/tex/bdz/ 0000755 0001750 0001750 00000000000 13411542035 013361 5 ustar joseph joseph cmph-2.0.2/tex/bdz/bdz.bib 0000755 0001750 0001750 00000010425 13411542035 014623 0 ustar joseph joseph @inproceedings{bpz07,
author = {F.C. Botelho and R. Pagh and N. Ziviani},
title = {Simple and Space-Efficient Minimal Perfect Hash Functions},
booktitle = {Proceedings of the 10th Workshop on Algorithms and Data Structures (WADs'07)},
publisher = {Springer LNCS vol. 4619},
pages = {139-150},
Moth = August,
location = {Halifax, Canada},
year = 2007,
key = {author}
}
@PhdThesis{b08,
author = {F. C. Botelho},
title = {Near-Optimal Space Perfect Hashing Algorithms},
school = {Federal University of Minas Gerais},
year = {2008},
OPTkey = {},
OPTtype = {},
OPTaddress = {},
month = {September},
note = {Supervised by Nivio Ziviani, \url{http://www.dcc.ufmg.br/pos/cursos/defesas/255D.PDF}},
OPTannote = {},
OPTurl = {http://www.dcc.ufmg.br/pos/cursos/defesas/255D.PDF},
OPTdoi = {},
OPTissn = {},
OPTlocalfile = {},
OPTabstract = {}
}
@Article{mwhc96,
author = {B.S. Majewski and N.C. Wormald and G. Havas and Z.J. Czech},
title = {A family of perfect hashing methods},
journal = {The Computer Journal},
year = {1996},
volume = {39},
number = {6},
pages = {547-554},
key = {author}
}
@inproceedings{ckrt04,
author = {B. Chazelle and J. Kilian and R. Rubinfeld and A. Tal},
title = {The Bloomier Filter: An Efficient Data Structure for Static Support Lookup Tables},
booktitle = {Proceedings of the 15th annual ACM-SIAM symposium on Discrete algorithms (SODA'04)},
year = {2004},
isbn = {0-89871-558-X},
pages = {30--39},
location = {New Orleans, Louisiana},
publisher = {Society for Industrial and Applied Mathematics},
address = {Philadelphia, PA, USA},
optpublisher = {Society for Industrial and Applied Mathematics}
}
@Article{j97,
author = {B. Jenkins},
title = {Algorithm Alley: Hash Functions},
journal = {Dr. Dobb's Journal of Software Tools},
volume = {22},
number = {9},
month = {september},
year = {1997},
note = {Extended version available at \url{http://burtleburtle.net/bob/hash/doobs.html}}
}
@Article{e87,
author = {J. Ebert},
title = {A Versatile Data Structure for Edges Oriented Graph Algorithms},
journal = {Communication of The ACM},
year = {1987},
OPTkey = {},
OPTvolume = {},
number = {30},
pages = {513-519},
OPTmonth = {},
OPTnote = {},
OPTannote = {}
}
@article {dict-jour,
AUTHOR = {R. Pagh},
TITLE = {Low Redundancy in Static Dictionaries with Constant Query Time},
OPTJOURNAL = sicomp,
JOURNAL = fsicomp,
VOLUME = {31},
YEAR = {2001},
NUMBER = {2},
PAGES = {353--363},
}
@inproceedings{sg06,
author = {K. Sadakane and R. Grossi},
title = {Squeezing succinct data structures into entropy bounds},
booktitle = {Proceedings of the 17th annual ACM-SIAM symposium on Discrete algorithms (SODA'06)},
year = {2006},
pages = {1230--1239}
}
@inproceedings{gn06,
author = {R. Gonzalez and
G. Navarro},
title = {Statistical Encoding of Succinct Data Structures},
booktitle = {Proceedings of the 19th Annual Symposium on Combinatorial Pattern Matching (CPM'06)},
year = {2006},
pages = {294--305}
}
@inproceedings{fn07,
author = {K. Fredriksson and
F. Nikitin},
title = {Simple Compression Code Supporting Random Access and Fast
String Matching},
booktitle = {Proceedings of the 6th International Workshop on Efficient and Experimental Algorithms (WEA'07)},
year = {2007},
pages = {203--216}
}
@inproceedings{os07,
author = {D. Okanohara and K. Sadakane},
title = {Practical Entropy-Compressed Rank/Select Dictionary},
booktitle = {Proceedings of the Workshop on Algorithm Engineering and
Experiments (ALENEX'07)},
year = {2007},
location = {New Orleans, Louisiana, USA}
}
@inproceedings{rrr02,
author = {R. Raman and V. Raman and S. S. Rao},
title = {Succinct indexable dictionaries with applications to encoding k-ary trees and multisets},
booktitle = {Proceedings of the thirteenth annual ACM-SIAM symposium on Discrete algorithms (SODA'02)},
year = {2002},
isbn = {0-89871-513-X},
pages = {233--242},
location = {San Francisco, California},
publisher = {Society for Industrial and Applied Mathematics},
address = {Philadelphia, PA, USA},
}
cmph-2.0.2/tex/bdz/figs/ 0000755 0001750 0001750 00000000000 13411542035 014311 5 ustar joseph joseph cmph-2.0.2/tex/bdz/figs/overviewinternal3g.eps 0000644 0001750 0001750 00000052477 13411542035 020676 0 ustar joseph joseph %!PS-Adobe-2.0 EPSF-2.0
%%Title: overviewinternal3g.fig
%%Creator: fig2dev Version 3.2 Patchlevel 5
%%CreationDate: Fri May 29 11:09:04 2009
%%For: fbotelho@fbotelho-laptop (Fabiano C. Botelho,,,)
%%BoundingBox: 0 0 342 128
%Magnification: 1.0000
%%EndComments
%%BeginProlog
/MyAppDict 100 dict dup begin def
/$F2psDict 200 dict def
$F2psDict begin
$F2psDict /mtrx matrix put
/col-1 {0 setgray} bind def
/col0 {0.000 0.000 0.000 srgb} bind def
/col1 {0.000 0.000 1.000 srgb} bind def
/col2 {0.000 1.000 0.000 srgb} bind def
/col3 {0.000 1.000 1.000 srgb} bind def
/col4 {1.000 0.000 0.000 srgb} bind def
/col5 {1.000 0.000 1.000 srgb} bind def
/col6 {1.000 1.000 0.000 srgb} bind def
/col7 {1.000 1.000 1.000 srgb} bind def
/col8 {0.000 0.000 0.560 srgb} bind def
/col9 {0.000 0.000 0.690 srgb} bind def
/col10 {0.000 0.000 0.820 srgb} bind def
/col11 {0.530 0.810 1.000 srgb} bind def
/col12 {0.000 0.560 0.000 srgb} bind def
/col13 {0.000 0.690 0.000 srgb} bind def
/col14 {0.000 0.820 0.000 srgb} bind def
/col15 {0.000 0.560 0.560 srgb} bind def
/col16 {0.000 0.690 0.690 srgb} bind def
/col17 {0.000 0.820 0.820 srgb} bind def
/col18 {0.560 0.000 0.000 srgb} bind def
/col19 {0.690 0.000 0.000 srgb} bind def
/col20 {0.820 0.000 0.000 srgb} bind def
/col21 {0.560 0.000 0.560 srgb} bind def
/col22 {0.690 0.000 0.690 srgb} bind def
/col23 {0.820 0.000 0.820 srgb} bind def
/col24 {0.500 0.190 0.000 srgb} bind def
/col25 {0.630 0.250 0.000 srgb} bind def
/col26 {0.750 0.380 0.000 srgb} bind def
/col27 {1.000 0.500 0.500 srgb} bind def
/col28 {1.000 0.630 0.630 srgb} bind def
/col29 {1.000 0.750 0.750 srgb} bind def
/col30 {1.000 0.880 0.880 srgb} bind def
/col31 {1.000 0.840 0.000 srgb} bind def
end
% This junk string is used by the show operators
/PATsstr 1 string def
/PATawidthshow { % cx cy cchar rx ry string
% Loop over each character in the string
{ % cx cy cchar rx ry char
% Show the character
dup % cx cy cchar rx ry char char
PATsstr dup 0 4 -1 roll put % cx cy cchar rx ry char (char)
false charpath % cx cy cchar rx ry char
/clip load PATdraw
% Move past the character (charpath modified the
% current point)
currentpoint % cx cy cchar rx ry char x y
newpath
moveto % cx cy cchar rx ry char
% Reposition by cx,cy if the character in the string is cchar
3 index eq { % cx cy cchar rx ry
4 index 4 index rmoveto
} if
% Reposition all characters by rx ry
2 copy rmoveto % cx cy cchar rx ry
} forall
pop pop pop pop pop % -
currentpoint
newpath
moveto
} bind def
/PATcg {
7 dict dup begin
/lw currentlinewidth def
/lc currentlinecap def
/lj currentlinejoin def
/ml currentmiterlimit def
/ds [ currentdash ] def
/cc [ currentrgbcolor ] def
/cm matrix currentmatrix def
end
} bind def
% PATdraw - calculates the boundaries of the object and
% fills it with the current pattern
/PATdraw { % proc
save exch
PATpcalc % proc nw nh px py
5 -1 roll exec % nw nh px py
newpath
PATfill % -
restore
} bind def
% PATfill - performs the tiling for the shape
/PATfill { % nw nh px py PATfill -
PATDict /CurrentPattern get dup begin
setfont
% Set the coordinate system to Pattern Space
PatternGState PATsg
% Set the color for uncolored pattezns
PaintType 2 eq { PATDict /PColor get PATsc } if
% Create the string for showing
3 index string % nw nh px py str
% Loop for each of the pattern sources
0 1 Multi 1 sub { % nw nh px py str source
% Move to the starting location
3 index 3 index % nw nh px py str source px py
moveto % nw nh px py str source
% For multiple sources, set the appropriate color
Multi 1 ne { dup PC exch get PATsc } if
% Set the appropriate string for the source
0 1 7 index 1 sub { 2 index exch 2 index put } for pop
% Loop over the number of vertical cells
3 index % nw nh px py str nh
{ % nw nh px py str
currentpoint % nw nh px py str cx cy
2 index oldshow % nw nh px py str cx cy
YStep add moveto % nw nh px py str
} repeat % nw nh px py str
} for
5 { pop } repeat
end
} bind def
% PATkshow - kshow with the current pattezn
/PATkshow { % proc string
exch bind % string proc
1 index 0 get % string proc char
% Loop over all but the last character in the string
0 1 4 index length 2 sub {
% string proc char idx
% Find the n+1th character in the string
3 index exch 1 add get % string proc char char+1
exch 2 copy % strinq proc char+1 char char+1 char
% Now show the nth character
PATsstr dup 0 4 -1 roll put % string proc chr+1 chr chr+1 (chr)
false charpath % string proc char+1 char char+1
/clip load PATdraw
% Move past the character (charpath modified the current point)
currentpoint newpath moveto
% Execute the user proc (should consume char and char+1)
mark 3 1 roll % string proc char+1 mark char char+1
4 index exec % string proc char+1 mark...
cleartomark % string proc char+1
} for
% Now display the last character
PATsstr dup 0 4 -1 roll put % string proc (char+1)
false charpath % string proc
/clip load PATdraw
neewath
pop pop % -
} bind def
% PATmp - the makepattern equivalent
/PATmp { % patdict patmtx PATmp patinstance
exch dup length 7 add % We will add 6 new entries plus 1 FID
dict copy % Create a new dictionary
begin
% Matrix to install when painting the pattern
TilingType PATtcalc
/PatternGState PATcg def
PatternGState /cm 3 -1 roll put
% Check for multi pattern sources (Level 1 fast color patterns)
currentdict /Multi known not { /Multi 1 def } if
% Font dictionary definitions
/FontType 3 def
% Create a dummy encoding vector
/Encoding 256 array def
3 string 0 1 255 {
Encoding exch dup 3 index cvs cvn put } for pop
/FontMatrix matrix def
/FontBBox BBox def
/BuildChar {
mark 3 1 roll % mark dict char
exch begin
Multi 1 ne {PaintData exch get}{pop} ifelse % mark [paintdata]
PaintType 2 eq Multi 1 ne or
{ XStep 0 FontBBox aload pop setcachedevice }
{ XStep 0 setcharwidth } ifelse
currentdict % mark [paintdata] dict
/PaintProc load % mark [paintdata] dict paintproc
end
gsave
false PATredef exec true PATredef
grestore
cleartomark % -
} bind def
currentdict
end % newdict
/foo exch % /foo newlict
definefont % newfont
} bind def
% PATpcalc - calculates the starting point and width/height
% of the tile fill for the shape
/PATpcalc { % - PATpcalc nw nh px py
PATDict /CurrentPattern get begin
gsave
% Set up the coordinate system to Pattern Space
% and lock down pattern
PatternGState /cm get setmatrix
BBox aload pop pop pop translate
% Determine the bounding box of the shape
pathbbox % llx lly urx ury
grestore
% Determine (nw, nh) the # of cells to paint width and height
PatHeight div ceiling % llx lly urx qh
4 1 roll % qh llx lly urx
PatWidth div ceiling % qh llx lly qw
4 1 roll % qw qh llx lly
PatHeight div floor % qw qh llx ph
4 1 roll % ph qw qh llx
PatWidth div floor % ph qw qh pw
4 1 roll % pw ph qw qh
2 index sub cvi abs % pw ph qs qh-ph
exch 3 index sub cvi abs exch % pw ph nw=qw-pw nh=qh-ph
% Determine the starting point of the pattern fill
%(px, py)
4 2 roll % nw nh pw ph
PatHeight mul % nw nh pw py
exch % nw nh py pw
PatWidth mul exch % nw nh px py
end
} bind def
% Save the original routines so that we can use them later on
/oldfill /fill load def
/oldeofill /eofill load def
/oldstroke /stroke load def
/oldshow /show load def
/oldashow /ashow load def
/oldwidthshow /widthshow load def
/oldawidthshow /awidthshow load def
/oldkshow /kshow load def
% These defs are necessary so that subsequent procs don't bind in
% the originals
/fill { oldfill } bind def
/eofill { oldeofill } bind def
/stroke { oldstroke } bind def
/show { oldshow } bind def
/ashow { oldashow } bind def
/widthshow { oldwidthshow } bind def
/awidthshow { oldawidthshow } bind def
/kshow { oldkshow } bind def
/PATredef {
MyAppDict begin
{
/fill { /clip load PATdraw newpath } bind def
/eofill { /eoclip load PATdraw newpath } bind def
/stroke { PATstroke } bind def
/show { 0 0 null 0 0 6 -1 roll PATawidthshow } bind def
/ashow { 0 0 null 6 3 roll PATawidthshow }
bind def
/widthshow { 0 0 3 -1 roll PATawidthshow }
bind def
/awidthshow { PATawidthshow } bind def
/kshow { PATkshow } bind def
} {
/fill { oldfill } bind def
/eofill { oldeofill } bind def
/stroke { oldstroke } bind def
/show { oldshow } bind def
/ashow { oldashow } bind def
/widthshow { oldwidthshow } bind def
/awidthshow { oldawidthshow } bind def
/kshow { oldkshow } bind def
} ifelse
end
} bind def
false PATredef
% Conditionally define setcmykcolor if not available
/setcmykcolor where { pop } {
/setcmykcolor {
1 sub 4 1 roll
3 {
3 index add neg dup 0 lt { pop 0 } if 3 1 roll
} repeat
setrgbcolor - pop
} bind def
} ifelse
/PATsc { % colorarray
aload length % c1 ... cn length
dup 1 eq { pop setgray } { 3 eq { setrgbcolor } { setcmykcolor
} ifelse } ifelse
} bind def
/PATsg { % dict
begin
lw setlinewidth
lc setlinecap
lj setlinejoin
ml setmiterlimit
ds aload pop setdash
cc aload pop setrgbcolor
cm setmatrix
end
} bind def
/PATDict 3 dict def
/PATsp {
true PATredef
PATDict begin
/CurrentPattern exch def
% If it's an uncolored pattern, save the color
CurrentPattern /PaintType get 2 eq {
/PColor exch def
} if
/CColor [ currentrgbcolor ] def
end
} bind def
% PATstroke - stroke with the current pattern
/PATstroke {
countdictstack
save
mark
{
currentpoint strokepath moveto
PATpcalc % proc nw nh px py
clip newpath PATfill
} stopped {
(*** PATstroke Warning: Path is too complex, stroking
with gray) =
cleartomark
restore
countdictstack exch sub dup 0 gt
{ { end } repeat } { pop } ifelse
gsave 0.5 setgray oldstroke grestore
} { pop restore pop } ifelse
newpath
} bind def
/PATtcalc { % modmtx tilingtype PATtcalc tilematrix
% Note: tiling types 2 and 3 are not supported
gsave
exch concat % tilingtype
matrix currentmatrix exch % cmtx tilingtype
% Tiling type 1 and 3: constant spacing
2 ne {
% Distort the pattern so that it occupies
% an integral number of device pixels
dup 4 get exch dup 5 get exch % tx ty cmtx
XStep 0 dtransform
round exch round exch % tx ty cmtx dx.x dx.y
XStep div exch XStep div exch % tx ty cmtx a b
0 YStep dtransform
round exch round exch % tx ty cmtx a b dy.x dy.y
YStep div exch YStep div exch % tx ty cmtx a b c d
7 -3 roll astore % { a b c d tx ty }
} if
grestore
} bind def
/PATusp {
false PATredef
PATDict begin
CColor PATsc
end
} bind def
% crosshatch30
11 dict begin
/PaintType 1 def
/PatternType 1 def
/TilingType 1 def
/BBox [0 0 1 1] def
/XStep 1 def
/YStep 1 def
/PatWidth 1 def
/PatHeight 1 def
/Multi 2 def
/PaintData [
{ clippath } bind
{ 32 16 true [ 32 0 0 -16 0 16 ]
{<033003300c0c0c0c30033003c000c000300330030c0c0c0c
0330033000c000c0033003300c0c0c0c30033003c000c000
300330030c0c0c0c0330033000c000c0>}
imagemask } bind
] def
/PaintProc {
pop
exec fill
} def
currentdict
end
/P3 exch def
/cp {closepath} bind def
/ef {eofill} bind def
/gr {grestore} bind def
/gs {gsave} bind def
/sa {save} bind def
/rs {restore} bind def
/l {lineto} bind def
/m {moveto} bind def
/rm {rmoveto} bind def
/n {newpath} bind def
/s {stroke} bind def
/sh {show} bind def
/slc {setlinecap} bind def
/slj {setlinejoin} bind def
/slw {setlinewidth} bind def
/srgb {setrgbcolor} bind def
/rot {rotate} bind def
/sc {scale} bind def
/sd {setdash} bind def
/ff {findfont} bind def
/sf {setfont} bind def
/scf {scalefont} bind def
/sw {stringwidth} bind def
/tr {translate} bind def
/tnt {dup dup currentrgbcolor
4 -2 roll dup 1 exch sub 3 -1 roll mul add
4 -2 roll dup 1 exch sub 3 -1 roll mul add
4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
bind def
/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
4 -2 roll mul srgb} bind def
/DrawEllipse {
/endangle exch def
/startangle exch def
/yrad exch def
/xrad exch def
/y exch def
/x exch def
/savematrix mtrx currentmatrix def
x y tr xrad yrad sc 0 0 1 startangle endangle arc
closepath
savematrix setmatrix
} def
/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
/$F2psEnd {$F2psEnteredState restore end} def
/pageheader {
save
newpath 0 128 moveto 0 0 lineto 342 0 lineto 342 128 lineto closepath clip newpath
-40.3 230.6 translate
1 -1 scale
$F2psBegin
10 setmiterlimit
0 slj 0 slc
0.06299 0.06299 sc
} bind def
/pagefooter {
$F2psEnd
restore
} bind def
%%EndProlog
pageheader
%
% Fig objects follow
%
%
% here starts figure with depth 53
% Polyline
0 slj
0 slc
7.500 slw
n 757 1980 m 652 1980 652 2640 105 arcto 4 {pop} repeat
652 2745 1155 2745 105 arcto 4 {pop} repeat
1260 2745 1260 2085 105 arcto 4 {pop} repeat
1260 1980 757 1980 105 arcto 4 {pop} repeat
cp gs col0 s gr
% here ends figure;
%
% here starts figure with depth 51
% Polyline
0 slj
0 slc
7.500 slw
gs clippath
5215 2261 m 5264 2278 l 5278 2235 l 5229 2219 l 5229 2219 l 5251 2250 l 5215 2261 l cp
eoclip
n 4399 1969 m
5257 2252 l gs col7 1.00 shd ef gr gs col0 s gr gr
% arrowhead
n 5215 2261 m 5251 2250 l 5229 2219 l 5215 2261 l cp gs 0.00 setgray ef gr col0 s
% Polyline
gs clippath
5223 2432 m 5272 2449 l 5286 2406 l 5237 2390 l 5237 2390 l 5259 2421 l 5223 2432 l cp
eoclip
n 4407 2140 m
5265 2423 l gs col7 1.00 shd ef gr gs col0 s gr gr
% arrowhead
n 5223 2432 m 5259 2421 l 5237 2390 l 5223 2432 l cp gs 0.00 setgray ef gr col0 s
% Polyline
gs clippath
5216 2650 m 5267 2647 l 5264 2602 l 5213 2605 l 5213 2605 l 5245 2626 l 5216 2650 l cp
eoclip
n 4398 2687 m
5251 2626 l gs col7 1.00 shd ef gr gs col0 s gr gr
% arrowhead
n 5216 2650 m 5245 2626 l 5213 2605 l 5216 2650 l cp gs 0.00 setgray ef gr col0 s
% Polyline
n 5362 2523 m 5752 2523 l 5752 2696 l 5362 2696 l
cp gs col7 1.00 shd ef gr gs col0 s gr
% Polyline
n 5362 2165 m 5752 2165 l 5752 2338 l 5362 2338 l
cp gs col7 1.00 shd ef gr gs col0 s gr
% Polyline
0.000 slw
n 720 2070 m 900 2070 l 900 2160 l 720 2160 l
cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def
15.00 15.00 sc P3 [16 0 0 -8 48.00 138.00] PATmp PATsp ef gr PATusp
% Polyline
n 720 2565 m 900 2565 l 900 2655 l 720 2655 l
cp gs col7 0.00 shd ef gr
% Polyline
7.500 slw
n 4245 2415 m 4425 2415 l 4425 2595 l 4245 2595 l
cp gs col7 1.00 shd ef gr gs col0 s gr
% Polyline
n 4245 2235 m 4425 2235 l 4425 2415 l 4245 2415 l
cp gs col7 1.00 shd ef gr gs col0 s gr
% Polyline
n 5362 2343 m 5752 2343 l 5752 2516 l 5362 2516 l
cp gs col7 1.00 shd ef gr gs col0 s gr
% Polyline
n 2835 3150 m 3330 3150 l 3330 3465 l 2835 3465 l
cp gs col7 1.00 shd ef gr gs col0 s gr
% Polyline
0.000 slw
n 2880 3330 m 3015 3330 l 3015 3420 l 2880 3420 l
cp gs col7 0.00 shd ef gr
% Polyline
7.500 slw
n 2340 3150 m 2835 3150 l 2835 3465 l 2340 3465 l
cp gs col7 1.00 shd ef gr gs col0 s gr
% Polyline
n 1845 3150 m 2340 3150 l 2340 3465 l 1845 3465 l
cp gs col7 1.00 shd ef gr gs col0 s gr
% Polyline
0.000 slw
n 2385 3330 m 2520 3330 l 2520 3420 l 2385 3420 l
cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def
15.00 15.00 sc P3 [16 0 0 -8 159.00 222.00] PATmp PATsp ef gr PATusp
% Polyline
n 2602 3017 m 2605 2425 l 2792 2423 l 2788 3044 l 2588 3030 l
cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def
15.00 15.00 sc P3 [16 0 0 -8 172.53 161.53] PATmp PATsp ef gr PATusp
% Polyline
n 2609 2477 m 2612 1885 l 2799 1883 l 2795 2504 l 2595 2490 l
cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def
15.00 15.00 sc P3 [16 0 0 -8 173.00 125.53] PATmp PATsp ef gr PATusp
% Polyline
7.500 slw
n 4245 1890 m 4425 1890 l 4425 2070 l 4245 2070 l
cp gs col7 0.85 shd ef gr gs col0 s gr
% Polyline
n 4245 2063 m 4425 2063 l 4425 2243 l 4245 2243 l
cp gs col7 0.85 shd ef gr gs col0 s gr
% Polyline
n 4245 2595 m 4425 2595 l 4425 2775 l 4245 2775 l
cp gs col7 0.85 shd ef gr gs col0 s gr
% Polyline
n 4247 2748 m 4427 2748 l 4427 2928 l 4247 2928 l
cp gs col7 1.00 shd ef gr gs col0 s gr
% Polyline
0.000 slw
n 2657 3060 m 2111 2491 l 2244 2360 l 2786 2937 l
cp gs col7 0.00 shd ef gr
% Polyline
n 2111 2402 m 2660 1838 l 2797 1966 l 2242 2527 l
cp gs col7 0.55 shd ef gr
% Polyline
n 2115 3017 m 2118 2425 l 2305 2423 l 2301 3044 l 2101 3030 l
cp gs col7 0.55 shd ef gr
% Polyline
n 1890 3330 m 2025 3330 l 2025 3420 l 1890 3420 l
cp gs col7 0.55 shd ef gr
% Polyline
n 720 2340 m 900 2340 l 900 2430 l 720 2430 l
cp gs col7 0.55 shd ef gr
% Polyline
n 2113 2439 m 2116 1847 l 2303 1845 l 2299 2466 l 2099 2452 l
cp gs col7 0.00 shd ef gr
/Times-Italic ff 142.88 scf sf
2835 2474 m
gs 1 -1 sc (h \(x\)) col0 sh gr
/Times-Roman ff 111.13 scf sf
2916 2520 m
gs 1 -1 sc (1) col0 sh gr
/Times-Italic ff 142.88 scf sf
2835 3030 m
gs 1 -1 sc (h \(x\)) col0 sh gr
/Times-Roman ff 111.13 scf sf
2916 3076 m
gs 1 -1 sc (2) col0 sh gr
/Times-Italic ff 142.88 scf sf
2835 1950 m
gs 1 -1 sc (h \(x\)) col0 sh gr
/Times-Roman ff 111.13 scf sf
2916 1996 m
gs 1 -1 sc (0) col0 sh gr
/Times-Roman ff 142.88 scf sf
4095 2025 m
gs 1 -1 sc (0) col0 sh gr
/Times-Roman ff 142.88 scf sf
4095 2205 m
gs 1 -1 sc (1) col0 sh gr
/Times-Roman ff 142.88 scf sf
4095 2385 m
gs 1 -1 sc (2) col0 sh gr
/Times-Roman ff 142.88 scf sf
4095 2565 m
gs 1 -1 sc (3) col0 sh gr
/Times-Roman ff 142.88 scf sf
4095 2745 m
gs 1 -1 sc (4) col0 sh gr
/Times-Roman ff 142.88 scf sf
4095 2925 m
gs 1 -1 sc (5) col0 sh gr
/Times-Italic ff 142.88 scf sf
4320 1800 m
gs 1 -1 sc (g) col0 sh gr
/Times-Roman ff 142.88 scf sf
5220 2115 m
gs 1 -1 sc (Hash Table ) col0 sh gr
/Times-Roman ff 142.88 scf sf
5265 2475 m
gs 1 -1 sc (1) col0 sh gr
/Times-Roman ff 142.88 scf sf
5265 2655 m
gs 1 -1 sc (2) col0 sh gr
/Times-Roman ff 142.88 scf sf
5265 2295 m
gs 1 -1 sc (0) col0 sh gr
/Times-Roman ff 158.75 scf sf
1575 1755 m
gs 1 -1 sc (\(a\)) col0 sh gr
/Times-Roman ff 158.75 scf sf
3465 1755 m
gs 1 -1 sc (\(b\)) col0 sh gr
/Times-Roman ff 158.75 scf sf
4680 1755 m
gs 1 -1 sc (\(c\)) col0 sh gr
/Times-Roman ff 142.88 scf sf
3015 3645 m
gs 1 -1 sc (2) col0 sh gr
/Times-Roman ff 142.88 scf sf
2565 3645 m
gs 1 -1 sc (1) col0 sh gr
/Times-Roman ff 142.88 scf sf
2070 3645 m
gs 1 -1 sc (0) col0 sh gr
/ZapfChancery-MediumItalic ff 174.63 scf sf
3420 3375 m
gs 1 -1 sc (L) col0 sh gr
/Times-Roman ff 142.88 scf sf
2865 3277 m
gs 1 -1 sc ({0,2,5}) col0 sh gr
/Times-Roman ff 142.88 scf sf
2370 3277 m
gs 1 -1 sc ({1,3,5}) col0 sh gr
/Times-Roman ff 142.88 scf sf
1895 3277 m
gs 1 -1 sc ({1,2,4}) col0 sh gr
% here ends figure;
%
% here starts figure with depth 45
% Polyline
0 slj
0 slc
7.500 slw
gs clippath
1944 2497 m 1995 2497 l 1995 2452 l 1944 2452 l 1944 2452 l 1974 2475 l 1944 2497 l cp
eoclip
n 1357 2475 m
1980 2475 l gs col7 1.00 shd ef gr gs col0 s gr gr
% arrowhead
n 1944 2497 m 1974 2475 l 1944 2452 l 1944 2497 l cp gs 0.00 setgray ef gr col0 s
% Polyline
gs clippath
3879 2497 m 3930 2497 l 3930 2452 l 3879 2452 l 3879 2452 l 3909 2475 l 3879 2497 l cp
eoclip
n 3292 2475 m
3915 2475 l gs col7 1.00 shd ef gr gs col0 s gr gr
% arrowhead
n 3879 2497 m 3909 2475 l 3879 2452 l 3879 2497 l cp gs 0.00 setgray ef gr col0 s
% Ellipse
n 2704 2448 101 101 0 360 DrawEllipse gs col7 1.00 shd ef gr gs col0 s gr
% Ellipse
n 2209 2448 101 101 0 360 DrawEllipse gs col7 1.00 shd ef gr gs col0 s gr
% Ellipse
n 2704 2988 101 101 0 360 DrawEllipse gs col7 1.00 shd ef gr gs col0 s gr
% Ellipse
n 2209 1908 101 101 0 360 DrawEllipse gs col7 1.00 shd ef gr gs col0 s gr
% Ellipse
n 2704 1908 101 101 0 360 DrawEllipse gs col7 1.00 shd ef gr gs col0 s gr
% Ellipse
n 2209 2988 101 101 0 360 DrawEllipse gs col7 1.00 shd ef gr gs col0 s gr
/Times-Roman ff 142.88 scf sf
5423 2663 m
gs 1 -1 sc (band) col0 sh gr
/Times-Roman ff 142.88 scf sf
5460 2304 m
gs 1 -1 sc (the) col0 sh gr
/Times-Roman ff 142.88 scf sf
1418 2430 m
gs 1 -1 sc (Mapping) col0 sh gr
/Times-Roman ff 142.88 scf sf
3285 2430 m
gs 1 -1 sc (Assigning) col0 sh gr
/Times-Roman ff 142.88 scf sf
2674 2485 m
gs 1 -1 sc (3) col0 sh gr
/Times-Roman ff 142.88 scf sf
2179 2485 m
gs 1 -1 sc (2) col0 sh gr
/Times-Italic ff 142.88 scf sf
945 1935 m
gs 1 -1 sc (S) col0 sh gr
/Times-Roman ff 142.88 scf sf
967 2160 m
gs 1 -1 sc (who) col0 sh gr
/Times-Roman ff 142.88 scf sf
960 2430 m
gs 1 -1 sc (band) col0 sh gr
/Times-Roman ff 142.88 scf sf
1005 2655 m
gs 1 -1 sc (the) col0 sh gr
/Times-Roman ff 142.88 scf sf
4305 2378 m
gs 1 -1 sc (3) col0 sh gr
/Times-Roman ff 142.88 scf sf
5422 2482 m
gs 1 -1 sc (who) col0 sh gr
/Times-Roman ff 142.88 scf sf
4545 2430 m
gs 1 -1 sc (Ranking) col0 sh gr
/Times-Roman ff 142.88 scf sf
3060 3420 m
gs 1 -1 sc (the) col0 sh gr
/Times-Roman ff 142.88 scf sf
2539 3420 m
gs 1 -1 sc (who) col0 sh gr
/Times-Roman ff 142.88 scf sf
2045 3420 m
gs 1 -1 sc (band) col0 sh gr
/Times-Roman ff 142.88 scf sf
2179 1945 m
gs 1 -1 sc (0) col0 sh gr
/Times-Roman ff 142.88 scf sf
2674 1945 m
gs 1 -1 sc (1) col0 sh gr
/Times-Roman ff 142.88 scf sf
2179 3025 m
gs 1 -1 sc (4) col0 sh gr
/Times-Roman ff 142.88 scf sf
2674 3025 m
gs 1 -1 sc (5) col0 sh gr
/Times-Roman ff 142.88 scf sf
4300 2875 m
gs 1 -1 sc (3) col0 sh gr
/Times-Roman ff 142.88 scf sf
4305 2548 m
gs 1 -1 sc (3) col0 sh gr
/Times-Roman ff 142.88 scf sf
4305 2715 m
gs 1 -1 sc (2) col0 sh gr
/Times-Roman ff 142.88 scf sf
4299 2190 m
gs 1 -1 sc (0) col0 sh gr
/Times-Roman ff 142.88 scf sf
4299 2033 m
gs 1 -1 sc (0) col0 sh gr
% here ends figure;
pagefooter
showpage
%%Trailer
end
%EOF
cmph-2.0.2/tex/bdz/figs/overviewinternal3g.fig 0000644 0001750 0001750 00000014022 13411542035 020634 0 ustar joseph joseph #FIG 3.2 Produced by xfig version 3.2.5
Landscape
Center
Metric
A4
100.00
Single
-2
1200 2
6 5355 2520 5760 2700
6 5400 2520 5715 2700
4 0 0 45 -1 0 9 0.0000 4 105 285 5423 2663 band\001
-6
2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
5362 2523 5752 2523 5752 2696 5362 2696 5362 2523
-6
6 5355 2162 5760 2342
2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
5362 2165 5752 2165 5752 2338 5362 2338 5362 2165
4 0 0 45 -1 0 9 0.0000 4 105 195 5460 2304 the\001
-6
6 1350 2340 1980 2520
6 1350 2340 1980 2520
2 1 0 1 0 7 45 -1 20 0.000 0 0 7 1 0 2
1 1 1.00 45.00 30.00
1357 2475 1980 2475
4 0 0 45 -1 0 9 0.0000 4 135 555 1418 2430 Mapping\001
-6
-6
6 3285 2340 3915 2520
6 3285 2340 3915 2520
2 1 0 1 0 7 45 -1 20 0.000 0 0 7 1 0 2
1 1 1.00 45.00 30.00
3292 2475 3915 2475
4 0 0 45 -1 0 9 0.0000 4 135 630 3285 2430 Assigning\001
-6
-6
6 2603 2347 2805 2549
1 3 0 1 0 7 45 -1 20 0.000 1 0.0000 2704 2448 101 101 2704 2448 2749 2538
4 0 0 45 -1 0 9 0.0000 4 105 75 2674 2485 3\001
-6
6 2108 2347 2310 2549
1 3 0 1 0 7 45 -1 20 0.000 1 0.0000 2209 2448 101 101 2209 2448 2254 2538
4 0 0 45 -1 0 9 0.0000 4 105 75 2179 2485 2\001
-6
6 2835 2340 3150 2520
4 0 0 50 -1 1 9 0.0000 4 135 300 2835 2474 h (x)\001
4 0 0 50 -1 0 7 0.0000 4 75 60 2916 2520 1\001
-6
6 2835 2925 3150 3105
4 0 0 50 -1 1 9 0.0000 4 135 300 2835 3030 h (x)\001
4 0 0 50 -1 0 7 0.0000 4 75 60 2916 3076 2\001
-6
6 2835 1845 3135 1996
4 0 0 50 -1 1 9 0.0000 4 135 300 2835 1950 h (x)\001
4 0 0 50 -1 0 7 0.0000 4 75 60 2916 1996 0\001
-6
1 3 0 1 0 7 45 -1 20 0.000 1 0.0000 2704 2988 101 101 2704 2988 2749 3078
1 3 0 1 0 7 45 -1 20 0.000 1 0.0000 2209 1908 101 101 2209 1908 2254 1998
1 3 0 1 0 7 45 -1 20 0.000 1 0.0000 2704 1908 101 101 2704 1908 2749 1998
1 3 0 1 0 7 45 -1 20 0.000 1 0.0000 2209 2988 101 101 2209 2988 2254 3078
2 4 0 1 0 7 53 -1 -1 0.000 0 0 7 0 0 5
1260 2745 1260 1980 652 1980 652 2745 1260 2745
2 2 0 0 0 7 50 -1 43 0.000 0 0 -1 0 0 5
720 2070 900 2070 900 2160 720 2160 720 2070
2 2 0 0 0 7 50 -1 0 0.000 0 0 7 0 0 5
720 2565 900 2565 900 2655 720 2655 720 2565
2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
4245 2415 4425 2415 4425 2595 4245 2595 4245 2415
2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
4245 2235 4425 2235 4425 2415 4245 2415 4245 2235
2 1 0 1 0 7 51 -1 20 0.000 0 0 7 1 0 2
1 1 1.00 45.00 30.00
4399 1969 5257 2252
2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
5362 2343 5752 2343 5752 2516 5362 2516 5362 2343
2 1 0 1 0 7 51 -1 20 0.000 0 0 7 1 0 2
1 1 1.00 45.00 30.00
4407 2140 5265 2423
2 1 0 1 0 7 51 -1 20 0.000 0 0 7 1 0 2
1 1 1.00 45.00 30.00
4398 2687 5251 2626
2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
2835 3150 3330 3150 3330 3465 2835 3465 2835 3150
2 2 0 0 0 7 50 -1 0 0.000 0 0 7 0 0 5
2880 3330 3015 3330 3015 3420 2880 3420 2880 3330
2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
2340 3150 2835 3150 2835 3465 2340 3465 2340 3150
2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
1845 3150 2340 3150 2340 3465 1845 3465 1845 3150
2 2 0 0 0 7 50 -1 43 0.000 0 0 7 0 0 5
2385 3330 2520 3330 2520 3420 2385 3420 2385 3330
2 3 0 0 0 7 50 -1 43 0.000 0 0 7 0 0 6
2602 3017 2605 2425 2792 2423 2788 3044 2588 3030 2602 3017
2 3 0 0 0 7 50 -1 43 0.000 0 0 7 0 0 6
2609 2477 2612 1885 2799 1883 2795 2504 2595 2490 2609 2477
2 2 0 1 0 7 50 -1 17 0.000 0 0 7 0 0 5
4245 1890 4425 1890 4425 2070 4245 2070 4245 1890
2 2 0 1 0 7 50 -1 17 0.000 0 0 7 0 0 5
4245 2063 4425 2063 4425 2243 4245 2243 4245 2063
2 2 0 1 0 7 50 -1 17 0.000 0 0 7 0 0 5
4245 2595 4425 2595 4425 2775 4245 2775 4245 2595
2 2 0 1 0 7 50 -1 20 0.000 0 0 7 0 0 5
4247 2748 4427 2748 4427 2928 4247 2928 4247 2748
2 3 0 0 0 7 50 -1 0 0.000 0 0 7 0 0 5
2657 3060 2111 2491 2244 2360 2786 2937 2657 3060
2 3 0 0 0 7 50 -1 11 0.000 0 0 7 0 0 5
2111 2402 2660 1838 2797 1966 2242 2527 2111 2402
2 3 0 0 0 7 50 -1 11 0.000 0 0 7 0 0 6
2115 3017 2118 2425 2305 2423 2301 3044 2101 3030 2115 3017
2 2 0 0 0 7 50 -1 11 0.000 0 0 7 0 0 5
1890 3330 2025 3330 2025 3420 1890 3420 1890 3330
2 2 0 0 0 7 50 -1 11 0.000 0 0 7 0 0 5
720 2340 900 2340 900 2430 720 2430 720 2340
2 3 0 0 0 7 50 -1 0 0.000 0 0 7 0 0 6
2113 2439 2116 1847 2303 1845 2299 2466 2099 2452 2113 2439
4 0 0 45 -1 1 9 0.0000 4 105 75 945 1935 S\001
4 0 0 45 -1 0 9 0.0000 4 105 270 967 2160 who\001
4 0 0 45 -1 0 9 0.0000 4 105 285 960 2430 band\001
4 0 0 45 -1 0 9 0.0000 4 105 195 1005 2655 the\001
4 0 0 50 -1 0 9 0.0000 4 105 75 4095 2025 0\001
4 0 0 50 -1 0 9 0.0000 4 105 75 4095 2205 1\001
4 0 0 50 -1 0 9 0.0000 4 105 75 4095 2385 2\001
4 0 0 50 -1 0 9 0.0000 4 105 75 4095 2565 3\001
4 0 0 50 -1 0 9 0.0000 4 105 75 4095 2745 4\001
4 0 0 50 -1 0 9 0.0000 4 105 75 4095 2925 5\001
4 0 0 50 -1 1 9 0.0000 4 120 60 4320 1800 g\001
4 0 0 45 -1 0 9 0.0000 4 105 75 4305 2378 3\001
4 0 0 50 -1 0 9 0.0000 4 105 810 5220 2115 Hash Table \001
4 0 0 45 -1 0 9 0.0000 4 105 270 5422 2482 who\001
4 0 0 50 -1 0 9 0.0000 4 105 75 5265 2475 1\001
4 0 0 50 -1 0 9 0.0000 4 105 75 5265 2655 2\001
4 0 0 50 -1 0 9 0.0000 4 105 75 5265 2295 0\001
4 0 0 50 -1 0 10 0.0000 4 135 180 1575 1755 (a)\001
4 0 0 50 -1 0 10 0.0000 4 135 195 3465 1755 (b)\001
4 0 0 50 -1 0 10 0.0000 4 135 180 4680 1755 (c)\001
4 0 0 45 -1 0 9 0.0000 4 135 510 4545 2430 Ranking\001
4 0 0 50 -1 0 9 0.0000 4 105 75 3015 3645 2\001
4 0 0 50 -1 0 9 0.0000 4 105 75 2565 3645 1\001
4 0 0 50 -1 0 9 0.0000 4 105 75 2070 3645 0\001
4 0 0 50 -1 33 11 0.0000 4 135 90 3420 3375 L\001
4 0 0 50 -1 0 9 0.0000 4 135 435 2865 3277 {0,2,5}\001
4 0 0 45 -1 0 9 0.0000 4 105 195 3060 3420 the\001
4 0 0 50 -1 0 9 0.0000 4 135 435 2370 3277 {1,3,5}\001
4 0 0 45 -1 0 9 0.0000 4 105 270 2539 3420 who\001
4 0 0 45 -1 0 9 0.0000 4 105 285 2045 3420 band\001
4 0 0 50 -1 0 9 0.0000 4 135 435 1895 3277 {1,2,4}\001
4 0 0 45 -1 0 9 0.0000 4 105 75 2179 1945 0\001
4 0 0 45 -1 0 9 0.0000 4 105 75 2674 1945 1\001
4 0 0 45 -1 0 9 0.0000 4 105 75 2179 3025 4\001
4 0 0 45 -1 0 9 0.0000 4 105 75 2674 3025 5\001
4 0 0 45 -1 0 9 0.0000 4 105 75 4300 2875 3\001
4 0 0 45 -1 0 9 0.0000 4 105 75 4305 2548 3\001
4 0 0 45 -1 0 9 0.0000 4 105 75 4305 2715 2\001
4 0 0 45 -1 0 9 0.0000 4 105 75 4299 2190 0\001
4 0 0 45 -1 0 9 0.0000 4 105 75 4299 2033 0\001
cmph-2.0.2/tex/bdz/makefile 0000755 0001750 0001750 00000000335 13411542035 015065 0 ustar joseph joseph all:
latex bdz.tex
bibtex bdz
latex bdz.tex
latex bdz.tex
dvips bdz.dvi -o bdz.ps
run: clean all
gv bdz.ps &
html: clean all
latex2html bdz.tex
clean:
rm bdz.dvi bdz.ps *.lot *.lof *.aux *.bbl *.blg *.log *.toc
cmph-2.0.2/tex/bdz/bdz.tex 0000755 0001750 0001750 00000002627 13411542035 014674 0 ustar joseph joseph \documentclass[12pt]{article}
\usepackage{graphicx}
\usepackage{latexsym}
\usepackage{url}
\usepackage{a4wide}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{fancyhdr}
\usepackage{graphics}
\usepackage{multicol}
\usepackage{epsfig}
\usepackage{textcomp}
\usepackage{url}
% \usepackage{subfigure}
% \usepackage{subfig}
% \usepackage{wrapfig}
\bibliographystyle{plain}
% \bibliographystyle{sbc}
% \bibliographystyle{abnt-alf}
% \bibliographystyle{abnt-num}
\begin{document}
\sloppy
% \renewcommand{\baselinestretch}{1.24}\normalsize % set the space between lines to 1.24
% set headings
% \pagestyle{fancy}
% \lhead[\fancyplain{}{\footnotesize\thepage}]
% {\fancyplain{}{\footnotesize\rightmark}}
% \rhead[\fancyplain{}{\footnotesize\leftmark}]
% {\fancyplain{}{\footnotesize\thepage}}
%
% \cfoot{}
\lstset{
language=C,
basicstyle=\fontsize{8}{8}\selectfont,
captionpos=t,
aboveskip=0mm,
belowskip=0mm,
abovecaptionskip=0.5mm,
belowcaptionskip=0.5mm,
% numbers = left,
mathescape=true,
escapechar=@,
extendedchars=true,
showstringspaces=false,
% columns=fixed,
basewidth=0.515em,
frame=single,
framesep=1mm,
xleftmargin=1mm,
xrightmargin=1mm,
framerule=0pt
}
\include{introduction} % Introducao
\bibliography{bdz}
\end{document}
cmph-2.0.2/tex/bdz/introduction.tex 0000755 0001750 0001750 00000042164 13411542035 016636 0 ustar joseph joseph \section{Introduction} \label{sec:introduction}
The BDZ algorithm was designed by Fabiano C. Botelho, Djamal Belazzougui, Rasmus Pagh and Nivio Ziviani.
It is a simple, efficient, near-optimal space and practical
algorithm to generate a family $\cal F$ of PHFs and MPHFs.
It is also referred to as BPZ algorithm because the work presented
by Botelho, Pagh and Ziviani in \cite{bpz07}.
In the Botelho's PhD. dissertation \cite{b08} it is also referred to as RAM algorithm
because it is more suitable for key sets that can be handled in internal memory.
The BDZ algorithm uses $r$-uniform random hypergraphs
given by function values of $r$
uniform random hash functions on the input key set $S$ for generating PHFs and MPHFs that
require $O(n)$ bits to be stored.
A hypergraph is the generalization of a standard undirected
graph where each edge connects $r\geq 2$ vertices.
This idea is not new, see e.g. \cite{mwhc96},
but we have proceed differently to achieve
a space usage of $O(n)$ bits rather than $O(n\log n)$ bits.
Evaluation time for all schemes considered is constant.
For $r=3$ we obtain a space usage of approximately $2.6n$ bits for
an MPHF. More compact, and even simpler, representations can be
achieved for larger $m$. For example, for $m=1.23n$ we can get a
space usage of $1.95n$ bits.
Our best MPHF space upper bound is within a
factor of 2 from the information theoretical lower bound of approximately
$1.4427n$ bits. We have shown that the BDZ algorithm is far more
practical than previous methods with proven space complexity, both
because of its simplicity, and because the constant factor of the
space complexity is more than 6 times lower than its closest
competitor, for plausible problem sizes. We verify the practicality
experimentally, using slightly more space than in the mentioned
theoretical bounds.
\section{The Algorithm}
The BDZ algorithm is a three-step algorithm that generates PHFs and MPHFs based on
random $r$-partite hypergraphs.
This is an approach that provides a much tighter analysis and is
much more simple than the one presented in
\cite{ckrt04}, where it was implicit how to construct
similar PHFs.
The fastest and most compact functions
are generated when $r=3$.
In this case a PHF can be stored in
approximately $1.95$ bits per key and
an MPHF in approximately
$2.62$ bits per key.
Figure~\ref{fig:overview} gives an overview of the algorithm for $r=3$,
taking as input a key set $S \subseteq U$ containing three English words, i.e., $S=\{\mathrm{who},\mathrm{band},\mathrm{the}\}$.
% which are nicely hashed to the name of a rock band ``the who band''.
The edge-oriented data structure proposed in~\cite{e87} is used
to represent hypergraphs, where each edge is explicitly represented
as an array of $r$ vertices and, for each vertex $v$,
there is a list of edges that are incident on $v$.
The {\em Mapping Step} in Figure~\ref{fig:overview}(a) carries out two
important tasks:
\begin{enumerate}
\item
It assumes that it is possible to find three uniform
hash functions, $h_0$, $h_1$ and $h_2$, with ranges $\{0,1\}$, $\{2,3\}$ and $\{4,5\}$, respectively.
These functions build an one-to-one mapping of the key set $S$ to the edge set $E$
of a random acyclic
$3$-partite hypergraph $G=(V,E)$, where $|V|=m=6$ and $|E|=n=3$.
In \cite{b08,bpz07} it is shown that
it is possible to obtain such a hypergraph with probability tending to $1$ as $n$
tends to infinity
whenever $m=cn$ and $c \ge 1.23$. The value of $c$ that minimizes the hypergraph size
(and thereby the amount of bits to represent the resulting functions) is $c \approx 1.23$.
To illustrate the mapping,
key ``who'' is mapped to edge $\{h_0(\text{``who''}),h_1(\text{``who''}),h_2(\text{``who''})\}=\{1,3,5\}$,
key ``band'' is mapped to edge $\{h_0(\text{``band''}),h_1(\text{``band''}),h_2(\text{``band''})\}=\{1,2,4\}$, and
key ``the'' is mapped to edge $\{h_0(\text{``the''}),h_1(\text{``the''}),h_2(\text{``the''})\}=\{0,2,5\}$.
\item
It tests whether the resulting random $3$-partite hypergraph contains cycles
by iteratively deleting edges connecting vertices of degree 1.
The deleted edges are stored in the order of deletion in a list $\cal L$
to be used in the assigning step.
The first deleted edge in Figure~\ref{fig:overview}(a)
was $\{1,2,4\}$, the second one was $\{1,3,5\}$ and
the third one was $\{0,2,5\}$.
% the last one was $\{0,2,5\}$.
If it ends with an empty graph, then the test succeeds,
otherwise it fails.
\end{enumerate}
\begin{figure}
\begin{center}
\scalebox{0.9}{\epsfig{file=figs/overviewinternal3g.eps}}
\end{center}
\caption{(a) The mapping step generates a random acyclic $3$-partite hypergraph with $m=6$ vertices and $n=3$ edges
and a list $\cal L$ of edges obtained when we test whether the hypergraph is acyclic.
(b) The assigning step builds an array $g:[0,5] \to [0,3]$ to uniquely
assign an edge to a vertex. (c) The ranking step builds the data structure used to
compute function $\mathit{rank}: [0,5] \to [0,2]$ in $O(1)$ time.~~~~}
\label{fig:overview}
\end{figure}
We now show how to use the Jenkins hash functions \cite{j97}
to implement the three hash functions $h_i: S \to V_i$, $0\le i \le 2$, which are used to build a random $3$-partite hypergraph
$G=(V,E)$,
where $V= V_0 \cup V_1 \cup V_2$ and $|V_i| = \eta = \lceil \frac{m}{3} \rceil$.
Let $h':S \to \{0,1\}^\gamma$ be a Jenkins hash function
for $\gamma = 3 \times w$, where
$w = 32 \text{ or } 64$ for
32-bit and 64-bit architectures, respectively.
Let $H'$ be an array of 3 $w$-bit values.
The Jenkins hash function
allow us to compute in parallel the three entries in $H'$
and thereby the three hash functions $h_i$, as follows:
% Thus we can compute the three hash functions $h_i$
% as follows:
\begin{eqnarray}
H' &=& h'(x) \nonumber \\
h_0(x) &=& H'[0] \bmod \eta \nonumber \\
h_1(x) &=& H'[1] \bmod \eta + \eta \nonumber \\
h_2(x) &=& H'[2] \bmod \eta + 2\eta
\end{eqnarray}
The {\em Assigning Step} in Figure~\ref{fig:overview}(b) outputs
a PHF that maps the key set $S$ into the range $[0,m-1]$ and is represented by
an array $g$ storing values from the range $[0,3]$.
The array $g$ allows to select one out of the $3$
vertices of a given edge, which is associated with a
key $k$.
A vertex for a key $k$ is given
by either $h_0(k)$, $h_1(k)$ or $h_2(k)$.
The function $h_i(k)$
to be used for $k$ is chosen by calculating $i = (g[h_0(k)] + g[h_1(k)] + g[h_2(k)]) \bmod 3$.
For instance,
the values 1 and 4 represent the keys ``who'' and ``band''
because $i = (g[1] + g[3] + g[5]) \bmod 3 = 0$ and $h_0(\text{``who''}) = 1$,
and $i = (g[1] + g[2] + g[4]) \bmod 3 = 2$ and $h_2(\text{``band''}) = 4$, respectively.
% Likewise, the value 4 represents the key
% because $(g[1] + g[2] + g[4]) \bmod 3 = 2$ and $h_2(\text{``band''}) = 4$, and so on.
The assigning step firstly initializes $g[i]=3$
to mark every vertex as unassigned
% (i.e., each vertex is unassigned)
and
$\mathit{Visited}[i]=\mathit{false}$, $0\leq i \leq m-1$.
Let $\mathit{Visited}$ be a boolean vector of size $m$
to indicate whether a vertex has been visited.
Then, for each edge $e \in \cal L$ from tail to head,
it looks for the first
vertex $u$ belonging to $e$ not yet visited.
This is a sufficient condition for success \cite{b08,bpz07,mwhc96}.
Let $j$, $0 \leq j \leq 2$, be the index of $u$ in $e$.
Then, it assigns $g[u]=(j-\sum_{v \in e \wedge \mathit{Visited}[v] = true} g[v]) \bmod 3$.
Whenever it passes through a vertex $u$ from $e$,
if $u$ has not yet been visited,
it sets $\mathit{Visited}[u] = true$.
% The value $g[i]=3$ is used to represent unassigned vertices.
If we stop the BDZ algorithm in the assigning step
we obtain a PHF with range $[0,m-1]$.
The PHF has the following form:
$phf(x) = h_{i(x)}(x)$, where $x\in S$ and $i(x) = (g[h_0(x)] + g[h_1(x)] + g[h_2(x)]) \bmod 3$.
In this case we do not need information for ranking and
can set $g[i] = 0$ whenever $g[i]$ is equal to 3, where $0 \le i \le m-1$.
Therefore, the range of the values stored in $g$ is narrowed
from $[0,3]$ to $[0,2]$. By using arithmetic coding as block of
values (see \cite{b08,bpz07} for details),
or any compression technique that allows to perform
random access in constant time to an array of compressed values \cite{fn07,gn06,sg06},
we can store the resulting PHFs in $m\log 3 = c n\log 3$ bits,
where $c \ge 1.23$. For $c = 1.23$, the space requirement is $1.95n$ bits.
The {\em Ranking Step} in Figure~\ref{fig:overview}(c)
outputs a data structure
that permits to narrow the range of a PHF generated in the
assigning step from $[0,m-1]$ to $[0,n-1]$ and thereby
an MPHF is produced.
The data structure allows to compute in constant time
a function $\mathit{rank}\!\!:[0,m-1]\to [0,n-1]$
that counts the number of assigned positions
before a given position $v$ in $g$.
For instance, $\mathit{rank}(4) = 2$ because
the positions $0$ and $1$ are assigned
since $g[0] \text{ and } g[1] \not = 3$.
% and they come before position 4 in $g$.
For the implementation of the ranking step
we have borrowed
a simple and efficient implementation from
\cite{dict-jour}.
It requires $\epsilon \, m$ additional bits of space, where $0 < \epsilon < 1$,
and is obtained by storing explicitly the
$\mathit{rank}$ of every $k$th index in a rankTable, where $k
=\lfloor\log(m)/\epsilon\rfloor$.
The larger is $k$ the more compact is the resulting MPHF.
Therefore, the users can tradeoff space for evaluation time
by setting $k$ appropriately in the implementation.
% In the implementation we let
% $k$ to be set by the users so that they can trade off
% space for evaluation time and vice-versa.
We only allow values for $k$
that are power of two (i.e., $k=2^{b_k}$ for some constant $b_k$) in order to replace the expensive
division and modulo operations by
bit-shift and bitwise ``and'' operations, respectively.
We have used $k=256$
in the experiments
for generating more succinct MPHFs.
We remark that it is still possible to obtain a more compact data structure by
using the results presented in \cite{os07,rrr02}, but at the cost of a much more
complex implementation.
We need to use an additional lookup table $T_r$
to guarantee the constant evaluation time of $\mathit{rank}(u)$.
Let us illustrate how $\mathit{rank}(u)$ is computed
using both the rankTable and the lookup table $T_r$.
We first look up
the rank of the largest precomputed index
$v\leq u$ in the rankTable,
and use $T_r$ to count the number of assigned vertices from position
$v$ to $u-1$.
The lookup table $T_r$ allows us to count in constant time
the number of assigned vertices in $\flat=\epsilon \log m$ bits,
where $0 < \epsilon < 1$. Thus the actual evaluation time is $O(1/\epsilon)$.
For simplicity and
without loss of generality we let $\flat$ be a multiple of the number of
bits $\beta$ used to encode each entry of $g$.
As the values in $g$ come from the range $[0,3]$,
then $\beta=2$ bits and we have tried $\flat = 8 \text{ and } 16$.
We would expect that $\flat = 16$ should provide
a faster evaluation time because we would need to carry out fewer lookups
in $T_r$. But, for both values of $\flat$ the lookup table $T_r$ fits entirely in
the CPU cache and we did not realize any significant difference in
the evaluation times. Therefore we settle for $\flat=8$.
We remark that each $r \ge 2$ requires
a different lookup table $T_r$ that can be generated a priori.
% To do this in $O(1/\epsilon)$ time
% we use a lookup table $T_r$ that allows us to count
% the number of assigned vertices in $\flat=\epsilon \log m$ bits
% in constant time for any $0 < \epsilon < 1$.
% In general the PHFs or MPHFs are constructed based on random acyclic $r$-partite hypergraphs $G_r=(V,E)$,
% where $V= V_0 \cup V_1 \cup \dots \cup V_{r-1}$ and $|V_i| = \eta = \lceil \frac{m}{r} \rceil$, where $0\leq i < r$.
% The most efficient and compact functions are generated
% when $r=3$ and $m=1.23n$. The value $1.23n$ is required to generate a
% random acyclic $3$-partite hypergraph with high probability\footnote{Throughout this paper
% we write ``with high probability'' to mean with probability
% $1 - n^{-\delta}$ for $\delta > 0$.}~\cite{b08,bpz07}.
% the family of linear transformations
% presented in \cite{admp99}. A still faster option is the Jenkins function
% proposed in \cite{j97}, which was used for all methods considered in this paper.
The resulting
MPHFs have the following form:
$h(x) = \mathit{rank}(\mathit{phf}(x))$.
Then, we cannot get rid of
the raking information by replacing the values 3 by 0 in the entries of $g$.
% The array
% $g$ is now representing a function $g:V\to \{0,1,2,3\}$
% and $\mathit{rank}: V \to [0,n-1]$ is
% now the cardinality of
% $\{ u\in V \;\mid\; u\!> b_k + 1)$ $\delta$-bit entries, where $\delta = 32 \text{ or } 64$ depending on the architecture. The operator $>\!>$ denotes the right shift of bits.\\[2mm]@
% void @BDZ@ (@$S$@, @$\cal H$@, @$c$@, @$b_k$@, @$g$@, @rankTable@)@\\[2mm]@
% // Mapping step
% do
% @$G.E = \emptyset$@;
% select @$h'$@ at random from @$\cal H$@;
% for @{\bf each}@ @$x \in S$@ do
% @$H'$ = $h'(x)$@;
% @$e$@ = @$\{h_0(x), h_1(x), h_2(x)\}$@;
% addEdge (@$G$@, @$e$@);
% @$\cal L$@ = isAcyclic(@$G$@);
% while (@$G.E$@ is not empty);
%
% // Assigning step
% for (@$u = 0$@; @$u < m$@; @$u$++@)
% Visited[@$u$@] = @{\bf false}@;
% @$g[u]$@ = @$3$@;
% for (i = @$|{\cal L}|-1$@; i @$\ge 0$@; i@$--$@)
% @$e$@ = @$\cal L$@[i];
% sum = 0;
% for (@$v$@ = 2; @$v \ge 0$@; @$v$@@$--$@)
% if (not Visited[@$e[v]$@])
% Visited[@$e[v]$@] = @{\bf true}@;
% @$u$@ = @$e[v]$@;
% @$j$@ = @$v$@;
% else sum += @$g[e[v]]$@;
% @g[u]@ = @$(j - \mathrm{sum}) \bmod 3$@;
%
% // Ranking step
% sum = 0;
% kmask = @$(2^{b_k}-1)$@;
% for (i = 0; i < @$|g|$@; i++)
% if((i & kmask) @==@ 0)
% rankTable[i @$>\!> b_k$@] = sum;
% if(@$g$@[i] @$\not = 3$@) sum++;
%
% @{\bf PHF Algorithm}\\[1mm]@
% @{\bf Input:} a key $x \in S$, an array $g$ with $m = \lceil cn \rceil$ 2-bit entries, where $c \ge 1.23$, and the ``good'' hash functions $h'$ selected by the BDZ algorithm.\\[1mm]@
% @{\bf Output:} the perfect hash function value for the key $x \in S$.\\[2mm]@
% int phf (@$x$@, @$g$@, @$h'$@)
% @$H'$@ = @$h'(x)$@;
% @$e$@ = @$\{h_0(x), h_1(x), h_2(x)\}$@;
% @$v$@ = @$(g[e[0]] + g[e[1]] + g[e[2]]) \bmod 3$@;
% return @$e[v]$@;
%
% @{\bf Algorithm to Generate the Lookup Table}\\[1mm]@
% @{\bf Input:} none\\[1mm]@
% @{\bf Output:} the lookup table @$T_r$@ to be used by the mphf function. It counts the number of assigned
% vertices in a single byte. As each entry in the array $g$ is encoded by 2 bits, then a single byte can store at most four 2-bit values. LS($i'$,2) stands for the value of the 2 least significant bits of $i'$.\\[2mm]@
% void genLookupTable (@$T_r$@)
% for (i = 0; i < 256; i++)
% sum = 0;
% @$i'$@ = i;
% for (j = 0; j < 4; j++)
% if(@$\text{LS}(i',2) \not = 3$@) sum++;
% @$i'$@ = @$i' >\!> 2$@;
% @$T_r[i]$@ = sum;
%
% @{\bf MPHF Algorithm}\\[1mm]@
% @{\bf Input:} a key $x \in S$, an array $g$ with $m = \lceil cn \rceil$ 2-bit entries, where $c \ge 1.23$, the chosen ``good'' hash functions $h'$, a constant $b_k$ that makes $k=2^{b_k}$, the lookup table $T_r$ that counts the number of assigned vertices in a single byte, and a rankTable with $(m >\!> b_k + 1)$ $\delta$-bit entries, where $\delta = 32 \text{ or } 64$ depending on the architecture. The notation $g[i \to j]$ represents the values stored in the entries from $g[i]$ to $g[j]$ for $i\leq j$.\\[1mm]@
% @{\bf Output:} the minimal perfect hash function value for the key $x \in S$.\\[2mm]@
% int mphf (@$x$@, @$g$@, @$h'$@, @$b_k$@, @$T_r$@, @rankTable@)
% @$u$@ = phf(@$x$@, @$g$@, @$h'$@);
% j = @$u >\!> b_k$@; // @j@ = @$u$@/k
% rank = rankTable[j];
% i = j @$<\!< b_k$@; // @i@ = @j*k@
% for(j = i + 4; j < u; i = j, j += 4)
% rank += @$T_r[g[$@i @$\to$@ j@$]]$@;
% for(j = j - 4; j < u; j ++)
% if(@$g$@[j] @$\not =$@ 3) rank ++ ;
% return rank;
% \end{lstlisting}
% \end{center}
% \vspace{-6mm}
% \caption{The BDZ algorithm and the resulting PHFs and MPHFs.}
% \label{prog:ram}
% \vspace{-7mm}
% \end{figure}
$\eta$ ~~
$\epsilon$ ~~
$\varepsilon$ cmph-2.0.2/TABLE1.t2t 0000644 0001750 0001750 00000005652 13411542035 013355 0 ustar joseph joseph