]])
# --enable-all-static
# Do not use libtool if building all static
AC_ARG_ENABLE([all-static],
[AC_HELP_STRING([--enable-all-static], [create statically linked executable])])
STATIC_FLAGS=
AS_IF([test x$enable_all_static = xyes],
[AC_SUBST([STATIC_FLAGS], [-all-static])])
#
# SWIG and bindings
#
maybe_swig=
# --enable-python-binding
AC_ARG_ENABLE([python-binding],
[AC_HELP_STRING([--enable-python-binding@<:@=PATH@:>@], [create SWIG python module and install in PATH])])
# --enable-ruby-binding
AC_ARG_ENABLE([ruby-binding],
[AC_HELP_STRING([--enable-ruby-binding@<:@=PATH@:>@], [create SWIG ruby module and install in PATH])])
# --enable-perl-binding
AC_ARG_ENABLE([perl-binding],
[AC_HELP_STRING([--enable-perl-binding@<:@=PATH@:>@], [create SWIG perl module and install in PATH])])
# --enable-swig
AC_ARG_ENABLE([swig],
[AC_HELP_STRING([--enable-swig], [enable development of swig binding])])
AS_IF([test x$enable_swig = xyes],
[AX_PKG_SWIG([3.0.0], [], [AC_MSG_ERROR([SWIG version 3 is required])])])
AS_IF([test -n "$SWIG"],
[SWIG_ENABLE_CXX])
AM_CONDITIONAL([HAVE_SWIG], [test -n "$SWIG"])
# Python binding setup
AM_CONDITIONAL(PYTHON_BINDING, [test -n "$enable_python_binding" -a x$enable_python_binding != xno])
AM_COND_IF([PYTHON_BINDING],
[AS_IF([test x$enable_python_binding != xyes], [PYTHON_SITE_PKG=$enable_python_binding])]
[AX_PYTHON_DEVEL([], [$prefix])])
# Ruby binding setup
AM_CONDITIONAL([RUBY_BINDING], [test -n "$enable_ruby_binding" -a x$enable_ruby_binding != xno])
AM_COND_IF([RUBY_BINDING],
[AS_IF([test x$enable_ruby_binding != xyes], [RUBY_EXT_LIB=$enable_ruby_binding])]
[AX_RUBY_EXT([$prefix])])
# Perl binding setup
AM_CONDITIONAL([PERL_BINDING], [test -n "$enable_perl_binding" -a x$enable_perl_binding != xno])
AM_COND_IF([PERL_BINDING],
[AS_IF([test x$enable_perl_binding != xyes], [PERL_EXT_LIB=$enable_perl_binding])]
[AX_PERL_EXT([$prefix])])
AC_OUTPUT
Jellyfish-2.2.4/development.mk 0000664 0000000 0000000 00000001245 12613705377 0016340 0 ustar 00root root 0000000 0000000 AM_CXXFLAGS += -Werror
# Count lines of code
.PHONY: cloc cloc_jellyfish
cloc:
cloc --force-lang="Ruby,yaggo" --force-lang="make,am" --force-lang="make,mk" \
--exclude-dir="gtest" --ignored=cloc_ignored_src_files \
$(srcdir)/jellyfish $(srcdir)/include $(srcdir)/lib $(srcdir)/sub_commands $(srcdir)/tests $(srcdir)/unit_tests \
$(srcdir)/Makefile.am $(srcdir)/*.mk
cloc_jellyfish:
cloc $(srcdir)/jellyfish $(srcdir)/include $(srcdir)/lib $(srcdir)/sub_commands
cloc_library:
cloc $(srcdir)/include $(srcdir)/lib
# Make a dependency on yaggo the software
$(YAGGO_SOURCES): $(YAGGO)
# Launch unit tests
unittests:
@$(MAKE) check TESTS=unit_tests/unit_tests.sh
Jellyfish-2.2.4/doc/ 0000775 0000000 0000000 00000000000 12613705377 0014230 5 ustar 00root root 0000000 0000000 Jellyfish-2.2.4/doc/Makefile 0000664 0000000 0000000 00000001022 12613705377 0015663 0 ustar 00root root 0000000 0000000 L2M = latex2man -t transfile
MI = makeinfo
PDFLATEX = pdflatex
M4 = m4
TARGETS = jellyfish.man jellyfish.html jellyfish.pdf
all: $(TARGETS)
man: jellyfish.man
man ./$<
options.tex: generate_options_tex.sh
sh $< > $@
%-full.tex: %.tex options.tex
$(M4) $< > $@
%.man: %-full.tex
$(L2M) $< $@
%.html: %-full.tex
$(L2M) -H $< $@
%.texi: %-full.tex
$(L2M) -T $< $@
%.info: %.texi
$(MI) --force $< $@
%.latex: %-full.tex
$(L2M) -L $< $@
%.pdf: %.latex
$(PDFLATEX) $<
clean:
rm -f $(TARGETS) *-full.tex options.tex
Jellyfish-2.2.4/doc/UserGuide.lyx 0000664 0000000 0000000 00000063255 12613705377 0016675 0 ustar 00root root 0000000 0000000 #LyX 2.0 created this file. For more info see http://www.lyx.org/
\lyxformat 413
\begin_document
\begin_header
\textclass report
\begin_preamble
\usepackage[letterpaper]{geometry}
\usepackage{siunitx}
\end_preamble
\use_default_options true
\begin_modules
theorems-ams
eqs-within-sections
figs-within-sections
\end_modules
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman default
\font_sans default
\font_typewriter default
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100
\font_tt_scale 100
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\spacing single
\use_hyperref false
\papersize default
\use_geometry false
\use_amsmath 1
\use_esint 1
\use_mhchem 1
\use_mathdots 1
\cite_engine basic
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\use_refstyle 1
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 2
\tocdepth 2
\paragraph_separation indent
\paragraph_indentation default
\quotes_language english
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header
\begin_body
\begin_layout Title
Jellyfish 2 User Guide
\end_layout
\begin_layout Standard
\begin_inset CommandInset toc
LatexCommand tableofcontents
\end_inset
\end_layout
\begin_layout Standard
\begin_inset FormulaMacro
\newcommand{\switch}[1]{\texttt{-\@-#1}}
\end_inset
\begin_inset FormulaMacro
\newcommand{\opt}[1]{\texttt{-#1}}
\end_inset
\end_layout
\begin_layout Chapter
Getting started
\end_layout
\begin_layout Section
Counting all
\begin_inset Formula $k$
\end_inset
-mers
\end_layout
\begin_layout Standard
The basic command to count all
\begin_inset Formula $k$
\end_inset
-mers is as follows:
\end_layout
\begin_layout LyX-Code
jellyfish count -m 21 -s 100M -t 10 -C reads.fasta
\end_layout
\begin_layout Standard
This will count canonical (
\begin_inset Formula $\opt C$
\end_inset
)
\begin_inset Formula $21$
\end_inset
-mers (
\family sans
\begin_inset Formula $\opt m\,21$
\end_inset
\family default
), using a hash with 100 million elements (
\family sans
\begin_inset Formula $\opt s\,\unit[100]{M}$
\end_inset
\family default
) and 10 threads (
\family sans
\begin_inset Formula $\opt t\,10$
\end_inset
\family default
) in the sequences in the file
\family sans
reads.fasta
\family default
.
The output is written in the file '
\family sans
mer_counts.jf
\family default
' by default (change with
\family sans
\begin_inset Formula $\opt o$
\end_inset
\family default
switch).
\end_layout
\begin_layout Standard
To compute the histogram of the
\begin_inset Formula $k$
\end_inset
-mer occurrences, use the
\family sans
histo
\family default
subcommand (see section
\begin_inset space ~
\end_inset
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:histo"
\end_inset
):
\end_layout
\begin_layout LyX-Code
jellyfish histo mer_counts.jf
\end_layout
\begin_layout Standard
To query the counts of a particular
\begin_inset Formula $k$
\end_inset
-mer, use the
\family sans
query
\family default
subcommand (see section
\begin_inset space ~
\end_inset
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:query"
\end_inset
):
\end_layout
\begin_layout LyX-Code
jellyfish query mer_counts.jf AACGTTG
\end_layout
\begin_layout Standard
To output all the counts for all the
\begin_inset Formula $k$
\end_inset
-mers in the file, use the
\family sans
dump
\family default
subcommand (see section
\begin_inset space ~
\end_inset
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:dump"
\end_inset
):
\end_layout
\begin_layout LyX-Code
jellyfish dump mer_counts.jf > mer_counts_dumps.fa
\end_layout
\begin_layout Standard
To get some information on how, when and where this jellyfish file was generated
, use the
\family sans
info
\family default
subcommand (see section
\begin_inset space ~
\end_inset
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:info"
\end_inset
):
\end_layout
\begin_layout LyX-Code
jellyfish info mer_counts.jf
\end_layout
\begin_layout Standard
For more detail information, see the relevant sections in this document.
All commands understand
\begin_inset Formula $\switch{help}$
\end_inset
and will produce some information about the switches available.
\end_layout
\begin_layout Subsection
Counting
\begin_inset Formula $k$
\end_inset
-mers in sequencing reads
\end_layout
\begin_layout Standard
In sequencing reads, it is unknown which strands of the DNA is sequenced.
As a consequence, a
\begin_inset Formula $k$
\end_inset
-mer or its reverse complement are essentially equivalent.
The canonical representative of a
\begin_inset Formula $k$
\end_inset
-mer
\begin_inset Formula $m$
\end_inset
is by definition
\begin_inset Formula $m$
\end_inset
or the reverse complement of
\begin_inset Formula $m$
\end_inset
, whichever comes first lexicographically.
The
\family sans
\begin_inset Formula $\opt C$
\end_inset
\family default
switch instructs to save in the hash only canonical
\begin_inset Formula $k$
\end_inset
-mers, while the count is the number of occurrences of both a
\begin_inset Formula $k$
\end_inset
-mer and it reverse complement.
\end_layout
\begin_layout Standard
The size parameter (given with
\begin_inset Formula $\opt s$
\end_inset
) is an indication of the number
\begin_inset Formula $k$
\end_inset
-mers that will be stored in the hash.
For sequencing reads, one this size should be the size of the genome plus
the
\begin_inset Formula $k$
\end_inset
-mers generated by sequencing errors.
For example, if the error rate is
\begin_inset Formula $e$
\end_inset
(e.g.
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
@
\end_layout
\end_inset
Illumina reads, usually
\begin_inset Formula $e\approx1\%$
\end_inset
), with an estimated genome size of
\begin_inset Formula $G$
\end_inset
and a coverage of
\begin_inset Formula $c$
\end_inset
, the number of expected
\begin_inset Formula $k$
\end_inset
-mers is
\begin_inset Formula $G+Gcek$
\end_inset
.
This assume
\end_layout
\begin_layout Quote
NOTE: unlike in Jellyfish 1, this
\family sans
-s
\family default
parameter is only an estimation.
If the size given is too small to fit all the
\begin_inset Formula $k$
\end_inset
-mers, the hash size will be increased automatically or partial results
will be written to disk and finally merged automatically.
Running '
\family sans
jellyfish merge
\family default
' should never be necessary, as now jellyfish now takes care of this task
on its own.
\end_layout
\begin_layout Standard
If the low frequency
\begin_inset Formula $k$
\end_inset
-mers (
\begin_inset Formula $k$
\end_inset
-mers occurring only once), which are mostly due to sequencing errors, are
not of interest, one might consider counting only high-frequency
\begin_inset Formula $k$
\end_inset
-mers (see section
\begin_inset space ~
\end_inset
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Counting-high-frequency--mers"
\end_inset
), which uses less memory and is potentially faster.
\end_layout
\begin_layout Subsection
Counting
\begin_inset Formula $k$
\end_inset
-mers in a genome
\end_layout
\begin_layout Standard
In an actual genome or finished sequence, a
\begin_inset Formula $k$
\end_inset
-mer and its reverse complement are not equivalent, hence using the
\family sans
\begin_inset Formula $\opt C$
\end_inset
\family default
switch does not make sense.
In addition, the size for the hash can be set directly to the size of the
genome.
\end_layout
\begin_layout Section
Counting high-frequency
\begin_inset Formula $k$
\end_inset
-mers
\begin_inset CommandInset label
LatexCommand label
name "sec:Counting-high-frequency--mers"
\end_inset
\end_layout
\begin_layout Standard
Jellyfish offers two way to count only high-frequency
\begin_inset Formula $k$
\end_inset
-mers (meaning only
\begin_inset Formula $k$
\end_inset
-mers with count
\begin_inset Formula $>1$
\end_inset
), which reduces significantly the memory usage.
Both methods are based on using Bloom filters.
The first method is a one pass method, which provides approximate count
for some percentage of the
\begin_inset Formula $k$
\end_inset
-mers.
The second method is a two pass method which provides exact count.
In both methods, most of the low-frequency
\begin_inset Formula $k$
\end_inset
-mers are not reported.
\end_layout
\begin_layout Subsection
One pass method
\begin_inset CommandInset label
LatexCommand label
name "sub:One-pass-method"
\end_inset
\end_layout
\begin_layout Standard
Adding the
\family sans
\begin_inset Formula $\switch{bf-size}$
\end_inset
\family default
switch make jellyfish first insert all
\begin_inset Formula $k$
\end_inset
-mers first into a Bloom filter and only insert into the hash the
\begin_inset Formula $k$
\end_inset
-mers which have already been seen at least once.
The argument to
\begin_inset Formula $\switch{bf-size}$
\end_inset
should the total number of
\begin_inset Formula $k$
\end_inset
-mer expected in the data set while the
\family sans
\begin_inset Formula $\switch{size}$
\end_inset
argument should be the number of
\begin_inset Formula $k$
\end_inset
-mers occurring more than once.
For example:
\end_layout
\begin_layout LyX-Code
jellyfish count -m 25 -s 3G --bf-size 100G -t 16 homo_sapiens.fa
\end_layout
\begin_layout Standard
would be appropriate for counting
\begin_inset Formula $25$
\end_inset
-mers in human reads at
\begin_inset Formula $30\times$
\end_inset
coverage.
The approximate memory usage is
\begin_inset Formula $\SI{9}{bits}$
\end_inset
per
\begin_inset Formula $k$
\end_inset
-mer in the Bloom filter.
\end_layout
\begin_layout Standard
The count reported for each
\begin_inset Formula $k$
\end_inset
-mer (by '
\family sans
jellyfish dump
\family default
' or '
\family sans
jellyfish query
\family default
') is one less than the actual count.
Meaning, the count
\begin_inset Formula $1$
\end_inset
\begin_inset Formula $k$
\end_inset
-mer are not reported, count
\begin_inset Formula $2$
\end_inset
\begin_inset Formula $k$
\end_inset
-mer are reported to have count
\begin_inset Formula $1$
\end_inset
, etc.
\end_layout
\begin_layout Standard
The drawback of this method is some percentage of the
\begin_inset Formula $k$
\end_inset
-mer that should not be reported (because they occur only once) are reported.
This is due to the random nature of the Bloom filter data structure.
The percentage is
\begin_inset Formula $<1\%$
\end_inset
by default and can be changed with the
\begin_inset Formula $\switch{bf-fp}$
\end_inset
switch.
\end_layout
\begin_layout Subsection
Two pass method
\end_layout
\begin_layout Standard
In the two pass method, first a Bloom counter is created from the reads
with '
\family sans
jellyfish bc
\family default
'.
Then this Bloom counter is given to the '
\family sans
jelllyfish count
\family default
' command and only the
\begin_inset Formula $k$
\end_inset
-mers which have been seen twice in the first pass will be inserted in the
hash.
For example, with a human data set similar that in section
\begin_inset space ~
\end_inset
\begin_inset CommandInset ref
LatexCommand ref
reference "sub:One-pass-method"
\end_inset
:
\end_layout
\begin_layout LyX-Code
jellyfish bc -m 25 -s 100G -t 16 -o homo_sapiens.bc homo_sapiens.fa
\end_layout
\begin_layout LyX-Code
jellyfish count -m 25 -s 3G -t 16 --bc homo_sapiens.bc homo_sapiens.fa
\end_layout
\begin_layout Standard
The advantage of this method is that the counts reported for the
\begin_inset Formula $k$
\end_inset
-mers are all correct.
Most count
\begin_inset Formula $1$
\end_inset
\begin_inset Formula $k$
\end_inset
-mer are not reported, except for a small percentage (set by the
\family sans
\begin_inset Formula $\opt f$
\end_inset
\family default
switch of the
\family sans
bc
\family default
subcommand) of them which are reported (correctly with count
\begin_inset Formula $1$
\end_inset
).
All other
\begin_inset Formula $k$
\end_inset
-mers are reported with the correct count.
\end_layout
\begin_layout Standard
The drawback of this method is that it requires to parse the entire reads
data set twice and the memory usage of the Bloom counter is greater than
that of the Bloom filter (slightly less than twice as much).
\end_layout
\begin_layout Chapter
FAQ
\end_layout
\begin_layout Section
How to read compressed files (or other format)?
\end_layout
\begin_layout Standard
Jellyfish only reads FASTA or FASTQ formatted input files.
By reading from pipes, jellyfish can read compressed files, like this:
\end_layout
\begin_layout LyX-Code
zcat *.fastq.gz | jellyfish count /dev/fd/0 ...
\end_layout
\begin_layout Standard
or by using the
\family sans
'<()
\family default
'
\family sans
redirection
\family default
provided by the shell (e.g.
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
@
\end_layout
\end_inset
bash, zsh):
\end_layout
\begin_layout LyX-Code
jellyfish count <(zcat file1.fastq.gz) <(zcat file2.fasta.gz) ...
\end_layout
\begin_layout Section
How to read multiple files at once?
\end_layout
\begin_layout Standard
Often, jellyfish can parse an input sequence file faster than
\family sans
gzip
\family default
or
\family sans
fastq-dump
\family default
(to parse SRA files) can output the sequence.
This leads to many threads in jellyfish going partially unused.
Jellyfish can be instructed to open multiple file at once.
For example, to read two short read archive files simultaneously:
\end_layout
\begin_layout LyX-Code
jellyfish count -F 2 <(fastq-dump -Z file1.sra) <(fastq-dump -Z file2.sra)
...
\end_layout
\begin_layout Standard
Another way is to use
\begin_inset Quotes eld
\end_inset
generators
\begin_inset Quotes erd
\end_inset
.
First, create a file containing, one per line, commands to generate sequence.
Then pass this file to jellyfish and the number of generators to run simultaneo
usly.
Jellyfish will spawn subprocesses running the commands passed and read
their standard output for sequence.
By default, the commands are run using the shell in the SHELL environment
variable, and this can be changed by the
\family sans
\begin_inset Formula $\opt S$
\end_inset
\family default
switch.
Multiple generators will be run simultaneously as specified by the
\family sans
\begin_inset Formula $\opt G$
\end_inset
\family default
switch.
For example:
\end_layout
\begin_layout LyX-Code
ls *.fasta.gz | xargs -n 1 echo gunzip -c > generators
\end_layout
\begin_layout LyX-Code
jellyfish count -g generators -G 4 ...
\end_layout
\begin_layout Standard
The first command created the command list into the '
\family sans
generators
\family default
' file, each command unzipping one FASTA file in the current directory.
The second command runs jellyfish with
\begin_inset Formula $4$
\end_inset
concurrent generators.
\end_layout
\begin_layout Section
How to reduce the output size?
\end_layout
\begin_layout Standard
The output file was design to be easy to read, but the file generated can
be rather large.
By default, a
\begin_inset Formula $\SI{4}{bytes}$
\end_inset
counter value is saved for every
\begin_inset Formula $k$
\end_inset
-mer (i.e.
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
@
\end_layout
\end_inset
a maximum count of over 4 billion).
Instead, a counter size of
\begin_inset Formula $\SI{2}{bytes}$
\end_inset
or
\begin_inset Formula $\SI{1}{byte}$
\end_inset
can be used with the switch
\begin_inset Formula $\switch{out-counter-len}$
\end_inset
, which reduces significantly the output size.
\end_layout
\begin_layout Standard
The count of
\begin_inset Formula $k$
\end_inset
-mers which cannot be represented with the given number of bytes will have
a value equal to the maximum value that can be represented.
Meaning, if the counter field uses
\begin_inset Formula $\SI{1}{byte}$
\end_inset
, any
\begin_inset Formula $k$
\end_inset
-mers with count greater or equal to
\begin_inset Formula $255$
\end_inset
will be reported of having a count
\begin_inset Formula $255$
\end_inset
.
\end_layout
\begin_layout Standard
Also, low frequency and high frequency
\begin_inset Formula $k$
\end_inset
-mers can be skipped using the
\family sans
\begin_inset Formula $\opt L$
\end_inset
\family default
and
\family sans
\begin_inset Formula $\opt U$
\end_inset
\family default
switches respectively.
Although it might be more appropriate to filter out the low frequency
\begin_inset Formula $k$
\end_inset
-mers using Bloom filters, as shown in section
\begin_inset space ~
\end_inset
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Counting-high-frequency--mers"
\end_inset
.
\end_layout
\begin_layout Section
How much memory is needed?
\end_layout
\begin_layout Standard
The memory needed to count
\begin_inset Formula $k$
\end_inset
-mers, given the various parameters of the
\family sans
count
\family default
subcommand is obtained by using the
\family sans
mem
\family default
subcommand.
It understand all the same switches, but only the
\begin_inset Formula $\switch{mer-len}$
\end_inset
(
\begin_inset Formula $\opt m$
\end_inset
),
\begin_inset Formula $\switch{size}$
\end_inset
(
\begin_inset Formula $\opt s$
\end_inset
),
\begin_inset Formula $\switch{counter-len}$
\end_inset
(
\begin_inset Formula $\opt c$
\end_inset
) and
\begin_inset Formula $\switch{reprobes}$
\end_inset
(
\begin_inset Formula $\opt p$
\end_inset
) switches are taken into account.
\end_layout
\begin_layout Standard
For example, for
\begin_inset Formula $24$
\end_inset
-mers, with an initial hash size of 1 billion, and default parameters otherwise,
the memory usage is:
\end_layout
\begin_layout LyX-Code
$ jellyfish mem -m 24 -s 1G
\end_layout
\begin_layout LyX-Code
4521043056 (4G)
\end_layout
\begin_layout Standard
Conversely, if the
\begin_inset Formula $\switch{size}$
\end_inset
switch is not given by the
\begin_inset Formula $\switch{mem}$
\end_inset
switch is, then the maximum initial hash size that would fit in the given
memory is returned.
For example, this is the maximum hash size for
\begin_inset Formula $31$
\end_inset
-mers in
\begin_inset Formula $\unit[8]{Gb}$
\end_inset
of RAM:
\end_layout
\begin_layout LyX-Code
$ jellyfish mem -m 31 --mem 8g
\end_layout
\begin_layout LyX-Code
1073741824 (1G)
\end_layout
\begin_layout Chapter
Subcommands
\end_layout
\begin_layout Section
\family sans
histo
\begin_inset CommandInset label
LatexCommand label
name "sec:histo"
\end_inset
\end_layout
\begin_layout Standard
The
\family sans
histo
\family default
subcommand outputs the histogram of
\begin_inset Formula $k$
\end_inset
-mers frequencies.
The last bin, with value one above the high setting set by the
\family sans
\begin_inset Formula $\opt h$
\end_inset
\family default
switch (
\begin_inset Formula $10\,000$
\end_inset
by default), is a catch all: all
\begin_inset Formula $k$
\end_inset
-mers with a count greater than the high setting are tallied in that one
bin.
If the low setting is set (
\family sans
\begin_inset Formula $\opt l$
\end_inset
\family default
switch), then the first bin, with value one below the low setting, is also
similarly a catch all.
\end_layout
\begin_layout Standard
By default, the bins with a zero count are skipped.
This can be changed with the
\family sans
\begin_inset Formula $\opt f$
\end_inset
\family default
switch.
\end_layout
\begin_layout Section
\family sans
dump
\family default
\begin_inset CommandInset label
LatexCommand label
name "sec:dump"
\end_inset
\end_layout
\begin_layout Standard
The
\family sans
dump
\family default
subcommand outputs a list of all the
\begin_inset Formula $k$
\end_inset
-mers in the file associated with their count.
By default, the output is in FASTA format, where the header line contains
the count of the
\begin_inset Formula $k$
\end_inset
-mer and the sequence part is the sequence of the
\begin_inset Formula $k$
\end_inset
-mer.
This format has the advantage that the output contains the sequence of
\begin_inset Formula $k$
\end_inset
-mers and can be directly fed into another program expecting the very common
FASTA format.
A more convenient column format (for human beings) is selected with the
\family sans
-c
\family default
switch.
\end_layout
\begin_layout Standard
Low frequency and high frequency
\begin_inset Formula $k$
\end_inset
-mers can be skipped with the
\family sans
\begin_inset Formula $\opt L$
\end_inset
\family default
and
\family sans
\begin_inset Formula $\opt U$
\end_inset
\family default
switches respectively.
\end_layout
\begin_layout Standard
In the output of the
\family sans
dump
\family default
subcommand, the
\begin_inset Formula $k$
\end_inset
-mers are sorted according to the hash function used by Jellyfish.
The output can be considered to be
\begin_inset Quotes eld
\end_inset
fairly pseudo-random
\begin_inset Quotes erd
\end_inset
.
By
\begin_inset Quotes eld
\end_inset
fairly
\begin_inset Quotes erd
\end_inset
we mean that NO guarantee is made about the actual randomness of this order,
it is just good enough for the hash table to work properly.
And by
\begin_inset Quotes eld
\end_inset
pseudo-random
\begin_inset Quotes erd
\end_inset
we mean that the order is actually deterministic: given the same hash function,
the output will be always the same and two different files generated with
the same hash function can be merged easily.
\end_layout
\begin_layout Section
\family sans
query
\begin_inset CommandInset label
LatexCommand label
name "sec:query"
\end_inset
\end_layout
\begin_layout Standard
The
\family sans
query
\family default
subcommand outputs the
\begin_inset Formula $k$
\end_inset
-mers and their counts for some subset of
\begin_inset Formula $k$
\end_inset
-mers.
It will outputs the counts of all the
\begin_inset Formula $k$
\end_inset
-mers passed on the command line or of all the
\begin_inset Formula $k$
\end_inset
-mers in the sequence read from the FASTA or FASTQ formatted file passed
to the switch
\family sans
\begin_inset Formula $\opt s$
\end_inset
\family default
(this switch can be given multiple times).
\end_layout
\begin_layout Section
\family sans
info
\family default
\begin_inset CommandInset label
LatexCommand label
name "sec:info"
\end_inset
\end_layout
\begin_layout Standard
The
\family sans
info
\family default
subcommand outputs some information about the jellyfish file and the command
used to generated it, in which directory and at what time the command was
run.
Hopefully, the information given should be enough to rerun jellyfish under
the same conditions and reproduce the output file.
In particular, the
\family sans
\begin_inset Formula $\opt c$
\end_inset
\family default
switch outputs the command, properly escaped and ready to run in a shell.
\end_layout
\begin_layout Standard
The header is saved in JSON format and contains more information than is
written by the default.
The full header in JSON format can be written out using the
\family sans
-j
\family default
switch.
\end_layout
\begin_layout Section
\family sans
merge
\family default
\begin_inset CommandInset label
LatexCommand label
name "sec:merge"
\end_inset
\end_layout
\begin_layout Standard
The
\family sans
merge
\family default
subcommand is a little direct use with version version 2 of jellyfish.
When intermediary files were written to disk, because not all
\begin_inset Formula $k$
\end_inset
-mers would fit in memory, they can be merged into one file containing the
final result with the
\family sans
merge
\family default
subcommand.
The
\family sans
count
\family default
will merge intermediary files automatically as needed.
\end_layout
\begin_layout Section
stats
\begin_inset CommandInset label
LatexCommand label
name "sec:stats"
\end_inset
\end_layout
\begin_layout Standard
The
\family sans
stats
\family default
subcommand computes some statistics about the mers.
Although these statistics could be computed from the histogram, it provides
quick summary information.
The fields are:
\end_layout
\begin_layout Description
Unique The number of
\begin_inset Formula $k$
\end_inset
-mer occuring exactly once
\end_layout
\begin_layout Description
Distinct The number of
\begin_inset Formula $k$
\end_inset
-mers, ignoring their multiplicity (i.e.
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
@
\end_layout
\end_inset
the cardinality of the set of
\begin_inset Formula $k$
\end_inset
-mers)
\end_layout
\begin_layout Description
Total The number of
\begin_inset Formula $k$
\end_inset
-mers with multiplicity (i.e.
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
@
\end_layout
\end_inset
the sum of the number of occurence of all the mers)
\end_layout
\begin_layout Description
Max_count The maximum of the number of occurences
\end_layout
\begin_layout Section
mem
\begin_inset CommandInset label
LatexCommand label
name "sec:mem"
\end_inset
\end_layout
\begin_layout Standard
The
\family sans
mem
\family default
subcommand shows how much memory a count subcommand will need or conversely
how large of a hash size will fit in a given amount of memory.
\end_layout
\begin_layout Section
cite
\begin_inset CommandInset label
LatexCommand label
name "sec:cite"
\end_inset
\end_layout
\begin_layout Standard
The
\family sans
cite
\family default
subcommand prints the citation for the jellyfish paper.
With the
\begin_inset Formula $\opt b$
\end_inset
, it is formatted in Bibtex format.
How convenient!
\end_layout
\end_body
\end_document
Jellyfish-2.2.4/doc/generate_options_tex.sh 0000775 0000000 0000000 00000000266 12613705377 0021020 0 ustar 00root root 0000000 0000000 #! /bin/sh
for i in count stats histo dump merge query qhisto qdump qmerge cite; do
echo "\\subsection{$i}"
jellyfish $i --help | ruby option_to_tex /dev/fd/0
echo
done
Jellyfish-2.2.4/doc/help_parse.rb 0000664 0000000 0000000 00000021536 12613705377 0016706 0 ustar 00root root 0000000 0000000 # Autogenerated from a Treetop grammar. Edits may be lost.
module HelpGrammar
include Treetop::Runtime
def root
@root ||= :line
end
module Line0
def spaces
elements[0]
end
def sws
elements[1]
end
def text
elements[2]
end
end
module Line1
def to_tex(opened)
res = ""
unless sws.empty?
if !opened.is_open
res << "\\begin{description}\n"
opened.is_open = true
end
res << "\\item["
res << sws.elements.map { |s| s.to_tex }.join(",")
res << "] "
end
res << "\\noindent " unless text.text_value.empty?
res << quote(text.text_value)
end
end
def _nt_line
start_index = index
if node_cache[:line].has_key?(index)
cached = node_cache[:line][index]
if cached
cached = SyntaxNode.new(input, index...(index + 1)) if cached == true
@index = cached.interval.end
end
return cached
end
i0, s0 = index, []
r1 = _nt_spaces
s0 << r1
if r1
s2, i2 = [], index
loop do
r3 = _nt_switches
if r3
s2 << r3
else
break
end
end
r2 = instantiate_node(SyntaxNode,input, i2...index, s2)
s0 << r2
if r2
r4 = _nt_text
s0 << r4
if r4
s5, i5 = [], index
loop do
if has_terminal?('\G[\\n]', true, index)
r6 = true
@index += 1
else
r6 = nil
end
if r6
s5 << r6
else
break
end
end
r5 = instantiate_node(SyntaxNode,input, i5...index, s5)
s0 << r5
end
end
end
if s0.last
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
r0.extend(Line0)
r0.extend(Line1)
else
@index = i0
r0 = nil
end
node_cache[:line][start_index] = r0
r0
end
def _nt_spaces
start_index = index
if node_cache[:spaces].has_key?(index)
cached = node_cache[:spaces][index]
if cached
cached = SyntaxNode.new(input, index...(index + 1)) if cached == true
@index = cached.interval.end
end
return cached
end
s0, i0 = [], index
loop do
if has_terminal?('\G[\\s]', true, index)
r1 = true
@index += 1
else
r1 = nil
end
if r1
s0 << r1
else
break
end
end
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
node_cache[:spaces][start_index] = r0
r0
end
module Switches0
def sw
elements[1]
end
end
module Switches1
def to_tex
sw.to_tex
end
end
def _nt_switches
start_index = index
if node_cache[:switches].has_key?(index)
cached = node_cache[:switches][index]
if cached
cached = SyntaxNode.new(input, index...(index + 1)) if cached == true
@index = cached.interval.end
end
return cached
end
i0, s0 = index, []
if has_terminal?("-", false, index)
r1 = instantiate_node(SyntaxNode,input, index...(index + 1))
@index += 1
else
terminal_parse_failure("-")
r1 = nil
end
s0 << r1
if r1
i2 = index
r3 = _nt_short_switch
if r3
r2 = r3
else
r4 = _nt_long_switch
if r4
r2 = r4
else
@index = i2
r2 = nil
end
end
s0 << r2
end
if s0.last
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
r0.extend(Switches0)
r0.extend(Switches1)
else
@index = i0
r0 = nil
end
node_cache[:switches][start_index] = r0
r0
end
module ShortSwitch0
def name
elements[0]
end
def spaces
elements[2]
end
end
module ShortSwitch1
def to_tex
"\\Opt{-#{name.text_value}}"
end
end
def _nt_short_switch
start_index = index
if node_cache[:short_switch].has_key?(index)
cached = node_cache[:short_switch][index]
if cached
cached = SyntaxNode.new(input, index...(index + 1)) if cached == true
@index = cached.interval.end
end
return cached
end
i0, s0 = index, []
if has_terminal?('\G[a-zA-Z]', true, index)
r1 = true
@index += 1
else
r1 = nil
end
s0 << r1
if r1
if has_terminal?(",", false, index)
r3 = instantiate_node(SyntaxNode,input, index...(index + 1))
@index += 1
else
terminal_parse_failure(",")
r3 = nil
end
if r3
r2 = r3
else
r2 = instantiate_node(SyntaxNode,input, index...index)
end
s0 << r2
if r2
r4 = _nt_spaces
s0 << r4
end
end
if s0.last
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
r0.extend(ShortSwitch0)
r0.extend(ShortSwitch1)
else
@index = i0
r0 = nil
end
node_cache[:short_switch][start_index] = r0
r0
end
module LongSwitch0
end
module LongSwitch1
def name
elements[1]
end
def val
elements[2]
end
def spaces
elements[4]
end
end
module LongSwitch2
def to_tex
val.empty? ? "\\LOpt{#{quote(name.text_value)}}" : "\\LOptArg{#{quote(name.text_value)}}{#{quote(val.text_value)}}"
end
end
def _nt_long_switch
start_index = index
if node_cache[:long_switch].has_key?(index)
cached = node_cache[:long_switch][index]
if cached
cached = SyntaxNode.new(input, index...(index + 1)) if cached == true
@index = cached.interval.end
end
return cached
end
i0, s0 = index, []
if has_terminal?("-", false, index)
r1 = instantiate_node(SyntaxNode,input, index...(index + 1))
@index += 1
else
terminal_parse_failure("-")
r1 = nil
end
s0 << r1
if r1
s2, i2 = [], index
loop do
if has_terminal?('\G[^,\\s=]', true, index)
r3 = true
@index += 1
else
r3 = nil
end
if r3
s2 << r3
else
break
end
end
if s2.empty?
@index = i2
r2 = nil
else
r2 = instantiate_node(SyntaxNode,input, i2...index, s2)
end
s0 << r2
if r2
i5, s5 = index, []
if has_terminal?("=", false, index)
r6 = instantiate_node(SyntaxNode,input, index...(index + 1))
@index += 1
else
terminal_parse_failure("=")
r6 = nil
end
s5 << r6
if r6
s7, i7 = [], index
loop do
if has_terminal?('\G[^,\\s]', true, index)
r8 = true
@index += 1
else
r8 = nil
end
if r8
s7 << r8
else
break
end
end
if s7.empty?
@index = i7
r7 = nil
else
r7 = instantiate_node(SyntaxNode,input, i7...index, s7)
end
s5 << r7
end
if s5.last
r5 = instantiate_node(SyntaxNode,input, i5...index, s5)
r5.extend(LongSwitch0)
else
@index = i5
r5 = nil
end
if r5
r4 = r5
else
r4 = instantiate_node(SyntaxNode,input, index...index)
end
s0 << r4
if r4
if has_terminal?(",", false, index)
r10 = instantiate_node(SyntaxNode,input, index...(index + 1))
@index += 1
else
terminal_parse_failure(",")
r10 = nil
end
if r10
r9 = r10
else
r9 = instantiate_node(SyntaxNode,input, index...index)
end
s0 << r9
if r9
r11 = _nt_spaces
s0 << r11
end
end
end
end
if s0.last
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
r0.extend(LongSwitch1)
r0.extend(LongSwitch2)
else
@index = i0
r0 = nil
end
node_cache[:long_switch][start_index] = r0
r0
end
def _nt_text
start_index = index
if node_cache[:text].has_key?(index)
cached = node_cache[:text][index]
if cached
cached = SyntaxNode.new(input, index...(index + 1)) if cached == true
@index = cached.interval.end
end
return cached
end
s0, i0 = [], index
loop do
if has_terminal?('\G[^\\n]', true, index)
r1 = true
@index += 1
else
r1 = nil
end
if r1
s0 << r1
else
break
end
end
r0 = instantiate_node(SyntaxNode,input, i0...index, s0)
node_cache[:text][start_index] = r0
r0
end
end
class HelpGrammarParser < Treetop::Runtime::CompiledParser
include HelpGrammar
end
Jellyfish-2.2.4/doc/help_parse.treetop 0000664 0000000 0000000 00000002032 12613705377 0017753 0 ustar 00root root 0000000 0000000 grammar HelpGrammar
rule line
spaces sws:switches* text [\n]* {
def to_tex(opened)
res = ""
unless sws.empty?
if !opened.is_open
res << "\\begin{description}\n"
opened.is_open = true
end
res << "\\item["
res << sws.elements.map { |s| s.to_tex }.join(",")
res << "] "
end
res << "\\noindent " unless text.text_value.empty?
res << quote(text.text_value)
end
}
end
rule spaces
[\s]*
end
rule switches
"-" sw:( short_switch / long_switch ) {
def to_tex
sw.to_tex
end
}
end
rule short_switch
name:[a-zA-Z] ","? spaces {
def to_tex
"\\Opt{-#{name.text_value}}"
end
}
end
rule long_switch
"-" name:[^,\s=]+ val:("=" [^,\s]+)? ","? spaces {
def to_tex
val.empty? ? "\\LOpt{#{quote(name.text_value)}}" : "\\LOptArg{#{quote(name.text_value)}}{#{quote(val.text_value)}}"
end
}
end
rule text
[^\n]*
end
end
Jellyfish-2.2.4/doc/jellyfish.html 0000664 0000000 0000000 00000044667 12613705377 0017130 0 ustar 00root root 0000000 0000000
JELLYFISH
Jellyfish: A fast k-mer counter
G. Marcais and C. Kingsford
2010/10/1
Version 1.1.4
Jellyfish
is a software to count k-mers in DNA sequences.
Table of Contents
jellyfish count
[-oprefix]
[-mmerlength]
[-tthreads]
[-shashsize]
[--both-strands]
fasta
[fasta ...]
jellyfish merge
hash1
hash2
...
jellyfish dump
hash
jellyfish stats
hash
jellyfish histo
[-hhigh]
[-llow]
[-iincrement]
hash
jellyfish query
hash
jellyfish cite
Plus equivalent version for Quake
mode: qhisto,
qdump
and qmerge.
Jellyfish
is a k-mer counter based on a multi-threaded hash
table implementation.
To count k-mers, use a command like:
jellyfish count -m 22 -o output -c 3 -s 10000000 -t 32 input.fasta
This will count the the 22-mers in input.fasta with 32 threads. The
counter field in the hash uses only 3 bits and the hash has at least
10 million entries.
The output files will be named output_0, output_1, etc. (the prefix
is specified with the -o
switch). If the hash is large enough
(has specified by the -s
switch) to fit all the k-mers, there
will be only one output file named output_0. If the hash filled up
before all the mers were read, the hash is dumped to disk, zeroed out
and reading in mers resumes. Multiple intermediary files will be
present on the disks, named output_0, output_1, etc.
To obtain correct results from the other sub-commands (such as histo,
stats, etc.), the multiple output files, if any, need to be merged into one
with the merge command. For example with the following command:
jellyfish merge -o output.jf output\_*
Should you get many intermediary output files (say hundreds), the size
of the hash table is too small. Rerunning Jellyfish
with a
larger size (option -s)
is probably faster than merging all the
intermediary files.
When the orientation of the sequences in the input fasta file is not
known, e.g. in sequencing reads, using --both-strands
(-C)
makes the most sense.
For any k-mer m, its canonical representation is m itself or its
reverse-complement, whichever comes first lexicographically. With the
option -C,
only the canonical representation of the mers are
stored in the hash and the count value is the number of occurrences of
both the mer and its reverse-complement.
To achieve the best performance, a minimum number of intermediary
files should be written to disk. So the parameter -s
should be
chosen to fit as many k-mers as possible (ideally all of them) while
still fitting in memory.
We consider to examples: counting mers in sequencing reads and in a
finished genome.
First, suppose we count k-mers in short sequencing reads:
there are n reads and there is an average of 1 error per reads where
each error generates k unique mers. If the genome size is G, the
size of the hash (option -s)
to fit all k-mers at once is estimated to: $(G +
k*n)/0.8$. The division by 0.8 compensates for the maximum usage of
approximately $80%$ of the hash table.
On the other hand, when counting k-mers in an assembled sequence of
length G, setting -s
to G is appropriate.
As a matter of convenience, Jellyfish understands ISO suffixes for the
size of the hash. Hence '-s 10M' stands 10 million entries while '-s
50G' stands for 50 billion entries.
The actual memory usage of the hash table can be computed as
follow. The actual size of the hash will be rounded up to the next
power of 2: s=2^l. The parameter r is such that the maximum
reprobe value (-p)
plus one is less than 2^r. Then the memory usage per
entry in the hash is (in bits, not bytes) 2k-l+r+1. The total memory
usage of the hash table in bytes is: 2^l*(2k-l+r+1)/8.
To save space, the hash table supports variable length counter, i.e. a
k-mer occurring only a few times will use a small counter, a k-mer
occurring many times will used multiple entries in the hash.
Important: the size of the couting field does NOT change the result,
it only impacts the amount of memory used. In particular, there is no
maximum value in the hash. Even if the counting field uses 5 bits, a
k-mer occuring 2 million times will have a value reported of 2
million (i.e., it is not capped at 2^5).
The -c
specify the length (in bits) of the counting field. The
trade off is as follows: a low value will save space per entry in the
hash but can potentially increase the number of entries used, hence
maybe requiring a larger hash.
In practice, use a value for -c
so that most of you k-mers
require only 1 entry. For example, to count k-mers in a genome,
where most of the sequence is unique, use -c1
or
-c2.
For sequencing reads, use a value for -c
large
enough to counts up to twice the coverage. For example, if the
coverage is 10X, choose a counter length of 5 (-c5)
as $2^5 > 20$.
Usage: jellyfish count [options] file:path+
Count k-mers or qmers in fasta or fastq files
Options (default value in (), *required):
- -m,
--mer-len=uint32
- *Length of mer
- -s,
--size=uint64
- *Hash size
- -t,
--threads=uint32
- Number of threads (1)
- -o,
--output=string
- Output prefix (mer_counts)
- -c,
--counter-len=Length
- in bits Length of counting field (7)
- --out-counter-len=Length
- in bytes Length of counter field in output (4)
- -C,--both-strands
- Count both strand, canonical representation (false)
- -p,
--reprobes=uint32
- Maximum number of reprobes (62)
- -r,--raw
- Write raw database (false)
- -q,--quake
- Quake compatibility mode (false)
- --quality-start=uint32
- Starting ASCII for quality values (64)
- --min-quality=uint32
- Minimum quality. A base with lesser quality becomes an N (0)
- -L,
--lower-count=uint64
- Don't output k-mer with count < lower-count
- -U,
--upper-count=uint64
- Don't output k-mer with count > upper-count
- --matrix=Matrix
- file Hash function binary matrix
- --timing=Timing
- file Print timing information
- --stats=Stats
- file Print stats
- --usage
- Usage
- -h,--help
- This message
- --full-help
- Detailed help
- -V,--version
- Version
Usage: jellyfish stats [options] db:path
Statistics
Display some statistics about the k-mers in the hash:
Unique: Number of k-mers which occur only once.
Distinct: Number of k-mers, not counting multiplicity.
Total: Number of k-mers, including multiplicity.
Max_count: Maximum number of occurrence of a k-mer.
Options (default value in (), *required):
- -L,
--lower-count=uint64
- Don't consider k-mer with count < lower-count
- -U,
--upper-count=uint64
- Don't consider k-mer with count > upper-count
- -v,--verbose
- Verbose (false)
- -o,
--output=string
- Output file
- --usage
- Usage
- -h,--help
- This message
- --full-help
- Detailed help
- -V,--version
- Version
Usage: jellyfish histo [options] db:path
Create an histogram of k-mer occurrences
Create an histogram with the number of k-mers having a given
count. In bucket 'i' are tallied the k-mers which have a count 'c'
satisfying 'low+i*inc <= c < low+(i+1)*inc'. Buckets in the output are
labeled by the low end point (low+i*inc).
The last bucket in the output behaves as a catchall: it tallies all
k-mers with a count greater or equal to the low end point of this
bucket.
Options (default value in (), *required):
- -l,
--low=uint64
- Low count value of histogram (1)
- -h,
--high=uint64
- High count value of histogram (10000)
- -i,
--increment=uint64
- Increment value for buckets (1)
- -t,
--threads=uint32
- Number of threads (1)
- -f,--full
- Full histo. Don't skip count 0. (false)
- -o,
--output=string
- Output file
- -v,--verbose
- Output information (false)
- --usage
- Usage
- --help
- This message
- --full-help
- Detailed help
- -V,--version
- Version
Usage: jellyfish dump [options] db:path
Dump k-mer counts
By default, dump in a fasta format where the header is the count and
the sequence is the sequence of the k-mer. The column format is a 2
column output: k-mer count.
Options (default value in (), *required):
- -c,--column
- Column format (false)
- -t,--tab
- Tab separator (false)
- -L,
--lower-count=uint64
- Don't output k-mer with count < lower-count
- -U,
--upper-count=uint64
- Don't output k-mer with count > upper-count
- -o,
--output=string
- Output file
- --usage
- Usage
- -h,--help
- This message
- -V,--version
- Version
Usage: jellyfish merge [options] input:string+
Merge jellyfish databases
Options (default value in (), *required):
- -s,
--buffer-size=Buffer
- length Length in bytes of input buffer (10000000)
- -o,
--output=string
- Output file (mer_counts_merged.jf)
- --out-counter-len=uint32
- Length (in bytes) of counting field in output (4)
- --out-buffer-size=uint64
- Size of output buffer per thread (10000000)
- -v,--verbose
- Be verbose (false)
- --usage
- Usage
- -h,--help
- This message
- -V,--version
- Version
Usage: jellyfish query [options] db:path
Query from a compacted database
Query a hash. It reads k-mers from the standard input and write the counts on the standard output.
Options (default value in (), *required):
- -C,--both-strands
- Both strands (false)
- -c,--cary-bit
- Value field as the cary bit information (false)
- -i,
--input=file
- Input file
- -o,
--output=file
- Output file
- --usage
- Usage
- -h,--help
- This message
- -V,--version
- Version
Usage: jellyfish qhisto [options] db:string
Create an histogram of k-mer occurences
Options (default value in (), *required):
- -l,
--low=double
- Low count value of histogram (0.0)
- -h,
--high=double
- High count value of histogram (10000.0)
- -i,
--increment=double
- Increment value for buckets (1.0)
- -f,--full
- Full histo. Don't skip count 0. (false)
- --usage
- Usage
- --help
- This message
- -V,--version
- Version
Usage: jellyfish qdump [options] db:path
Dump k-mer from a qmer database
By default, dump in a fasta format where the header is the count and
the sequence is the sequence of the k-mer. The column format is a 2
column output: k-mer count.
Options (default value in (), *required):
- -c,--column
- Column format (false)
- -t,--tab
- Tab separator (false)
- -L,
--lower-count=double
- Don't output k-mer with count < lower-count
- -U,
--upper-count=double
- Don't output k-mer with count > upper-count
- -v,--verbose
- Be verbose (false)
- -o,
--output=string
- Output file
- --usage
- Usage
- -h,--help
- This message
- -V,--version
- Version
Usage: jellyfish merge [options] db:string+
Merge quake databases
Options (default value in (), *required):
- -s,
--size=uint64
- *Merged hash table size
- -m,
--mer-len=uint32
- *Mer length
- -o,
--output=string
- Output file (merged.jf)
- -p,
--reprobes=uint32
- Maximum number of reprobes (62)
- --usage
- Usage
- -h,--help
- This message
- --full-help
- Detailed help
- -V,--version
- Version
Usage: jellyfish cite [options]
How to cite Jellyfish's paper
Citation of paper
Options (default value in (), *required):
- -b,--bibtex
- Bibtex format (false)
- -o,
--output=string
- Output file
- --usage
- Usage
- -h,--help
- This message
- -V,--version
- Version
Version: 1.1.4 of 2010/10/1
- jellyfish merge has not been parallelized and is
relatively slow.
- The hash table does not grow in memory automatically and
jellyfish merge
is not called automatically on the
intermediary files (if any).
- Copyright
- © 2010, Guillaume Marcais guillaume@marcais.net
and Carl Kingsford carlk@umiacs.umd.edu.
- License
- This program is free software: you can redistribute it
and/or modify it under the terms of the GNU General Public License
as published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see
http://www.gnu.org/licenses/.
Guillaume Marcais
University of Maryland
gmarcais@umd.edu
Carl Kingsford
University of Maryland
carlk@umiacs.umd.edu
Jellyfish-2.2.4/doc/jellyfish.man 0000664 0000000 0000000 00000034717 12613705377 0016732 0 ustar 00root root 0000000 0000000 '\" t
.\" Manual page created with latex2man on Wed Feb 29 10:58:48 EST 2012
.\" NOTE: This file is generated, DO NOT EDIT.
.de Vb
.ft CW
.nf
..
.de Ve
.ft R
.fi
..
.TH "JELLYFISH" "1" "2010/10/1" "k\-mer counter " "k\-mer counter "
.SH NAME
.PP
Jellyfish
is a software to count k\-mers in DNA sequences.
.PP
.SH SYNOPSIS
jellyfish count
[\fB\-o\fP\fIprefix\fP]
[\fB\-m\fP\fImerlength\fP]
[\fB\-t\fP\fIthreads\fP]
[\fB\-s\fP\fIhashsize\fP]
[\fB--both\-strands\fP]
\fIfasta\fP
[\fIfasta \&...
\fP]
.br
jellyfish merge
\fIhash1\fP
\fIhash2\fP
\&...
.br
jellyfish dump
\fIhash\fP
.br
jellyfish stats
\fIhash\fP
.br
jellyfish histo
[\fB\-h\fP\fIhigh\fP]
[\fB\-l\fP\fIlow\fP]
[\fB\-i\fP\fIincrement\fP]
\fIhash\fP
.br
jellyfish query
\fIhash\fP
.br
jellyfish cite
.br
.PP
Plus equivalent version for Quake
mode: qhisto,
qdump
and qmerge\&.
.PP
.SH DESCRIPTION
.PP
Jellyfish
is a k\-mer counter based on a multi\-threaded hash
table implementation.
.PP
.SS COUNTING AND MERGING
.PP
To count k\-mers, use a command like:
.PP
.Vb
jellyfish count \-m 22 \-o output \-c 3 \-s 10000000 \-t 32 input.fasta
.Ve
.PP
This will count the the 22\-mers in input.fasta with 32 threads. The
counter field in the hash uses only 3 bits and the hash has at least
10 million entries.
.PP
The output files will be named output_0, output_1, etc. (the prefix
is specified with the \fB\-o\fP
switch). If the hash is large enough
(has specified by the \fB\-s\fP
switch) to fit all the k\-mers, there
will be only one output file named output_0. If the hash filled up
before all the mers were read, the hash is dumped to disk, zeroed out
and reading in mers resumes. Multiple intermediary files will be
present on the disks, named output_0, output_1, etc.
.PP
To obtain correct results from the other sub\-commands (such as histo,
stats, etc.), the multiple output files, if any, need to be merged into one
with the merge command. For example with the following command:
.PP
.Vb
jellyfish merge \-o output.jf output\\_*
.Ve
.PP
Should you get many intermediary output files (say hundreds), the size
of the hash table is too small. Rerunning Jellyfish
with a
larger size (option \fB\-s\fP)
is probably faster than merging all the
intermediary files.
.PP
.SS ORIENTATION
When the orientation of the sequences in the input fasta file is not
known, e.g. in sequencing reads, using \fB--both\-strands\fP
(\fB\-C\fP)
makes the most sense.
.PP
For any k\-mer m, its canonical representation is m itself or its
reverse\-complement, whichever comes first lexicographically. With the
option \fB\-C\fP,
only the canonical representation of the mers are
stored in the hash and the count value is the number of occurrences of
both the mer and its reverse\-complement.
.PP
.SS CHOOSING THE HASH SIZE
.PP
To achieve the best performance, a minimum number of intermediary
files should be written to disk. So the parameter \fB\-s\fP
should be
chosen to fit as many k\-mers as possible (ideally all of them) while
still fitting in memory.
.PP
We consider to examples: counting mers in sequencing reads and in a
finished genome.
.PP
First, suppose we count k\-mers in short sequencing reads:
there are n reads and there is an average of 1 error per reads where
each error generates k unique mers. If the genome size is G, the
size of the hash (option \fB\-s\fP)
to fit all k\-mers at once is estimated to: $(G +
k*n)/0.8$. The division by 0.8 compensates for the maximum usage of
approximately $80%$ of the hash table.
.PP
On the other hand, when counting k\-mers in an assembled sequence of
length G, setting \fB\-s\fP
to G is appropriate.
.PP
As a matter of convenience, Jellyfish understands ISO suffixes for the
size of the hash. Hence \&'\-s 10M\&' stands 10 million entries while \&'\-s
50G\&' stands for 50 billion entries.
.PP
The actual memory usage of the hash table can be computed as
follow. The actual size of the hash will be rounded up to the next
power of 2: s=2^l\&. The parameter r is such that the maximum
reprobe value (\fB\-p\fP)
plus one is less than 2^r\&. Then the memory usage per
entry in the hash is (in bits, not bytes) 2k\-l+r+1\&. The total memory
usage of the hash table in bytes is: 2^l*(2k\-l+r+1)/8\&.
.PP
.SS CHOOSING THE COUNTING FIELD SIZE
To save space, the hash table supports variable length counter, i.e. a
k\-mer occurring only a few times will use a small counter, a k\-mer
occurring many times will used multiple entries in the hash.
.PP
Important: the size of the couting field does NOT change the result,
it only impacts the amount of memory used. In particular, there is no
maximum value in the hash. Even if the counting field uses 5 bits, a
k\-mer occuring 2 million times will have a value reported of 2
million (i.e., it is not capped at 2^5).
.PP
The \fB\-c\fP
specify the length (in bits) of the counting field. The
trade off is as follows: a low value will save space per entry in the
hash but can potentially increase the number of entries used, hence
maybe requiring a larger hash.
.PP
In practice, use a value for \fB\-c\fP
so that most of you k\-mers
require only 1 entry. For example, to count k\-mers in a genome,
where most of the sequence is unique, use \fB\-c\fP\fI1\fP
or
\fB\-c\fP\fI2\fP\&.
For sequencing reads, use a value for \fB\-c\fP
large
enough to counts up to twice the coverage. For example, if the
coverage is 10X, choose a counter length of 5 (\fB\-c\fP\fI5\fP)
as $2^5 > 20$.
.PP
.SH SUBCOMMANDS AND OPTIONS
.SS COUNT
Usage: jellyfish count [options] file:path+
.PP
Count k\-mers or qmers in fasta or fastq files
.PP
Options (default value in (), *required):
.TP
\fB\-m\fP,
\fB--mer\-len\fP\fI=uint32\fP
*Length of mer
.TP
\fB\-s\fP,
\fB--size\fP\fI=uint64\fP
*Hash size
.TP
\fB\-t\fP,
\fB--threads\fP\fI=uint32\fP
Number of threads (1)
.TP
\fB\-o\fP,
\fB--output\fP\fI=string\fP
Output prefix (mer_counts)
.TP
\fB\-c\fP,
\fB--counter\-len\fP\fI=Length\fP
in bits Length of counting field (7)
.TP
\fB--out\-counter\-len\fP\fI=Length\fP
in bytes Length of counter field in output (4)
.TP
\fB\-C\fP,\fB--both\-strands\fP
Count both strand, canonical representation (false)
.TP
\fB\-p\fP,
\fB--reprobes\fP\fI=uint32\fP
Maximum number of reprobes (62)
.TP
\fB\-r\fP,\fB--raw\fP
Write raw database (false)
.TP
\fB\-q\fP,\fB--quake\fP
Quake compatibility mode (false)
.TP
\fB--quality\-start\fP\fI=uint32\fP
Starting ASCII for quality values (64)
.TP
\fB--min\-quality\fP\fI=uint32\fP
Minimum quality. A base with lesser quality becomes an N (0)
.TP
\fB\-L\fP,
\fB--lower\-count\fP\fI=uint64\fP
Don\&'t output k\-mer with count < lower\-count
.TP
\fB\-U\fP,
\fB--upper\-count\fP\fI=uint64\fP
Don\&'t output k\-mer with count > upper\-count
.TP
\fB--matrix\fP\fI=Matrix\fP
file Hash function binary matrix
.TP
\fB--timing\fP\fI=Timing\fP
file Print timing information
.TP
\fB--stats\fP\fI=Stats\fP
file Print stats
.TP
\fB--usage\fP
Usage
.TP
\fB\-h\fP,\fB--help\fP
This message
.TP
\fB--full\-help\fP
Detailed help
.TP
\fB\-V\fP,\fB--version\fP
Version
.PP
.SS STATS
Usage: jellyfish stats [options] db:path
.PP
Statistics
.PP
Display some statistics about the k\-mers in the hash:
.PP
Unique: Number of k\-mers which occur only once.
Distinct: Number of k\-mers, not counting multiplicity.
Total: Number of k\-mers, including multiplicity.
Max_count: Maximum number of occurrence of a k\-mer.
.PP
Options (default value in (), *required):
.TP
\fB\-L\fP,
\fB--lower\-count\fP\fI=uint64\fP
Don\&'t consider k\-mer with count < lower\-count
.TP
\fB\-U\fP,
\fB--upper\-count\fP\fI=uint64\fP
Don\&'t consider k\-mer with count > upper\-count
.TP
\fB\-v\fP,\fB--verbose\fP
Verbose (false)
.TP
\fB\-o\fP,
\fB--output\fP\fI=string\fP
Output file
.TP
\fB--usage\fP
Usage
.TP
\fB\-h\fP,\fB--help\fP
This message
.TP
\fB--full\-help\fP
Detailed help
.TP
\fB\-V\fP,\fB--version\fP
Version
.PP
.SS HISTO
Usage: jellyfish histo [options] db:path
.PP
Create an histogram of k\-mer occurrences
.PP
Create an histogram with the number of k\-mers having a given
count. In bucket \&'i\&' are tallied the k\-mers which have a count \&'c\&'
satisfying \&'low+i*inc <= c < low+(i+1)*inc\&'\&. Buckets in the output are
labeled by the low end point (low+i*inc).
.PP
The last bucket in the output behaves as a catchall: it tallies all
k\-mers with a count greater or equal to the low end point of this
bucket.
.PP
Options (default value in (), *required):
.TP
\fB\-l\fP,
\fB--low\fP\fI=uint64\fP
Low count value of histogram (1)
.TP
\fB\-h\fP,
\fB--high\fP\fI=uint64\fP
High count value of histogram (10000)
.TP
\fB\-i\fP,
\fB--increment\fP\fI=uint64\fP
Increment value for buckets (1)
.TP
\fB\-t\fP,
\fB--threads\fP\fI=uint32\fP
Number of threads (1)
.TP
\fB\-f\fP,\fB--full\fP
Full histo. Don\&'t skip count 0. (false)
.TP
\fB\-o\fP,
\fB--output\fP\fI=string\fP
Output file
.TP
\fB\-v\fP,\fB--verbose\fP
Output information (false)
.TP
\fB--usage\fP
Usage
.TP
\fB--help\fP
This message
.TP
\fB--full\-help\fP
Detailed help
.TP
\fB\-V\fP,\fB--version\fP
Version
.PP
.SS DUMP
Usage: jellyfish dump [options] db:path
.PP
Dump k\-mer counts
.PP
By default, dump in a fasta format where the header is the count and
the sequence is the sequence of the k\-mer. The column format is a 2
column output: k\-mer count.
.PP
Options (default value in (), *required):
.TP
\fB\-c\fP,\fB--column\fP
Column format (false)
.TP
\fB\-t\fP,\fB--tab\fP
Tab separator (false)
.TP
\fB\-L\fP,
\fB--lower\-count\fP\fI=uint64\fP
Don\&'t output k\-mer with count < lower\-count
.TP
\fB\-U\fP,
\fB--upper\-count\fP\fI=uint64\fP
Don\&'t output k\-mer with count > upper\-count
.TP
\fB\-o\fP,
\fB--output\fP\fI=string\fP
Output file
.TP
\fB--usage\fP
Usage
.TP
\fB\-h\fP,\fB--help\fP
This message
.TP
\fB\-V\fP,\fB--version\fP
Version
.PP
.SS MERGE
Usage: jellyfish merge [options] input:string+
.PP
Merge jellyfish databases
.PP
Options (default value in (), *required):
.TP
\fB\-s\fP,
\fB--buffer\-size\fP\fI=Buffer\fP
length Length in bytes of input buffer (10000000)
.TP
\fB\-o\fP,
\fB--output\fP\fI=string\fP
Output file (mer_counts_merged.jf)
.TP
\fB--out\-counter\-len\fP\fI=uint32\fP
Length (in bytes) of counting field in output (4)
.TP
\fB--out\-buffer\-size\fP\fI=uint64\fP
Size of output buffer per thread (10000000)
.TP
\fB\-v\fP,\fB--verbose\fP
Be verbose (false)
.TP
\fB--usage\fP
Usage
.TP
\fB\-h\fP,\fB--help\fP
This message
.TP
\fB\-V\fP,\fB--version\fP
Version
.PP
.SS QUERY
Usage: jellyfish query [options] db:path
.PP
Query from a compacted database
.PP
Query a hash. It reads k\-mers from the standard input and write the counts on the standard output.
.PP
Options (default value in (), *required):
.TP
\fB\-C\fP,\fB--both\-strands\fP
Both strands (false)
.TP
\fB\-c\fP,\fB--cary\-bit\fP
Value field as the cary bit information (false)
.TP
\fB\-i\fP,
\fB--input\fP\fI=file\fP
Input file
.TP
\fB\-o\fP,
\fB--output\fP\fI=file\fP
Output file
.TP
\fB--usage\fP
Usage
.TP
\fB\-h\fP,\fB--help\fP
This message
.TP
\fB\-V\fP,\fB--version\fP
Version
.PP
.SS QHISTO
Usage: jellyfish qhisto [options] db:string
.PP
Create an histogram of k\-mer occurences
.PP
Options (default value in (), *required):
.TP
\fB\-l\fP,
\fB--low\fP\fI=double\fP
Low count value of histogram (0.0)
.TP
\fB\-h\fP,
\fB--high\fP\fI=double\fP
High count value of histogram (10000.0)
.TP
\fB\-i\fP,
\fB--increment\fP\fI=double\fP
Increment value for buckets (1.0)
.TP
\fB\-f\fP,\fB--full\fP
Full histo. Don\&'t skip count 0. (false)
.TP
\fB--usage\fP
Usage
.TP
\fB--help\fP
This message
.TP
\fB\-V\fP,\fB--version\fP
Version
.PP
.SS QDUMP
Usage: jellyfish qdump [options] db:path
.PP
Dump k\-mer from a qmer database
.PP
By default, dump in a fasta format where the header is the count and
the sequence is the sequence of the k\-mer. The column format is a 2
column output: k\-mer count.
.PP
Options (default value in (), *required):
.TP
\fB\-c\fP,\fB--column\fP
Column format (false)
.TP
\fB\-t\fP,\fB--tab\fP
Tab separator (false)
.TP
\fB\-L\fP,
\fB--lower\-count\fP\fI=double\fP
Don\&'t output k\-mer with count < lower\-count
.TP
\fB\-U\fP,
\fB--upper\-count\fP\fI=double\fP
Don\&'t output k\-mer with count > upper\-count
.TP
\fB\-v\fP,\fB--verbose\fP
Be verbose (false)
.TP
\fB\-o\fP,
\fB--output\fP\fI=string\fP
Output file
.TP
\fB--usage\fP
Usage
.TP
\fB\-h\fP,\fB--help\fP
This message
.TP
\fB\-V\fP,\fB--version\fP
Version
.PP
.SS QMERGE
Usage: jellyfish merge [options] db:string+
.PP
Merge quake databases
.PP
Options (default value in (), *required):
.TP
\fB\-s\fP,
\fB--size\fP\fI=uint64\fP
*Merged hash table size
.TP
\fB\-m\fP,
\fB--mer\-len\fP\fI=uint32\fP
*Mer length
.TP
\fB\-o\fP,
\fB--output\fP\fI=string\fP
Output file (merged.jf)
.TP
\fB\-p\fP,
\fB--reprobes\fP\fI=uint32\fP
Maximum number of reprobes (62)
.TP
\fB--usage\fP
Usage
.TP
\fB\-h\fP,\fB--help\fP
This message
.TP
\fB--full\-help\fP
Detailed help
.TP
\fB\-V\fP,\fB--version\fP
Version
.PP
.SS CITE
Usage: jellyfish cite [options]
.PP
How to cite Jellyfish\&'s paper
.PP
Citation of paper
.PP
Options (default value in (), *required):
.TP
\fB\-b\fP,\fB--bibtex\fP
Bibtex format (false)
.TP
\fB\-o\fP,
\fB--output\fP\fI=string\fP
Output file
.TP
\fB--usage\fP
Usage
.TP
\fB\-h\fP,\fB--help\fP
This message
.TP
\fB\-V\fP,\fB--version\fP
Version
.PP
.SH VERSION
.PP
Version: 1.1.4 of 2010/10/1
.PP
.SH BUGS
.PP
.TP
.B *
jellyfish merge has not been parallelized and is
relatively slow.
.TP
.B *
The hash table does not grow in memory automatically and
jellyfish merge
is not called automatically on the
intermediary files (if any).
.PP
.SH COPYRIGHT & LICENSE
.TP
Copyright
(C)2010, Guillaume Marcais \fBguillaume@marcais.net\fP
and Carl Kingsford \fBcarlk@umiacs.umd.edu\fP\&.
.PP
.TP
License
This program is free software: you can redistribute it
and/or modify it under the terms of the GNU General Public License
as published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
.br
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
.br
You should have received a copy of the GNU General Public License
along with this program. If not, see
\fBhttp://www.gnu.org/licenses/\fP\&.
.PP
.SH AUTHORS
Guillaume Marcais
.br
University of Maryland
.br
\fBgmarcais@umd.edu\fP
.PP
Carl Kingsford
.br
University of Maryland
.br
\fBcarlk@umiacs.umd.edu\fP
.PP
.\" NOTE: This file is generated, DO NOT EDIT.
Jellyfish-2.2.4/doc/jellyfish.pdf 0000664 0000000 0000000 00000730726 12613705377 0016733 0 ustar 00root root 0000000 0000000 %PDF-1.4
%ÐÔÅØ
1 0 obj
<< /S /GoTo /D (section.1) >>
endobj
4 0 obj
(1 Synopsis)
endobj
5 0 obj
<< /S /GoTo /D (section.2) >>
endobj
8 0 obj
(2 Description)
endobj
9 0 obj
<< /S /GoTo /D (subsection.2.1) >>
endobj
12 0 obj
(2.1 Counting and merging)
endobj
13 0 obj
<< /S /GoTo /D (subsection.2.2) >>
endobj
16 0 obj
(2.2 Orientation)
endobj
17 0 obj
<< /S /GoTo /D (subsection.2.3) >>
endobj
20 0 obj
(2.3 Choosing the hash size)
endobj
21 0 obj
<< /S /GoTo /D (subsection.2.4) >>
endobj
24 0 obj
(2.4 Choosing the counting field size)
endobj
25 0 obj
<< /S /GoTo /D (section.3) >>
endobj
28 0 obj
(3 Subcommands and options)
endobj
29 0 obj
<< /S /GoTo /D (subsection.3.1) >>
endobj
32 0 obj
(3.1 count)
endobj
33 0 obj
<< /S /GoTo /D (subsection.3.2) >>
endobj
36 0 obj
(3.2 stats)
endobj
37 0 obj
<< /S /GoTo /D (subsection.3.3) >>
endobj
40 0 obj
(3.3 histo)
endobj
41 0 obj
<< /S /GoTo /D (subsection.3.4) >>
endobj
44 0 obj
(3.4 dump)
endobj
45 0 obj
<< /S /GoTo /D (subsection.3.5) >>
endobj
48 0 obj
(3.5 merge)
endobj
49 0 obj
<< /S /GoTo /D (subsection.3.6) >>
endobj
52 0 obj
(3.6 query)
endobj
53 0 obj
<< /S /GoTo /D (subsection.3.7) >>
endobj
56 0 obj
(3.7 qhisto)
endobj
57 0 obj
<< /S /GoTo /D (subsection.3.8) >>
endobj
60 0 obj
(3.8 qdump)
endobj
61 0 obj
<< /S /GoTo /D (subsection.3.9) >>
endobj
64 0 obj
(3.9 qmerge)
endobj
65 0 obj
<< /S /GoTo /D (subsection.3.10) >>
endobj
68 0 obj
(3.10 cite)
endobj
69 0 obj
<< /S /GoTo /D (section.4) >>
endobj
72 0 obj
(4 Version)
endobj
73 0 obj
<< /S /GoTo /D (section.5) >>
endobj
76 0 obj
(5 Bugs)
endobj
77 0 obj
<< /S /GoTo /D (section.6) >>
endobj
80 0 obj
(6 Copyright \046 License)
endobj
81 0 obj
<< /S /GoTo /D (section.7) >>
endobj
84 0 obj
(7 Authors)
endobj
85 0 obj
<< /S /GoTo /D [86 0 R /Fit ] >>
endobj
88 0 obj <<
/Length 2072
/Filter /FlateDecode
>>
stream
xÚXIsã¶¾ûWðHUYö%·É2¯Þ¤’z©¸r™Ì’(‹1%Ê"•‰óë_7¤(‰öØ5ñÁ„°t7zùðßÝÝ|ó^˜L8&…ÑÙÝ:“œ3§Bfµa¨ìn•}Ì?”uýôºÝ|;›knów³¹â2_mG‡ù¶Ý}
6‚c$jP‚q)³¹ôÌA
þÃ`),ù¹8,‹ªÅ./v+êý>þTíîÛusX¡ØÖíAŽbA$9ïg^æåâp,O$D†[Z,¹dŽ4™gÁÚ´_¶y$ã÷™·yyh«f+µÊL§¥v´Ô0\™äií»EÛŠe—f»Ñì¹0àQg’2;xv67*'÷ÎDÞn&ŒT– e2Nk¢À®?2o›u7“>ÿ<“!/%
v
R0|Þ›äGb}`NöR&ôÆè'`€“æjGÂøåu´åã±Ü-Ë–‘%2¡™ÒVÒÖm`VÛè+%ÒÖÅ