e-PCR-2.3.12/0000755001137700010620000000000011745334032012401 5ustar rotmistrcontige-PCR-2.3.12/stand/0000755001137700010620000000000011745334032013512 5ustar rotmistrcontige-PCR-2.3.12/stand/Makefile0000644001137700010620000000545111745334032015157 0ustar rotmistrcontig## $Id: Makefile,v 1.7 2007/07/05 16:06:04 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## LIBS = epcr BINS = cmd_epcr cmd_famap cmd_fahash cmd_rpcr srcdir = . all links dirs clean dist clean-all install install-lib dist-clean depend: for i in $(LIBS:%=lib%) $(BINS) ; do \ $(MAKE) -ef $(srcdir)/stand/Makefile.$$i $@ ; \ done # ######################################################################## ## $Log: Makefile,v $ ## Revision 1.7 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.6 2004/05/27 20:36:03 rotmistr ## Version 2.1.0 with appropriate changes (see Changes) is ready for tests. ## ## Revision 1.5 2004/04/06 16:44:57 rotmistr ## *** empty log message *** ## ## Revision 1.4 2004/03/29 03:16:47 rotmistr ## *** empty log message *** ## ## Revision 1.3 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.2 2003/12/23 21:30:57 rotmistr ## - gaps/mismatches reporting ## - lo/hi fixup ## - reverse sts in re-PCR_main ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.3 2003/11/20 05:56:02 rotmistr ## Loading looks working ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/stand/Makefile.cmd_epcr0000644001137700010620000000572311745334032016734 0ustar rotmistrcontig## $Id: Makefile.cmd_epcr,v 1.4 2007/07/05 16:06:04 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## srcdir = . include $(srcdir)/stand/config.mk SRC_FILES = e-PCR_main SRC = $(SRC_FILES:%=$(srcdir)/%.cpp) OBJ = $(SRC_FILES:%=$(objdir)/%.o) HDR = LIB = epcr #LIBNAME = epcr TARGET = $(tgtdir)/e-PCR target: $(TARGET) install: cp $(TARGET) $(BINDIR) install-lib: $(TARGET): $(OBJ) $(LIB:%=$(tgtdir)/lib%.a) -rm -f $@ $(CXX) $(LDFLAGS) -o $@ $(OBJ) $(LIB:%=-l%) $(objdir)/e-PCR_main.o: $(srcdir)/e-PCR_main.cpp depend: $(CXX) $(CXXFLAGS) -M $(SRC) | $(FIXCMD) > $(tgtdir)/cmdepcr_depend.mk -include $(tgtdir)/cmdepcr_depend.mk ######################################################################## ## $Log: Makefile.cmd_epcr,v $ ## Revision 1.4 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.3 2004/03/30 21:39:30 rotmistr ## Fixed build arguments usage ## ## Revision 1.2 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.5 2003/12/01 12:29:36 rotmistr ## Reverse PCR wordhash builder in progress ## ## Revision 1.4 2003/11/21 23:26:36 rotmistr ## Almost compilable ## ## Revision 1.3 2003/11/20 18:27:32 rotmistr ## Sample files updated ## Program does not crush ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/stand/Makefile.cmd_famap0000644001137700010620000000572711745334032017073 0ustar rotmistrcontig## $Id: Makefile.cmd_famap,v 1.4 2007/07/05 16:06:04 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## srcdir = . include $(srcdir)/stand/config.mk SRC_FILES = famap_main SRC = $(SRC_FILES:%=$(srcdir)/%.cpp) OBJ = $(SRC_FILES:%=$(objdir)/%.o) HDR = LIB = epcr #LIBNAME = epcr TARGET = $(tgtdir)/famap target: $(TARGET) install: cp $(TARGET) $(BINDIR) install-lib: $(TARGET): $(OBJ) $(LIB:%=$(tgtdir)/lib%.a) -rm -f $@ $(CXX) $(LDFLAGS) -o $@ $(OBJ) $(LIB:%=-l%) $(objdir)/famap_main.o: $(srcdir)/famap_main.cpp depend: $(CXX) $(CXXFLAGS) -M $(SRC) | $(FIXCMD) > $(tgtdir)/cmdfamap_depend.mk -include $(tgtdir)/cmdfamap_depend.mk ######################################################################## ## $Log: Makefile.cmd_famap,v $ ## Revision 1.4 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.3 2004/03/30 21:39:30 rotmistr ## Fixed build arguments usage ## ## Revision 1.2 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.5 2003/12/01 12:29:36 rotmistr ## Reverse PCR wordhash builder in progress ## ## Revision 1.4 2003/11/21 23:26:36 rotmistr ## Almost compilable ## ## Revision 1.3 2003/11/20 18:27:32 rotmistr ## Sample files updated ## Program does not crush ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/stand/Makefile.cmd_fahash0000644001137700010620000000624311745334032017233 0ustar rotmistrcontig## $Id: Makefile.cmd_fahash,v 1.2 2007/07/05 16:06:04 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## srcdir = . include $(srcdir)/stand/config.mk SRC_FILES = fahash_main SRC = $(SRC_FILES:%=$(srcdir)/%.cpp) OBJ = $(SRC_FILES:%=$(objdir)/%.o) HDR = LIB = epcr #LIBNAME = epcr TARGET = $(tgtdir)/fahash target: $(TARGET) install: cp $(TARGET) $(BINDIR) install-lib: $(TARGET): $(OBJ) $(LIB:%=$(tgtdir)/lib%.a) -rm -f $@ $(CXX) $(LDFLAGS) -o $@ $(OBJ) $(LIB:%=-l%) $(objdir)/fahash_main.o: $(srcdir)/fahash_main.cpp depend: $(CXX) $(CXXFLAGS) -M $(SRC) | $(FIXCMD) > $(tgtdir)/cmdfahash_depend.mk -include $(tgtdir)/cmdfahash_depend.mk ######################################################################## ## $Log: Makefile.cmd_fahash,v $ ## Revision 1.2 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.1 2004/05/27 20:36:03 rotmistr ## Version 2.1.0 with appropriate changes (see Changes) is ready for tests. ## ## Revision 1.1 2004/04/27 00:11:57 rotmistr ## Forgot to add... ## ## Revision 1.3 2004/03/30 21:39:30 rotmistr ## Fixed build arguments usage ## ## Revision 1.2 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.5 2003/12/01 12:29:36 rotmistr ## Reverse PCR wordhash builder in progress ## ## Revision 1.4 2003/11/21 23:26:36 rotmistr ## Almost compilable ## ## Revision 1.3 2003/11/20 18:27:32 rotmistr ## Sample files updated ## Program does not crush ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/stand/Makefile.cmd_rpcr0000644001137700010620000000572711745334032016755 0ustar rotmistrcontig## $Id: Makefile.cmd_rpcr,v 1.4 2007/07/05 16:06:04 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## srcdir = . include $(srcdir)/stand/config.mk SRC_FILES = re-PCR_main SRC = $(SRC_FILES:%=$(srcdir)/%.cpp) OBJ = $(SRC_FILES:%=$(objdir)/%.o) HDR = LIB = epcr #LIBNAME = epcr TARGET = $(tgtdir)/re-PCR target: $(TARGET) install: cp $(TARGET) $(BINDIR) install-lib: $(TARGET): $(OBJ) $(LIB:%=$(tgtdir)/lib%.a) -rm -f $@ $(CXX) $(LDFLAGS) -o $@ $(OBJ) $(LIB:%=-l%) $(objdir)/re-PCR_main.o: $(srcdir)/re-PCR_main.cpp depend: $(CXX) $(CXXFLAGS) -M $(SRC) | $(FIXCMD) > $(tgtdir)/cmdrpcr_depend.mk -include $(tgtdir)/cmdrpcr_depend.mk ######################################################################## ## $Log: Makefile.cmd_rpcr,v $ ## Revision 1.4 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.3 2004/03/30 21:39:30 rotmistr ## Fixed build arguments usage ## ## Revision 1.2 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.5 2003/12/01 12:29:36 rotmistr ## Reverse PCR wordhash builder in progress ## ## Revision 1.4 2003/11/21 23:26:36 rotmistr ## Almost compilable ## ## Revision 1.3 2003/11/20 18:27:32 rotmistr ## Sample files updated ## Program does not crush ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/stand/Makefile.cmd_seqcmp0000644001137700010620000000602111745334032017263 0ustar rotmistrcontig## $Id: Makefile.cmd_seqcmp,v 1.3 2007/07/05 16:06:04 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## srcdir = . include $(srcdir)/stand/config.mk SRC_FILES = seqcmp_main SRC = $(SRC_FILES:%=$(srcdir)/%.cpp) OBJ = $(SRC_FILES:%=$(objdir)/%.o) HDR = LIB = epcr #LIBNAME = epcr TARGET = $(tgtdir)/seqcmp target: $(TARGET) install: install-lib: $(TARGET): $(OBJ) $(LIB:%=$(tgtdir)/lib%.a) -rm -f $@ $(CXX) $(LDFLAGS) -o $@ $(OBJ) $(LIB:%=-l%) $(objdir)/seqcmp_main.o: $(srcdir)/seqcmp_main.cpp depend: $(CXX) $(CXXFLAGS) -M $(SRC) | $(FIXCMD) > $(tgtdir)/cmdseqcmp_depend.mk -include $(tgtdir)/cmdseqcmp_depend.mk ######################################################################## ## $Log: Makefile.cmd_seqcmp,v $ ## Revision 1.3 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.2 2004/03/30 21:39:30 rotmistr ## Fixed build arguments usage ## ## Revision 1.1 2004/02/12 21:38:55 rotmistr ## Added seqcmp binary ## ## Revision 1.2 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.5 2003/12/01 12:29:36 rotmistr ## Reverse PCR wordhash builder in progress ## ## Revision 1.4 2003/11/21 23:26:36 rotmistr ## Almost compilable ## ## Revision 1.3 2003/11/20 18:27:32 rotmistr ## Sample files updated ## Program does not crush ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/stand/Makefile.libepcr0000644001137700010620000001051211745334032016570 0ustar rotmistrcontig## $Id: Makefile.libepcr,v 1.7 2007/07/05 16:06:04 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## srcdir = . include $(srcdir)/stand/config.mk SRC_FILES = \ faread \ fast_seqio_read \ fast_seqio_write \ hashset \ mmap \ align \ stsmatch_i \ stsmatch_m \ stsfilter \ stsmatch \ fahash_create \ fahash_create1 \ fahash_create2 \ fahash_lookup SRC = $(SRC_FILES:%=$(srcdir)/%.cpp) OBJ = $(SRC_FILES:%=$(objdir)/%.o) HDR = defaults.h build_cfg.h \ bin-io.hpp \ strref.hpp \ faread.hpp \ fast_seqio.hpp \ hashset.hpp \ minilcs.hpp \ align.hpp \ sts_i.hpp \ stsmatch_i.hpp \ stsmatch_m.hpp \ stsmatch.hpp \ fahash.hpp \ sts.hpp LIBNAME = epcr TARGET = $(tgtdir)/libepcr.a target: $(TARGET) install: install-lib: cp $(TARGET) $(LIBDIR) mkdir $(INCDIR)/$(LIBNAME) cp $(HDR) $(INCDIR)/$(LIBNAME) $(tgtdir)/libepcr.a: $(OBJ) -rm -f $@ ar cru $@ $(OBJ) ranlib $@ $(objdir)/fahash_lookup.o: $(srcdir)/fahash_lookup.cpp $(objdir)/fahash_create.o: $(srcdir)/fahash_create.cpp $(objdir)/fahash_create1.o: $(srcdir)/fahash_create1.cpp $(objdir)/fahash_create2.o: $(srcdir)/fahash_create2.cpp $(objdir)/faread.o: $(srcdir)/faread.cpp $(objdir)/fast_seqio_read.o: $(srcdir)/fast_seqio_read.cpp $(objdir)/fast_seqio_write.o: $(srcdir)/fast_seqio_write.cpp $(objdir)/hashset.o: $(srcdir)/hashset.cpp $(objdir)/mmap.o: $(srcdir)/mmap.cpp $(objdir)/align.o: $(srcdir)/align.cpp $(objdir)/stsmatch_i.o: $(srcdir)/stsmatch_i.cpp $(objdir)/stsmatch_m.o: $(srcdir)/stsmatch_m.cpp $(objdir)/stsfilter.o: $(srcdir)/stsfilter.cpp $(objdir)/stsmatch.o: $(srcdir)/stsmatch.cpp depend: $(CXX) $(CXXFLAGS) -M $(SRC) | $(FIXCMD) > $(tgtdir)/libepcr_depend.mk -include $(tgtdir)/libepcr_depend.mk ######################################################################## ## $Log: Makefile.libepcr,v $ ## Revision 1.7 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.6 2004/06/03 23:37:29 rotmistr ## New aligner added. ## ## Revision 1.5 2004/05/27 20:36:04 rotmistr ## Version 2.1.0 with appropriate changes (see Changes) is ready for tests. ## ## Revision 1.4 2004/03/30 21:06:58 rotmistr ## Fixes for setting default STS size range. ## ## Revision 1.3 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.2 2004/01/28 23:27:09 rotmistr ## "Best of overlapping" hit selection postprocessor added. ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.5 2003/12/01 12:29:36 rotmistr ## Reverse PCR wordhash builder in progress ## ## Revision 1.4 2003/11/21 23:26:36 rotmistr ## Almost compilable ## ## Revision 1.3 2003/11/20 18:27:32 rotmistr ## Sample files updated ## Program does not crush ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/stand/Makefile.bcc55-w320000644001137700010620000001075411745334032016472 0ustar rotmistrcontig## $Id: Makefile.bcc55-w32,v 1.7 2004/09/03 21:28:56 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## srcdir = . !include $(srcdir)/stand/version.mk BCC = bcc32 -I./ -DSTANDALONE -DUSE_WIN=1 -DVERSION=\"$(VERSION)\" -O2 \ -w-8026 -w-8027 LINK = ilink32 .c.obj: $(BCC) -c $< -o $@ .cpp.obj: $(BCC) -c $< -o $@ C_FILES = \ getopt.c SRC_FILES = \ faread.cpp \ fast_seqio_read.cpp \ fast_seqio_write.cpp \ hashset.cpp \ mmap.cpp \ align.cpp \ stsmatch_i.cpp \ stsmatch_m.cpp \ stsfilter.cpp \ stsmatch.cpp \ fahash_create.cpp \ fahash_create1.cpp \ fahash_create2.cpp \ fahash_lookup.cpp OBJ = $(SRC_FILES:.cpp=.obj) $(C_FILES:.c=.obj) HDR = defaults.h build_cfg.h \ bin-io.hpp \ strref.hpp \ mmap.hpp \ faread.hpp \ fast_seqio.hpp \ hashset.hpp \ sts_i.hpp \ align.hpp \ minilcs.hpp \ stsmatch_i.hpp \ stsmatch_m.hpp \ stsmatch.hpp \ fahash.hpp \ sts.hpp \ mswin.h all: includes objects e-PCR.exe famap.exe fahash.exe re-PCR.exe clean: -del *.obj -del epcr/* -rmdir epcr clean-all: clean -rm e-PCR.exe -rm re-PCR.exe -rm famap.exe -rm fahash.exe e-PCR.exe: e-PCR_main.obj $(OBJ) $(LINK) c0x32.obj $** , e-PCR.exe, , cw32.lib import32.lib , , famap.exe: famap_main.obj $(OBJ) $(LINK) c0x32.obj $** , famap.exe, , cw32.lib import32.lib , , fahash.exe: fahash_main.obj $(OBJ) $(LINK) c0x32.obj $** , fahash.exe, , cw32.lib import32.lib , , re-PCR.exe: re-PCR_main.obj $(OBJ) $(LINK) c0x32.obj $** , re-PCR.exe, , cw32.lib import32.lib , , # $(BCC) $(srcdir)/e-PCR_main.cpp $(OBJ) -tWC includes: $(HDR) -mkdir epcr © $** epcr objects: $(OBJ) ######################################################################## ## $Log: Makefile.bcc55-w32,v $ ## Revision 1.7 2004/09/03 21:28:56 rotmistr ## Fixes to compile with Borland C++ 5.5 ## ## Revision 1.6 2004/09/03 19:07:06 rotmistr ## List of files changed ## ## Revision 1.5 2004/05/27 21:18:54 rotmistr ## Changes from ../Changes v.2.1.0 added ## ## Revision 1.4 2004/04/02 15:44:01 rotmistr ## *** empty log message *** ## ## Revision 1.3 2004/04/01 17:24:46 rotmistr ## *** empty log message *** ## ## Revision 1.2 2004/04/01 16:37:54 rotmistr ## Added getopt ## ## Revision 1.1 2004/04/01 05:59:11 rotmistr ## Compilable with Bcc5.5/win32 ## ## Revision 1.4 2004/03/30 21:06:58 rotmistr ## Fixes for setting default STS size range. ## ## Revision 1.3 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.2 2004/01/28 23:27:09 rotmistr ## "Best of overlapping" hit selection postprocessor added. ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.5 2003/12/01 12:29:36 rotmistr ## Reverse PCR wordhash builder in progress ## ## Revision 1.4 2003/11/21 23:26:36 rotmistr ## Almost compilable ## ## Revision 1.3 2003/11/20 18:27:32 rotmistr ## Sample files updated ## Program does not crush ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/stand/Makefile.vc80000644001137700010620000001103011745334032015644 0ustar rotmistrcontig## $Id: Makefile.vc8,v 1.1 2007/07/05 16:06:04 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## srcdir = . !include $(srcdir)/stand/version.mk CC = cl -I../ -D_WIN32 -D_CONSOLE -DSTANDALONE -DUSE_WIN=1 -DVERSION=\"$(VERSION)\" -O2 /EHsc /GR LINK = cl .c.obj: $(CC) -c $< -o $@ .cpp.obj: $(CC) -c $< -o $@ C_FILES = \ getopt.c SRC_FILES = \ faread.cpp \ fast_seqio_read.cpp \ fast_seqio_write.cpp \ hashset.cpp \ mmap.cpp \ align.cpp \ stsmatch_i.cpp \ stsmatch_m.cpp \ stsfilter.cpp \ stsmatch.cpp \ fahash_create.cpp \ fahash_create1.cpp \ fahash_create2.cpp \ fahash_lookup.cpp OBJ = $(SRC_FILES:.cpp=.obj) $(C_FILES:.c=.obj) HDR = defaults.h build_cfg.h \ bin-io.hpp \ strref.hpp \ mmap.hpp \ faread.hpp \ fast_seqio.hpp \ hashset.hpp \ sts_i.hpp \ align.hpp \ minilcs.hpp \ stsmatch_i.hpp \ stsmatch_m.hpp \ stsmatch.hpp \ fahash.hpp \ sts.hpp \ mswin.h all: includes objects e-PCR.exe famap.exe fahash.exe re-PCR.exe clean: -del *.obj -del epcr/* -rmdir epcr clean-all: clean -rm e-PCR.exe -rm re-PCR.exe -rm famap.exe -rm fahash.exe e-PCR.exe: e-PCR_main.obj $(OBJ) $(LINK) e-PCR_main.obj /Fee-PCR.exe /ML /link /SUBSYSTEM:CONSOLE $(OBJ) famap.exe: famap_main.obj $(OBJ) $(LINK) famap_main.obj /Fefamap.exe /ML /link /SUBSYSTEM:CONSOLE $(OBJ) fahash.exe: fahash_main.obj $(OBJ) $(LINK) fahash_main.obj /Fefahash.exe /ML /link /SUBSYSTEM:CONSOLE $(OBJ) re-PCR.exe: re-PCR_main.obj $(OBJ) $(LINK) re-PCR_main.obj /Fere-PCR.exe /ML /link /SUBSYSTEM:CONSOLE $(OBJ) includes: $(HDR) objects: $(OBJ) ######################################################################## ## $Log: Makefile.vc8,v $ ## Revision 1.1 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.7 2004/09/03 21:28:56 rotmistr ## Fixes to compile with Borland C++ 5.5 ## ## Revision 1.6 2004/09/03 19:07:06 rotmistr ## List of files changed ## ## Revision 1.5 2004/05/27 21:18:54 rotmistr ## Changes from ../Changes v.2.1.0 added ## ## Revision 1.4 2004/04/02 15:44:01 rotmistr ## *** empty log message *** ## ## Revision 1.3 2004/04/01 17:24:46 rotmistr ## *** empty log message *** ## ## Revision 1.2 2004/04/01 16:37:54 rotmistr ## Added getopt ## ## Revision 1.1 2004/04/01 05:59:11 rotmistr ## Compilable with Bcc5.5/win32 ## ## Revision 1.4 2004/03/30 21:06:58 rotmistr ## Fixes for setting default STS size range. ## ## Revision 1.3 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.2 2004/01/28 23:27:09 rotmistr ## "Best of overlapping" hit selection postprocessor added. ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.5 2003/12/01 12:29:36 rotmistr ## Reverse PCR wordhash builder in progress ## ## Revision 1.4 2003/11/21 23:26:36 rotmistr ## Almost compilable ## ## Revision 1.3 2003/11/20 18:27:32 rotmistr ## Sample files updated ## Program does not crush ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/stand/config.mk0000644001137700010620000001151711745334032015315 0ustar rotmistrcontig## $Id: config.mk,v 1.7 2007/07/05 16:06:04 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## tgtdir = . objdir = . srcdir = . COMMON_CC_FLAGS = ifdef OPTIMIZE CC_FLAGS = $(COMMON_CC_FLAGS) -O$(OPTIMIZE) LD_FLAGS = else ifdef PROFILE CC_FLAGS = $(COMMON_CC_FLAGS) -g -pg LD_FLAGS = -g -pg else CC_FLAGS = $(COMMON_CC_FLAGS) -g2 LD_FLAGS = -g2 endif endif #arch = $(shell echo `uname -s`-`uname -m`) prefix = /usr/local/ #$(arch) BINDIR = $(prefix)/bin/ INCDIR = $(prefix)/include/ LIBDIR = $(prefix)/lib/ FIXCMD = perl -ne's/^([^\s\#]+)/\$$(objdir)\/$$1/;print' include $(srcdir)/stand/version.mk ######################################################################### # GNU compiler flags CC = gcc CXX = g++ CXXFLAGS = -I$(srcdir) -I$(INCDIR) $(CC_FLAGS) $(PART_CXXFLAGS) \ -DDEALLOCATE=0 $(LF64CCFLAGS) $(VERSION_FLAGS) -DSTANDALONE=1 LDFLAGS = $(LD_FLAGS) $(LF64LDFLAGS) -L$(tgtdir) -L$(LIBDIR) $(PART_LDFLAGS) # $(PART_PRELIBS) $(LIBS:%=-l%) $(PART_POSTLIBS) LF64CCFLAGS = `getconf LFS_CFLAGS` LF64LDFLAGS = `getconf LFS_LDFLAGS` `getconf LFS_LIBS` ## Use following lines if you don't have getconf but need to ## explicitely turn on largefile support # LF64CCFLAGS = -D_LARGEFILE64_SOURCE -DFILE_OFFSET_BITS=64 # LF64LDFLAGS = ## Use following lines for Mac OS X and other systems that lack *64 functions # LF64CCFLAGS = -DNATIVE_LARGEFILE # LF64LDFLAGS = VERSION_FLAGS = -DVERSION=\"$(VERSION)\" \ -DVER_MAJOR=$(VER_MAJOR) \ -DVER_MINOR=$(VER_MINOR) \ -DVER_BUILD=$(VER_BUILD) LIBS = seq epcr src = $(SRC:%=$(srcdir)/%) hdr = $(HDR:%=$(srcdir)/%) all: links target links: if test -n "$(LIBNAME)" ; then \ test -L $(LIBNAME) || ln -s $(srcdir) $(LIBNAME) ; \ fi dirs: for i in $(INCDIR)/$(LIBNAME) $(BINDIR) $(LIBDIR) ; do \ test -d $$i || mkdir -p $$i ; \ done clean: -rm $(OBJ) $(HDR:%=%~) $(SRC:%=%~) clean-all: clean -rm $(TARGET) dist-clean: clean-all -rm *~ -test -L $(LIBNAME) && rm $(LIBNAME) $(objdir)/%.o: $(srcdir)/%.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ ######################################################################## ## $Log: config.mk,v $ ## Revision 1.7 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.6 2004/09/08 18:30:59 rotmistr ## Fixed typo ## ## Revision 1.5 2004/09/03 15:54:48 rotmistr ## Compilation for Mac OS/X ## ## Revision 1.4 2004/06/03 23:37:29 rotmistr ## New aligner added. ## ## Revision 1.3 2004/03/31 05:04:11 rotmistr ## Search range fix ## ## Revision 1.2 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.8 2003/12/10 19:55:48 rotmistr ## Plain fasta interface is about to be substituted to blastdb interface ## ## Revision 1.7 2003/12/04 21:29:34 rotmistr ## Looks like faindex branch works better! ## ## Revision 1.6 2003/11/24 19:33:40 rotmistr ## Optimised. Added OneTimeRun flag. ## ## Revision 1.5 2003/11/23 03:40:53 rotmistr ## Looks like working, requires optimisation. ## ## Revision 1.4 2003/11/20 23:05:58 rotmistr ## Contiguos words work. ## Discontiguos need to be modified. ## ## Revision 1.3 2003/11/20 18:27:32 rotmistr ## Sample files updated ## Program does not crush ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/stand/version.mk0000644001137700010620000001022111745334032015524 0ustar rotmistrcontig## $Id: version.mk,v 1.26 2008/06/18 14:48:20 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## #set -x #echo "HELLO!!!" VER_MAJOR=2 VER_MINOR=3 VER_BUILD=12 VERSION = $(VER_MAJOR).$(VER_MINOR).$(VER_BUILD) ######################################################################## ## $Log: version.mk,v $ ## Revision 1.26 2008/06/18 14:48:20 rotmistr ## *** empty log message *** ## ## Revision 1.25 2008/04/28 16:39:19 rotmistr ## Applied patch to build with gcc-4.3 ## ## Revision 1.24 2008/03/26 16:04:35 rotmistr ## Added support for blastdb files ## ## Revision 1.23 2007/07/11 20:49:33 rotmistr ## Made 64bit-compatible ## ## Revision 1.22 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.21 2005/06/14 16:46:51 rotmistr ## Changed report format for floppy tails ## ## Revision 1.20 2005/02/11 20:42:59 rotmistr ## Fixed "margin" bug, added primer search from file ## ## Revision 1.19 2004/10/26 17:16:41 rotmistr ## Added 5'-end masking for primers ## ## Revision 1.18 2004/09/03 15:54:48 rotmistr ## Compilation for Mac OS/X ## ## Revision 1.17 2004/06/08 16:14:59 rotmistr ## *** empty log message *** ## ## Revision 1.16 2004/06/07 16:25:03 rotmistr ## Bug fixes to previos version. ## ## Revision 1.15 2004/06/03 23:37:29 rotmistr ## New aligner added. ## ## Revision 1.14 2004/04/27 00:01:55 rotmistr ## Second version of reverse hash file started ## ## Revision 1.13 2004/04/06 04:53:18 rotmistr ## All is compileable with BCC5.5 and runnable on WIndows ## ## Revision 1.12 2004/04/01 17:24:46 rotmistr ## *** empty log message *** ## ## Revision 1.11 2004/03/30 19:08:08 rotmistr ## default STS size is tunnable now ## ## Revision 1.10 2004/03/26 17:02:18 rotmistr ## Compat-options are now allowed everywhere, and multiple fasta files can be used. ## ## Revision 1.9 2004/03/25 19:36:52 rotmistr ## API: separate left and right primers mism/gaps in forward API ## ## Revision 1.8 2004/03/23 22:36:02 rotmistr ## 2.0 release ## ## Revision 1.7 2004/02/18 05:44:40 rotmistr ## Changes in CGI: sort order, separate misalignments for l and r primers, reload button ## ## Revision 1.6 2004/02/12 21:39:29 rotmistr ## New version ## ## Revision 1.5 2004/01/28 23:27:09 rotmistr ## "Best of overlapping" hit selection postprocessor added. ## ## Revision 1.4 2004/01/08 23:22:47 rotmistr ## Fixed init error in faread, ## Adjusted output to standard, ## Added output format style and output file to parameters. ## ## Revision 1.3 2004/01/07 16:57:48 rotmistr ## Fragment size is now configurable. ## ## Revision 1.2 2004/01/06 21:54:28 rotmistr ## Statistics for word repetitions API added ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.2 2003/11/20 02:12:32 rotmistr ## Fixed id, log tags and copyright notice ## ######################################################################## e-PCR-2.3.12/BUILD.html0000644001137700010620000002206711745334031014134 0ustar rotmistrcontig Electronic PCR commandline tools: build instructions

Electronic PCR commandline tools: build instructions

Version: 2.3.12

  1. Build
    1. Unix/gcc instructions
      1. Make arguments
      2. Mac OS/X with gcc
    2. Windows/BorlandC++ instructions
    3. Windows/MS-VisualC++8.0 instructions
  2. Files
    1. Sources common to forward and reverse e-PCR binaries
    2. Forward e-PCR source files
    3. Reverse e-PCR source files
    4. Extra files
    5. Build files

Build e-PCR and reverse e-PCR (re-PCR, famap) binaries with GNU make and GCC.

Build

Unix/gcc instructions

  1. Unpack archive
  2. Edit stand/config.mk if nesessary
  3. run gmake links depend all OPTIMIZE=6

Make arguments

One can use following arguments to make:

OPTIMIZE=[0-9]
to pass with -O argument to compiler (default is build debug version)
srcdir={path-to-src}
to set path to sources
objdir={path-to-obj}
to set path where to place .o
tgtdir={target-path}
to set path where to put targets (libepcr.a and executable files)

Mac OS/X with gcc

Use LF64LDFLAGS= LF64CCFLAGS=-DNATIVE_LARGEFILES (yes, space after first "=") argument with gmake since Mac OS/2 does not have (and does not need) *64 file functions and off64_t

Windows/BorlandC++ instructions

  1. Unpack archive
  2. Edit stand/Makefile.bcc55-w32 if nesessary
  3. run make all -f stand/Makefile.bcc55-w32

Windows/MS-VisualC++8.0 instructions

  1. Unpack archive
  2. Edit stand/Makefile.vc8 if nesessary
  3. run nmake all -f stand/Makefile.vc8

Files

e-PCR package includes two tool sets: forward e-PCR (e-PCR) and reverse e-PCR (re-PCR, fahash and famap). These binaries share some source files, that are compiled as libepcr.a library.

Directory stand/ contains makefiles to use with GCC. Change stand/config.mk to update compiler and compiling options. Makefiles should be OK for GNU make.

Sources common to forward and reverse e-PCR binaries

build_cfg.h
macroses that control compilation with/without NCBI toolkit
defaults.h
defaults for e-PCR program (used also in library)
mswin.h
windows compatibility defines and declarations
native64.h
native 64bit file access compatibility defines and declarations
strref.hpp
class for passing reference to string data
sts_i.hpp
generic STS interface class
hashset.hpp
hash calculating class declaration (allows set of discontiguos words)
hashset.cpp
hash calculating class implementation
align.hpp
align or compare two sequences allowing mismatches and gaps, declarations
align.cpp
align or compare two sequences allowing mismatches and gaps, implementation
minilcs.hpp
align two sequences templat class
faread.hpp
read fasta files, declarations
faread.cpp
read fasta files, implementation
mmap.hpp
mmap(2) wrapper for huge files and no page boundary restriction, declarations
mmap.cpp
mmap(2) wrapper for huge files and no page boundary restriction, implementation
getopt.c
getopt implementation -- to compile for windows

Forward e-PCR source files

e-PCR_main.cpp
main for e-PCR commandline program
stsmatch_i.hpp
STS lookup algorithm declarations
stsmatch_i.cpp
STS lookup algorithm implementation
stsfilter.cpp
Postprocessor for STS lookup
stsmatch_m.hpp
STS and STS hash table implementation for mmapable UniSTS file, declarations
stsmatch_m.cpp
STS and STS hash table implementation for mmapable UniSTS file, implementation
stsmatch.hpp
Some useful callbacks declarations
stsmatch.cpp
Some useful callbacks, implementation

Reverse e-PCR source files

bin-io.hpp
Generic read/write integers and strings
fahash_defines.h
Internal defines for fahash
fahash_internal.hpp
Internal defines for fahash
fahash.hpp
Hash sequence words in file, declarations
fahash_create.cpp
Hash sequence words in file, creating hash file, abstract
fahash_create1.cpp
Hash sequence words in file, creating hash file version 1 implementation
fahash_create2.cpp
Hash sequence words in file, creating hash file version 2 implementation
fahash_lookup.cpp
Hash sequence words in file, lookup algorithm implementation
fast_seqio.hpp
Fast access to regions of sequences, declarations
fast_seqio_read.cpp
Fast access to regions of sequences, implementation
fast_seqio_write.cpp
Fast access to regions of sequences, create sequence file implementation
famap_main.cpp
main for commandline tool to create/dump mmapable file
fahash_main.cpp
main for commandline tool to create hash file
sts.hpp
simple implementation for STS class
re-PCR_main.cpp
main for reverse e-PCR commandline tool

Extra files

seqcmp_main.cpp
Main file for align.?pp test

Build files

Makefile
Master makefile
stand/version.mk
Version definitions
stand/config.mk
Compiler options
stand/Makefile.libepcr
libepcr makefile
stand/Makefile.cmd_epcr
e-PCR makefile
stand/Makefile.cmd_repcr
re-PCR makefile
stand/Makefile.cmd_famap
famap makefile
stand/Makefile.cmd_fahash
fahash makefile
stand/Makefile.cmd_seqcmp
seqcmp makefile
stand/Makefile.bcc55-w32
makefile for BorlandC++/win32
stand/Makefile.vc8
makefile for MS Visual C++ 8.0

e-PCR-2.3.12/BUILD.txt0000644001137700010620000001335511745334031014007 0ustar rotmistrcontig Electronic PCR commandline tools: build instructions Version: 2.3.12 _________________________________________________________________ Build e-PCR and reverse e-PCR (re-PCR, famap) binaries with GNU make and GCC. _________________________________________________________________ Build Unix/gcc instructions 1. Unpack archive 2. Edit stand/config.mk if nesessary 3. run gmake links depend all OPTIMIZE=6 Make arguments One can use following arguments to make: OPTIMIZE=[0-9] to pass with -O argument to compiler (default is build debug version) srcdir={path-to-src} to set path to sources objdir={path-to-obj} to set path where to place .o tgtdir={target-path} to set path where to put targets (libepcr.a and executable files) Mac OS/X with gcc Use LF64LDFLAGS= LF64CCFLAGS=-DNATIVE_LARGEFILES (yes, space after first "=") argument with gmake since Mac OS/2 does not have (and does not need) *64 file functions and off64_t Windows/BorlandC++ instructions 1. Unpack archive 2. Edit stand/Makefile.bcc55-w32 if nesessary 3. run make all -f stand/Makefile.bcc55-w32 Windows/MS-VisualC++8.0 instructions 1. Unpack archive 2. Edit stand/Makefile.vc8 if nesessary 3. run nmake all -f stand/Makefile.vc8 _________________________________________________________________ Files e-PCR package includes two tool sets: forward e-PCR (e-PCR) and reverse e-PCR (re-PCR, fahash and famap). These binaries share some source files, that are compiled as libepcr.a library. Directory stand/ contains makefiles to use with GCC. Change stand/config.mk to update compiler and compiling options. Makefiles should be OK for GNU make. Sources common to forward and reverse e-PCR binaries build_cfg.h macroses that control compilation with/without NCBI toolkit defaults.h defaults for e-PCR program (used also in library) mswin.h windows compatibility defines and declarations native64.h native 64bit file access compatibility defines and declarations strref.hpp class for passing reference to string data sts_i.hpp generic STS interface class hashset.hpp hash calculating class declaration (allows set of discontiguos words) hashset.cpp hash calculating class implementation align.hpp align or compare two sequences allowing mismatches and gaps, declarations align.cpp align or compare two sequences allowing mismatches and gaps, implementation minilcs.hpp align two sequences templat class faread.hpp read fasta files, declarations faread.cpp read fasta files, implementation mmap.hpp mmap(2) wrapper for huge files and no page boundary restriction, declarations mmap.cpp mmap(2) wrapper for huge files and no page boundary restriction, implementation getopt.c getopt implementation -- to compile for windows Forward e-PCR source files e-PCR_main.cpp main for e-PCR commandline program stsmatch_i.hpp STS lookup algorithm declarations stsmatch_i.cpp STS lookup algorithm implementation stsfilter.cpp Postprocessor for STS lookup stsmatch_m.hpp STS and STS hash table implementation for mmapable UniSTS file, declarations stsmatch_m.cpp STS and STS hash table implementation for mmapable UniSTS file, implementation stsmatch.hpp Some useful callbacks declarations stsmatch.cpp Some useful callbacks, implementation Reverse e-PCR source files bin-io.hpp Generic read/write integers and strings fahash_defines.h Internal defines for fahash fahash_internal.hpp Internal defines for fahash fahash.hpp Hash sequence words in file, declarations fahash_create.cpp Hash sequence words in file, creating hash file, abstract fahash_create1.cpp Hash sequence words in file, creating hash file version 1 implementation fahash_create2.cpp Hash sequence words in file, creating hash file version 2 implementation fahash_lookup.cpp Hash sequence words in file, lookup algorithm implementation fast_seqio.hpp Fast access to regions of sequences, declarations fast_seqio_read.cpp Fast access to regions of sequences, implementation fast_seqio_write.cpp Fast access to regions of sequences, create sequence file implementation famap_main.cpp main for commandline tool to create/dump mmapable file fahash_main.cpp main for commandline tool to create hash file sts.hpp simple implementation for STS class re-PCR_main.cpp main for reverse e-PCR commandline tool Extra files seqcmp_main.cpp Main file for align.?pp test Build files Makefile Master makefile stand/version.mk Version definitions stand/config.mk Compiler options stand/Makefile.libepcr libepcr makefile stand/Makefile.cmd_epcr e-PCR makefile stand/Makefile.cmd_repcr re-PCR makefile stand/Makefile.cmd_famap famap makefile stand/Makefile.cmd_fahash fahash makefile stand/Makefile.cmd_seqcmp seqcmp makefile stand/Makefile.bcc55-w32 makefile for BorlandC++/win32 stand/Makefile.vc8 makefile for MS Visual C++ 8.0 _________________________________________________________________ e-PCR-2.3.12/README.html0000644001137700010620000005125011745334031014226 0ustar rotmistrcontig Electronic PCR commandline tools: operating instructions

Electronic PCR commandline tools: operating instructions

Version: 2.3.12

  1. Forward e-PCR
    1. Example
    2. Synopsis
    3. Description
    4. Options
      1. General options
      2. Hash building options
      3. Hit quality options
      4. Alignment algorithms options
      5. Report options
    5. Ouput formats
    6. Exit codes
  2. Reverse e-PCR
    1. Example
    2. Synopsis
    3. Description
    4. Options
      1. Common options
      2. famap options
      3. Fahash
      4. Commands
      5. Search options
    5. Output format
      1. For primer lookup
      2. For STS lookup
    6. Exit codes
    7. Bugs and features
  3. File formats

Use e-PCR to map sequences using STS database

Use re-PCR to map STSes or short primers in sequence database

Use famap and fahash to prepare sequence database for re-PCR searches.


Forward e-PCR

Example

work> e-PCR -w9 -f 1 -m100 mystsdb.sts D=100-400 myfastafile.fa N=1 G=1 T=3

Synopsis


e-PCR [-hV] [posix-options] stsfile [fasta ...] [compat-options]
where posix-options are:
	-m ##	Margin (default 50)
	-w ##	Wordsize  (default 7)
	-n ##	Max mismatches allowed (default 0)
	-g ##	Max indels allowed (default 0)
	-f ##	Use ## discontiguos words
	-o ##	Set output file
	-t ##	Set output format:
		1 - classic, range (pos1..pos2)
		2 - classic, midpoint
		3 - tabular
                4 - tabular with alignment in comments (slow)
        -d ##-## Set default sts size 
	-p +-	Turn hits postprocess on/off
	-v +-	Verbose on/Off
        -a a|f  Use presize alignmens (only if gaps>0), slow
                 a - Allways or f - as Fallback
        -x +-   Use 5'-end lowercase masking of primers (default -)
        -u +-   Uppercase all primers (default -)
and compat-options (duplicate posix-options) are:
	M=##	Margin (default 50)
	W=##	Wordsize  (default 7)
	N=##	Max mismatches allowed (default 0)
	G=##	Max indels allowed (default 0)
	F=##	Use ## discontinuos words
	O=##	Set output file to ##
	T=##	Set output format (1..4)
        D=##-## Set default sts size 
	P=+-	Postprocess hits on/off
	V=+-	Verbose on/Off
        A=a|f   Use presize alignmens (only if gaps>0), slow
                 a - Allways or f - as Fallback
        X=+-    Use 5'-end lowercase masking of primers (default -)
        U=+-    Uppercase all primers (default -)
	-mid	Same as T=2

Description

e-PCR parses stsfile in unists format, then reads nucleotide sequence data in FASTA format from files listed in commandline if any, or from stdin otherwise. For input sequences e-PCR finds matches and prints output in one of three formats.

Options

Two sets of options are used: POSIX-compatible and old-style provided for compatibility with previous versions of e-PCR.

Posix-style options can appear only before first parameter not starting with '-'. Argument '--' explicitely stops parsing arguments as posix options.

Compatibility options can appear anywhere in commandline. '-mid' can appear anywhere and do not stop posix options recognision.

General options

-V
Print version, exit after parsing commandline
-h
Print help, exit after parsing commandline

Hash building options

-w wordsize | W=wordsize
Set word size for primers hash (nucleotide positions). Longer word size decreases hash collision rate, but increases memory usage. Also no mismatches are allowed within word size near "inner" boundary of primers unless one uses discontiguous words, and no gaps are ever allowed in that region.
-f wordcnt | W=wordcnt
Set discontiguous word count for primers hash (1 means "use contiguous words"). Discontiguous words increase number of hash tables and decrease "effective" word size (thus increasing hash collision rate), so make search significantly slower, but increase sencitivity by allowing mismatches within word size. Reasonable values are 1 (contiguous words) and 3.
-d lo-hi | D=lo-hi
Set ddefault STS size range - values used for STSs that have no size associated in file.

Hit quality options

-m margin | M=margin
Set maximal allowed deviation of hit product size from expected STS size.
-n mism | N=mism
Set maximal number of mismatches allowed in primer-to-sequence alignment (per primer!).
-g mism | G=mism
Set maximal number of gaps allowed in primer-to-sequence alignment (per primer!).

Alignment algorithms options

-a a|f | A=a|f
Use NW algorithm to align primers to sequence: a - always, f - as fallback if fast algorithm gives no hit at this position.
-x +|- | X=+|-
Turn on/off recognising of lowercase characters at 5'-ends of primers as nucleotides that don't need to be aligned to sequence (floppy tails).
-u +|- | U=+|-
Uppercase primers. To use with files prepared for ``-x=+'' mode, but requiring full primer alignment.
If STS file contains primers with lowercase charactars, you have to use either -x+ or -u+ flag.

Report options

-o output | O=output
Set output file.
-t 1|2|3|4 | T=1|2|3|4
Set output format.
-p +|- | P=+|-
Set hit grouping on/off: when using discontiguous words and gaps, some hits may be reported multiple times with little different quality. This option controls reporting only best hit of group of overlapping hits. Default depends on F and G values.
-v +|- | V=+|-
Report sequence ids to stderr on/off.

Ouput formats

1: Traditional: reports whitespace-separated
  • Sequence FASTA identifier
  • POS1..POS2 -- start and end positions of hit (includes length floppy tail)
  • STS identifier (col. 1 from STS file)
  • STS description (columns 5..last from STS file)

In this format product size equals to POS2-POS1+1

2: Traditional midpoint: reports whitespace-separated
  • Sequence FASTA identifier
  • POS -- middle point position of hit
  • STS identifier (col. 1 from STS file)
  • STS description (columns 5..last from STS file)
3: Tab-separated detailed
  • Sequence FASTA identifier
  • STS identifier (col. 1 from STS file)
  • +|- -- strand of hit (order of primers in hit)
  • POS1 -- start position of hit (does not include floppy tail if any)
  • POS2 -- end position of hit (does not include floppy tail)
  • SIZE/MIN..MAX -- observed size of hit/expected size range of STS
  • MISM -- Total number of mismatches for two primers
  • GAPS -- Total number of gaps for two primers
  • STS description (columns 5..last from STS file)

In this format product size may be greater then POS2-POS1+1 for probes with floppy tails

4: Tab-separated detailed with alignment
Is same as format 3, but also containing visualisations of alignments in comment lines (lines starting with ``#'')

Exit codes

Zero on success, nonzero on fail


Reverse e-PCR

Example

work> famap -tN -b genome.famap org/chr_*.fa
work> fahash -b genome.hash -w 12 -f3 ${PWD}/genome.famap
work> re-PCR -s genome.hash -n1 -g1 ACTATTGATGATGA AGGTAGATGTTTTT 120-200

Synopsis


famap [-hV]
famap -b mmapped-file [-t cvt] [fasta-file ...]
famap -d mmapped-file [ord ...]
famap -l mmapped-file [ord ...]
where cvt is one of: off n N nx NX

fahash [-hV]
fahash -b hash-file [build-options] mmapped-file ...
fahash -T hash-file [-o output]

where:
	-b hash-file	Build hash tables (hash-file) from sequence files,
	-T hash-file	Print word usage statistics for hash-file
	-o outfile   	Set output file name for -T

build-options:
	-w wordsize 	Set word size when building hash tables
	-f period   	Set discontiguity when building hash tables
	-k          	Skip repeats when building indexfile
	-F min,max  	Set watermarks for fragment size (in Mb) for -v1
        -v 1|2          Build file of format version 1 or 2
        -c cachesize    Use cache size cachesize (for -v2)

re-PCR [-hV]
re-PCR -p hash-file [-g gaps] [-n mism] [primer ...]
re-PCR -P hash-file [-g gaps] [-n mism] [primer-file ...]
re-PCR -s hash-file [search-options] [-O output] [left right lo hi [...]]
re-PCR -S hash-file [search-options] [-O output] [-C bcnt] [stsfile ...]

where:
	-p hash-file	Perform primer lookup using hash-file
	-P hash-file	Perform primer lookup using hash-file
	-s hash-file	Perform STS lookup using hash-file, STSs in cmdline
	-S hash-file	Perform STS lookup using hash-file, STSs in file


search-options:
	-n mism      	Set max allowed mismatches per primer for lookup
	-g gaps      	Set max allowed indels per primer for lookup
	-m margin    	Set variability for STS size for lookup
        -d min-max      Set default STS size (for STSs without size set)
	-r +|-       	Enable/disable reverse STS lookup
	-O +|-       	Enable/disable syscall optimisation

	-C batchcnt  	Set number of STSes per batch
	-o outfile   	Set output file name

Description

Reverse e-PCR (re-PCR) performs STS or primer lookup against sequence database. Two files are required for database: mmapped-file with sequence data in fast random-accessible format and hash-file, that keeps precalculated positions of all words of sequence database

Use famap to build mmapped-file from FASTA files.

Use fahash to build hash-file, and output word usage statistics.

Use re-PCR to perform STS and primer searches.

Discontiguous words are supported by re-PCR as well as contiguous.

Options

Common options

-V
Print version, exit after parsing commandline
-h
Print help, exit after parsing commandline

famap options

-b mmapped-file
Build famap-file from input fasta file(s). If no fasta files are set in commandline, use stdin as input.
-d mmapped-file
Dump famap-file contents in fasta format. If ord number(s) are set, print only sequences with given ordinals.
-l mmapped-file
List fama-file sequence identifiers. If ord number(s) are set, print only sequences with given ordinals.
-t cvt-table
Use compiled-in table to convert input.
n
Nucleotides. Allowed characters are [actgACTGnN]. Other letters are converted to n or N. Rest of symbols are ignored. Case is preserved.
nx
Nucleotides with extended ambiquity codes iupac_na, lowercase are allowed. Other letters are converted to n or N. Rest of symbols are ignored. Case is preserved.
N
Nucleotides. Allowed characters are [ACTGN]. [actgn] are converted to uppercase. Other letters are converted to N. Rest of symbols are ignored.
NX
Nucleotides with extended ambiquity codes iupac_na, lowercase are converted to uppercase. Other letters are converted to N. Rest of symbols are ignored.

Fahash

-b hash-file
Build hash-file for mmapped-file(s).
-T hash-file
Dump word usage statustics for hash-file.
-v version
Build hash-file of version 1 or 2 (2 is default).
-w wordsize
Build hash-file for word wordsize nucleotides long.
-f wordcnt
Build hash-file for wordcnt discontiguous words. 1 stands for contiguous words.
-F min,max
Use memory watermarks (Mbytes) for hash table size (for -v 1).
-c cachesize
Set cache size for -v 2.
-o output-file
Use output-file for output result of -T.

Commands

-p hash-file
Perform lookup for primers given in commandline.
-s hash-file
Perform lookup for STSes given in commandline.
-S hash-file
Perform lookup for STSes taken from unists file(s) given in commandline.

Search options

-n mism
Number of mismatches allowed per primer.
-g gaps
Number of gaps allowed per primer.
-m margin
Maximal deviation of observed product size to expected STS size.
-d lo-hi
Set ddefault STS size range - values used for STSs that have no size associated in file.
-r +|-
Enable|disable flipped STS lookup (default is "enabled").
-O +|-
Enable|disable syscall optimisation. Since lookup is i/o expensive, enabling this parameter may improve search performance diskwise. On the other hand, it takes significantly more memory and CPU.
-C batchcount
How many STSs from input file to look at one pass. May effect on performance, especialy when used with -O +.
-o output-file
Use output-file for output.

Output format

Is tab-separated file with following fields:

For primer lookup

  • Primer ID
  • Sequence ID
  • Strand
  • Hit start
  • Hit end
  • Mismatches
  • Gaps
  • Size

For STS lookup

  • STS ID
  • Sequence ID
  • Strand
  • Hit start
  • Hit end
  • Mismatches
  • Gaps
  • Observed Size/Expected size range

Exit codes

Zero on success, non-zero on errors

Bugs and features


File formats

STS database
Is single-tab (i.e. two tabs in a row mean "empty field") separated file with following fields:
  • STS id (required).
  • First (left) primer (required).
  • Second (right) primer (required).
  • Product size (optional): can be number for strict size, or two numbers separated by dash for size range.
  • Additional info, that can be used by applications (optional).
Primers should be in iupac_na encoding, everything that is not ACTG or actg is translated to N or n. Primers sequences should be uppercase, unless you want to use file with e-PCR -x+ flag - then several first nucleotides of primers may be lowercase-masked. If primers are not fully uppercase and you don't use -x+ flag, you have to use -u+ flag with e-PCR.
Primers file
Is single-tab (i.e. two tabs in a row mean "empty field") separated file with following fields:
  • Primer id (required).
  • Primer sequence.

e-PCR-2.3.12/README.txt0000644001137700010620000003612011745334031014100 0ustar rotmistrcontig Electronic PCR commandline tools: operating instructions Version: 2.3.12 _________________________________________________________________ Use e-PCR to map sequences using STS database Use re-PCR to map STSes or short primers in sequence database Use famap and fahash to prepare sequence database for re-PCR searches. _________________________________________________________________ Forward e-PCR Example work> e-PCR -w9 -f 1 -m100 mystsdb.sts D=100-400 myfastafile.fa N=1 G=1 T=3 Synopsis e-PCR [-hV] [posix-options] stsfile [fasta ...] [compat-options] where posix-options are: -m ## Margin (default 50) -w ## Wordsize (default 7) -n ## Max mismatches allowed (default 0) -g ## Max indels allowed (default 0) -f ## Use ## discontiguos words -o ## Set output file -t ## Set output format: 1 - classic, range (pos1..pos2) 2 - classic, midpoint 3 - tabular 4 - tabular with alignment in comments (slow) -d ##-## Set default sts size -p +- Turn hits postprocess on/off -v +- Verbose on/Off -a a|f Use presize alignmens (only if gaps>0), slow a - Allways or f - as Fallback -x +- Use 5'-end lowercase masking of primers (default -) -u +- Uppercase all primers (default -) and compat-options (duplicate posix-options) are: M=## Margin (default 50) W=## Wordsize (default 7) N=## Max mismatches allowed (default 0) G=## Max indels allowed (default 0) F=## Use ## discontinuos words O=## Set output file to ## T=## Set output format (1..4) D=##-## Set default sts size P=+- Postprocess hits on/off V=+- Verbose on/Off A=a|f Use presize alignmens (only if gaps>0), slow a - Allways or f - as Fallback X=+- Use 5'-end lowercase masking of primers (default -) U=+- Uppercase all primers (default -) -mid Same as T=2 Description e-PCR parses stsfile in unists format, then reads nucleotide sequence data in FASTA format from files listed in commandline if any, or from stdin otherwise. For input sequences e-PCR finds matches and prints output in one of three formats. Options Two sets of options are used: POSIX-compatible and old-style provided for compatibility with previous versions of e-PCR. Posix-style options can appear only before first parameter not starting with '-'. Argument '--' explicitely stops parsing arguments as posix options. Compatibility options can appear anywhere in commandline. '-mid' can appear anywhere and do not stop posix options recognision. General options -V Print version, exit after parsing commandline -h Print help, exit after parsing commandline Hash building options -w wordsize | W=wordsize Set word size for primers hash (nucleotide positions). Longer word size decreases hash collision rate, but increases memory usage. Also no mismatches are allowed within word size near "inner" boundary of primers unless one uses discontiguous words, and no gaps are ever allowed in that region. -f wordcnt | W=wordcnt Set discontiguous word count for primers hash (1 means "use contiguous words"). Discontiguous words increase number of hash tables and decrease "effective" word size (thus increasing hash collision rate), so make search significantly slower, but increase sencitivity by allowing mismatches within word size. Reasonable values are 1 (contiguous words) and 3. -d lo-hi | D=lo-hi Set ddefault STS size range - values used for STSs that have no size associated in file. Hit quality options -m margin | M=margin Set maximal allowed deviation of hit product size from expected STS size. -n mism | N=mism Set maximal number of mismatches allowed in primer-to-sequence alignment (per primer!). -g mism | G=mism Set maximal number of gaps allowed in primer-to-sequence alignment (per primer!). Alignment algorithms options -a a|f | A=a|f Use NW algorithm to align primers to sequence: a - always, f - as fallback if fast algorithm gives no hit at this position. -x +|- | X=+|- Turn on/off recognising of lowercase characters at 5'-ends of primers as nucleotides that don't need to be aligned to sequence (floppy tails). -u +|- | U=+|- Uppercase primers. To use with files prepared for ``-x=+'' mode, but requiring full primer alignment. If STS file contains primers with lowercase charactars, you have to use either -x+ or -u+ flag. Report options -o output | O=output Set output file. -t 1|2|3|4 | T=1|2|3|4 Set output format. -p +|- | P=+|- Set hit grouping on/off: when using discontiguous words and gaps, some hits may be reported multiple times with little different quality. This option controls reporting only best hit of group of overlapping hits. Default depends on F and G values. -v +|- | V=+|- Report sequence ids to stderr on/off. Ouput formats 1: Traditional: reports whitespace-separated + Sequence FASTA identifier + POS1..POS2 -- start and end positions of hit (includes length floppy tail) + STS identifier (col. 1 from STS file) + STS description (columns 5..last from STS file) In this format product size equals to POS2-POS1+1 2: Traditional midpoint: reports whitespace-separated + Sequence FASTA identifier + POS -- middle point position of hit + STS identifier (col. 1 from STS file) + STS description (columns 5..last from STS file) 3: Tab-separated detailed + Sequence FASTA identifier + STS identifier (col. 1 from STS file) + +|- -- strand of hit (order of primers in hit) + POS1 -- start position of hit (does not include floppy tail if any) + POS2 -- end position of hit (does not include floppy tail) + SIZE/MIN..MAX -- observed size of hit/expected size range of STS + MISM -- Total number of mismatches for two primers + GAPS -- Total number of gaps for two primers + STS description (columns 5..last from STS file) In this format product size may be greater then POS2-POS1+1 for probes with floppy tails 4: Tab-separated detailed with alignment Is same as format 3, but also containing visualisations of alignments in comment lines (lines starting with ``#'') Exit codes Zero on success, nonzero on fail _________________________________________________________________ Reverse e-PCR Example work> famap -tN -b genome.famap org/chr_*.fa work> fahash -b genome.hash -w 12 -f3 ${PWD}/genome.famap work> re-PCR -s genome.hash -n1 -g1 ACTATTGATGATGA AGGTAGATGTTTTT 120-200 Synopsis famap [-hV] famap -b mmapped-file [-t cvt] [fasta-file ...] famap -d mmapped-file [ord ...] famap -l mmapped-file [ord ...] where cvt is one of: off n N nx NX fahash [-hV] fahash -b hash-file [build-options] mmapped-file ... fahash -T hash-file [-o output] where: -b hash-file Build hash tables (hash-file) from sequence files, -T hash-file Print word usage statistics for hash-file -o outfile Set output file name for -T build-options: -w wordsize Set word size when building hash tables -f period Set discontiguity when building hash tables -k Skip repeats when building indexfile -F min,max Set watermarks for fragment size (in Mb) for -v1 -v 1|2 Build file of format version 1 or 2 -c cachesize Use cache size cachesize (for -v2) re-PCR [-hV] re-PCR -p hash-file [-g gaps] [-n mism] [primer ...] re-PCR -P hash-file [-g gaps] [-n mism] [primer-file ...] re-PCR -s hash-file [search-options] [-O output] [left right lo hi [...]] re-PCR -S hash-file [search-options] [-O output] [-C bcnt] [stsfile ...] where: -p hash-file Perform primer lookup using hash-file -P hash-file Perform primer lookup using hash-file -s hash-file Perform STS lookup using hash-file, STSs in cmdline -S hash-file Perform STS lookup using hash-file, STSs in file search-options: -n mism Set max allowed mismatches per primer for lookup -g gaps Set max allowed indels per primer for lookup -m margin Set variability for STS size for lookup -d min-max Set default STS size (for STSs without size set) -r +|- Enable/disable reverse STS lookup -O +|- Enable/disable syscall optimisation -C batchcnt Set number of STSes per batch -o outfile Set output file name Description Reverse e-PCR (re-PCR) performs STS or primer lookup against sequence database. Two files are required for database: mmapped-file with sequence data in fast random-accessible format and hash-file, that keeps precalculated positions of all words of sequence database Use famap to build mmapped-file from FASTA files. Use fahash to build hash-file, and output word usage statistics. Use re-PCR to perform STS and primer searches. Discontiguous words are supported by re-PCR as well as contiguous. Options Common options -V Print version, exit after parsing commandline -h Print help, exit after parsing commandline famap options -b mmapped-file Build famap-file from input fasta file(s). If no fasta files are set in commandline, use stdin as input. -d mmapped-file Dump famap-file contents in fasta format. If ord number(s) are set, print only sequences with given ordinals. -l mmapped-file List fama-file sequence identifiers. If ord number(s) are set, print only sequences with given ordinals. -t cvt-table Use compiled-in table to convert input. n Nucleotides. Allowed characters are [actgACTGnN]. Other letters are converted to n or N. Rest of symbols are ignored. Case is preserved. nx Nucleotides with extended ambiquity codes iupac_na, lowercase are allowed. Other letters are converted to n or N. Rest of symbols are ignored. Case is preserved. N Nucleotides. Allowed characters are [ACTGN]. [actgn] are converted to uppercase. Other letters are converted to N. Rest of symbols are ignored. NX Nucleotides with extended ambiquity codes iupac_na, lowercase are converted to uppercase. Other letters are converted to N. Rest of symbols are ignored. Fahash -b hash-file Build hash-file for mmapped-file(s). -T hash-file Dump word usage statustics for hash-file. -v version Build hash-file of version 1 or 2 (2 is default). -w wordsize Build hash-file for word wordsize nucleotides long. -f wordcnt Build hash-file for wordcnt discontiguous words. 1 stands for contiguous words. -F min,max Use memory watermarks (Mbytes) for hash table size (for -v 1). -c cachesize Set cache size for -v 2. -o output-file Use output-file for output result of -T. Commands -p hash-file Perform lookup for primers given in commandline. -s hash-file Perform lookup for STSes given in commandline. -S hash-file Perform lookup for STSes taken from unists file(s) given in commandline. Search options -n mism Number of mismatches allowed per primer. -g gaps Number of gaps allowed per primer. -m margin Maximal deviation of observed product size to expected STS size. -d lo-hi Set ddefault STS size range - values used for STSs that have no size associated in file. -r +|- Enable|disable flipped STS lookup (default is "enabled"). -O +|- Enable|disable syscall optimisation. Since lookup is i/o expensive, enabling this parameter may improve search performance diskwise. On the other hand, it takes significantly more memory and CPU. -C batchcount How many STSs from input file to look at one pass. May effect on performance, especialy when used with -O +. -o output-file Use output-file for output. Output format Is tab-separated file with following fields: For primer lookup * Primer ID * Sequence ID * Strand * Hit start * Hit end * Mismatches * Gaps * Size For STS lookup * STS ID * Sequence ID * Strand * Hit start * Hit end * Mismatches * Gaps * Observed Size/Expected size range Exit codes Zero on success, non-zero on errors Bugs and features * Mmapped-file path is hardcoded to hash-file as it is in commandline when hash-file is being built, which means that when one performs searches mmapped-file should be accessible with same name from current directory, as it is hardcoded. * Mmapped-file is a proprietary format, that could be substituted with megablast database format, but is not (yet?) for performance reasons. * If sequence sizes are large, it may be tricky to create database with discontiguous words because of memory usage requirements. Changing parameter -F (for -v 1) or -c (for -v 2) may help. _________________________________________________________________ File formats STS database Is single-tab (i.e. two tabs in a row mean "empty field") separated file with following fields: + STS id (required). + First (left) primer (required). + Second (right) primer (required). + Product size (optional): can be number for strict size, or two numbers separated by dash for size range. + Additional info, that can be used by applications (optional). Primers should be in iupac_na encoding, everything that is not ACTG or actg is translated to N or n. Primers sequences should be uppercase, unless you want to use file with e-PCR -x+ flag - then several first nucleotides of primers may be lowercase-masked. If primers are not fully uppercase and you don't use -x+ flag, you have to use -u+ flag with e-PCR. Primers file Is single-tab (i.e. two tabs in a row mean "empty field") separated file with following fields: + Primer id (required). + Primer sequence. _________________________________________________________________ e-PCR-2.3.12/Changes0000644001137700010620000000750511745334031013702 0ustar rotmistrcontigversion 2.3.12 affected: few *.cpp files description: - Added include of cstdio to make it compilable with gcc 4.4 (thanks to Ubuntu and Debian-med teams) version 2.3.11 affected: stsmatch_i.?pp description: - When parsing commandline if -d was used before -w or -f default size was reset to program's default version 2.3.10 affected: includes in a few files description: - Applied user-provided patch to add includes for gcc-4.3 compatibility version 2.3.9 affected: fahash_lookup.cpp description: - Fixed a bug leading to false negatives when run without optimization version 2.3.8 affected: e-PCR_main.cpp build_cfg.h added: ncbi/Makefile ncbi/Makefile.INCLUDE description: - Added possibility to compile with NCBI C++ toolkit and read blastdb instead of fasta version 2.3.7 affected: build_cfg.h fast_seqio*.?pp fahash*.?pp stsmatch_m.cpp description: - Fixed reverse e-pcr code to work on Linux 64 bit architecture version 2.3.6 added: stand/Makefile.vc8 affected: mswin.h *_main.cpp description: - Fixed some warnings and made it compileable by MS Visual C++ 8.0 affected: version 2.3.5 affected: fahash_create2.cpp description: - Made more details to report seek failure version 2.3.4 affected: re-PCR_main.cpp, fahash_main.cpp description: - Minor changes to progress reporting version 2.3.3 affected: e-PCR_main.cpp (forward e-PCR) output description: - Attention: Hit positions do not include floppy tails anymore - TODO: same with reverse e-PCR - Fixed typo in BUILD.* version 2.3.2 affected: stsmatch_i.?pp, e-PCR_main.cpp (forward e-PCR) description: - Progress indication is more detailed and controlled version 2.3.1 affected: sts_i.hpp, stsmatch_m.?pp (forward e-PCR) description: - Bugfix: some hits were lost with large margin - Bugfix: re-PCR for win32 now works again - Can read primers from file version 2.3.0 affected: sts_i.hpp, stsmatch_m.?pp (forward e-PCR) description: - Support for overhang nucleotides in primers as masked characters version 2.2.3 affected: Makefiles description: - Compiling under Mac OS X version 2.2.2 affected: API, re-PCR, e-PCR, commandline description: - Better consistency for misalignments found and alignments shown version 2.2.1 affected: API, re-PCR, commandline description: - New output for re-PCR is available (with alignments) version 2.2.0 affected: Building, API, data format, e-PCR, re-PCR, commandline description: - API: Alignment now is performed by IAlign interface - seqcmp class is obsolete - Added new alignment method, more presize but slow (fixes bug) - New output is available, with graphic representation of alignments version 2.1.0 affected: Building, API, data format, e-PCR, commandline description: - e-PCR, re-PCR: fixed bug, that forbidded overlapping primers - e-PCR : fixed `empty description is shown as to "?"' - re-PCR : hash file version 2, new build algorithm new "fahash" program generates hash files re-PCR is for lookup only version 2.0.5 affected: Building, e-PCR, famap, commandline description: All tools are working for windows. Fasta conversion is added. version 2.0.4 affected: Building description: Now compileable with Borlang C++ Builder 5.5 for win32 version 2.0.3 affected: commandline parsing, API, defaults description: Changed defaults for M (Margin); now can be changed from commandline version 2.0.2 affected: e-PCR commandline parsing description: Fixed commandline processing -- compat options are allowed anywhere version 2.0.1 affected: API description: independent report for left and right primer misalignments in forward e-pcr API e-PCR-2.3.12/defaults.h0000644001137700010620000000435511745334031014367 0ustar rotmistrcontig// $Id: defaults.h,v 1.2 2004/03/30 18:52:19 rotmistr Exp $ /* =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= */ #ifndef EPCR_DEFAULTS__H #define EPCR_DEFAULTS__H #define ePCR_WDSIZE_DEFAULT 7 #define ePCR_WDSIZE_MIN 3 #define ePCR_WDSIZE_MAX 8 //// Number of mismatches allowed #define ePCR_MMATCH_DEFAULT 0 #define ePCR_MMATCH_MIN 0 #define ePCR_MMATCH_MAX 10 #define ePCR_GAPS_DEFAULT 0 #define ePCR_GAPS_MIN 0 #define ePCR_GAPS_MAX 5 //// Margin (allowed deviation in product size) #define ePCR_MARGIN_DEFAULT 50 #define ePCR_MARGIN_MIN 0 #define ePCR_MARGIN_MAX 10000 #define ePCR_DEFAULT_size_lo 100 #define ePCR_DEFAULT_size_hi 350 #endif /* * $Log: defaults.h,v $ * Revision 1.2 2004/03/30 18:52:19 rotmistr * Updated default STS size * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * * Revision 1.2 2003/11/20 02:12:29 rotmistr * Fixed id, log tags and copyright notice * */ e-PCR-2.3.12/fahash_defines.h0000644001137700010620000000364311745334031015506 0ustar rotmistrcontig// $Id: fahash_defines.h,v 1.2 2004/04/27 00:01:54 rotmistr Exp $ /* =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= */ #ifndef FAHASH_DEFINES__HPP #define FAHASH_DEFINES__HPP #define SIGNATURE "Rev e-PCR index\0" #define FILE_VERSION 0x00010000U #define FILE_VERSION2 0x00020000U #define BYTE_ORDER_WORD 0x01234567U #define HEADER_SIZE 4096 #endif /* * $Log: fahash_defines.h,v $ * Revision 1.2 2004/04/27 00:01:54 rotmistr * Second version of reverse hash file started * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * * Revision 1.2 2003/11/20 02:12:29 rotmistr * Fixed id, log tags and copyright notice * */ e-PCR-2.3.12/fahash_internal.hpp0000644001137700010620000000436011745334031016242 0ustar rotmistrcontig/* $Id: fahash_internal.hpp,v 1.3 2007/07/11 20:49:29 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef FAHASH_INTERNAL__HPP #define FAHASH_INTERNAL__HPP #define SYSERROR(a) throw runtime_error(a+": "+strerror(errno)) typedef AFaIndexerBase::THashElement THashElement; static const THashElement kHighBit = THashElement( 1 ) << ( 8 * sizeof(THashElement) - 1 ); static const unsigned kKilobyte = 1024; static const unsigned kMegabyte = 1024 * kKilobyte; static const unsigned kGigabyte = 1024 * kMegabyte; static const unsigned kMinFragSize = kGigabyte / 4 / 4 * 3; static const unsigned kMaxFragSize = kGigabyte / 4 / 4 * 6; //static const unsigned min_frag_size=100*Megabyte/4*3; //static const unsigned max_frag_size=100*Megabyte/4*6; #endif /* * $Log: fahash_internal.hpp,v $ * Revision 1.3 2007/07/11 20:49:29 rotmistr * Made 64bit-compatible * * Revision 1.2 2004/09/03 19:06:41 rotmistr * Code formatting changes * */ e-PCR-2.3.12/build_cfg.h0000644001137700010620000001115611745334032014474 0ustar rotmistrcontig/* $Id: build_cfg.h,v 1.13 2008/03/26 16:04:29 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_BUILD__HPP #define EPCR_BUILD__HPP #ifdef STANDALONE #include namespace std {} #define BEGIN_SCOPE(a) namespace a { #define END_SCOPE(a) } #define BEGIN_NCBI_SCOPE BEGIN_SCOPE(ncbi) USING_SCOPE(std); #define END_NCBI_SCOPE END_SCOPE(ncbi) #define USING_SCOPE(a) using namespace a #define USING_NCBI_SCOPE USING_SCOPE(ncbi) #ifdef _WIN32 #include #define FILE_BINARY "b" #define FILE_TEXT "t" //#warning "Using Borland C/C++ Builder config" #define madvise(a,b,c) // no madvise #define MADV_SEQUENTIAL 0 #define MADV_DONTNEED 0 #ifdef __cplusplus BEGIN_NCBI_SCOPE #endif typedef char Int1; typedef short Int2; typedef int Int4; typedef long long Int8; typedef unsigned char Uint1; typedef unsigned short Uint2; typedef unsigned int Uint4; typedef unsigned long long Uint8; #ifdef __cplusplus END_NCBI_SCOPE #endif #else // _WIN32 #include #include #include #ifdef NATIVE_LARGEFILES #include #endif // NATIVE_LARGEFILES #include #ifdef __cplusplus BEGIN_NCBI_SCOPE #endif typedef int8_t Int1; typedef int16_t Int2; typedef int32_t Int4; typedef int64_t Int8; typedef uint8_t Uint1; typedef uint16_t Uint2; typedef uint32_t Uint4; typedef uint64_t Uint8; #ifdef __cplusplus END_NCBI_SCOPE #endif #ifdef __cplusplus extern "C" { #endif // __cplusplus int madvise(void* addr, size_t len, int advice); #ifdef __cplusplus } #endif // __cplusplus #endif // _WIN32 #else // STANDALONE #include #include #include #include #endif // STANDALONE #define EPCR_SCOPE pcr_tools #ifndef FILE_BINARY #define FILE_BINARY #endif #ifndef FILE_TEXT #define FILE_TEXT #endif #endif /* * $Log: build_cfg.h,v $ * Revision 1.13 2008/03/26 16:04:29 rotmistr * Added support for blastdb files * * Revision 1.12 2007/07/11 20:49:29 rotmistr * Made 64bit-compatible * * Revision 1.11 2007/07/05 16:23:08 rotmistr * Forgot two changes * * Revision 1.10 2007/07/05 16:05:58 rotmistr * Made things compileable by MS Visual C++ 8.0 * * Revision 1.9 2004/09/03 21:28:49 rotmistr * Fixes to compile with Borland C++ 5.5 * * Revision 1.8 2004/09/03 15:54:43 rotmistr * Compilation for Mac OS/X * * Revision 1.7 2004/05/27 20:35:46 rotmistr * Version 2.1.0 with appropriate changes (see Changes) is ready for tests. * * Revision 1.6 2004/04/15 14:18:22 rotmistr * Fix to compile with NCBI toolkit (CGI) * * Revision 1.5 2004/04/06 04:53:17 rotmistr * All is compileable with BCC5.5 and runnable on WIndows * * Revision 1.4 2004/04/01 16:37:40 rotmistr * Cleaned after adding windows capabilities * * Revision 1.3 2004/04/01 05:57:52 rotmistr * Compilable with borland C++ * * Revision 1.2 2004/02/04 21:23:21 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/getopt.c0000644001137700010620000000747511745334032014064 0ustar rotmistrcontig/* $Id: getopt.c,v 1.2 2004/09/03 19:59:25 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include int optind=1; int optopt=-1; int opterr=0; const char* optarg=0; static char * nextarg=0; int getopt(int argc, char ** argv, const char* optstring) { while(optind=argc) { fprintf(stderr, "getopt: need argument for %c\n", optopt); ++opterr; return -1; } optarg=argv[optind]; } nextarg=0; ++optind; } else { optarg=0; if(nextarg[1]) ++nextarg; else { nextarg=0; ++optind; } } return optopt; } else { fprintf(stderr, "getopt: invalid option %c\n", optopt); if(nextarg[1]) ++nextarg; else { nextarg=0; ++optind; } ++opterr; } } while(0); break; case ':': // ':' should not be used as option fprintf(stderr,"getopt: bad option %c\n",optopt); if(nextarg[1]) ++nextarg; else { nextarg=0; ++optind; } ++opterr; break; } } return optopt=-1; } /* * $Log: getopt.c,v $ * Revision 1.2 2004/09/03 19:59:25 rotmistr * *** empty log message *** * * Revision 1.1 2004/04/02 15:43:55 rotmistr * *** empty log message *** * */ e-PCR-2.3.12/bin-io.hpp0000644001137700010620000001117511745334032014274 0ustar rotmistrcontig/* $Id: bin-io.hpp,v 1.5 2008/04/28 16:38:45 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_BIN_IO__HPP #define EPCR_BIN_IO__HPP #include #include #include #include #include #include #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) // This code is being used INTERNALLY by e-PCR library enum EByteOrder { eHiEndian = 0x78563412, eLoEndian = 0x12345678 }; template inline T BoCvt( T x, bool do_swap ) { if( do_swap ) { unsigned char * a = (unsigned char*) &x; unsigned char * b = a + sizeof( x ) - 1; for( ; a < b ; ++a, --b ) swap( *a, *b ); } return x; } template inline void Write( int fd, const T& t, unsigned sz = 1 ) { if( (size_t) write( fd, &t, sz * sizeof(T) ) != sz * sizeof( T ) ) throw runtime_error( "write failed: " + string( std::strerror( errno ) ) ); } template<> inline void Write( int fd, const string& t, unsigned ) { Write( fd, Uint4( t.length() ) ); if( (size_t)write( fd, t.data(), t.length() ) != t.length() ) throw runtime_error( "write failed: " + string( strerror( errno ) ) ); } template inline void Write( FILE* f, const T& t, unsigned sz = 1 ) { if( (size_t)fwrite( &t, sizeof( T ), sz, f ) != sz ) throw runtime_error( "write failed: " + string( strerror( errno ) ) ); } template<> inline void Write(FILE* f, const string& t, unsigned ) { Write( f, Uint4( t.length() ) ); if((size_t)fwrite( t.data(), 1, t.length(), f ) != t.length() ) throw runtime_error( "write failed: " + string( strerror( errno ) ) ); } template inline T Read( int fd ) { T t( -1 ); if( (size_t)read( fd, &t, sizeof( T ) ) != sizeof( T ) && errno ) throw runtime_error( "read failed: " + string( strerror( errno ) ) ); return t; } template<> inline string Read( int fd ) { vector t( Read( fd ) ); if( t.size() && (size_t)read( fd, &t[0], t.size() ) != t.size() && errno ) throw runtime_error( "read failed: " + string( strerror( errno ) ) ); return string( &t[0], t.size() ); } inline off64_t SeekAlign( int fd, unsigned page = 4096 ) { off64_t c = lseek64( fd, 0, SEEK_CUR ); if( c % page ) c = lseek64( fd, page - c % page, SEEK_CUR ); return c; } inline off64_t SeekAlign( FILE* f, unsigned page = 4096 ) { off64_t c = ftello64( f ); if( c % page ) { fseeko64( f, page - c % page, SEEK_CUR ); c = ftello64( f ); } return c; } END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: bin-io.hpp,v $ * Revision 1.5 2008/04/28 16:38:45 rotmistr * Applied patch to build with gcc-4.3 * * Revision 1.4 2007/07/11 20:49:29 rotmistr * Made 64bit-compatible * * Revision 1.3 2004/04/01 05:57:52 rotmistr * Compilable with borland C++ * * Revision 1.2 2004/02/04 21:23:21 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/e-PCR_main.cpp0000644001137700010620000007421011745334032014763 0ustar rotmistrcontig/* $Id: e-PCR_main.cpp,v 1.25 2008/06/18 14:45:33 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include #include #include #include #include #include #ifndef STANDALONE //#include //#include #include #include #endif USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); class CMain { public: enum EAlignMode { eNever, eAlways, eFallback }; CMain(int c, char ** v): argc(c),argv(v),done(false),ofmt(1), postprocess(true),have_postprocess(false),verbose(0), m_MaxMismatch(ePCR_MMATCH_DEFAULT), m_MaxGaps(ePCR_GAPS_DEFAULT), m_AlignL(0), m_AlignR(0), m_AlignMode(eNever) #ifndef STANDALONE , m_blastdbs( false ) #endif { stsFileHash.SetHash(CHashSet(ePCR_WDSIZE_DEFAULT,0)); pcrMachine.SetMargin(ePCR_MARGIN_DEFAULT); } int Run(); protected: int Execute(); int ParseCmdline(); int Help(FILE* = stdout); int Version(); void ParseVerbose(const char * opt); protected: int argc; char ** argv; bool done; protected: // CPcrMachineCompat pcrMachine; CPcrMachine pcrMachine; CStsFileHash stsFileHash; string stsfile; list fafiles; int ofmt; string ofile; bool postprocess, have_postprocess; int verbose; int m_MaxMismatch, m_MaxGaps; IAlign * m_AlignL, * m_AlignR; EAlignMode m_AlignMode; #ifndef STANDALONE bool m_blastdbs; string m_gilist; #endif }; int CMain::Help(FILE* out) { done=true; fprintf(out, "usage: [-hV] [posix-options] stsfile [fasta ...] " "[compat-options]\n" "where posix-options are:\n"); fprintf(out,"\t-m ##\tMargin (default %d)\n",ePCR_MARGIN_DEFAULT); fprintf(out,"\t-w ##\tWordsize (default %d)\n",ePCR_WDSIZE_DEFAULT); fprintf(out,"\t-n ##\tMax mismatches allowed (default %d)\n", ePCR_MMATCH_DEFAULT); fprintf(out,"\t-g ##\tMax indels allowed (default %d)\n", ePCR_GAPS_DEFAULT); fprintf(out,"\t-f ##\tUse ## discontiguos words, slow if ##>1\n"); fprintf(out,"\t-o ##\tSet output file\n"); fprintf(out,"\t-t ##\tSet output format:\n" "\t\t1 - classic, range (pos1..pos2)\n" "\t\t2 - classic, midpoint\n" "\t\t3 - tabular\n" "\t\t4 - tabular with alignment in comments (slow)\n" ); fprintf(out,"\t-d##-##\tSet default size range (default %d-%d)\n", ePCR_DEFAULT_size_lo,ePCR_DEFAULT_size_hi); fprintf(out,"\t-p +-\tTurn hits postprocess on/off\n"); fprintf(out,"\t-v ##\tVerbosity flags\n"); fprintf(out,"\t-a a|f\tUse presize alignmens (only if gaps>0), slow\n" "\t\t a - Allways or f - as Fallback\n"); fprintf(out,"\t-x +-\tUse 5'-end lowercase masking of primers " "(default %s)\n",stsFileHash.AllowOverhang()?"+":"-"); fprintf(out,"\t-u +-\tUppercase all primers " "(default %s)\n",stsFileHash.UnmaskPrimers()?"+":"-"); #ifndef STANDALONE fprintf(out,"\t-b +-\tInput sequences are in blastdb\n"); fprintf(out,"\t-l file\tLimit blastdb sequences to list of gis from the file\n"); #endif fprintf(out,"and compat-options (duplicate posix-options) are:\n"); fprintf(out,"\tM=##\tMargin (default %d)\n",ePCR_MARGIN_DEFAULT); fprintf(out,"\tW=##\tWordsize (default %d)\n",ePCR_WDSIZE_DEFAULT); fprintf(out,"\tN=##\tMax mismatches allowed (default %d)\n", ePCR_MMATCH_DEFAULT); fprintf(out,"\tG=##\tMax indels allowed (default %d)\n", ePCR_GAPS_DEFAULT); fprintf(out,"\tF=##\tUse ## discontinuos words\n"); fprintf(out,"\tO=##\tSet output file to ##\n"); fprintf(out,"\tT=##\tSet output format (1..3)\n"); fprintf(out,"\tD=##-##\tSet default size range\n"); fprintf(out,"\tP=+-\tPostprocess hits on/off\n"); fprintf(out,"\tV=##\tVerbosity flags\n"); fprintf(out,"\tA=a|f\tUse presize alignmens (only if gaps>0), slow\n" "\t\t a - Allways or f - as Fallback\n"); fprintf(out,"\tX=+-\tUse 5'-end lowercase masking of primers " "(default %s)\n",stsFileHash.AllowOverhang()?"+":"-"); fprintf(out,"\tU=+-\tUppercase all primers " "(default %s)\n",stsFileHash.UnmaskPrimers()?"+":"-"); #ifndef STANDALONE fprintf(out,"\tB=+-\tInput sequences are in blastdb\n"); fprintf(out,"\tL=file\tLimit blastdb sequences to list of gis from the file\n"); #endif fprintf(out,"\t-mid\tSame as T=2\n"); fprintf(out,"verbosity flags are (flags may be changed in future):\n" "\t- set all progress reporting off (default)\n" "\t+ switch error reporting to basic (same as Sl)\n" "\tt display time\n" "\tl display fasta identifiers\n" "\to display sequence offset (currently: 3' position of first primer)\n" "\tp display percent of sequence processed\n" "\ts report every sequence start\n" "\te report every sequence end\n" "\tS newline after sequence start report\n" "\tE newline after sequence end report\n" "\tP newline after sequence progress report\n"); return 0; } void SetDefaultSize(CStsFileHash& stsFileHash, const char * str) { char * x=0; int hi=0, lo=strtol(str,&x,10); if(x!=0 && *x=='-') { hi=atoi(x+1); } else hi=lo; if(lo>0 && hi>=lo) stsFileHash.SetDefaultSize(lo,hi); else throw runtime_error("bad range: "+string(str)); } int CMain::ParseCmdline() { int optchar; while((optchar=getopt(argc,argv,"+hVf:m:n:w:g:o:t:p:v:d:a:x:u:b:l:"))!=-1) { switch(optchar) { case 'h': Help(); break; case 'V': Version(); break; case 'a': m_AlignMode=(optarg[0]=='a'?eAlways: optarg[0]=='f'?eFallback:eNever); if(m_AlignMode==eNever) { fprintf(stderr,"? Unknown alignment mode `%s' ignored\n", optarg); } break; case 'm': if(strcmp(optarg,"id")==0) ofmt=2; else pcrMachine.SetMargin(atoi(optarg)); break; case 'w': stsFileHash.SetHash(CHashSet(atoi(optarg), stsFileHash.GetWordCount())); break; case 'n': m_MaxMismatch=atoi(optarg); break; case 'g': m_MaxGaps=atoi(optarg); break; case 'f': stsFileHash.SetHash(CHashSet(stsFileHash.GetWordSize(), atoi(optarg))); break; case 'd': SetDefaultSize(stsFileHash,optarg); break; case 'o': ofile=optarg; break; case 't': ofmt=atoi(optarg); break; case 'p': have_postprocess=true; postprocess=*optarg=='+'?true:*optarg=='-'?false:postprocess; break; #ifndef STANDALONE case 'b': m_blastdbs=*optarg=='+'?true:*optarg=='-'?false:m_blastdbs; break; case 'l': m_gilist = optarg; break; #endif case 'v': ParseVerbose(optarg); break; case 'x': if(*optarg=='+') stsFileHash.SetFlags(CStsFileHash::fAllowOverhang,true); else if(*optarg=='-') stsFileHash.SetFlags(CStsFileHash::fAllowOverhang,false); break; case 'u': if(*optarg=='+') stsFileHash.SetFlags(CStsFileHash::fUnmaskPrimers,true); else if(*optarg=='-') stsFileHash.SetFlags(CStsFileHash::fUnmaskPrimers,false); break; } } if(done) return 0; if(optind >= argc) { Help(stderr); return 1; } // Parse compat options for(; optindGetOverhangChars(ISts::eLeft); //int ovhg2 = sts->GetOverhangChars(ISts::eRight); // pos1 += ovhg1; // pos2 -= ovhg2; if (show_midpt) sprintf(position,"%d", 1 + (pos1+pos2-1)/2); else sprintf(position,"%d..%d",pos1+1,pos2); fprintf(m_Out,"%-10s %-16s %-14.*s %.*s\n", seq_label,position, sts->GetName().length(),sts->GetName().data(), sts->GetDescription().length(), sts->GetDescription().data()); return 1; } class CPcrMachineCallbackTabular:public CPcrMachineCallback { public: CPcrMachineCallbackTabular(FILE * out, bool showalign, int gaps): CPcrMachineCallback(out), m_ShowAlign(showalign),m_Matrix(127,gaps),m_SeqData(0),m_SeqLength(0) {} virtual ~CPcrMachineCallbackTabular() throw () {} virtual void CbkMatch(const ISts * sts, unsigned pos1, unsigned pos2, const SScore* score) ; virtual void CbkSequenceData(const char * data, unsigned size) { // delete[] m_SeqData; // m_SeqData=new char[(m_SeqLength=size)+1]; // memcpy(m_SeqData,data,size); // m_SeqData[size]=0; m_SeqData=data; m_SeqLength=size; } protected: bool m_ShowAlign; CLcsMatrix m_Matrix; const char * m_SeqData; unsigned m_SeqLength; }; void CPcrMachineCallbackTabular::CbkMatch ( const ISts * sts, unsigned pos1, unsigned pos2, const SScore* score) { int mism=score->mism_l+score->mism_r; int gaps=score->gaps_l+score->gaps_r; int len1=sts->GetPrimerLength(ISts::eLeft); int len2=sts->GetPrimerLength(ISts::eRight); int ovhg1=sts->GetOverhangChars(ISts::eLeft); int ovhg2=sts->GetOverhangChars(ISts::eRight); const char * data1=sts->GetPrimerData(ISts::eLeft); const char * data2=sts->GetPrimerData(ISts::eRight); if(m_ShowAlign && m_SeqData && pos2<=m_SeqLength) { vector left, right; m_Matrix.Build( m_SeqData+pos2-len2-ovhg2,m_SeqData+m_SeqLength-ovhg2,data2,len2); m_Matrix.Graph( m_SeqData+pos2-len2-ovhg2,m_SeqData+m_SeqLength-ovhg2,data2,len2, right); m_Matrix.Stat( m_SeqData+pos2-len2-ovhg2,m_SeqData+m_SeqLength-ovhg2,data2,len2); mism = m_Matrix.GetMismatches(); gaps = m_Matrix.GetGaps(); m_Matrix.Build >( m_SeqData+pos1+len1+ovhg1-1,m_SeqData,data1+len1-1,len1); m_Matrix.Graph >( m_SeqData+pos1+len1+ovhg1-1,m_SeqData,data1+len1-1,len1,left); m_Matrix.Stat >( m_SeqData+pos1+len1+ovhg1-1,m_SeqData,data1+len1-1,len1); mism += m_Matrix.GetMismatches(); gaps += m_Matrix.GetGaps(); int l=max(int(m_SeqId.length()),int(sts->GetName().length())); int d=score->actlen-len1-len2; string stsname(sts->GetName().data(),sts->GetName().length()); fprintf(m_Out, "#####################################" "#####################################\n" "# STS %*s %s...%d...%s\n" "# %.*s %s %d %s\n" "# Seq %*s %s...%d...%s\n", l,stsname.c_str(), left[0].c_str(),d,right[0].c_str(), l," ", left[2].c_str(),d,right[2].c_str(), l,m_SeqId.c_str(), left[1].c_str(),d,right[1].c_str()); } pos1 += ovhg1; pos2 -= ovhg2; fprintf(m_Out,"%s\t%.*s\t%c\t%d\t%d\t%d/%d-%d\t%d\t%d\t%.*s\n", m_SeqId.c_str(), sts->GetName().length(),sts->GetName().data(), sts->GetDirection(), pos1+1,pos2, score->actlen, sts->GetSizeLo(),sts->GetSizeHi(), mism, gaps, sts->GetDescription().length(), sts->GetDescription().data()); } class CPcrFastaProcessor:public IFastaReaderCallback { public: virtual ~CPcrFastaProcessor() throw () { if( !m_NoCopySeq ) free(m_Sequence); } CPcrFastaProcessor(CPcrMachine* pmachine, bool noCopySeq = false ): m_Sequence(0),m_Size(0),m_Capacity(0), m_NoCopySeq( noCopySeq ) { m_PcrMachine=pmachine; } virtual void CbkDefline(const char * , unsigned ) {} virtual void CbkIdent(const char * ident, unsigned length); virtual void CbkSeqline(const char * data, unsigned length); virtual void CbkEntryEnd(); protected: CPcrMachine * m_PcrMachine; string m_Ident; char * m_Sequence; unsigned m_Size; unsigned m_Capacity; bool m_NoCopySeq; }; void CPcrFastaProcessor::CbkIdent(const char * ident, unsigned length) { m_Ident.assign(ident,length); } void CPcrFastaProcessor::CbkSeqline(const char * seq, unsigned length) { if( m_NoCopySeq ) { assert( m_Size == 0 && m_Sequence == 0 ); m_Size = length; m_Sequence = const_cast(seq); } else { while(m_Size+length>=m_Capacity) m_Sequence=(char*)realloc(m_Sequence,m_Capacity+=16192); memcpy(m_Sequence+m_Size,seq,length); m_Size+=length; } } void CPcrFastaProcessor::CbkEntryEnd() { if( !m_NoCopySeq ) { if(m_Sequence) m_Sequence[m_Size]=0; } m_PcrMachine->ProcessSequence(m_Ident.c_str(),m_Sequence,m_Size); m_Size=0; if( m_NoCopySeq ) { m_Sequence = 0; } m_Ident.clear(); } int CMain::Execute() { stsFileHash.SetOneTimeRun(true); do { CStsFileCallbackDefault cbk; stsFileHash.ReadStsFile(stsfile, &cbk); } while(0); do { FILE * out=ofile.length()?fopen64(ofile.c_str(),"w"):stdout; if(out==0) throw runtime_error(ofile+": "+strerror(errno)); auto_ptr cbk(0); switch(ofmt) { case 4: case 3: cbk.reset(new CPcrMachineCallbackTabular(out,ofmt==4,m_MaxGaps)); break; case 2: case 1: default: cbk.reset(new CPcrMachineCallbackClassic(out,ofmt==2)); break; } // if(!have_postprocess) { // if(pcrMachine.GetMaxIndels() || // stsFileHash.GetHash().GetWordCount()>1) { // postprocess=true; // } else { // postprocess=false; // } // } CPcrMachinePostprocess post(cbk.get()); if(postprocess) pcrMachine.SetCallback(&post); else pcrMachine.SetCallback(cbk.get()); CPcrProgressCallback pgscbk(verbose); if(verbose) pcrMachine.SetProgressCallback(&pgscbk); pcrMachine.SetStsHash(&stsFileHash); if(m_MaxGaps) { switch(m_AlignMode) { case eNever: m_AlignL=new CAlignFast(m_MaxMismatch,m_MaxGaps); m_AlignR=new CAlignFast(m_MaxMismatch,m_MaxGaps); break; case eAlways: m_AlignL=new CAlignLCS(m_MaxMismatch,m_MaxGaps); m_AlignR=new CAlignLCS(m_MaxMismatch,m_MaxGaps); break; case eFallback: m_AlignL=new CAlignCompromise(m_MaxMismatch,m_MaxGaps); m_AlignR=new CAlignCompromise(m_MaxMismatch,m_MaxGaps); break; default: throw logic_error("Invalig align mode"); } } else if(m_MaxMismatch) { m_AlignL=new CAlignNoGaps(m_MaxMismatch); m_AlignR=new CAlignNoGaps(m_MaxMismatch); } else { m_AlignL=new CAlignExact(); m_AlignR=new CAlignExact(); } pcrMachine.SetAligner(m_AlignL,m_AlignR); CPcrFastaProcessor processor(&pcrMachine #ifndef STANDALONE , m_blastdbs #endif ); if(fafiles.size()) { for(list::const_iterator f=fafiles.begin(); f!=fafiles.end(); ++f) { #ifndef STANDALONE if( m_blastdbs ) { vector volumes; try { CSeqDB::FindVolumePaths( *f, CSeqDB::eNucleotide, volumes, 0, true ); } catch(exception& e) { cerr << "? Warning: CSeqDB::FindVolumePaths( \"" << *f << "\", CSeqDB::eNucleotide, volumes, 0, true ); failed with error: " << e.what() << "\n"; volumes.clear(); volumes.push_back( *f ); } catch(...) { cerr << "? Warning: CSeqDB::FindVolumePaths( \"" << *f << "\", CSeqDB::eNucleotide, volumes, 0, true ); failed with unknown exception\n"; volumes.clear(); volumes.push_back( *f ); } for( vector::const_iterator v = volumes.begin(); v != volumes.end(); ++v ) { auto_ptr seqDB( 0 ); try { CSeqDBGiList * lst = (m_gilist.length() ? new CSeqDBFileGiList( m_gilist ) : 0 ); seqDB.reset( new CSeqDB( *v, CSeqDB::eNucleotide, lst ) ); } catch(exception& e) { throw runtime_error( "Failed to open blastdb volume " + *v + ": " + e.what() ); } catch(...) { throw runtime_error( "Failed to open blastdb volume " + *v + ": unknown error" ); } processor.CbkFileBegin(); for( CSeqDBIter i = seqDB->Begin(); i; ++i ) { list > ids = seqDB->GetSeqIDs( i.GetOID() ); if( ids.size() == 0 ) { ostringstream err; err << "Bad entry in " << *f << " (" << *v << ") " << " oid " << i.GetOID() << ": no seqids\n"; throw runtime_error( err.str() ); //cerr << "? Warning: " << err.str(); //continue; } string ident; for( list >::const_iterator x = ids.begin(); x != ids.end(); ++x ) { if( x != ids.begin() ) ident += "|"; ident += (*x)->AsFastaString(); } processor.CbkEntryBegin(); processor.CbkIdent( ident.c_str(), ident.length() ); string seq; seqDB->GetSequenceAsString( i.GetOID(), seq ); processor.CbkSeqline( seq.c_str(), seq.length() ); processor.CbkEntryEnd(); } } processor.CbkFileEnd(); } else { #endif if(*f=="-") { CFastaReader reader("/dev/stdin"); reader.SetCvtTable(CFastaReader::sm_NucleotidesUc); reader.ReadFile(&processor); } else { CFastaReader reader(*f); reader.SetCvtTable(CFastaReader::sm_NucleotidesUc); reader.ReadFile(&processor); } #ifndef STANDALONE } #endif } } else { CFastaReader reader("/dev/stdin"); reader.SetCvtTable(CFastaReader::sm_NucleotidesUc); reader.ReadFile(&processor); } delete m_AlignR; delete m_AlignL; fclose(out); } while(0); return 0; } int CMain::Version() { done=true; puts("e-PCR cmdline tool version " VERSION); return 0; } int CMain::Run() { if(int rc=ParseCmdline() ) return rc; if(done) return 0; return Execute(); } int main(int argc, char ** argv) { // try { CMain app(argc,argv); return app.Run(); /* } catch(logic_error& e) { fprintf(stderr,"! Fatal: Internal error %s\n",e.what()); } catch(exception& e) { fprintf(stderr,"! Fatal: %s\n",e.what()); } catch(...) { fprintf(stderr,"! Fatal: Unknown error\n"); } return 100; */ } /* * $Log: e-PCR_main.cpp,v $ * Revision 1.25 2008/06/18 14:45:33 rotmistr * Fixed problem with -d x-X parameter being reset if -w or some others are used after it. * * Revision 1.24 2008/06/16 16:02:40 rotmistr * *** empty log message *** * * Revision 1.23 2008/04/28 16:38:45 rotmistr * Applied patch to build with gcc-4.3 * * Revision 1.22 2008/03/27 14:36:58 rotmistr * Added assert.h to make it compiling with VC8 * * Revision 1.21 2008/03/26 16:04:29 rotmistr * Added support for blastdb files * * Revision 1.20 2007/07/05 16:05:58 rotmistr * Made things compileable by MS Visual C++ 8.0 * * Revision 1.19 2005/06/14 16:46:41 rotmistr * Changed report format for floppy tails * * Revision 1.18 2004/10/26 17:16:33 rotmistr * Added 5'-end masking for primers * * Revision 1.17 2004/06/08 20:32:51 rotmistr * Fixup for gap+insert special case * * Revision 1.16 2004/06/08 16:14:55 rotmistr * *** empty log message *** * * Revision 1.15 2004/06/03 23:37:19 rotmistr * New aligner added. * * Revision 1.14 2004/04/06 04:53:17 rotmistr * All is compileable with BCC5.5 and runnable on WIndows * * Revision 1.13 2004/04/01 16:37:41 rotmistr * Cleaned after adding windows capabilities * * Revision 1.12 2004/04/01 05:57:52 rotmistr * Compilable with borland C++ * * Revision 1.11 2004/03/30 21:06:53 rotmistr * Fixes for setting default STS size range. * * Revision 1.10 2004/03/30 19:11:18 rotmistr * STS default size * * Revision 1.9 2004/03/30 19:08:03 rotmistr * default STS size is tunnable now * * Revision 1.8 2004/03/26 17:02:13 rotmistr * Compat-options are now allowed everywhere, and multiple fasta files can be used. * * Revision 1.7 2004/03/25 19:36:52 rotmistr * API: separate left and right primers mism/gaps in forward API * * Revision 1.6 2004/03/23 22:35:25 rotmistr * Fixed processing of -mid flag in cmdline * Fixed destructor for fasta reader * Removed cgi * * Revision 1.5 2004/03/07 06:35:59 rotmistr * Many bugfixes and optimisations -- cgi is to go to production * * Revision 1.4 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.3 2004/01/28 23:27:02 rotmistr * "Best of overlapping" hit selection postprocessor added. * * Revision 1.2 2004/01/08 23:22:41 rotmistr * Fixed init error in faread, * Adjusted output to standard, * Added output format style and output file to parameters. * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/fahash.hpp0000644001137700010620000002432711745334032014354 0ustar rotmistrcontig/* $Id: fahash.hpp,v 1.15 2007/07/11 20:49:29 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_HASH__HPP #define EPCR_HASH__HPP #include #include #include #include #include #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class CFaData; class CFaIndexer; class CFaLookup; class IFaLookupCallback; class CFaData { public: struct SSeqDesc { Int2 fid; off64_t data; SSeqDesc( Int2 f, off64_t o ) : fid( f ), data( o ) {} SSeqDesc() : fid( -1 ), data( 0 ) {} }; typedef std::string string; struct SFileDesc { Int2 flags; string path; SFileDesc() : flags( 0 ) {} SFileDesc( Int2 fl, const string& s ) : flags( fl ), path( s ) {} }; typedef std::vector TPathLst; typedef std::vector TSeqLst; typedef Int4 THashElement; // typedef util::CChunkArray<4092,1,THashElement> THashList; typedef std::vector THashList; // (-)seq-id, offset, offset, (-)seq-id, offset, ..., -1 typedef std::vector THashTable; typedef std::vector TData; typedef Uint8 TBigCount; typedef vector TBranchStat; typedef vector > TStatVector; TPathLst m_Fapath; // is accumulated and appended to file TSeqLst m_Seqlst; // is accumulated and appended to file CHashSet m_Hash; // hash itself std::vector< std::pair > m_Tabloc; const CHashSet& GetHash() const { return m_Hash; } // hash itself }; class IFaIndexerCallback { public: virtual ~IFaIndexerCallback() {} virtual void CbkSequence( const char * id ) = 0; virtual void CbkProgress( unsigned pos, unsigned length ) = 0; virtual void CbkFile( const char * name ) =0; virtual void CbkDumpProgress( unsigned pos, unsigned length ) = 0; virtual void CbkDumpStart() {}; virtual void CbkDumpEnd() {}; virtual void CbkResetStart() {}; virtual void CbkResetEnd() {}; }; class AFaIndexerBase : public CFaData { public: virtual void AttachFile( const string& path ) = 0; virtual void SetHash( const CHashSet& hs ) = 0; virtual void AddFile( const string& path ); virtual void Finish(); virtual Uint4 GetFlags() const { return m_Flags; } virtual void SetFlags( unsigned f ) { m_Flags = f; } virtual void SetCallback( IFaIndexerCallback * cbk ) { m_Cbk = cbk; } virtual ~AFaIndexerBase() throw() {} AFaIndexerBase() : m_File( 0 ), m_Count( 0 ), m_Cbk( 0 ), m_Flags( 0 ) {} protected: // writes default header virtual void AttachHeaderDefault( const string& path, Uint4 version ); virtual void AddSequence( const char * seq, unsigned len, off64_t off ) = 0; virtual void DumpTables() = 0; protected: FILE * m_File; Uint4 m_Count; IFaIndexerCallback * m_Cbk; Uint4 m_Flags; }; class CFaIndexer1 : public AFaIndexerBase { public: enum EFlags { fSkipRepeatitive = 0x01, fStatTable = 0x02 }; void AttachFile( const string& path ); void SetHash( const CHashSet& hs ); void SetFragmentSizeRange( unsigned lo, unsigned hi ) { m_Lo = lo; m_Hi = hi; } unsigned GetFragmentLo() const { return m_Lo; } unsigned GetFragmentHi() const { return m_Hi; } ~CFaIndexer1() throw() {} CFaIndexer1() : m_Lo( 512 * 1024 * 1024 ), m_Hi( 1536 * 1024 * 1024 ) {} protected: void AddSequence( const char * seq, unsigned len, off64_t off ); void DumpTables(); protected: TData m_Data; // vector of hash tables (one per word) unsigned m_Lo, m_Hi; }; class CFaIndexer2 : public AFaIndexerBase { public: void AttachFile( const string& path ); void SetHash( const CHashSet& hs ); void SetCacheSize( unsigned cs ) { m_CacheSize = cs; } ~CFaIndexer2() throw() {} CFaIndexer2() : m_CacheSize( 200000000 ) {} protected: void AddSequence( const char * seq, unsigned len, off64_t off ); void DumpTables(); void StoreSequence( unsigned sid, const char * seq, unsigned len ); void WriteCache(); protected: TStatVector m_Data; // vector of hash lists TStatVector m_Seqs; // vector of seq counts TStatVector m_LastSid; // vector of last sids TStatVector m_Cursor; // vectpr of "current positions" for 2nd pass TData m_Cache; unsigned m_CacheSize; // cache size }; class CFaLookup : public CFaData { public: CFaLookup() : m_Fd( -1 ), m_AlignL( 0 ), m_AlignR( 0 ) {} typedef list TStsList; enum EPrimerEnd { eLeft = 'l', eRight = 'r' }; void AttachFile( const string& path ); void Find( IFaLookupCallback * cbk, const string& label, char report_strand, // which strand to report const string& primer ); void Find( IFaLookupCallback * cbk, ISts* sts, int window = 0 ); void Stat(); TBigCount CalcStat( const char * primer, unsigned wd = 0 ) { m_Hash.Begin( primer ); return m_Hash.Good( wd ) ? CalcStat( m_Hash.GetValue( wd ), wd ) : 0 ; } TBigCount CalcStat( THashElement hashval, unsigned wd = 0 ); TBigCount GetStat( const char * primer, unsigned wd = 0 ) { m_Hash.Begin( primer ); return m_Hash.Good( wd ) ? GetStat( m_Hash.GetValue( wd ), wd ) : 0 ; } TBigCount GetStat( THashElement hashval, unsigned wd = 0 ) { return m_Counts.size() ? m_Counts[ wd ][ hashval ] : 0 ; } const TStatVector& GetStat() const { return m_Counts; } void Find( IFaLookupCallback * cbk, const TStsList& sts, bool syscall_optimize = true, int window = 0); void SetAligner( IAlign * left, IAlign * right ) { m_AlignL = left; m_AlignR = right; } protected: off64_t GetHashEntries( const char * table, off64_t tab_off, unsigned word, THashElement value, unsigned& size ) const; void InitTableOffsets(); protected: int m_Fd; Uint4 m_Version; TStatVector m_Counts; vector m_TableOffset; // offsets of tables for words Uint4 m_ElSize; IAlign * m_AlignL, * m_AlignR; }; struct SFaMatchBlock { enum EStrand { eUnkn='0', ePos = '+', eNeg = '-' }; enum EType { eSTS, ePrimer }; typedef std::string string; EType type; EStrand strand; string seq_label; string sts_label; unsigned from, to; unsigned char mism, gaps; unsigned char mism_l, mism_r; unsigned char gaps_l, gaps_r; const char * sequence; unsigned seqlen; }; class IFaLookupCallback { public: virtual ~IFaLookupCallback() {} virtual bool Start() = 0; virtual bool Done() = 0; virtual bool Fail( const std::string& msg ) = 0; virtual bool Warn( const std::string& msg, const ISts * ) = 0; virtual bool Match( const SFaMatchBlock * info) = 0; virtual bool Match( const SFaMatchBlock * info, const ISts * sts) = 0; virtual void Fragment( unsigned i, unsigned total ) {}; virtual void Progress( unsigned i, unsigned total ) {}; }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: fahash.hpp,v $ * Revision 1.15 2007/07/11 20:49:29 rotmistr * Made 64bit-compatible * * Revision 1.14 2004/06/07 16:24:56 rotmistr * Bug fixes to previos version. * * Revision 1.13 2004/06/03 23:37:19 rotmistr * New aligner added. * * Revision 1.12 2004/05/27 20:35:46 rotmistr * Version 2.1.0 with appropriate changes (see Changes) is ready for tests. * * Revision 1.11 2004/04/28 14:35:35 rotmistr * hashfile ver2 build/search works now * * Revision 1.10 2004/04/27 00:01:54 rotmistr * Second version of reverse hash file started * * Revision 1.9 2004/04/01 05:57:52 rotmistr * Compilable with borland C++ * * Revision 1.8 2004/03/07 06:35:59 rotmistr * Many bugfixes and optimisations -- cgi is to go to production * * Revision 1.7 2004/02/18 05:44:24 rotmistr * Changes in CGI: sort order, separate misalignments for l and r primers, reload button * * Revision 1.6 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.5 2004/01/07 16:57:42 rotmistr * Fragment size is now configurable. * * Revision 1.4 2004/01/06 21:54:19 rotmistr * Statistics for word repetitions API added * * Revision 1.3 2003/12/30 21:36:31 rotmistr * Syscall optimisation mode added. * * Revision 1.2 2003/12/23 21:30:50 rotmistr * - gaps/mismatches reporting * - lo/hi fixup * - reverse sts in re-PCR_main * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/fahash_main.cpp0000644001137700010620000002442311745334032015350 0ustar rotmistrcontig/* $Id: fahash_main.cpp,v 1.5 2008/04/28 16:38:45 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include //#include #include #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); class CMain { public: CMain(int c, char ** v): argc(c),argv(v),done(false), wdsize(8),period(0), lo(0),hi(0),skip_repetative(false),cachesize(0), command(eNone),version(eVer2) {} int Run(); protected: int Execute(); int ParseCmdline(); int Help(FILE* = stdout); int Version(); int Build(); int Stat(); protected: int argc; char ** argv; bool done; protected: string findex; int wdsize, period; int lo, hi; bool skip_repetative; unsigned cachesize; enum ECommand { eNone, eBuild, eStat } command; enum EVersion { eVer1=1, eVer2=2 } version; string outfile; }; int CMain::Help(FILE* out) { fprintf(out,"usage: [-hV] -b hash-file [-w wdsize] [-f period] " "[-F fragment_min,fragment_max] [-k] [-c cachesize] [-v 1|2] " "famap-file ...\n"); fprintf(out," or: [-hV] -T hash-file [-o outfile]\n"); fprintf(out,"where:\n" "\t-T hash-file\tPrint word usage statistics for hash-file\n" "\t-b hash-file\tBuild hash tables (hash-file) " "from sequence files,\n" "\t-w wordsize \tSet word size when building hash tables\n" "\t-f period \tSet discontiguity when building hash tables\n" "\t-k \tSkip repeats when building hash-file\n" "\t-F min,max \tSet watermarks for fragment size (in Mb) " "(version 1 only)\n" "\t-c cachesize\tSet cache size (version 2 only)\n" "\t-v ver \tUse format version (1|2, 2 is default)\n" "\t-o outfile \tWrite output to file `outfile'\n"); done=true; return 0; } int CMain::ParseCmdline() { int optchar; while((optchar=getopt(argc,argv,"hVw:F:f:b:kT:v:o:c:"))!=-1) { switch(optchar) { case 'h': Help(); break; case 'V': Version(); break; case 'w': wdsize=atoi(optarg); break; case 'f': period=atoi(optarg); break; case 'c': cachesize=atoi(optarg); break; case 'k': skip_repetative=true; break; case 'b': findex=optarg; command=eBuild; break; case 'T': findex=optarg; command=eStat; break; case 'o': outfile=optarg; break; case 'F': do { char * x=0; lo=strtol(optarg,&x,10); if(x==0 || strchr(":,-;/",*x)==0) throw runtime_error("fragment size should be RANGE"); hi=strtol(x+1,0,10); } while(0); break; case 'v': do { int v=atoi(optarg); switch(v) { case 1: version=eVer1; break; case 2: version=eVer2; break; default: throw runtime_error("only file versions 1 and 2 " "are supported"); } } while(0); break; } } if(done) return 0; if(command !=eStat && (command==eNone || optind >= argc)) { Help(stderr); return 1; } return 0; } #ifndef USE_WIN #define CLREOL "\r\x1b[K" #else #define CLREOL "\r" #endif class CFaIndexerCallback:public IFaIndexerCallback { time_t t0; public: CFaIndexerCallback():t0(time(0)) {} virtual void CbkSequence(const char * id) { fprintf(stderr,CLREOL " - Adding Sequence %s\n",id); fflush(stderr); } virtual void CbkProgress(unsigned pos, unsigned length) { time_t t1=time(0); if(t0==t1) return; if( length > 10000 ) length/=100; else pos*=100; if(length) { fprintf(stderr,CLREOL " %50s\r %3d%% %.*s", "..................................................", pos/length,pos/length/2, "##################################################"); fflush(stderr); } t0=t1; } virtual void CbkFile(const char * name) { fprintf(stderr,CLREOL "* Adding File %s\n",name); fflush(stderr); } virtual void CbkDumpProgress(unsigned pos, unsigned length) { time_t t1=time(0); if(t0==t1) return; if( length > 10000 ) length/=100; else pos*=100; if(length) { fprintf(stderr,CLREOL " %50s\r %3d%% %.*s", "oooooooooooooooooooooooooooooooooooooooooooooooooo", pos/length,pos/length/2, "OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO"); fflush(stderr); } t0=t1; } virtual void CbkDumpStart() { fprintf(stderr,CLREOL "= Dumping \n"); fflush(stderr); } virtual void CbkDumpEnd() { fprintf(stderr,CLREOL "= Dumping OK\n"); fflush(stderr); } virtual void CbkResetStart() { fprintf(stderr,CLREOL "= Resetting..."); fflush(stderr); }; virtual void CbkResetEnd() { fprintf(stderr,"\r\r\r OK"); }; }; int CMain::Build() { auto_ptr indexer( version==eVer1?new CFaIndexer1(): version==eVer2?new CFaIndexer2(): (AFaIndexerBase*)0); switch(version) { case eVer1: if(CFaIndexer1* ind=dynamic_cast(indexer.get())) { if(lo && hi) ind->SetFragmentSizeRange(unsigned(lo)*1024U*1024U, unsigned(hi)*1024U*1024U); if(skip_repetative) ind->SetFlags(ind->GetFlags()&CFaIndexer1::fSkipRepeatitive); } break; case eVer2: if(CFaIndexer2* ind=dynamic_cast(indexer.get())) { if(cachesize) ind->SetCacheSize(cachesize); } break; default: throw logic_error("Unknown index version"); } CFaIndexerCallback cbk; indexer->SetCallback(&cbk); indexer->SetHash(CHashSet(wdsize,period)); indexer->AttachFile(findex+"~"); for(int i=optind; iAddFile(argv[i]); } indexer->Finish(); if(rename((findex+"~").c_str(),findex.c_str())) { throw runtime_error("Rename failed for "+findex+": "+strerror(errno)); } return 0; } int CMain::Execute() { switch(command) { case eBuild: return Build(); case eStat: return Stat(); } } int CMain::Stat() { CFaLookup lookup; lookup.AttachFile(findex); fputs("* Calculating statistics...\n",stderr); lookup.Stat(); fputs("* Done\n",stderr); FILE * out=outfile.length()?fopen64(outfile.c_str(),"w"):stdout; if(out==0) throw runtime_error("Failed to open "+outfile+": "+strerror(errno)); const CFaLookup::TStatVector& st=lookup.GetStat(); int count=0; double average=0; for(unsigned wd=0; wd>(k*2)) & 3]; } } else { for(int k=0; ptr>wtype; ++k) { if(((ptr-wtype+wd)%lookup.GetHash().GetWordCount())==0) *--ptr='*'; *--ptr="ACGT"[(i>>(k*2)) & 3]; } } fprintf(out,"%d\t%d\t%s\t%lld\t%lf\n",wd,i,wtype, x[i],x[i]/average); } } if(outfile.length()) fclose(out); return 0; } int CMain::Version() { done=true; puts("Reverse e-PCR: sequence hash builder version " VERSION); return 0; } int CMain::Run() { if(int rc=ParseCmdline() ) return rc; if(done) return 0; return Execute(); } int main(int argc, char ** argv) { try { CMain app(argc,argv); return app.Run(); } catch(logic_error& e) { fprintf(stderr,"! Fatal: Internal error %s\n",e.what()); } catch(exception& e) { fprintf(stderr,"! Fatal: %s\n",e.what()); } catch(...) { fprintf(stderr,"! Fatal: Unknown error\n"); } return 100; } /* * $Log: fahash_main.cpp,v $ * Revision 1.5 2008/04/28 16:38:45 rotmistr * Applied patch to build with gcc-4.3 * * Revision 1.4 2007/07/05 16:05:58 rotmistr * Made things compileable by MS Visual C++ 8.0 * * Revision 1.3 2004/09/03 21:28:49 rotmistr * Fixes to compile with Borland C++ 5.5 * * Revision 1.2 2004/06/03 23:37:20 rotmistr * New aligner added. * * Revision 1.1 2004/05/27 20:35:47 rotmistr * Version 2.1.0 with appropriate changes (see Changes) is ready for tests. * * */ e-PCR-2.3.12/fahash_create.cpp0000644001137700010620000001262711745334032015672 0ustar rotmistrcontig/* $Id: fahash_create.cpp,v 1.14 2007/07/11 20:49:29 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include "fahash_defines.h" #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); #include "fahash_internal.hpp" //////////////////////////////////////////////////////////////////////// void AFaIndexerBase::AddFile( const string& path ) { CFastaMap fmap( path ); m_Fapath.push_back( SFileDesc( 0, path ) ); if( m_Cbk ) m_Cbk->CbkFile( path.c_str() ); for( unsigned i = 0; i < fmap.SequenceCount(); ++i ) { if( m_Cbk ) m_Cbk->CbkSequence( fmap.GetIdent(i).c_str() ); CMmSequence seq( fmap, i ); AddSequence( seq.GetData(), seq.GetSize(), i ); } } void AFaIndexerBase::AttachHeaderDefault( const string& path, Uint4 version ) { m_File = fopen64( path.c_str(), "w" FILE_BINARY ); if( m_File == 0 ) SYSERROR( path ); setvbuf( m_File, 0, _IOFBF, 1024 * 1024 ); if( fwrite( SIGNATURE, 1, 16, m_File ) != 16 ) SYSERROR( path ); // signature Write( m_File, Uint4( version ) ); // version Write( m_File, Uint4( BYTE_ORDER_WORD ) ); // architecture Write( m_File, Uint4( HEADER_SIZE ) ) ; // header size Write( m_File, Uint4( m_Hash.GetWordSize() ) ); // word size Write( m_File, Uint4( m_Hash.GetWordCount() ) ); // period fseeko64( m_File, HEADER_SIZE, SEEK_SET ); m_Count = 0; } void AFaIndexerBase::Finish() { DumpTables(); off64_t seq_tbl = SeekAlign( m_File ); Write( m_File, Uint4( m_Seqlst.size() ) ); for( unsigned i = 0; i < m_Seqlst.size(); ++i ) { Write( m_File, m_Seqlst[i].fid ); Write( m_File, m_Seqlst[i].data ); } off64_t fil_tbl = SeekAlign( m_File ); Write( m_File, Uint4( m_Fapath.size() ) ); for( unsigned i = 0; i < m_Fapath.size(); ++i ) { Write( m_File, m_Fapath[i].flags ); Write( m_File, m_Fapath[i].path ); } off64_t off_tbl = SeekAlign( m_File ); Write( m_File, Uint4( m_Tabloc.size() ) ); for( unsigned i = 0; i < m_Tabloc.size(); ++i ) { Write( m_File, m_Tabloc[i].first ); Write( m_File, m_Tabloc[i].second ); } off64_t endoffile = SeekAlign( m_File ); Write( m_File, seq_tbl ); Write( m_File, fil_tbl ); Write( m_File, off_tbl ); Write( m_File, endoffile ); fclose( m_File ); m_File = 0; } /* * $Log: fahash_create.cpp,v $ * Revision 1.14 2007/07/11 20:49:29 rotmistr * Made 64bit-compatible * * Revision 1.13 2004/05/27 20:35:46 rotmistr * Version 2.1.0 with appropriate changes (see Changes) is ready for tests. * * Revision 1.12 2004/04/27 00:01:54 rotmistr * Second version of reverse hash file started * * Revision 1.11 2004/04/06 04:53:17 rotmistr * All is compileable with BCC5.5 and runnable on WIndows * * Revision 1.10 2004/04/01 05:57:52 rotmistr * Compilable with borland C++ * * Revision 1.9 2004/03/07 06:35:59 rotmistr * Many bugfixes and optimisations -- cgi is to go to production * * Revision 1.8 2004/02/18 05:44:25 rotmistr * Changes in CGI: sort order, separate misalignments for l and r primers, reload button * * Revision 1.7 2004/02/12 21:38:20 rotmistr * Fixed typo in seqcmp * Optimized and fixed lookup * Better look for reverse.cgi * * Revision 1.6 2004/02/11 04:34:55 rotmistr * Optimised lookup speed and memory usage * Fixed bug with end of sequence in stsmatch * Changing CGI look * * Revision 1.5 2004/02/05 23:41:21 rotmistr * Better reload, fixed margin report in commandline, unists tab in CGI form. * * Revision 1.4 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.3 2004/01/07 16:57:42 rotmistr * Fragment size is now configurable. * * Revision 1.2 2003/12/30 15:27:22 rotmistr * Fixed bug with sequence end * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/fahash_create1.cpp0000644001137700010620000001517311745334032015752 0ustar rotmistrcontig/* $Id: fahash_create1.cpp,v 1.2 2007/07/11 20:49:29 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include "fahash_defines.h" #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); #include "fahash_internal.hpp" //////////////////////////////////////////////////////////////////////// void CFaIndexer1::AddSequence( const char * sequence, unsigned len, off64_t pos ) { Uint8 exp_count = len * m_Hash.GetWordCount(); if( exp_count > m_Hi ) throw runtime_error( "Too large sequence" ); if( ( m_Count + exp_count ) > m_Hi ) DumpTables(); m_Seqlst.push_back( SSeqDesc( m_Fapath.size() - 1, pos ) ); m_Hash.Begin( sequence ); assert( m_Hash.GetWordCount() == m_Data.size() ); for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { assert( m_Hash.GetTableSize( wd ) == m_Data[ wd ].size() ); for( unsigned u = 0; u < m_Hash.GetTableSize( wd ); ++u ) { THashElement sid = ( m_Seqlst.size() - 1 ) | kHighBit; if( m_Data[ wd ][ u ].size() && m_Data[ wd ][ u ][ m_Data[ wd ][ u ].size() - 1 ] & kHighBit ) // subst last sid m_Data[ wd ][ u ][ m_Data[ wd ][ u ].size() - 1 ] = sid; else { m_Data[ wd ][ u ].push_back( sid ); m_Count++; } } } for( ; m_Hash.GetPosition() < len/*!m_Hash.End()*/; m_Hash.Next() ) { if( m_Hash.GetPosition() % 100 == 0 && m_Cbk ) m_Cbk->CbkProgress( m_Hash.GetPosition(), len ); for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { if( m_Hash.Good(wd) ) { m_Data[ wd ][ m_Hash.GetValue(wd) ].push_back( THashElement( m_Hash.GetPosition() ) ); m_Count++; } } } if( m_Count > m_Lo ) DumpTables(); } void CFaIndexer1::AttachFile( const string& path ) { AttachHeaderDefault( path, FILE_VERSION ); } void CFaIndexer1::DumpTables() { // INDICATE("DUMPING..."); if( m_Cbk ) m_Cbk->CbkDumpStart(); off64_t start_off = SeekAlign( m_File ); Uint4 pos = 0; for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { pos += m_Hash.GetTableSize(wd) * 2 * sizeof(Uint4); } Uint4 cur=pos; long double avg=0, dev=0; Uint8 cnt=0; if( m_Flags & fSkipRepeatitive ) { for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { for( unsigned el = 0; el < m_Hash.GetTableSize( wd ); ++el, ++cnt ) { avg += m_Data[ wd ][ el ].size(); dev += m_Data[ wd ][ el ].size() * m_Data[ wd ][ el ].size(); } } if( cnt ) { avg /= cnt; dev /= cnt; } dev -= avg; } for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { for(unsigned el = 0; el < m_Hash.GetTableSize( wd ); ++el ) { Write( m_File, pos ); if( m_Flags & fSkipRepeatitive && m_Data[wd][el].size() > avg + 3 * dev ) { Write( m_File, ~0 ); } else { Write( m_File, m_Data[wd][el].size() ); } pos += m_Data[wd][el].size() * sizeof( THashElement ); } } for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { for( unsigned el = 0; el < m_Hash.GetTableSize( wd ); ++el ) { if( m_Cbk ) m_Cbk->CbkDumpProgress( cur, pos ); if( m_Flags & fSkipRepeatitive && m_Data[wd][el].size() > avg + 3 * dev ) { } else { if( fwrite( &m_Data[wd][el][0], sizeof(THashElement), m_Data[wd][el].size(), m_File ) != m_Data[wd][el].size() ) SYSERROR( string( "Block write" ) ); cur += m_Data[wd][el].size() * sizeof(THashElement); } } } off64_t end_off = ftello64(m_File); if( m_Cbk ) m_Cbk->CbkDumpEnd(); // fprintf(stderr, // "DUMP: start offset = %lld\n" // "DUMP: end offset = %lld\n" // "DUMP: distance = %lld\n" // "DUMP: pos calculated = %u\n" // "DUMP: pos accumulated = %u\n", // start_off, end_off, end_off-start_off,pos, cur); if( end_off - start_off != pos || pos != cur ) throw runtime_error( "File format error!" ); if( m_Cbk ) m_Cbk->CbkResetStart(); m_Tabloc.push_back( make_pair( start_off, end_off - start_off ) ); // INDICATE("RESETTING..."); for( unsigned i = 0; i < m_Hash.GetWordCount(); ++i ) { for( unsigned j = 0; j < m_Hash.GetTableSize(i); ++j ) { m_Data[i][j].clear(); } } m_Count = 0; // SetHash(m_Hash); if( m_Cbk ) m_Cbk->CbkResetEnd(); // INDICATE("DUMPING...DONE"); } void CFaIndexer1::SetHash( const CHashSet& hs ) { // for(unsigned i=0; i #include #include "fahash_defines.h" #include #include #include #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); #include "fahash_internal.hpp" //////////////////////////////////////////////////////////////////////// void CFaIndexer2::AddSequence( const char * sequence, unsigned len, off64_t pos ) { m_Seqlst.push_back( SSeqDesc( m_Fapath.size() - 1, pos ) ); m_Hash.Begin( sequence ); THashElement sid = ( m_Seqlst.size() - 1 ) | kHighBit; assert( m_Hash.GetWordCount() == m_Data.size() ); for( ; m_Hash.GetPosition() < len/*!m_Hash.End()*/; m_Hash.Next() ) { if( m_Cbk && m_Hash.GetPosition() % 100 == 0 ) m_Cbk->CbkProgress( m_Hash.GetPosition(), len ); for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { assert( m_Hash.GetTableSize( wd ) == m_Data[wd].size() ); if( m_Hash.Good( wd ) ) { THashElement val = m_Hash.GetValue( wd ); if( m_LastSid[wd][val] != sid ) { m_LastSid[wd][val] = sid; m_Seqs[wd][val]++; } m_Data[wd][val]++; } } } } void CFaIndexer2::AttachFile( const string& path ) { AttachHeaderDefault( path, FILE_VERSION2 ); } void CFaIndexer2::DumpTables() { if( m_Cbk ) m_Cbk->CbkResetStart(); // Allocate Space unsigned entries = 0; for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { entries += m_Hash.GetTableSize(wd); } off64_t start_off = SeekAlign( m_File ); off64_t curr = start_off; TStatVector& pos = m_Cursor; pos.resize( m_Hash.GetWordCount() ); m_Cache.resize( m_Hash.GetWordCount() ); for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { pos[wd].resize( m_Hash.GetTableSize(wd) ); m_Cache[wd].resize( m_Hash.GetTableSize(wd) ); curr += m_Hash.GetTableSize( wd ) * ( sizeof(off64_t) + sizeof(THashElement) ); } m_Tabloc.push_back( make_pair( start_off, curr - start_off ) ); off64_t totalseq = 0, maxseq = 0, nullseq = 0, totalwd = 0, maxwd = 0; for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { for( unsigned el = 0; el < m_Hash.GetTableSize( wd ); ++el ) { // Set starting position Write( m_File, curr ); Write( m_File, m_Seqs[wd][el] + m_Data[wd][el] ); pos[wd][el] = curr; curr += ( m_Seqs[wd][el] + m_Data[wd][el] ) * sizeof(Uint4); // Reset lastsid m_LastSid[wd][el] = ~0U; // Statistics totalseq += m_Seqs[wd][el]; if( m_Seqs[wd][el] > maxseq ) maxseq = m_Seqs[wd][el]; if( m_Seqs[wd][el] == 0 ) nullseq++; totalwd += m_Data[wd][el]; if( m_Data[wd][el] > maxwd ) maxwd = m_Data[wd][el]; } } if( m_Cbk ) m_Cbk->CbkResetEnd(); // Print stat cerr << "Total number of sequences: " << m_Seqlst.size() << endl; cerr << "Number of hash entries: " << entries << endl; cerr << "Average number of sequences per hash entry: " << double(totalseq) / entries << endl; cerr << "Maximal number of sequences per hash entry: " << maxseq << endl; cerr << "Null hash entries: " << nullseq << endl; cerr << "Total number of hits: " << totalwd << endl; cerr << "Average number of hits per hash entry: " << double(totalwd) / entries << endl; cerr << "Maximal number of hits per hash entry: " << maxwd << endl; // Second pass auto_ptr fmap( 0 ); Uint2 oldfid = ~Uint2(0); m_Count = 0; for( int sid = 0; sid < m_Seqlst.size(); ++sid ) { const SSeqDesc& sd = m_Seqlst[sid]; if( oldfid != sd.fid ) { if( fmap.get() != 0 ) delete fmap.get(); fmap.reset( new CFastaMap( m_Fapath[ oldfid = sd.fid ].path ) ); } if( m_Cbk ) m_Cbk->CbkSequence( fmap->GetIdent( sd.data ).c_str() ); CMmSequence seq( *fmap, sd.data ); StoreSequence( sid, seq.data(), seq.size() ); } WriteCache(); } void CFaIndexer2::StoreSequence( unsigned sid, const char * seq, unsigned len ) { m_Hash.Begin( seq ); // THashElement maxsid=(m_Seqlst.size()-1)|kHighBit; assert( m_Hash.GetWordCount() == m_Data.size() ); for( ; m_Hash.GetPosition() < len/*!m_Hash.End()*/; m_Hash.Next() ) { if( m_Cbk && m_Hash.GetPosition() % 100 == 0 ) m_Cbk->CbkProgress( m_Hash.GetPosition(), len ); for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { if( m_Hash.Good(wd) ) { THashElement val = m_Hash.GetValue( wd ); if( m_LastSid[wd][val] != sid ) { m_LastSid[wd][val] = sid; m_Cache[wd][val].push_back( sid | kHighBit ); // Write(m_File,sid|kHighBit); // m_Cursor[wd][val]+=sizeof(unsigned); ++m_Count; } m_Cache[wd][val].push_back( m_Hash.GetPosition() ); // Write(m_File,m_Hash.GetPosition()); // m_Cursor[wd][val]+=sizeof(unsigned); ++m_Count; } } if( m_Count > m_CacheSize ) { // this value should be calculated based on statistics WriteCache(); m_Count = 0; } } } void CFaIndexer2::WriteCache() { if( m_Cbk ) m_Cbk->CbkDumpStart(); // fprintf(stderr,"\nWriting cache..."); for( unsigned wd = 0, cnt = 0; wd < m_Hash.GetWordCount(); ++wd ) { TData::value_type & wdCache( m_Cache[wd] ); for( unsigned val = 0; val < wdCache.size(); ++val ) { THashList & cache( wdCache[val] ); if( cache.size() ) { Uint8 & cursor = m_Cursor[wd][val]; if( fseeko64( m_File, cursor, SEEK_SET ) ) { ostringstream err; err << "Failed to seek to pos " << cursor << "(uint" << ( 8 * sizeof( cursor ) ) << "): " << strerror(errno) << "\n"; throw runtime_error( err.str() ); } Write( m_File, cache[0], cache.size() ); cursor += sizeof( cache[0] ) * cache.size(); cnt += cache.size(); cache.resize(0); if(m_Cbk) m_Cbk->CbkDumpProgress( cnt, m_Count ); } } } // fprintf(stderr,"...Done\n"); if( m_Cbk ) m_Cbk->CbkDumpEnd(); } void CFaIndexer2::SetHash( const CHashSet& hs ) { // for(unsigned i=0; i #include #include "fahash_defines.h" #include #include #include #include #include #include #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); #include "fahash_internal.hpp" //////////////////////////////////////////////////////////////////////// #define DBG fprintf(stderr,__FILE__":%d\n",__LINE__) #define HEX(a) fprintf(stderr,__FILE__":%d\t"#a"\t= 0x%08x\n",__LINE__,a) #define INT(a) fprintf(stderr,__FILE__":%d\t"#a"\t= %d\n",__LINE__,a) #define STR(a) fprintf(stderr,__FILE__":%d\t"#a"\t= %s\n",__LINE__,a.c_str()) class CHashListIterator { protected: Uint4 * cur, * end; Uint4 sid; public: enum { eHighBit = ( Uint4(1) << ( 8 * sizeof( Uint4 ) - 1 ) ), eNullSid = ( 0 ^ eHighBit ), eSidMask = ~eHighBit }; CHashListIterator( void * ptr = 0, unsigned cnt = 0 ) { cur = (Uint4 *) ptr; end = cur + cnt; sid = eNullSid; if( cur && cur < end ) { while( cur < end && *cur & eHighBit ) sid = *cur++ & eSidMask; if( sid == eNullSid ) throw runtime_error( "hash list format error: " "does not start with sid" ); } } unsigned GetSid() const { return sid; } operator bool () const { return cur && cur < end; } int operator * () const { return int(*cur); } // !!! CHashListIterator& operator ++ () { if( cur ) { if( cur < end ) ++cur; while( cur < end && *cur & eHighBit ) sid = *cur++ & eSidMask; } return *this; } CHashListIterator operator ++ (int) { CHashListIterator i( *this ); ++*this; return i; } bool operator < ( const CHashListIterator& i ) const { return sid < i.sid || ( sid == i.sid && cur < i.cur ); } bool operator > ( const CHashListIterator& i ) const { return sid > i.sid || ( sid == i.sid && cur > i.cur ); } bool operator == ( const CHashListIterator& i ) const { return sid == i.sid && cur == i.cur; } bool operator != ( const CHashListIterator& i ) const { return sid != i.sid || cur != i.cur; } bool operator <= ( const CHashListIterator& i ) const { return sid <= i.sid || ( sid == i.sid && cur <= i.cur ); } bool operator >= ( const CHashListIterator& i ) const { return sid >= i.sid || ( sid == i.sid && cur >= i.cur ); } }; class CHashCoIterator { protected: CHashListIterator x, y, z; int lo, hi; public: CHashCoIterator( CHashListIterator& _x, CHashListIterator& _y, int l, int h ) : x( _x ), y( _y ), z( _x ), lo( l ), hi( h ) { fit(); } void report( const string& state, FILE * f = stdout ) { fprintf( stdout ,"#%s# x=%d:%d; y=%d:%d; z=%d:%d; " "lo=%d; len=%d; hi=%d;\n",state.c_str(), x.GetSid(), *x, y.GetSid(), *y, z.GetSid(), *z, lo, *y - *x, hi ); } void fit() { if( ! *this ) return; while( true ) { do { while( x.GetSid() < y.GetSid() ) if( ! ++x ) return; while( x.GetSid() > y.GetSid() ) if( ! ++y ) return; } while( x.GetSid() != y.GetSid() ); while( (*y - *x) > hi) { if( ! ++x ) return; if( x.GetSid() != y.GetSid() ) goto nextsid; } if( ( *y - *x ) >= lo ) { z = x; return; } nextsid: if( ! ++y ) return; } } operator bool () const { return x && y && z; } const CHashListIterator& GetX() const { return z; } const CHashListIterator& GetY() const { return y; } CHashCoIterator& IncX() { return ++*this; } CHashCoIterator& IncY() { if(++y) fit(); return *this; } CHashCoIterator& operator ++ () { if( ++z && z.GetSid() == y.GetSid() && ( *y - *z ) >= lo ) return *this; if( ++y ) fit(); return *this; } }; //////////////////////////////////////////////////////////////////////// void CFaLookup::AttachFile( const string& path ) { m_Fd = open( path.c_str(), O_RDONLY | O_LARGEFILE ); if( m_Fd == -1 ) SYSERROR( path ); // TODO: here signature, version, byte order and similar // stuff will be written char buff[16]; read( m_Fd, buff, 16 ); // signature if( memcmp( buff, SIGNATURE, 16 ) ) throw runtime_error( "Wrong signature in " + path ); m_Version = Read( m_Fd ); if( Read( m_Fd ) != BYTE_ORDER_WORD ) throw runtime_error( "Wrong byte order in " + path ); if( m_Version != FILE_VERSION && m_Version != FILE_VERSION2 ) throw runtime_error( "Wrong version in " + path ); if( Read( m_Fd ) != HEADER_SIZE ) throw runtime_error( "Wrong header size order in " + path ); Uint4 wdsize = Read( m_Fd ); Uint4 period = Read( m_Fd ); m_Hash = CHashSet( wdsize, period ); off64_t last_off = lseek64( m_Fd, 0, SEEK_END ); if( last_off == -1 ) SYSERROR( string( "seek-end(0) failed" ) ); last_off = lseek64( m_Fd, -1 * int(sizeof(off64_t)), SEEK_CUR ); if( last_off == -1 ) SYSERROR( string( "seek-cur(-8) failed" ) ); // sizeof(off64_t) off64_t tail_off = Read( m_Fd ); // fprintf(stderr, // "ATTACH: last_off = %lld\n" // "ATTACH: tail_off = %lld\n", // last_off, // tail_off); if( last_off != off64_t(tail_off + 3 * sizeof( off64_t )) ) { throw runtime_error( "Wrong file trailer in " + path ); } lseek64( m_Fd, tail_off, SEEK_SET ); off64_t seq_tbl = Read( m_Fd ); off64_t fil_tbl = Read( m_Fd ); off64_t off_tbl = Read( m_Fd ); lseek64( m_Fd, seq_tbl, SEEK_SET ); m_Seqlst.resize( Read( m_Fd ) ); for( unsigned i = 0; i < m_Seqlst.size(); ++i ) { Int2 file = Read(m_Fd); m_Seqlst[i] = SSeqDesc( file, Read( m_Fd ) ); } lseek64( m_Fd, fil_tbl, SEEK_SET ); m_Fapath.resize( Read( m_Fd ) ); for( unsigned i = 0; i < m_Fapath.size(); ++i ) { Int2 flags = Read( m_Fd ); m_Fapath[i] = SFileDesc( flags, Read( m_Fd ) ); } lseek64( m_Fd, off_tbl, SEEK_SET ); m_Tabloc.resize( Read( m_Fd ) ); for( unsigned i = 0; i < m_Tabloc.size(); ++i ) { m_Tabloc[i].first = Read( m_Fd ); m_Tabloc[i].second = Read( m_Fd ); } InitTableOffsets(); } void CFaLookup::InitTableOffsets() { m_TableOffset.resize( m_Hash.GetWordCount() + 1 ); m_ElSize= m_Version == FILE_VERSION ? 2 * sizeof(Uint4) : m_Version == FILE_VERSION2 ? sizeof(off64_t) + sizeof(Uint4): 0; unsigned i = 0, pos = 0; for( ; i < m_Hash.GetWordCount(); ++i ) { m_TableOffset[i] = pos; pos += m_Hash.GetTableSize( i ); } m_TableOffset[i] = pos; } off64_t CFaLookup::GetHashEntries( const char * table, off64_t tab_off, unsigned wd, THashElement val, unsigned& size ) const { const char * entry = table + ( m_TableOffset[wd] + val ) * m_ElSize; switch( m_Version ) { case FILE_VERSION2: size = ( (const Uint4*) entry )[2]; return ( (const off64_t*) entry )[0]; case FILE_VERSION: size = ( (const Uint4*) entry )[1]; return ( (const Uint4*) entry )[0] + tab_off; default: throw logic_error( "Unknown file version" ); } } void CFaLookup::Find( IFaLookupCallback * cbk, const string& label, const char report_strand, const string& primer) { vector famap( m_Fapath.size() ); SFaMatchBlock info; info.type = SFaMatchBlock::ePrimer; info.strand = report_strand == '+' ? SFaMatchBlock::ePos : report_strand == '-' ? SFaMatchBlock::eNeg : SFaMatchBlock::eUnkn; info.sts_label = label; if( cbk && !cbk->Start() ) return; if( primer.length() < m_Hash.GetWordSize() ) { cbk && cbk->Fail( "primer " + label + " is too short" ); return; } unsigned htbl_size = m_TableOffset[ m_Hash.GetWordCount() ] * m_ElSize; m_Hash.Begin( primer.c_str() ); bool warned = false; for( unsigned frag = 0; frag < m_Tabloc.size(); ++frag ) { CMMap table( htbl_size, CMMap::fProtRead, CMMap::fMapPrivate, m_Fd, m_Tabloc[frag].first ); for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { if( cbk ) cbk->Progress( wd, m_Hash.GetWordCount() ); if( ! m_Hash.Good( wd ) ) continue; unsigned count; off64_t start = GetHashEntries( table.data(), m_Tabloc[frag].first, wd, m_Hash.GetValue( wd ), count ); if( count == ~0U ) { if( cbk && ! warned ) cbk->Warn( "Repeated word in primer " + label, 0 ); warned = true; continue; } CMMap xmap( count * sizeof(Uint4), CMMap::fProtRead, CMMap::fMapPrivate, m_Fd, start ); madvise( xmap.data(), xmap.size(), MADV_SEQUENTIAL | MADV_DONTNEED ); for( CHashListIterator u( xmap.data(), count ); u; ++u ) { Int4 pos1 = *u - m_Hash.GetWordSize(); Int4 sid = u.GetSid(); Int2 fil = m_Seqlst[sid].fid; if( ! famap[fil].IsOpen() ) famap[fil].Open( m_Fapath[fil].path ); CMmSequence seq( famap[fil], m_Seqlst[sid].data ); if( m_AlignR->Forward( seq.GetData() + pos1, seq.GetData() + seq.Length(), primer.c_str(), primer.length() ) ) { info.seq_label = famap[fil].GetIdent( m_Seqlst[sid].data ); info.mism = m_AlignR->GetMismatches(); info.gaps = m_AlignR->GetGaps(); info.mism_l = info.mism_r = 0; info.gaps_l = info.gaps_r = 0; info.from = pos1; info.to = info.from + primer.length(); info.sequence = seq.GetData(); info.seqlen = seq.Length(); if( cbk && ! ( cbk->Match( &info ) ) ) return; } } } } cbk && cbk->Done(); } void CFaLookup::Find( IFaLookupCallback * cbk, ISts* sts, int window ) { TStsList lst; lst.push_back( sts ); Find( cbk, lst, false, window ); } struct SMatchPos { unsigned sid; unsigned pos; SMatchPos( unsigned s, unsigned p ) : sid( s ), pos( p ) {} bool operator < ( const SMatchPos& s ) const { return sid < s.sid || sid == s.sid && pos < s.pos; } }; struct SStsPreMatch { ISts * sts; int le, rb; SStsPreMatch( ISts * s, int p1, int p2 ) : sts( s ), le( p1 ), rb( p2 ) {} }; class CMatchSts { public: struct SHit { int pos1, pos2; unsigned char mism_l, mism_r, gaps_l, gaps_r; int length() const { return pos2 - pos1; } bool operator == (const SHit& o) const { return pos2 == o.pos2 && pos1 == o.pos1 && mism_l == o.mism_l && gaps_l == o.gaps_l && mism_r == o.mism_r && gaps_r == o.gaps_r; } SHit( int p1, int p2, char ml, char mr, char gl, char gr ) : pos1( p1 ), pos2( p2 ), mism_l( ml ), mism_r( mr ), gaps_l( gl ), gaps_r( gr ) {} SHit() {} unsigned char mism() const { return mism_l + mism_r; } unsigned char gaps() const { return gaps_l + gaps_r; } static bool OrderByPos2Pos1( const SHit&, const SHit& ); static int Compare( const SHit&, const SHit&, unsigned, unsigned); static bool Overlap( unsigned, unsigned, const SHit&, const SHit&); bool operator < ( const SHit& h ) const { return pos1 < h.pos1 || pos1 == h.pos1 && ( pos2 < h.pos2 || pos2 == h.pos2 && ( gaps() < h.gaps() || gaps() == h.gaps() && mism() < h.mism() ) ); } }; enum { fLeftNoMatch = 2, fRightNoMatch = 1, }; typedef vector TStsHits; typedef map TAllHits; typedef map TSidHits; typedef TStsHits::iterator TStsHits_I; typedef TAllHits::iterator TAllHits_I; typedef TSidHits::iterator TSidHits_I; typedef TStsHits::const_iterator TStsHits_CI; typedef TAllHits::const_iterator TAllHits_CI; typedef TSidHits::const_iterator TSidHits_CI; public: ~CMatchSts() { delete m_Seq; } void Flush(); CMatchSts( const CFaData::TPathLst& fapath, const CFaData::TSeqLst& seqlst, IFaLookupCallback * cbk, IAlign * alignl, IAlign * alignr): m_Fapath( fapath ), m_Seqlst( seqlst ), m_Famap( fapath.size() ), m_OldSid( -1 ), m_Seq( 0 ), m_Cbk( cbk ), m_AlignL( alignl ), m_AlignR( alignr ) { m_Info.type = SFaMatchBlock::eSTS; } // returns false if no processing should continue bool Match( ISts * sts, int sid, int posle, int posrb ); int MatchEx( ISts * sts, int sid, int posle, int posrb ); protected: TSidHits m_OutQueues; const CFaData::TPathLst& m_Fapath; const CFaData::TSeqLst& m_Seqlst; vector m_Famap; int m_OldSid; CMmSequence * m_Seq; IFaLookupCallback *m_Cbk; // int m_Mism, m_Gaps; IAlign * m_AlignL, * m_AlignR; SFaMatchBlock m_Info; }; bool CMatchSts::SHit::OrderByPos2Pos1( const SHit& o1, const SHit& o2 ) { return ( o1.pos2 < o2.pos2 || o1.pos2 == o2.pos2 && o1.pos1 < o2.pos1 ); } int CMatchSts::SHit::Compare( const SHit& a, const SHit& b, unsigned min_l, unsigned max_l ) { if( a.gaps() > b.gaps() ) return -1; if( a.gaps() < b.gaps() ) return +1; if( a.mism() > b.mism() ) return -1; if( a.mism() < b.mism() ) return +1; int lb = ( b.pos2 - b.pos1 ); int la = ( a.pos2 - a.pos1 ); int min_len = min_l; int max_len = max_l; if( la >= min_len && la <= max_len ) { if( lb >= min_len && lb <= max_len ) { return lb - la; } else return 1; } else { if( lb >= min_len && lb <= max_len ) { return -1; } else { int da = ( la < min_len ) ? min_len - la : la - max_len; int db = ( lb < min_len ) ? min_len - lb : lb - max_len; return db - da; } } } bool CMatchSts::SHit::Overlap( unsigned l1, unsigned l2, const SHit& a, const SHit& b ) { return ( (unsigned)abs( a.pos2 - b.pos2 ) <= l2 && (unsigned)abs( a.pos1 - b.pos1 ) <= l1 ); } void CMatchSts::Flush() { for( TSidHits_I s = m_OutQueues.begin(); s != m_OutQueues.end(); ++s ) { int sid = s->first; int fil = m_Seqlst[sid].fid; m_Info.seq_label = m_Famap[fil].GetIdent( m_Seqlst[sid].data ); CMmSequence seq( m_Famap[fil], m_Seqlst[sid].data ); TAllHits& all = s->second; for( TAllHits_I i = all.begin(); i != all.end(); ++ i ) { // 1st: cluster // 2nd: select best per cluster if( i->second.size() == 0 ) continue; int l1 = i->first->GetPrimerLength( 0 ); int l2 = i->first->GetPrimerLength( 1 ); int lo = i->first->GetSizeLo(); int hi = i->first->GetSizeHi(); TStsHits& hits = i->second; sort( hits.begin(), hits.end() ); while( hits.size() ) { TStsHits todo; TStsHits cluster; cluster.push_back( hits.back() ); SHit best = cluster.front(); hits.pop_back(); for( TStsHits_CI j = hits.begin(); j != hits.end(); ++j ) { bool overlaps = false; for( TStsHits_CI h = cluster.begin(); h != cluster.end(); ++h ) if( SHit::Overlap( l1, l2, *j, *h ) ) { overlaps = true; break; } if( overlaps ) { if( SHit::Compare( *j, best, lo, hi ) >= 0 ) best = *j; cluster.push_back( *j ); } else { todo.push_back( *j ); } } m_Info.strand = i->first->GetDirection() == '+' ? SFaMatchBlock::ePos : i->first->GetDirection() == '-' ? SFaMatchBlock::eNeg : SFaMatchBlock::eUnkn; m_Info.sts_label = i->first->GetName(); m_Info.from = best.pos1; m_Info.to = best.pos2; m_Info.mism = best.mism(); m_Info.gaps = best.gaps(); m_Info.mism_l = best.mism_l; m_Info.mism_r = best.mism_r; m_Info.gaps_l = best.gaps_l; m_Info.gaps_r = best.gaps_r; m_Info.sequence = seq.GetData(); m_Info.seqlen = seq.Length(); m_Cbk->Match( &m_Info, i->first ); hits = todo; } } all.clear(); } m_OutQueues.clear(); } bool CMatchSts::Match( ISts * sts, int sid, int posle, int posrb ) { CMatchSts::MatchEx( sts, sid, posle, posrb ) ; return true; } int CMatchSts::MatchEx( ISts * sts, int sid, int posle, int posrb ) { if( sid != m_OldSid ) { // Flush(); int fil = m_Seqlst[sid].fid; if( ! m_Famap[fil].IsOpen() ) { m_Famap[fil].Open( m_Fapath[fil].path ); } delete m_Seq; m_Seq = new CMmSequence( m_Famap[fil], m_Seqlst[sid].data ); if( m_Seq->GetSize() < (unsigned)posrb || m_Seq->GetSize() < (unsigned)posle ) { ostringstream o; o << "range error for hash word, sid=" << sid << ": 0 < " << posle << " < " << posrb << " < " << m_Seq->GetSize(); throw range_error( o.str() ); } m_OldSid = sid; } CStrRef lp( sts->GetPrimer( ISts::eLeft ) ); CStrRef rp( sts->GetPrimer( ISts::eRight ) ); int poslb = posle-lp.length(); int posre = posrb+rp.length(); int flags = 0; if( ! m_AlignR->Forward( m_Seq->GetData() + posrb, m_Seq->GetData() + m_Seq->Length(), rp.data(), rp.length() ) ) flags |= fRightNoMatch; if( ! m_AlignL->Reverse( m_Seq->GetData(), m_Seq->GetData() + posle, lp.data(), lp.length())) flags |= fLeftNoMatch; if( ! flags ) { m_OutQueues[m_OldSid][sts].push_back( (sts->GetDirection() == ISts::ePlus)? SHit( poslb, posre, m_AlignL->GetMismatches(), m_AlignR->GetMismatches(), m_AlignL->GetGaps(), m_AlignR->GetGaps()) : SHit( poslb, posre, m_AlignR->GetMismatches(), m_AlignL->GetMismatches(), m_AlignR->GetGaps(), m_AlignL->GetGaps() ) ); } return flags; } void CFaLookup::Stat() { m_Counts.clear(); m_Counts.resize( m_Hash.GetWordCount() ); for( unsigned i = 0; i < m_Hash.GetWordCount(); ++i ) { m_Counts[i].resize( m_Hash.GetTableSize(i) ); } unsigned htbl_size = m_TableOffset[m_Hash.GetWordCount()] * m_ElSize; for( unsigned frag = 0; frag < m_Tabloc.size(); ++frag ) { CMMap loc( htbl_size, CMMap::fProtRead, CMMap::fMapPrivate, m_Fd, m_Tabloc[frag].first ); for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { for( unsigned e = 0; e < m_Hash.GetTableSize(wd); ++e ) { unsigned count; GetHashEntries( loc.data(), m_Tabloc[frag].first, wd, e, count ); if( count != ~0U ) m_Counts[wd][e] += count; } } } } CFaLookup::TBigCount CFaLookup::CalcStat( THashElement hash, unsigned wd ) { TBigCount cnt = 0; unsigned base = 0; for( unsigned i = 0; i < wd; ++i ) base += 2 * sizeof(THashElement) * m_Hash.GetTableSize( wd ); base += m_ElSize * hash; for( unsigned frag = 0; frag < m_Tabloc.size(); ++frag ) { CMMap loc( m_Tabloc[frag].second, CMMap::fProtRead, CMMap::fMapPrivate, m_Fd, m_Tabloc[frag].first ); Uint4 * entry = (Uint4*)( loc.data() + base ); // unsigned start=entry[0]; Uint4 size = entry[1]; if( size != ~0U ) cnt += size; } return cnt; } void CFaLookup::Find( IFaLookupCallback * cbk, const TStsList& lsts, bool syscall_optimize, int window ) { if( cbk && ! cbk->Start() ) return; CMatchSts stsmatch( m_Fapath, m_Seqlst, cbk, m_AlignL, m_AlignR ); for( TStsList::const_iterator i = lsts.begin(); i != lsts.end(); ++i ) { ISts * const& sts = *i; if( sts->GetPrimerLength( ISts::eLeft ) < m_Hash.GetWordSize() || sts->GetPrimerLength( ISts::eRight ) < m_Hash.GetWordSize() ) { cbk && cbk->Fail( "sts " + string( sts->GetName() ) + " is too short\n" ); return; } } map warned; CHashSet r_Hash( m_Hash ); unsigned htbl_size = m_TableOffset[m_Hash.GetWordCount()] * m_ElSize; unsigned overall = m_Tabloc.size() * lsts.size() * m_Hash.GetWordCount(); unsigned progress = 0; for( unsigned frag = 0; frag < m_Tabloc.size(); ++frag ) { if(cbk) cbk->Fragment( frag, m_Tabloc.size() ); CMMap table( htbl_size, //m_Tabloc[frag].second, CMMap::fProtRead, CMMap::fMapPrivate, m_Fd, m_Tabloc[frag].first ); typedef multimap TPrematch; typedef TPrematch::const_iterator TPrematch_CI; TPrematch prematch; // walk around stupidestest MS Visual C++ 8.x (.NET) which can't convert const_iterator to iterator :-/ const TPrematch& cprematch( prematch ); for( TStsList::const_iterator i = lsts.begin(); i != lsts.end(); ++i ) { ISts * const& sts = *i; m_Hash.Begin( sts->GetPrimerData( ISts::eLeft ) + sts->GetPrimerLength( ISts::eLeft ) - m_Hash.GetWordSize() ); r_Hash.Begin( sts->GetPrimerData( ISts::eRight ) ); int fixup= sts->GetPrimerLength( ISts::eLeft ) + sts->GetPrimerLength( ISts::eRight ) - m_Hash.GetWordSize(); int limitsize = max( sts->GetPrimerLength( ISts::eLeft ), sts->GetPrimerLength( ISts::eRight ) ); int lo = sts->GetSizeLo() - fixup - window; int hi = sts->GetSizeHi() - fixup + window; int minlo = limitsize - fixup; if( lo < minlo ) lo = minlo; if( hi < lo ) hi = lo; for( unsigned wd = 0; wd < m_Hash.GetWordCount(); ++wd ) { if( cbk ) cbk->Progress( progress++, overall ); if( ! m_Hash.Good(wd) || ! r_Hash.Good(wd) ) continue; unsigned ucount, vcount; off64_t ustart, vstart; {{ ustart = GetHashEntries( table.data(), m_Tabloc[frag].first, wd, m_Hash.GetValue(wd), ucount ); vstart = GetHashEntries( table.data(), m_Tabloc[frag].first, wd, r_Hash.GetValue(wd), vcount ); if( cbk && ! warned[sts] ) { if( ucount == ~0U ) cbk->Warn( "STS " + string( sts->GetName() ) + " has repeatable word for left primer ", sts ); if( vcount == ~0U ) cbk->Warn( "STS " + string( sts->GetName() ) + " has repeatable word for right primer ", sts ); if( ucount == ~0U || vcount == ~0U ) { warned[sts] = true; continue; } } if( ucount == ~0U || vcount == ~0U) continue; if( ucount == 0 || vcount == 0) continue; }} CMMap umap( ucount * sizeof( Uint4 ), CMMap::fProtRead, CMMap::fMapPrivate, m_Fd, ustart ); CMMap vmap( vcount * sizeof( Uint4 ), CMMap::fProtRead, CMMap::fMapPrivate, m_Fd, vstart ); madvise( umap.data(), umap.size(), MADV_SEQUENTIAL | MADV_DONTNEED ); madvise( vmap.data(), vmap.size(), MADV_SEQUENTIAL | MADV_DONTNEED ); CHashListIterator u( umap.data(), ucount ); CHashListIterator v( vmap.data(), vcount ); for( CHashCoIterator i( u, v, lo, hi ); i; ) { int posrb = *i.GetY() - m_Hash.GetWordSize(); int posle = *i.GetX(); int sid = i.GetX().GetSid(); if( (Uint4)sid != i.GetY().GetSid()) { throw runtime_error( "CoIterator error!" ); } if( syscall_optimize ) { prematch.insert( make_pair( sid, SStsPreMatch( sts, posle, posrb ) ) ); if( prematch.size() > 10000 ) { for(TPrematch_CI im = cprematch.begin(); im != cprematch.end(); ++im) { // int sid=im->first; if( ! stsmatch.Match(im->second.sts, im->first, im->second.le, im->second.rb ) ) return; } prematch.clear(); } } else { int rc = stsmatch.MatchEx( sts, sid, posle, posrb ); if(rc) { if( rc & CMatchSts::fRightNoMatch ) i.IncY(); else if( rc & CMatchSts::fLeftNoMatch ) i.IncX(); continue; } } ++i; } } } for( TPrematch_CI im = cprematch.begin(); im != cprematch.end(); ++im ) { if( ! stsmatch.Match( im->second.sts, im->first, im->second.le, im->second.rb ) ) return; } stsmatch.Flush(); } cbk && cbk->Done(); } //////////////////////////////////////////////////////////////////////// /* * $Log: fahash_lookup.cpp,v $ * Revision 1.24 2008/03/26 16:03:16 rotmistr * Fixed bug with false negatives in unoptimized mode * * Revision 1.23 2007/07/11 20:49:29 rotmistr * Made 64bit-compatible * * Revision 1.22 2007/07/05 16:05:58 rotmistr * Made things compileable by MS Visual C++ 8.0 * * Revision 1.21 2005/02/11 21:53:50 rotmistr * Fixed one more "margin" bug * * Revision 1.20 2005/02/11 20:42:54 rotmistr * Fixed "margin" bug, added primer search from file * * Revision 1.19 2004/09/03 21:28:49 rotmistr * Fixes to compile with Borland C++ 5.5 * * Revision 1.18 2004/06/07 16:24:56 rotmistr * Bug fixes to previos version. * * Revision 1.17 2004/06/03 23:37:20 rotmistr * New aligner added. * * Revision 1.16 2004/05/27 21:12:46 rotmistr * Some warnings fixed. * * Revision 1.15 2004/05/27 20:35:47 rotmistr * Version 2.1.0 with appropriate changes (see Changes) is ready for tests. * * Revision 1.14 2004/04/28 14:35:36 rotmistr * hashfile ver2 build/search works now * * Revision 1.13 2004/04/27 00:01:55 rotmistr * Second version of reverse hash file started * * Revision 1.12 2004/04/01 05:57:52 rotmistr * Compilable with borland C++ * * Revision 1.11 2004/02/18 05:44:25 rotmistr * Changes in CGI: sort order, separate misalignments for l and r primers, reload button * * Revision 1.10 2004/02/12 21:38:20 rotmistr * Fixed typo in seqcmp * Optimized and fixed lookup * Better look for reverse.cgi * * Revision 1.9 2004/02/11 04:34:55 rotmistr * Optimised lookup speed and memory usage * Fixed bug with end of sequence in stsmatch * Changing CGI look * * Revision 1.8 2004/02/04 21:37:40 rotmistr * Optimized filter for labeling sequences * * Revision 1.7 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.6 2004/01/28 23:27:02 rotmistr * "Best of overlapping" hit selection postprocessor added. * * Revision 1.5 2004/01/07 16:57:42 rotmistr * Fragment size is now configurable. * * Revision 1.4 2004/01/06 21:54:19 rotmistr * Statistics for word repetitions API added * * Revision 1.3 2003/12/30 21:36:32 rotmistr * Syscall optimisation mode added. * * Revision 1.2 2003/12/23 21:30:50 rotmistr * - gaps/mismatches reporting * - lo/hi fixup * - reverse sts in re-PCR_main * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/famap_main.cpp0000644001137700010620000001663711745334032015212 0ustar rotmistrcontig/* $Id: famap_main.cpp,v 1.7 2008/04/28 16:38:45 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); #ifndef VERSION #define VERSION "(devel:" __DATE__ ")" #endif class CMain { public: CMain(int c, char ** v): argc(c),argv(v),done(false),command(eStat),cvt(0) {} int Run(); protected: int Execute(); int ParseCmdline(); int Help(FILE* = stdout); int Version(); int Build(); int Stat(); int Dump(); int List(); protected: int argc; char ** argv; bool done; protected: string famap; enum ECommand { eBuild, eStat, eDump, eList } command; const char * cvt; }; int CMain::Help(FILE* out) { fprintf(out,"usage: [-hV] -b mmapped-file [-t cvt] [fafile ...]\n"); fprintf(out," or: [-hV] -d mmapped-file [ord ...]\n"); fprintf(out," or: [-hV] -l mmapped-file [ord ...]\n"); fprintf(out,"where cvt (convertion table) is one of:\n" "\toff - as is (default)\n" "\tn - nucleotide [acgtnACGTN] allowed,\n" "\tN - nucleotide uppercase allowed [ACGTN]\n" "\tnx - nucleotide with ambiquity codes allowed\n" "\tNX - nucleotide with ambiquity codes uppercase\n"); done=true; return 0; } int CMain::ParseCmdline() { int optchar; while((optchar=getopt(argc,argv,"hVb:d:l:t:"))!=-1) { switch(optchar) { case 'h': Help(); break; case 'V': Version(); break; case 'b': command=eBuild; famap=optarg; break; case 'd': command=eDump; famap=optarg; break; case 'l': command=eList; famap=optarg; break; case 't': if(strcmp(optarg,"off")==0||strcmp(optarg,"-")==0) cvt=0; else if(strcmp(optarg,"n")==0) cvt=CFastaReader::sm_Nucleotides; else if(strcmp(optarg,"N")==0) cvt=CFastaReader::sm_NucleotidesUc; else if(strcmp(optarg,"nx")==0) cvt=CFastaReader::sm_NucleotidesExt; else if(strcmp(optarg,"NX")==0) cvt=CFastaReader::sm_NucleotidesExtUc; else fprintf(stderr,"Unknown table %s\n",optarg); break; } } if(done) return 0; if(command == eBuild && optind >= argc) { Help(stderr); return 1; } return 0; } int CMain::Build() { do { CFastaMapPrepare prepare(famap+"~"); for(int i=optind; i= argc) { for(unsigned i=0; i=fmap.SequenceCount()) fprintf(stderr,"? Id %d is out of range\n",x); else { printf("%s\n",fmap.GetDefline(x).c_str()); CMmSequence seq(fmap,x); for(unsigned j=0; j= argc) { for(unsigned i=0; i=fmap.SequenceCount()) fprintf(stderr,"? Id %d is out of range\n",x); else { printf("%d\t%s\t%d\n",x,fmap.GetIdent(x).c_str(), fmap.GetSize(x)); } } } return 0; } int CMain::Version() { done=true; puts("Fasta converter for e-PCR version " VERSION); return 0; } int CMain::Run() { if(int rc=ParseCmdline() ) return rc; if(done) return 0; return Execute(); } int main(int argc, char ** argv) { try { CMain app(argc,argv); return app.Run(); } catch(logic_error& e) { fprintf(stderr,"! Fatal: Internal error %s\n",e.what()); } catch(exception& e) { fprintf(stderr,"! Fatal: %s\n",e.what()); } catch(...) { fprintf(stderr,"! Fatal: Unknown error\n"); } return 100; } /* * $Log: famap_main.cpp,v $ * Revision 1.7 2008/04/28 16:38:45 rotmistr * Applied patch to build with gcc-4.3 * * Revision 1.6 2007/07/05 16:05:58 rotmistr * Made things compileable by MS Visual C++ 8.0 * * Revision 1.5 2004/04/06 04:53:17 rotmistr * All is compileable with BCC5.5 and runnable on WIndows * * Revision 1.4 2004/03/07 06:35:59 rotmistr * Many bugfixes and optimisations -- cgi is to go to production * * Revision 1.3 2004/02/12 21:38:20 rotmistr * Fixed typo in seqcmp * Optimized and fixed lookup * Better look for reverse.cgi * * Revision 1.2 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/faread.cpp0000644001137700010620000002004211745334032014325 0ustar rotmistrcontig/* $Id: faread.cpp,v 1.6 2004/04/06 04:53:17 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); struct SFile { bool m_pipe; FILE * f; SFile(bool pipe, const char * name) { #ifdef NO_POPEN if(pipe) throw runtime_error("Pipes are not supported in windows!"); f=fopen(name,"r"); #else f=pipe?popen(name,"r"):fopen64(name,"r"); #endif if(f==0) throw runtime_error(string(name)+": "+strerror(errno)); setvbuf(f,0,_IOFBF,16192); m_pipe=pipe; } ~SFile() { #ifndef NO_POPEN if(m_pipe) pclose(f); else #endif fclose(f); } }; void CFastaReader::Open(const string& fname) { if(IsOpen()) Close(); // FILE * f=fopen64(fname.c_str(),"r"); // if(f==0) throw runtime_error(fname+": "+strerror(errno)); // setvbuf(f,0,_IOFBF,16192); m_Fptr=new SFile(false,fname.c_str()); } void CFastaReader::PipeIn(const string& command) { if(IsOpen()) Close(); m_Fptr=new SFile(true,command.c_str()); } void CFastaReader::Close() { if(m_Fptr) { // fclose((FILE*)m_Fptr); delete (SFile*)m_Fptr; m_Fptr=0; } } void CFastaReader::ReadFile(IFastaReaderCallback * cbk) { if(!cbk) return; if(!IsOpen()) return; FILE * f=((SFile*)m_Fptr)->f; char buffer[16192]; bool is_defline=false; bool is_newline=true; bool beginning_of_file=true; cbk->CbkFileBegin(); string defline; while(!feof(f)) { if(!fgets(buffer,sizeof(buffer),f)) break; char * eol=strlen(buffer)+buffer; bool complete_line=eol>buffer && (eol[-1]=='\r' || eol[-1]=='\n'); while(eol>buffer && isspace(eol[-1])) --eol; *eol=0; if(!is_newline) { if(is_defline) { defline.append(buffer); } else SeqlineCbk(cbk,buffer,eol-buffer); } else { if(*buffer=='>') { if(!beginning_of_file) cbk->CbkEntryEnd(); cbk->CbkEntryBegin(); beginning_of_file=false; is_defline=true; defline.assign(buffer); } else { if(beginning_of_file) throw runtime_error("Leading garbage in fasta file!"); if(defline.length()) { // should have at least '>' cbk->CbkDefline(defline.c_str(),defline.length()); const char * s=defline.c_str()+1; for(;*s && isspace(*s);++s); const char * ss=s; for(;*s && !isspace(*s);++s); cbk->CbkIdent(ss,s-ss); defline.clear(); } is_defline=false; SeqlineCbk(cbk,buffer,eol-buffer); } } is_newline=complete_line; } if(!beginning_of_file) cbk->CbkEntryEnd(); cbk->CbkFileEnd(); } void CFastaReader::SeqlineCbk(IFastaReaderCallback* cbk, char * buffer, unsigned length) { char * eol=buffer+length; if(m_CvtTable) { for(char * x=buffer; x=eol) break; c=m_CvtTable[*x]; } *x=c; } } cbk->CbkSeqline(buffer,eol-buffer); } char CFastaReader::sm_NucleotidesExt[]= "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0ABCDNNGHNNKNMNNNNRNTTNWNYN\0\0\0\0\0" //.abcdefghijklmnopqrstuvwxyz..... "\0abcdnnghnnknmnnnnrnttnwnyn\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" ; char CFastaReader::sm_NucleotidesExtUc[]= "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0ABCDNNGHNNKNMNNNNRNTTNWNYN\0\0\0\0\0" //.abcdefghijklmnopqrstuvwxyz..... "\0ABCDNNGHNNKNMNNNNRNTTNWNYN\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" ; char CFastaReader::sm_Nucleotides[]= "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0ANCNNNGNNNNNNNNNNNNTNNNNNN\0\0\0\0\0" //.abcdefghijklmnopqrstuvwxyz..... "\0ancnnngnnnnnnnnnnnntnnnnnn\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" ; char CFastaReader::sm_NucleotidesUc[]= "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0ANCNNNGNNNNNNNNNNNNTNNNNNN\0\0\0\0\0" //.abcdefghijklmnopqrstuvwxyz..... "\0ANCNNNGNNNNNNNNNNNNTNNNNNN\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" ; /* * $Log: faread.cpp,v $ * Revision 1.6 2004/04/06 04:53:17 rotmistr * All is compileable with BCC5.5 and runnable on WIndows * * Revision 1.5 2004/04/01 05:57:52 rotmistr * Compilable with borland C++ * * Revision 1.4 2004/03/23 22:35:25 rotmistr * Fixed processing of -mid flag in cmdline * Fixed destructor for fasta reader * Removed cgi * * Revision 1.3 2004/03/07 06:35:59 rotmistr * Many bugfixes and optimisations -- cgi is to go to production * * Revision 1.2 2004/01/08 23:22:41 rotmistr * Fixed init error in faread, * Adjusted output to standard, * Added output format style and output file to parameters. * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/faread.hpp0000644001137700010620000000760411745334032014343 0ustar rotmistrcontig/* $Id: faread.hpp,v 1.5 2007/07/11 20:49:29 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_FAREAD__HPP #define EPCR_FAREAD__HPP #include #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class CFastaReader; class IFastaReaderCallback; class CFastaReader { public: virtual ~CFastaReader() throw () { Close(); } CFastaReader() : m_Fptr(0), m_CvtTable(0) {} explicit CFastaReader( const string& fname ): m_Fptr( 0 ), m_CvtTable( 0 ) { Open( fname ); } bool IsOpen() const { return m_Fptr; } void Open( const string& fname ); void PipeIn( const string& cmd ); void Close(); virtual void ReadFile( IFastaReaderCallback* cbk ); // virtual bool ReadEntry(IFastaReaderCallback* cbk); void SetCvtTable( const char * table = 0 ) { m_CvtTable = table; } // These tables should have '\0' in all positions to be "eaten" // (f.e. for space and tab), everything else will be xlated static char sm_Nucleotides[]; static char sm_NucleotidesUc[]; static char sm_NucleotidesExt[]; static char sm_NucleotidesExtUc[]; protected: void SeqlineCbk( IFastaReaderCallback * cbk, char * data, unsigned length ); protected: void * m_Fptr; const char * m_CvtTable; }; class IFastaReaderCallback { public: virtual ~IFastaReaderCallback() throw () {} virtual void CbkDefline( const char * defline, unsigned length ) = 0; virtual void CbkIdent( const char * ident, unsigned length ) = 0; virtual void CbkSeqline( const char * data, unsigned length ) = 0; virtual void CbkEntryBegin() {} virtual void CbkEntryEnd() = 0; virtual void CbkFileBegin() {} virtual void CbkFileEnd() {} }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: faread.hpp,v $ * Revision 1.5 2007/07/11 20:49:29 rotmistr * Made 64bit-compatible * * Revision 1.4 2004/03/07 06:35:59 rotmistr * Many bugfixes and optimisations -- cgi is to go to production * * Revision 1.3 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.2 2004/01/08 23:22:41 rotmistr * Fixed init error in faread, * Adjusted output to standard, * Added output format style and output file to parameters. * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/fast_seqio.hpp0000644001137700010620000001073011745334032015250 0ustar rotmistrcontig/* $Id: fast_seqio.hpp,v 1.3 2004/04/06 04:53:17 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_FAST_SEQIO__HPP #define EPCR_FAST_SEQIO__HPP #include #include #include #include #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class CFastaMapData; class CFastaMapPrepare; class CFastaMap; class CMmSequence; class CFastaMapData { public: typedef off64_t TOffset; typedef off64_t TSize; protected: vector m_Ident; vector m_Defline; vector m_Offset; vector m_Size; }; class CFastaMapPrepare:public IFastaReaderCallback,protected CFastaMapData { public: virtual void CbkDefline(const char * defline, unsigned length); virtual void CbkIdent(const char * ident, unsigned length); virtual void CbkSeqline(const char * data, unsigned length); virtual void CbkEntryBegin(); virtual void CbkEntryEnd(); virtual void CbkFileBegin(); virtual void CbkFileEnd(); void Open(const string& fname); void Close(); bool IsOpen() const { return m_Fptr!=0; } void AddFile(const string& fname, const char * cvtTable=0); virtual ~CFastaMapPrepare() throw () { Close(); } CFastaMapPrepare():m_Fptr(0) {} explicit CFastaMapPrepare(const string& fname):m_Fptr(0) { Open(fname); } protected: void WritePrologue(); void WriteEpilogue(); protected: void * m_Fptr; }; class CFastaMap:public CFastaMapData { public: friend class CMmSequence; ~CFastaMap() throw (); CFastaMap(); CFastaMap(const string& file); bool IsOpen() const { return m_Fd!=-1; } void Open(const string& file); void Close(); unsigned SequenceCount() const { return m_Size.size(); } const string& GetDefline(unsigned i) const { return m_Defline[i]; } const string& GetIdent(unsigned i) const { return m_Ident[i]; } unsigned GetSize(unsigned i) const { return m_Size[i]; } unsigned GetCount() const { return m_Size.size(); } protected: void ReadPrologue(); void ReadEpilogue(); protected: int m_Fd; bool m_SwapBytes; }; class CIndexAssert { public: CIndexAssert(unsigned index, unsigned size, const string& msg="") { if(index >= size) throw range_error(msg); } }; class CMmSequence:public CIndexAssert, public CMMap { public: explicit CMmSequence(CFastaMap&, unsigned); ~CMmSequence() throw (); operator const char * () const { return GetData(); } // const char * Data() const { return GetData(); } unsigned Length() const { return GetSize(); } protected: // const char * m_Data; // unsigned m_Size; }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: fast_seqio.hpp,v $ * Revision 1.3 2004/04/06 04:53:17 rotmistr * All is compileable with BCC5.5 and runnable on WIndows * * Revision 1.2 2003/12/30 15:27:22 rotmistr * Fixed bug with sequence end * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/fast_seqio_read.cpp0000644001137700010620000001254411745334032016243 0ustar rotmistrcontig/* $Id: fast_seqio_read.cpp,v 1.6 2007/07/11 20:49:29 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); CFastaMap::~CFastaMap() throw() { Close(); } CFastaMap::CFastaMap():m_Fd(-1) {} CFastaMap::CFastaMap(const string& path):m_Fd(-1) { Open(path); } void CFastaMap::Open(const string& path) { if(m_Fd!=-1) Close(); int fd=open(path.c_str(),O_RDONLY|O_LARGEFILE); if(fd==-1) throw runtime_error(path+": "+strerror(errno)); m_Fd=fd; ReadPrologue(); ReadEpilogue(); } void CFastaMap::Close() { if(m_Fd!=-1) { m_Size.clear(); m_Offset.clear(); m_Defline.clear(); m_Ident.clear(); close(m_Fd); } } void CFastaMap::ReadPrologue() { if( sizeof( Int2 ) != 2 ) throw logic_error( "Bad compile options: sizeof( Int2 ) != 2" ); if( sizeof( Int4 ) != 4 ) throw logic_error( "Bad compile options: sizeof( Int4 ) != 4" ); if( sizeof( Int8 ) != 8 ) throw logic_error( "Bad compile options: sizeof( Int8 ) != 8" ); char buff[128]; if(read(m_Fd,buff,8) != 8) throw runtime_error(strerror(errno)); if(memcmp(buff,"FASTAMAP",8)) throw runtime_error("FastaMap Signature is not found"); switch(Read(m_Fd)){ case eHiEndian: m_SwapBytes=true; break; case eLoEndian: m_SwapBytes=false; break; default: throw runtime_error("Bad format: wrong byteorder signature"); } // if(m_SwapBytes) fprintf(stderr,"* FaMap: Swapping bytes\n"); short ver=BoCvt(Read(m_Fd),m_SwapBytes); short rev=BoCvt(Read(m_Fd),m_SwapBytes); if(ver!=1 && rev!=0) throw runtime_error("Wrong FastaMap file version"); } void CFastaMap::ReadEpilogue() { /*TOffset filesize=*/lseek64(m_Fd,0,SEEK_END); TOffset directory_pos=lseek64(m_Fd,-(TOffset)sizeof(TOffset),SEEK_CUR); TOffset directory=BoCvt(Read(m_Fd),m_SwapBytes); if(TOffset(directory+3*sizeof(TOffset)) != directory_pos) throw runtime_error("Wrong directory record"); lseek64(m_Fd,directory,SEEK_SET); TOffset epilogue=BoCvt(Read(m_Fd),m_SwapBytes); TOffset ident =BoCvt(Read(m_Fd),m_SwapBytes); TOffset defline =BoCvt(Read(m_Fd),m_SwapBytes); lseek64(m_Fd,epilogue,SEEK_SET); char buff[128]; if(read(m_Fd,buff,8)!=8) throw runtime_error("read failed"); if(memcmp(buff,"EPILOGUE",8)) throw runtime_error("epilogue is not found!"); unsigned sz=BoCvt(Read(m_Fd),m_SwapBytes); m_Size.resize(sz); m_Offset.resize(sz); m_Defline.resize(sz); m_Ident.resize(sz); for(unsigned i=0; i(m_Fd),m_SwapBytes); } for(unsigned i=0; i(m_Fd),m_SwapBytes); } lseek64(m_Fd,ident,SEEK_SET); for(unsigned i=0; i(m_Fd); lseek64(m_Fd,defline,SEEK_SET); for(unsigned i=0; i(m_Fd); } CMmSequence::CMmSequence(CFastaMap& fm, unsigned i): CIndexAssert(i,fm.GetCount()), CMMap(fm.m_Size[i]+1,CMMap::fProtRead,CMMap::fMapPrivate,fm.m_Fd,fm.m_Offset[i]) { if(m_Size>1) m_Size--; } CMmSequence::~CMmSequence() throw () { if(m_Size) ++m_Size; } /* * $Log: fast_seqio_read.cpp,v $ * Revision 1.6 2007/07/11 20:49:29 rotmistr * Made 64bit-compatible * * Revision 1.5 2004/04/01 16:37:41 rotmistr * Cleaned after adding windows capabilities * * Revision 1.4 2004/04/01 05:57:53 rotmistr * Compilable with borland C++ * * Revision 1.3 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.2 2003/12/30 15:27:22 rotmistr * Fixed bug with sequence end * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/fast_seqio_write.cpp0000644001137700010620000001233611745334032016461 0ustar rotmistrcontig/* $Id: fast_seqio_write.cpp,v 1.5 2007/07/11 20:49:29 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); void CFastaMapPrepare::Open( const string& fname ) { if( IsOpen() ) Close(); FILE * f = fopen64( fname.c_str(), "w"FILE_BINARY ); if( f == 0 ) throw runtime_error( fname + ": " + strerror( errno ) ); setvbuf( f, 0, _IOFBF, 16192 ); m_Fptr = f; WritePrologue(); } void CFastaMapPrepare::Close() { if( m_Fptr ) { WriteEpilogue(); m_Offset.clear(); m_Size.clear(); m_Defline.clear(); m_Ident.clear(); fclose( (FILE*)m_Fptr ); m_Fptr = 0; } } void CFastaMapPrepare::AddFile( const string& fname, const char * cvtTable ) { if( IsOpen() ) { CFastaReader reader( fname ); if( cvtTable ) reader.SetCvtTable( cvtTable ); reader.ReadFile( this ); } } void CFastaMapPrepare::WritePrologue() { if( sizeof( Int2 ) != 2 ) throw logic_error( "Bad compile options: sizeof( Int2 ) != 2" ); if( sizeof( Int4 ) != 4 ) throw logic_error( "Bad compile options: sizeof( Int4 ) != 4" ); if( sizeof( Int8 ) != 8 ) throw logic_error( "Bad compile options: sizeof( Int8 ) != 8" ); if( fwrite( "FASTAMAP", 1, 8, (FILE*)m_Fptr ) != 8 ) throw runtime_error( strerror( errno ) + string(" while writing signature") ); Write( (FILE*)m_Fptr, Uint4( eLoEndian ) ); Write( (FILE*)m_Fptr, Uint2( 1 )); Write( (FILE*)m_Fptr, Uint2( 0 )); Write( (FILE*)m_Fptr, Uint4( 20 )); // header size } void CFastaMapPrepare::WriteEpilogue() { TOffset epilogue = SeekAlign( (FILE*)m_Fptr ); fwrite( "EPILOGUE", 1, 8, (FILE*)m_Fptr ); Write( (FILE*)m_Fptr, Uint4( m_Offset.size() ) ); Write( (FILE*)m_Fptr, m_Offset[0], m_Offset.size() ); Write( (FILE*)m_Fptr, m_Size[0], m_Size.size() ); TOffset ident = ftello64( (FILE*)m_Fptr ); for( unsigned i = 0; i < m_Ident.size(); ++i ) Write( (FILE*)m_Fptr, m_Ident[i] ); TOffset defline = ftello64( (FILE*)m_Fptr ); for( unsigned i = 0; i < m_Defline.size(); ++i ) Write( (FILE*)m_Fptr, m_Defline[i] ); TOffset directory = SeekAlign( (FILE*)m_Fptr ); Write( (FILE*)m_Fptr, epilogue ); Write( (FILE*)m_Fptr, ident ); Write( (FILE*)m_Fptr, defline ); Write( (FILE*)m_Fptr, directory ); } void CFastaMapPrepare::CbkDefline( const char * defline, unsigned length ) { m_Defline.back().assign( defline, length ); } void CFastaMapPrepare::CbkIdent( const char * ident, unsigned length ) { m_Ident.back().assign( ident, length ); } void CFastaMapPrepare::CbkSeqline( const char * seqline, unsigned length ) { Write( (FILE*)m_Fptr, *seqline, length ); m_Size.back() += length; } void CFastaMapPrepare::CbkEntryBegin() { m_Defline.push_back( string("") ); m_Ident.push_back( string("") ); m_Size.push_back( 0 ); m_Offset.push_back( SeekAlign( (FILE*)m_Fptr, 2 ) ); } void CFastaMapPrepare::CbkEntryEnd() { Write( (FILE*)m_Fptr, char(0) ); } void CFastaMapPrepare::CbkFileBegin() {} void CFastaMapPrepare::CbkFileEnd() {} /* * $Log: fast_seqio_write.cpp,v $ * Revision 1.5 2007/07/11 20:49:29 rotmistr * Made 64bit-compatible * * Revision 1.4 2004/04/06 04:53:17 rotmistr * All is compileable with BCC5.5 and runnable on WIndows * * Revision 1.3 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.2 2003/12/30 15:27:22 rotmistr * Fixed bug with sequence end * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/hashset.cpp0000644001137700010620000001672111745334032014553 0ustar rotmistrcontig/* $Id: hashset.cpp,v 1.2 2008/06/16 16:02:40 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); unsigned char CHashSet::sm_HashId[]= "********************************" "********************************" "*\0*\1***\2************\3***********" //=Ab=Cdef=Ghijklmnopqrs=Tuvwxyz*****// "*\0*\1***\2************\3***********" "********************************" "********************************" "********************************" "********************************"; inline bool s_IsAmbiq(unsigned char a) { return a>3; } typedef CHashSet::hash_type hash_type; typedef CHashSet::size_type size_type; typedef CHashSet::bits_type bits_type; /* static char * s_BitsRepr(unsigned u, int w=32) { static char ret[65]; if(w>32) w=31; if(w<1) w=1; for(unsigned m=(1<<(w-1)), i=0; m; ++i,(m>>=1)) { ret[i]=(m&u)?'1':'0'; } ret[w]=0; return ret; } */ /* ************************************************************************ m_mask[*] 0000111111111 m_word[0] 0000110110110 m_word[1] 0000101101101 m_word[2] 0000011011011 m_ambiq 0000001000000 str(rev) acgtACNGTCATGcgatagat ^ ptr, offset ************************************************************************ */ bool CHashSet::Begin(const char * ptr) { m_Ptr=ptr; m_Offset=0; m_AmbMask= ~0U; // all are ambiquos in the very beginning // for(size_type t=0; t3) { m_AmbMask|=1; nt=0; } // Circular 'rotation' with update of values m_Hash[0]=m_Mask[0]&((m_Hash[0]<<2)|nt); } } else { for(;m_Offset < m_WdSize && *m_Ptr; ++m_Ptr, ++m_Offset) { m_AmbMask<<=1; unsigned char nt=sm_HashId[(int)*m_Ptr]; if(nt>3) { m_AmbMask|=1; nt=0; } hash_type x=m_Hash[m_Period-1]; for(size_type t=m_Period-1; t>0; --t) { m_Hash[t]=m_Mask[t]&((m_Hash[t-1]<<2)|nt); } m_Hash[0]=x&m_Mask[0]; } } return !End(); } bool CHashSet::Good() const { for(size_type t=0; t< m_Period; ++t) if(!Good(t)) return false; return true; } CHashSet::~CHashSet() throw () { _intl_free(); } void CHashSet::_intl_free() { delete[] m_Word; delete[] m_Mask; delete[] m_Hash; delete[] m_TbSize; } CHashSet::CHashSet(TSize wdsize, TSize period) :m_Ptr(0),m_Offset(0),m_AmbMask(0) { if(period==0 || period>wdsize) period=1; #if 0 // Assure that hash ranged are equal if(wdsize%period) throw logic_error("Uneven hash masks"); #endif m_TbSize = new TSize[period]; m_Word = new TBitMask[period]; m_Mask = new THashValue[period]; m_Hash = new THashValue[period]; m_WdSize = wdsize; m_Period = period; if(period<=1) { m_TbSize[0] = 1<<(2*wdsize); m_Mask[0]=m_TbSize[0]-1; m_Word[0]=(1< #include #include #include using namespace std; int main(int argc, char ** argv) { try { int optchar; int period=0; int wdsize=7; while((optchar=getopt(argc,argv,"hVp:w:"))!=-1) { switch(optchar){ case 'V': case 'h': cout<< "usage: [-hV] [-p period] [-w wordsize] [string ...]\n"; return 0; case 'p': period=atoi(optarg); break; case 'w': wdsize=atoi(optarg); break; } } CHashSet hset(wdsize,period); cout << "WordSize = " << hset.GetWordSize() << endl << "TabCount = " << hset.GetWordCount() << endl << "Id\t" << setw(16) << "Word" << "\t" << setw(16) << "Mask" << "\t" << "Table size\n"; for(size_type i=0; i BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class CHashSet; class CHashSet { public: // stl-style typedefs typedef Uint4 hash_type; typedef Uint4 size_type; typedef Uint4 bits_type; // ncbi-style typedefs typedef hash_type THashValue; typedef size_type TSize; typedef bits_type TBitMask; TSize GetWordSize() const { return m_WdSize; } TSize GetWordCount() const { return m_Period; } TBitMask GetWord( TSize id ) const { return m_Word[id]; } TBitMask GetMask( TSize id ) const { return m_Mask[id]; } THashValue GetValue( TSize id ) const { return m_Hash[id]; } THashValue GetTableSize( TSize id ) const { return m_TbSize[id]; } THashValue GetAmbiquityMask() const { return m_AmbMask; } THashValue operator [] ( TSize id ) const { return GetValue( id ); } bool Begin( const char * ptr ); bool Next() { // inline if( End() ) return false; m_AmbMask <<= 1; unsigned char nt = sm_HashId[ int(*m_Ptr) ]; if( nt > 3 ) /* ambiquos */ { m_AmbMask |= 1; nt = 0; } // Circular 'rotation' with update of values if( m_Period == 1 ) { m_Hash[0] = m_Mask[0] & ( ( m_Hash[0] << 2 ) | nt ); } else { hash_type x = m_Hash[ m_Period - 1 ]; for( size_type t = m_Period - 1; t > 0; --t ) { m_Hash[t] = m_Mask[t] & ( ( m_Hash[ t - 1 ] << 2 ) | nt ); } m_Hash[0] = x & m_Mask[0]; } ++m_Offset; ++m_Ptr; return true; }; bool Good() const; bool Good( TSize id ) const { return ( m_Word[id] & m_AmbMask ) == 0; } bool End() const { return m_Ptr == 0 || *m_Ptr == 0; } TSize GetPosition() const { return m_Offset; } const char * GetPtr() const { return m_Ptr; } virtual ~CHashSet() throw (); CHashSet( TSize wdsize = 0, TSize period = 0 ); CHashSet( const CHashSet& ); CHashSet& operator = ( const CHashSet& s ); protected: void _intl_free(); void _intl_copy( const CHashSet& s ); protected: const char * m_Ptr; // Pointer to current position // of string being processed TSize m_Offset; // Current offset in original string TSize m_WdSize; // Wordsize TSize m_Period; // Number of words to be used TSize * m_TbSize; // Hash table sizes for each word; // used by users TBitMask m_AmbMask; // Mask showing ambiquity positions TBitMask * m_Word; // Words; needed for ambiquity test THashValue * m_Hash; // Current hash values for each word THashValue * m_Mask; // Masks for each word; // used for value update protected: static unsigned char sm_HashId[]; }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: hashset.hpp,v $ * Revision 1.3 2007/07/11 20:49:29 rotmistr * Made 64bit-compatible * * Revision 1.2 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/mmap.cpp0000644001137700010620000000771011745334032014044 0ustar rotmistrcontig/* $Id: mmap.cpp,v 1.7 2007/07/05 16:05:58 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include //#include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); #ifndef USE_WIN CMMap::CMMap(unsigned size, unsigned prot, unsigned flags, TFileHandle fd, TOffset offset) { static unsigned page=sysconf(_SC_PAGESIZE); offset-=(m_Delta=(offset%page)); m_Data=(char*)mmap64(0,(m_Size=size)+m_Delta,prot,flags,fd,offset); if(m_Data==0 || m_Data==(char*)-1) { throw runtime_error("mmap failed: "+string(strerror(errno))); } else m_Data+=m_Delta; } CMMap::~CMMap() throw() { if(m_Data != (char *)-1) munmap(m_Data-m_Delta,m_Size+m_Delta); } #else static string errmsg() { char * x; if(!FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER| FORMAT_MESSAGE_FROM_SYSTEM| FORMAT_MESSAGE_IGNORE_INSERTS, NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (char*)&x,0,NULL)) return "UNKNOWN"; return x?x:"null"; } CMMap::CMMap(unsigned size, unsigned prot, unsigned flags, TFileHandle fd, TOffset offset) { SYSTEM_INFO si; GetSystemInfo(&si); static unsigned page=si.dwAllocationGranularity; offset-=(m_Delta=(offset%page)); m_Size=size; m_Map=CreateFileMapping((HANDLE)_get_osfhandle(fd),0, prot&fProtWrite?PAGE_READWRITE:PAGE_READONLY, 0,0,0); if(m_Map) { m_Data=(char*)MapViewOfFile( m_Map, flags&fMapPrivate?FILE_MAP_COPY:FILE_MAP_ALL_ACCESS, offset>>32,offset,size+m_Delta+1); } else m_Data=0; if(m_Map==0 || m_Data==0 || m_Data==(char*)-1) { string err( "mmap failed: " ); err.append( errmsg() ); throw runtime_error( err ); } else m_Data+=m_Delta; } CMMap::~CMMap() throw() { if(m_Data != (char *)-1 && m_Data != 0) { UnmapViewOfFile(m_Data); CloseHandle(m_Map); } } #endif /* * $Log: mmap.cpp,v $ * Revision 1.7 2007/07/05 16:05:58 rotmistr * Made things compileable by MS Visual C++ 8.0 * * Revision 1.6 2005/01/27 19:09:13 rotmistr * Fixed mmap for win32 * * Revision 1.5 2004/04/06 04:53:18 rotmistr * All is compileable with BCC5.5 and runnable on WIndows * * Revision 1.4 2004/04/01 16:37:41 rotmistr * Cleaned after adding windows capabilities * * Revision 1.3 2004/04/01 05:57:53 rotmistr * Compilable with borland C++ * * Revision 1.2 2004/01/06 21:54:19 rotmistr * Statistics for word repetitions API added * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/mmap.hpp0000644001137700010620000000725411745334032014054 0ustar rotmistrcontig/* $Id: mmap.hpp,v 1.4 2004/04/01 16:37:41 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_MMAP__HPP #define EPCR_MMAP__HPP #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class CMMap; class CMMap { public: typedef off64_t TOffset; typedef int TFileHandle; #ifndef USE_WIN enum EProtFlags { fProtRead = PROT_READ, fProtWrite = PROT_WRITE, fProtExec = PROT_EXEC, fProtNone = PROT_NONE, fProtFlagsNONE = 0 }; enum EAccessFlags { fMapPrivate = MAP_PRIVATE, fMapShared = MAP_SHARED, fMapNoReserve = MAP_NORESERVE, fMapAnon = MAP_ANON, fAccessFlagsNONE = 0 }; #else enum EProtFlags { fProtRead = 0x001, fProtWrite = 0x002, fProtExec = 0x000, fProtNone = 0x000, fProtFlagsNONE = 0 }; enum EAccessFlags { fMapPrivate = 0x001, fMapShared = 0x002, fMapNoReserve = 0x004, fMapAnon = 0x008, fAccessFlagsNONE = 0 }; #endif explicit CMMap(unsigned size, unsigned prot, unsigned flags, TFileHandle fd, off64_t offset=0); ~CMMap() throw (); // stl-style methods char * data() const { return m_Data; } unsigned size() const { return m_Size; } // ncbi-style methods char * GetData() const { return m_Data; } unsigned GetSize() const { return m_Size; } // operators operator char * () const { return m_Data; } operator unsigned () const { return m_Size; } protected: char * m_Data; unsigned m_Size; unsigned m_Delta; #ifdef USE_WIN HANDLE m_Map; #endif }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: mmap.hpp,v $ * Revision 1.4 2004/04/01 16:37:41 rotmistr * Cleaned after adding windows capabilities * * Revision 1.3 2004/04/01 05:57:53 rotmistr * Compilable with borland C++ * * Revision 1.2 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/re-PCR_main.cpp0000644001137700010620000005075211745334032015152 0ustar rotmistrcontig/* $Id: re-PCR_main.cpp,v 1.21 2008/04/28 16:38:45 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include #include //#include #include #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); class CMain { public: CMain(int c, char ** v): argc(c),argv(v),done(false), margin(50),gaps(0),mism(0),rev_lookup(true),presize(false), command(eNone),mode(eCmdline),optimize(false),batchcnt(1000), defLo(ePCR_DEFAULT_size_lo),defHi(ePCR_DEFAULT_size_hi), m_PrintAlignments(false),m_Quiet(false) {} int Run(); protected: int Execute(); int ParseCmdline(); int Help(FILE* = stdout); int Version(); int PrimerLookup(); int STSLookup(); protected: int argc; char ** argv; bool done; protected: string findex; int margin; int gaps, mism; bool rev_lookup; bool presize; enum ECommand { eNone, ePrimerLookup, eSTSLookup } command; enum EMode { eCmdline, eFile } mode; bool optimize; string outfile; int batchcnt; int defLo, defHi; IAlign * m_AlignL, * m_AlignR; bool m_PrintAlignments; bool m_Quiet; }; int CMain::Help(FILE* out) { fprintf(out,"usage: [-hV] -p hash-file [-g gaps] [-n mism] [-lq] " "[primer ...]\n"); fprintf(out," or: [-hV] -P hash-file [-g gaps] [-n mism] [-l] " "[-m margin] [-O+|-] [-C batchcnt] [-o outfile] [-r+|-] " "[primers-file ...]\n"); fprintf(out," or: [-hV] -s hash-file [-g gaps] [-n mism] [-lq] " "[-m margin] [-o outfile] [-r+|-] " "[left right lo[-hi] [...]]\n"); fprintf(out," or: [-hV] -S hash-file [-g gaps] [-n mism] [-lq] " "[-m margin] [-O+|-] [-C batchcnt] [-o outfile] [-r+|-] " "[stsfile ...]\n"); fprintf(out,"where:\n" "\t-p hash-file\tPerform primer lookup using hash-file\n" "\t-P hash-file\tPerform primer lookup using hash-file\n" "\t-s hash-file\tPerform STS lookup using hash-file\n" "\t-S hash-file\tPerform STS lookup using hash-file\n" "\t-n mism \tSet max allowed mismatches per primer " "for lookup\n" "\t-g gaps \tSet max allowed indels per primer for lookup\n" "\t-m margin \tSet variability for STS size for lookup\n" "\t-l \tUse presize alignments (only if gaps>0)\n" "\t-G \tPrint alignments in comments\n" "\t-d min-max \tSet default STS size\n" "\t-r +|- \tEnable/disable reverse STS lookup\n" "\t-O +|- \tEnable/disable syscall optimisation\n" "\t-C batchcnt \tSet number of STSes per batch\n" "\t-o outfile \tSet output file name\n" "\t-q \tQuiet (no progress indicator)\n" "Use famap and fahash to generate hash files\n"); done=true; return 0; } int CMain::ParseCmdline() { int optchar; while((optchar=getopt(argc,argv,"hVp:P:s:S:d:m:r:g:n:O:C:o:lGq"))!=-1) { switch(optchar) { case 'h': Help(); break; case 'V': Version(); break; case 'p': findex=optarg; command=ePrimerLookup; break; case 's': findex=optarg; command=eSTSLookup; break; case 'm': margin=strtol(optarg,0,10); break; case 'r': rev_lookup=*optarg=='+'?true:*optarg=='-'?false:rev_lookup; break; case 'g': gaps=strtol(optarg,0,10); break; case 'o': outfile=optarg; break; case 'n': mism=strtol(optarg,0,10); break; case 'l': presize=true; break; case 'q': m_Quiet=true; break; case 'G': m_PrintAlignments=true; break; case 'd': do { char * x=const_cast(optarg); int lo=strtol(x,&x,10); int hi=(x && *x=='-')?strtol(x+1,&x,10):lo; if(lo>0 && lo= argc) { Help(stderr); return 1; } if(!gaps) presize=false; return 0; } #ifndef USE_WIN #define CLREOL "\r\x1b[K" #else #define CLREOL "\r" #endif class CFaLookupCallbackBase:public IFaLookupCallback { protected: FILE * out; bool quiet; int old; public: CFaLookupCallbackBase(FILE * o=stdout, bool q=false):out(o),quiet(q),old(-1) {} virtual bool Fail(const std::string& msg) { fprintf(out,"#- Error: %s\n",msg.c_str()); return true; } virtual bool Warn(const std::string& msg, const ISts *) { fprintf(out,"#- Warning: %s\n",msg.c_str()); return true; } virtual bool Done(){ if((!quiet) && isatty(fileno(stderr))) { fprintf(stderr,CLREOL ""); fflush(out); } fprintf(out,"#- Done\n"); return true; } virtual void Progress(unsigned i, unsigned total) { if((!quiet) && isatty(fileno(stderr))) { int I=(+i*50)/total; if( I == old ) return; old = I; fprintf(stderr,CLREOL "- Progress: %3d%% %.*sO%.*s\r", int(0.5+100*(i+0.5)/total),I, "==================================================" "==================================================", 50-I-1, "--------------------------------------------------" "--------------------------------------------------"); fflush(out); } }; // virtual void Fragment(unsigned i, unsigned total) { // if(isatty(fileno(stderr))) { // fprintf(stderr,CLREOL "- Fragment %5d of %-5d %.*sO%.*s\r", // i+1,total,i, // "==================================================" // "==================================================", // total-i-1, // "--------------------------------------------------" // "--------------------------------------------------"); // fflush(out); // } // }; }; class CFaLookupPrimerCallback:public CFaLookupCallbackBase { public: CFaLookupPrimerCallback(FILE * o=stdout, bool q=false):CFaLookupCallbackBase(o,q) {} virtual bool Start() { fprintf(out,"#- sts\tseq\tstrand\tfrom\tto\tmism\tgaps\n"); return true; } virtual bool Match(const SFaMatchBlock * info, const ISts *) { return Match(info); return true; } virtual bool Match(const SFaMatchBlock * info) { fprintf(out,"%s\t%s\t%c\t%d\t%d\t%d\t%d\t%d/%d\n", info->sts_label.c_str(), info->seq_label.c_str(), char(info->strand), info->from+1,info->to, info->mism,info->gaps, info->to-info->from,0); return true; } }; class CFaLookupStsCallback:public CFaLookupCallbackBase { public: CFaLookupStsCallback(FILE * o=stdout, bool pa=false, bool q=false): CFaLookupCallbackBase(o,q),m_PrintAlignments(pa),m_Matrix(127,2) {} virtual bool Start() { fprintf(out,"#- sts\tseq\tstrand\tfrom\tto\tmism\tgaps\t" "act_len/exp_len\n"); return true; } virtual bool Match(const SFaMatchBlock * info) { return Match(info,0); } virtual bool Match(const SFaMatchBlock * info, const ISts * sts) { fprintf(out,"%s\t%s\t%c\t%d\t%d\t%d\t%d\t%d/%d-%d\n", info->sts_label.c_str(), info->seq_label.c_str(), char(info->strand), info->from+1,info->to, info->mism,info->gaps, info->to-info->from, sts?sts->GetSizeLo():0, sts?sts->GetSizeHi():0); if(m_PrintAlignments && info->sequence && info->seqlen>=info->to) { vector left, right; m_Matrix.Build( info->sequence+info->to-sts->GetPrimerLength(ISts::eRight), info->sequence+info->seqlen, sts->GetPrimerData(ISts::eRight), sts->GetPrimerLength(ISts::eRight)); m_Matrix.Graph( info->sequence+info->to-sts->GetPrimerLength(ISts::eRight), info->sequence+info->seqlen, sts->GetPrimerData(ISts::eRight), sts->GetPrimerLength(ISts::eRight), right); m_Matrix.Build >( info->sequence+info->from+sts->GetPrimerLength(ISts::eLeft)-1, info->sequence, sts->GetPrimerData(ISts::eLeft)+ sts->GetPrimerLength(ISts::eLeft)-1, sts->GetPrimerLength(ISts::eLeft)); m_Matrix.Graph >( info->sequence+info->from+sts->GetPrimerLength(ISts::eLeft)-1, info->sequence, sts->GetPrimerData(ISts::eLeft)+ sts->GetPrimerLength(ISts::eLeft)-1, sts->GetPrimerLength(ISts::eLeft), left); int l=max(int(info->seq_label.length()),int(sts->GetName().length())); int d=info->to-info->from- sts->GetPrimerLength(ISts::eLeft)- sts->GetPrimerLength(ISts::eRight); string stsname(sts->GetName().data(),sts->GetName().length()); fprintf(out, "# STS %*s %s...%d...%s\n" "# %.*s %s %d %s\n" "# Seq %*s %s...%d...%s\n" "#############################################" "#############################\n", l,stsname.c_str(), left[0].c_str(),d,right[0].c_str(), l," ", left[2].c_str(),d,right[2].c_str(), l,info->seq_label.c_str(), left[1].c_str(),d,right[1].c_str()); } return true; } protected: bool m_PrintAlignments; CLcsMatrix m_Matrix; }; int CMain::Execute() { if(presize) { m_AlignL = new CAlignLCS(mism,gaps); m_AlignR = new CAlignLCS(mism,gaps); } else if(gaps) { m_AlignL = new CAlignFast(mism,gaps); m_AlignR = new CAlignFast(mism,gaps); } else if(mism) { m_AlignL = new CAlignNoGaps(mism); m_AlignR = new CAlignNoGaps(mism); } else { m_AlignL = new CAlignExact(); m_AlignR = new CAlignExact(); } switch(command) { case ePrimerLookup: return PrimerLookup(); case eSTSLookup: return STSLookup(); } } int CMain::PrimerLookup() { CFaLookup lookup; lookup.SetAligner(m_AlignL, m_AlignR); lookup.AttachFile(findex); CFaLookupPrimerCallback cbk(stdout,m_Quiet); if(mode == eCmdline) { for(int i=optind; i rev(FlipSequence(argv[i])); lookup.Find(&cbk,buffer,'-',string(rev.get())); } } } else { for(int i=optind; i rev(FlipSequence(primer.c_str())); lookup.Find(&cbk,pname,'-',string(rev.get())); } } fclose(in); } else { fprintf(stderr,"! Could not open file %s -- ignored\n", argv[i]); } } } return 0; } int CMain::STSLookup() { CFaLookup lookup; lookup.SetAligner(m_AlignL, m_AlignR); lookup.AttachFile(findex); FILE * out=outfile.length()?fopen64(outfile.c_str(),"w"):stdout; CFaLookupStsCallback cbk(out,m_PrintAlignments,m_Quiet); if(mode==eCmdline) { list stslist; for(int i=optind; i<=argc-3; i+=3) { char buffer[1024]; snprintf(buffer,sizeof(buffer),"STS-%d",(i-optind+1)); char * x=0; int lo=strtol(argv[i+2],&x,10); int hi=(x&&(*x=='-'))?max(atoi(x+1),lo):lo; if(lo==hi && lo==0) { lo=defLo; hi=defHi; } char* rev2(FlipSequence(argv[i+1],strlen(argv[i+1]))); stslist.push_back(new CSimpleSTS(argv[i],rev2,lo,hi, ISts::ePlus,buffer)); delete[] rev2; if(rev_lookup) { char* rev1(FlipSequence(argv[i+0],strlen(argv[i+0]))); stslist.push_back(new CSimpleSTS(argv[i+1],rev1,lo,hi, ISts::eMinus,buffer)); } } lookup.Find(&cbk,stslist,optimize,margin); } else { unsigned tcount=0; for(int i=optind; i stslist; // fprintf(stderr,"* File %s\n",argv[i]); unsigned fcount=0; while(fgets(buffer,sizeof(buffer),f)) { if(feof(f)) break; if(*buffer=='#') continue; CStrRef fld[5]; int cnt=CMmFileSts::Parse(buffer,fld); if(cnt<3) throw runtime_error("format error in file "+ string(argv[i])); int lo=defLo, hi=defHi; if(cnt>3) CMmFileSts::ParseRange(fld[3],lo,hi); char* rev2(FlipSequence(fld[2].data(), fld[2].length())); string name(fld[0]); string lprimer(fld[1]); string rprimer(rev2); delete[] rev2; stslist.push_back(new CSimpleSTS(lprimer,rprimer, lo,hi,ISts::ePlus,name)); if(rev_lookup) { char* rev1(FlipSequence(fld[1].data(), fld[1].length())); lprimer.assign(fld[2]); rprimer.assign(rev1); delete[] rev1; stslist.push_back(new CSimpleSTS(lprimer,rprimer, lo,hi,ISts::eMinus,name)); } if(stslist.size()>=batchcnt) { fprintf(stderr,"= File %s, STSes %u-%u\n", argv[i],fcount+1,fcount+stslist.size()); lookup.Find(&cbk,stslist,optimize,margin); for(list::const_iterator i=stslist.begin(); i!=stslist.end(); ++i) { delete *i; } fcount+=stslist.size(); stslist.clear(); } } fclose(f); fprintf(stderr,"= File %s, STSes %u-%u\n", argv[i],fcount+1,fcount+stslist.size()); lookup.Find(&cbk,stslist,optimize,margin); for(list::const_iterator i=stslist.begin(); i!=stslist.end(); ++i) { delete *i; } stslist.clear(); } } return 0; } int CMain::Version() { done=true; puts("Reverse e-PCR search cmdline tool version " VERSION); return 0; } int CMain::Run() { if(int rc=ParseCmdline() ) return rc; if(done) return 0; return Execute(); } int main(int argc, char ** argv) { try { CMain app(argc,argv); return app.Run(); } catch(logic_error& e) { fprintf(stderr,"! Fatal: Internal error %s\n",e.what()); } catch(exception& e) { fprintf(stderr,"! Fatal: %s\n",e.what()); } catch(...) { fprintf(stderr,"! Fatal: Unknown error\n"); } return 100; } /* * $Log: re-PCR_main.cpp,v $ * Revision 1.21 2008/04/28 16:38:45 rotmistr * Applied patch to build with gcc-4.3 * * Revision 1.20 2007/07/05 16:05:58 rotmistr * Made things compileable by MS Visual C++ 8.0 * * Revision 1.19 2005/04/11 18:17:52 rotmistr * Fixed range-parsing code for commandline * * Revision 1.18 2005/02/11 20:42:54 rotmistr * Fixed "margin" bug, added primer search from file * * Revision 1.17 2004/09/03 21:28:50 rotmistr * Fixes to compile with Borland C++ 5.5 * * Revision 1.16 2004/06/07 16:24:57 rotmistr * Bug fixes to previos version. * * Revision 1.15 2004/06/03 23:37:21 rotmistr * New aligner added. * * Revision 1.14 2004/05/27 20:35:47 rotmistr * Version 2.1.0 with appropriate changes (see Changes) is ready for tests. * * Revision 1.13 2004/04/06 04:53:18 rotmistr * All is compileable with BCC5.5 and runnable on WIndows * * Revision 1.12 2004/03/30 21:06:53 rotmistr * Fixes for setting default STS size range. * * Revision 1.11 2004/03/29 21:25:40 rotmistr * Dist files are prepared * * Revision 1.10 2004/02/05 23:41:21 rotmistr * Better reload, fixed margin report in commandline, unists tab in CGI form. * * Revision 1.9 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.8 2004/01/28 23:27:02 rotmistr * "Best of overlapping" hit selection postprocessor added. * * Revision 1.7 2004/01/08 23:22:41 rotmistr * Fixed init error in faread, * Adjusted output to standard, * Added output format style and output file to parameters. * * Revision 1.6 2004/01/07 16:57:42 rotmistr * Fragment size is now configurable. * * Revision 1.5 2004/01/06 21:54:19 rotmistr * Statistics for word repetitions API added * * Revision 1.4 2003/12/30 21:36:32 rotmistr * Syscall optimisation mode added. * * Revision 1.3 2003/12/30 15:27:22 rotmistr * Fixed bug with sequence end * * Revision 1.2 2003/12/23 21:30:50 rotmistr * - gaps/mismatches reporting * - lo/hi fixup * - reverse sts in re-PCR_main * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/align.cpp0000644001137700010620000002174311745334032014206 0ustar rotmistrcontig/* $Id: align.cpp,v 1.5 2004/10/26 17:16:32 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ //#include //#include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); static char s_Compl[]= "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NTBGHNNCDNNMNKNNNNYNAABWXRNNNNNN" //ABCDEFGHIJKLMNOPQRSTUVWXYZNNNNN "NTBGHNNCDNNMNKNNNNYNAABWXRNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" ; static char s_Lc2Uc[]= "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NABCDNNGHNNKNMNNNNRNTTVWXYNNNNNN" //ABCDEFGHIJKLMNOPQRSTUVWXYZNNNNN "NABCDNNGHNNKNMNNNNRNTTVWXYNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" ; BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) inline char UCase(char c) { return s_Lc2Uc[int((unsigned char)c)]; } char * UCaseSequence(const char * seq, unsigned len) { if(len == ~0U) len=strlen(seq); char * ret=new char[len+1]; ret[len]=0; for(char * x=ret; len!=0; --len) *x++=s_Lc2Uc[int(*seq++)]; return ret; } char * FlipSequence(const char * seq, unsigned len) { if(len == ~0U) len=strlen(seq); char * ret=new char[len+1]; ret[len]=0; for(char * x=ret+len; len!=0; --len) *--x=s_Compl[int(*seq++)]; return ret; } END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE bool CAlignExact::Forward(const char * seq_ptr, const char * seq_end, const char * primer, int length) { if(strncmp(seq_ptr,primer,length)==0) { m_Identities=length; return true; } else { m_Identities=0; return false; } } bool CAlignExact::Reverse(const char * seq_start, const char * seq_ptr, const char * primer, int length) { seq_ptr-=length; if(seq_ptr>=seq_start && strncmp(seq_ptr,primer,length)==0) { m_Identities=length; return true; } else { m_Identities=0; return false; } } bool CAlignNoGaps::Forward(const char * seq_ptr, const char * seq_end, const char * primer, int length) { return x_Compare(seq_ptr,primer,length); } bool CAlignNoGaps::Reverse(const char * seq_start, const char * seq_ptr, const char * primer, int length) { seq_ptr-=length; if(seq_ptr>=seq_start) { return x_Compare(seq_ptr,primer,length); } m_Identities=0; return false; } bool CAlignNoGaps::x_Compare(const char * a, const char * b, int l) { m_Identities=0; if(l<=0) return false; int mism=m_Mism=m_MaxMismatch; for(;l; --l) { if(*a++ != *b++) { if(a[-1]==0 || --mism<0) return false; } } m_Mism-=mism; m_Identities=l-m_Mism; return l==0; } bool CAlignFast::Forward(const char * a, const char * A, const char * b, int l) { const char * B=b+l; int gaps=m_Gaps=m_MaxGaps; int mism=m_Mism=m_MaxMismatch; m_Identities=0; for(;b=2 ) { mism-=2; a+=2; b+=2; continue; } } if(--gaps<0) return false; ++a; } else if(gaps && a[0] == b[1]) { if(--gaps<0) return false; ++b; } else { if(--mism<0) return false; } } for(;*b && b=A && b>=B; --a, --b) { if(*a == *b) { m_Identities++; continue; } if(a>A && b>B && a[-1] == b[-1]) { if(--mism<0) return false; --a,--b; } else if(gaps && a>A && a[-1] == b[0]) { // mismatches are preferred to gaps if(b>B && a[0] == b[-1]) { if(mism >=2 ) { mism-=2; a-=2; b-=2; continue; } } if(--gaps<0) return false; --a; } else if(gaps && b>B && a[0] == b[-1]) { if(--gaps<0) return false; --b; } else { if(--mism<0) return false; } } for(;b>=B; --b) if(--gaps<0) return false; m_Gaps-=gaps; m_Mism-=mism; return true; } bool CAlignLCS::Forward(const char * seq_ptr, const char * seq_end, const char * primer, int length) { m_Matrix.Build(seq_ptr,seq_end,primer,length); m_Matrix.Stat(seq_ptr,seq_end,primer,length); return m_Matrix.GetMismatches()<=m_MaxMismatch && m_Matrix.GetGaps()<=m_MaxGaps; } bool CAlignLCS::Reverse(const char * seq_start, const char * seq_ptr, const char * primer, int length) { m_Matrix.Build >( seq_ptr-1,seq_start-1,primer+length-1,length); m_Matrix.Stat >( seq_ptr-1,seq_start-1,primer+length-1,length); return m_Matrix.GetMismatches()<=m_MaxMismatch && m_Matrix.GetGaps()<=m_MaxGaps; } CAlignLCS::CAlignLCS(int mm, int gg): m_MaxMismatch(mm),m_MaxGaps(gg),m_Matrix(256,gg) {} CAlignCompromise::CAlignCompromise(int mm, int gg): CAlignLCS(mm,gg), CAlignNoGaps(mm) {} bool CAlignCompromise::Forward(const char * seq_ptr, const char * seq_end, const char * primer, int length) { if(CAlignNoGaps::Forward(seq_ptr,seq_end,primer,length)) { m_Gaps=CAlignNoGaps::GetGaps(); m_Mismatches=CAlignNoGaps::GetMismatches(); m_Identities=CAlignNoGaps::GetIdentities(); return true; } if(CAlignLCS::Forward(seq_ptr,seq_end,primer,length)) { m_Gaps=CAlignLCS::GetGaps(); m_Mismatches=CAlignLCS::GetMismatches(); m_Identities=CAlignLCS::GetIdentities(); return true; } return false; } bool CAlignCompromise::Reverse(const char * seq_start, const char * seq_ptr, const char * primer, int length) { if(CAlignNoGaps::Reverse(seq_start,seq_ptr,primer,length)) { m_Gaps=CAlignNoGaps::GetGaps(); m_Mismatches=CAlignNoGaps::GetMismatches(); m_Identities=CAlignNoGaps::GetIdentities(); return true; } if(CAlignLCS::Reverse(seq_start,seq_ptr,primer,length)) { m_Gaps=CAlignLCS::GetGaps(); m_Mismatches=CAlignLCS::GetMismatches(); m_Identities=CAlignLCS::GetIdentities(); return true; } m_Identities=0; return false; } /* * $Log: align.cpp,v $ * Revision 1.5 2004/10/26 17:16:32 rotmistr * Added 5'-end masking for primers * * Revision 1.4 2004/06/08 20:32:50 rotmistr * Fixup for gap+insert special case * * Revision 1.3 2004/06/08 16:14:55 rotmistr * *** empty log message *** * * Revision 1.2 2004/06/07 16:24:56 rotmistr * Bug fixes to previos version. * * Revision 1.1 2004/06/03 23:37:19 rotmistr * New aligner added. * * */ e-PCR-2.3.12/align.hpp0000644001137700010620000001371711745334032014215 0ustar rotmistrcontig/* $Id: align.hpp,v 1.5 2008/03/26 16:04:29 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_ALIGN__HPP #define EPCR_ALIGN__HPP #include #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class IAlign; class CAlignExact; class CAlignNoGaps; class CAlignFast; class CAlignLCS; class CAlignCompromise; char * FlipSequence(const char * seq, unsigned len= ~0U); char * UCaseSequence(const char * seq, unsigned len= ~0U); class IAlign { public: virtual ~IAlign() throw () {}; virtual bool Forward(const char * seq_ptr, const char * seq_end, const char * primer, int length) = 0; virtual bool Reverse(const char * seq_start, const char * seq_ptr, const char * primer, int length) = 0; virtual int GetIdentities() const { return 0; } virtual int GetMismatches() const { return 0; } virtual int GetGaps() const { return 0; } virtual int GetFloppy() const { return 0; } }; class CAlignExact:public virtual IAlign { public: CAlignExact():m_Identities(0) {} virtual ~CAlignExact() throw () {}; bool Forward(const char * seq_ptr, const char * seq_end, const char * primer, int length); bool Reverse(const char * seq_start, const char * seq_ptr, const char * primer, int length); int GetIdentities() const { return m_Identities; } protected: int m_Identities; }; class CAlignNoGaps:public CAlignExact { public: virtual ~CAlignNoGaps() throw () {}; bool Forward(const char * seq_ptr, const char * seq_end, const char * primer, int length); bool Reverse(const char * seq_start, const char * seq_ptr, const char * primer, int length); CAlignNoGaps(int mm=1):m_MaxMismatch(mm),m_Mism(0) {} int GetMismatches() const { return m_Mism; } protected: bool x_Compare(const char * a, const char * b, int l); protected: int m_MaxMismatch; int m_Mism; }; class CAlignFast:public CAlignNoGaps { public: virtual ~CAlignFast() throw () {}; bool Forward(const char * seq_ptr, const char * seq_end, const char * primer, int length); bool Reverse(const char * seq_start, const char * seq_ptr, const char * primer, int length); CAlignFast(int mm=1, int gg=1):CAlignNoGaps(mm),m_MaxGaps(gg),m_Gaps(0) {} int GetGaps() const { return m_Gaps; } protected: int m_MaxGaps; int m_Gaps; }; class CAlignLCS:public virtual IAlign { public: virtual ~CAlignLCS() throw () {}; bool Forward(const char * seq_ptr, const char * seq_end, const char * primer, int length); bool Reverse(const char * seq_start, const char * seq_ptr, const char * primer, int length); CAlignLCS(int mm=1, int gg=1); int GetIdentities() const { return m_Matrix.GetMatches(); } int GetMismatches() const { return m_Matrix.GetMismatches(); } int GetGaps() const { return m_Matrix.GetGaps(); } protected: int m_MaxMismatch; int m_MaxGaps; CLcsMatrix m_Matrix; }; class CAlignCompromise:public virtual CAlignLCS, public virtual CAlignNoGaps { public: virtual ~CAlignCompromise() throw () {}; bool Forward(const char * seq_ptr, const char * seq_end, const char * primer, int length); bool Reverse(const char * seq_start, const char * seq_ptr, const char * primer, int length); CAlignCompromise(int mm=1, int gg=1); int GetIdentities() const { return m_Identities; } int GetMismatches() const { return m_Mismatches; } int GetGaps() const { return m_Gaps; } protected: int m_Identities, m_Mismatches, m_Gaps; }; //class CAlignFloppy : public virtual CAlignLCS END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: align.hpp,v $ * Revision 1.5 2008/03/26 16:04:29 rotmistr * Added support for blastdb files * * Revision 1.4 2004/10/26 17:16:33 rotmistr * Added 5'-end masking for primers * * Revision 1.3 2004/06/08 20:32:50 rotmistr * Fixup for gap+insert special case * * Revision 1.2 2004/06/08 16:14:55 rotmistr * *** empty log message *** * * Revision 1.1 2004/06/03 23:37:19 rotmistr * New aligner added. * */ e-PCR-2.3.12/minilcs.cpp0000644001137700010620000002203011745334032014540 0ustar rotmistrcontig/* $Id: minilcs.cpp,v 1.5 2004/09/03 19:06:41 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include "minilcs.hpp" #include using namespace std; using namespace ncbi; using namespace EPCR_SCOPE; int main(int argc, char ** argv) { try { bool done=false; bool reverse=false; int optopt; int maxgaps=2; while((optopt=getopt(argc,argv,"hVg:r"))!=-1) { switch(optopt) { case 'h': cout << "Usage: [-hV] [-r] [-g gaps] sequence primer\n"; done=true; break; case 'V': cout << "Version " __DATE__ "\n"; done=true; break; case 'g': maxgaps=atoi(optarg); break; case 'r': reverse=true; break; } } if(done) return 0; if(argc-optind<2) throw runtime_error("Need primer and sequence"); CLcsMatrix matrix(256,maxgaps); vector rc; double pcti; string sequence=argv[optind]; string primer=argv[optind+1]; if(!reverse) { pcti=matrix.Build( sequence.c_str(), sequence.c_str()+sequence.length(), primer.c_str(),primer.length()); matrix.Graph( sequence.c_str(), sequence.c_str()+sequence.length(), primer.c_str(),primer.length(),rc); matrix.Stat( sequence.c_str(), sequence.c_str()+sequence.length(), primer.c_str(),primer.length()); } else { pcti=matrix.Build >( sequence.c_str()+sequence.length()-1, sequence.c_str()-1, primer.c_str()+primer.length()-1, primer.length()); matrix.Graph >( sequence.c_str()+sequence.length()-1, sequence.c_str()-1, primer.c_str()+primer.length()-1, primer.length(),rc); matrix.Stat >( sequence.c_str()+sequence.length()-1, sequence.c_str()-1, primer.c_str()+primer.length()-1, primer.length()); } cout << "PCTI: " << pcti << "\n"; cout << "BestX: " << matrix.GetBestX() << endl << "BestY: " << matrix.GetBestY() << endl << "BestVal:" << matrix.GetBestVal() << endl; cout << "Mism: " << matrix.GetMismatches() << endl << "Gaps: " << matrix.GetGaps() << endl << "Match: " << matrix.GetMatches() << endl; cout << "primer: " << rc[0] << endl << " " << rc[2] << endl << "seqnce: " << rc[1] << endl; cout << " "; for(int j=0; j class CReverseConstSeqIterator { public: typedef CReverseConstSeqIterator TClass; typedef CReverseConstSeqIterator class_type; typedef T data_type; typedef T TDataType; CReverseConstSeqIterator(const T* ptr=0):m_Ptr(ptr) {} CReverseConstSeqIterator(const TClass& i):m_Ptr(i.m_Ptr) {} T* Get() const { return m_Ptr; } TClass& Set(const T* ptr=0) { m_Ptr=ptr; return *this; } T operator * () const { return *m_Ptr; } T operator [] (int i) const { return m_Ptr[-i]; } TClass& operator ++ () { --m_Ptr; return *this; } TClass& operator -- () { ++m_Ptr; return *this; } TClass operator ++ (int) { TClass i(this); ++*this; return i; } TClass operator -- (int) { TClass i(this); --*this; return i; } TClass& operator += (int i) { m_Ptr-=i; return *this; } TClass& operator -= (int i) { m_Ptr+=i; return *this; } TClass& operator = (const TClass& i) { m_Ptr=i.m_Ptr; return *this; } bool operator == (const TClass& i) const { return m_Ptr==i.m_Ptr; } bool operator != (const TClass& i) const { return m_Ptr!=i.m_Ptr; } bool operator >= (const TClass& i) const { return m_Ptr<=i.m_Ptr; } bool operator <= (const TClass& i) const { return m_Ptr>=i.m_Ptr; } bool operator < (const TClass& i) const { return m_Ptr> i.m_Ptr; } bool operator > (const TClass& i) const { return m_Ptr< i.m_Ptr; } friend TClass operator + (const TClass& c, int i) { return TClass(c.m_Ptr-i); } friend TClass operator - (const TClass& c, int i) { return TClass(c.m_Ptr+i); } protected: TDataType * m_Ptr; }; template class CLcsMatrix { public: typedef T data_type; typedef data_type TDataType; CLcsMatrix(int length, int maxgaps); ~CLcsMatrix() throw (); typedef const char * TForwardStr; typedef const char * TReverseStr; template int Build(CSeqIterator genome, CSeqIterator gend, CSeqIterator primer, int length); protected: int m_Size; int m_MaxGaps; TDataType ** m_Data; }; template template inline int CLcsMatrix::Build(CSeqIterator genome, CSeqIterator gend, CSeqIterator primer, int len) { double best_pcti=0; for(int x=1; x <= len; ++x, ++genome, ++primer) { cout << *primer << ":\t"; for(int y=-min(x-1,m_MaxGaps); y<=m_MaxGaps&&genome+ylen-m_MaxGaps) { double pcti=double(m_Data[y][x])/max(x,x+y); if(pcti>best_pcti) best_pcti=pcti; } cout.width(5); cout << (int)m_Data[y][x] << ", "; } cout << "\n"; } cout << "pcti: " << 100*best_pcti << endl; return 0; } template inline CLcsMatrix::CLcsMatrix(int length, int maxgaps): m_Size(length), m_MaxGaps(maxgaps), m_Data((new (TDataType*)[(1+m_MaxGaps)*2+1])+m_MaxGaps+1) { for(int i=-m_MaxGaps-1; i<=m_MaxGaps+1; ++i) { m_Data[i]=new TDataType[m_Size+m_MaxGaps+1]; memset(m_Data[i],0,(m_Size+m_MaxGaps+1)*sizeof(TDataType)); } } template inline CLcsMatrix::~CLcsMatrix() throw () { for(int i=-m_MaxGaps-1; i<=m_MaxGaps+1; ++i) { delete[] m_Data[i]; } m_Data-=m_MaxGaps+1; delete[] m_Data; } #endif /* * $Log: minilcs.cpp,v $ * Revision 1.5 2004/09/03 19:06:41 rotmistr * Code formatting changes * *. e-PCR-2.3.12/minilcs.hpp0000644001137700010620000002537511745334032014564 0ustar rotmistrcontig/* $Id: minilcs.hpp,v 1.9 2007/07/05 16:23:08 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_MINILCS__HPP #define EPCR_MINILCS__HPP #include #include #include #include #include #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) template class CReverseConstSeqIterator { public: typedef CReverseConstSeqIterator TClass; typedef CReverseConstSeqIterator class_type; typedef T data_type; typedef T TDataType; protected: TDataType * m_Ptr; public: CReverseConstSeqIterator(const T* ptr=0):m_Ptr(ptr) {} CReverseConstSeqIterator(const TClass& i):m_Ptr(i.m_Ptr) {} T* Get() const { return m_Ptr; } TClass& Set(const T* ptr=0) { m_Ptr=ptr; return *this; } const T& operator * () const { return *m_Ptr; } const T& operator [] (int i) const { return m_Ptr[-i]; } TClass& operator ++ () { --m_Ptr; return *this; } TClass& operator -- () { ++m_Ptr; return *this; } TClass operator ++ (int) { TClass i(this); ++*this; return i; } TClass operator -- (int) { TClass i(this); --*this; return i; } TClass& operator += (int i) { m_Ptr-=i; return *this; } TClass& operator -= (int i) { m_Ptr+=i; return *this; } TClass& operator = (const TClass& i) { m_Ptr=i.m_Ptr; return *this; } bool operator == (const TClass& i) const { return m_Ptr==i.m_Ptr; } bool operator != (const TClass& i) const { return m_Ptr!=i.m_Ptr; } bool operator >= (const TClass& i) const { return m_Ptr<=i.m_Ptr; } bool operator <= (const TClass& i) const { return m_Ptr>=i.m_Ptr; } bool operator < (const TClass& i) const { return m_Ptr> i.m_Ptr; } bool operator > (const TClass& i) const { return m_Ptr< i.m_Ptr; } friend TClass operator + (const TClass& c, int i) { return TClass(c.m_Ptr-i); } friend TClass operator - (const TClass& c, int i) { return TClass(c.m_Ptr+i); } }; template class CLcsMatrix { public: typedef T data_type; typedef data_type TDataType; typedef TDataType * TDataTypePtr; CLcsMatrix(int length, int maxgaps); ~CLcsMatrix() throw (); typedef const char * TForwardStr; typedef CReverseConstSeqIterator TReverseStr; template double Build(CSeqIterator genome, CSeqIterator gend, CSeqIterator primer, int length); template void Graph(CSeqIterator genome, CSeqIterator gend, CSeqIterator primer, int length, vector& dest, int extra=2, int intra=0); template void Stat(CSeqIterator genome, CSeqIterator gend, CSeqIterator primer, int length); int GetMatches() const { return m_Matches; } int GetMismatches() const { return m_Mismatches; } int GetSeqInsertions() const { return m_Insertions; } int GetSeqDeletions() const { return m_Deletions; } int GetGaps() const { return m_Insertions+m_Deletions; } int GetResultLength() const { return m_ResultLength; } int GetIdentities() const { return m_Identities; } int GetBestX() const { return m_BestX; } int GetBestY() const { return m_BestY; } int GetBestVal() const { return m_BestVal; } int Get(int i, int j) { int y=j-i; return abs(y)>m_MaxGaps?0:m_Data[y][i]; } char Sym(int i, int j) { int y=j-i; char c=abs(y)>m_MaxGaps?0:m_Path[y][i]; return c?c:'.'; } protected: int m_Size; int m_MaxGaps; TDataType ** m_Data; TDataType ** m_Path; int m_BestX, m_BestY, m_BestVal; int m_PrimerLength; int m_Matches, m_Mismatches, m_Insertions, m_Deletions; int m_Identities, m_ResultLength; }; template template inline double CLcsMatrix::Build(CSeqIterator genome, CSeqIterator gend, CSeqIterator primer, int len) { m_PrimerLength=len; m_ResultLength=m_Identities=m_BestY=m_BestX=0; CSeqIterator gg=genome; for(int x=1; x <= len; ++x, ++gg, ++primer) { int start=-min(x-1,m_MaxGaps); CSeqIterator g=gg+start; for(int y=start; y<=m_MaxGaps && gcy) { m_Data[y][x]=cx; m_Path[y][x]='i'; } else if(cy>cx) { m_Data[y][x]=cy; m_Path[y][x]='d'; } else { // char cz=m_Data[y][x-1]; m_Data[y][x]=cx;//max(cx,cz); m_Path[y][x]='!'; } } } } int z = m_BestX = len; CSeqIterator g = genome + len - m_MaxGaps; m_BestVal = 0; for(int y = -m_MaxGaps; y <= m_MaxGaps && g < gend; ++y, ++g) { switch( m_Path[y][z] ) { case 'i': case 'd': continue; } char val = m_Data[y][z]; if( val > m_BestVal ) { // || (val==m_BestVal && y<=0)) { m_BestVal = val; m_BestY = y; } } return m_BestVal / max( len, m_BestY + m_BestX - 1); } template template inline void CLcsMatrix::Stat(CSeqIterator genome, CSeqIterator gend, CSeqIterator primer, int length) { int x=m_BestX; int y=m_BestY; m_Mismatches=length>x?length-x:0; m_Matches=m_Insertions=m_Deletions=0; while(x>0) { switch(m_Path[y][x]) { default: case '=': m_Matches++; x--; break; case '!': mismatch: m_Mismatches++; x--; break; case 'd': if(m_Path[y-1][x]=='i') goto mismatch; m_Insertions++; y--; break; case 'i': if(m_Path[y+1][x-1]=='d') goto mismatch; m_Deletions++; y++; x--; break; } } } template template inline void CLcsMatrix::Graph(CSeqIterator genome, CSeqIterator gend, CSeqIterator primer, int length, vector& dest, int extra, int intra) { dest.clear(); dest.resize(3); int x=m_BestX; int y=m_BestY; m_Mismatches=length>x?length-x:0; m_Matches=m_Insertions=m_Deletions=0; for(int pos=y+x-1+extra; extra>0; --pos, --extra) { dest[0].push_back(' '); dest[1].push_back(genome+pos>=gend?' ':tolower(genome[pos])); dest[2].push_back(' '); } while(x>0) { switch(m_Path[y][x]) { default: case '=': x--; dest[0].push_back(primer[x]); dest[1].push_back(genome[x+y]); dest[2].push_back('|'); break; case '!': mismatch: x--; dest[0].push_back(primer[x]); dest[1].push_back(genome[x+y]); dest[2].push_back(' '); break; case 'd': if(m_Path[y-1][x]=='i') goto mismatch; y--; dest[0].push_back('-'); dest[1].push_back(genome[x+y]); dest[2].push_back(' '); break; case 'i': if(m_Path[y+1][x-1]=='d') goto mismatch; y++; x--; dest[0].push_back(primer[x]); dest[1].push_back('-'); dest[2].push_back(' '); break; } } for(int pos=1; pos<=intra; ++pos) { dest[0].push_back(' '); dest[1].push_back(tolower(genome[-pos])); dest[2].push_back(' '); } if(&genome[0]<&genome[1]) { reverse(dest[0].begin(),dest[0].end()); reverse(dest[1].begin(),dest[1].end()); reverse(dest[2].begin(),dest[2].end()); } } template inline CLcsMatrix::CLcsMatrix(int length, int maxgaps): m_Size(length), m_MaxGaps(maxgaps), m_Data((new TDataTypePtr[(1+m_MaxGaps)*2+1])+m_MaxGaps+1), m_Path((new TDataTypePtr[(1+m_MaxGaps)*2+1])+m_MaxGaps+1) { for(int i=-m_MaxGaps-1; i<=m_MaxGaps+1; ++i) { m_Data[i]=new TDataType[m_Size+m_MaxGaps+1]; memset(m_Data[i],0,(m_Size+m_MaxGaps+1)*sizeof(TDataType)); m_Path[i]=new TDataType[m_Size+m_MaxGaps+1]; memset(m_Path[i],0,(m_Size+m_MaxGaps+1)*sizeof(TDataType)); } } template inline CLcsMatrix::~CLcsMatrix() throw () { for(int i=-m_MaxGaps-1; i<=m_MaxGaps+1; ++i) { delete[] m_Data[i]; } m_Data-=m_MaxGaps+1; delete[] m_Data; } END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: minilcs.hpp,v $ * Revision 1.9 2007/07/05 16:23:08 rotmistr * Forgot two changes * * Revision 1.8 2007/07/05 16:05:58 rotmistr * Made things compileable by MS Visual C++ 8.0 * * Revision 1.7 2004/09/03 19:06:41 rotmistr * Code formatting changes * * Revision 1.6 2004/07/22 20:40:08 rotmistr * Fixed to work with gcc-3.4.0 * * Revision 1.5 2004/06/08 20:32:51 rotmistr * Fixup for gap+insert special case * * Revision 1.4 2004/06/08 16:14:55 rotmistr * *** empty log message *** * * Revision 1.3 2004/06/07 16:24:56 rotmistr * Bug fixes to previos version. * * Revision 1.2 2004/06/03 23:37:20 rotmistr * New aligner added. * * Revision 1.1 2004/06/02 21:37:54 rotmistr * Added minilcs function * */ e-PCR-2.3.12/seqcmp_main.cpp0000644001137700010620000001552711745334032015413 0ustar rotmistrcontig/* $Id: seqcmp_main.cpp,v 1.2 2004/06/03 23:37:21 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); #ifndef VERSION #define VERSION "(devel:" __DATE__ ")" #endif class CMain { public: CMain(int c, char ** v): argc(c),argv(v),done(false),mism(0),gaps(0),fwd(true), aligner(eAlignerLcs) { rev[0]=rev[1]=false; } int Run(); enum EAligner { eAlignerExact, eAlignerNoGaps, eAlignerFast, eAlignerLcs } aligner; protected: int Execute(); int ParseCmdline(); int Help(FILE* = stdout); int Version(); protected: int argc; char ** argv; bool done; protected: int mism, gaps; bool fwd, rev[2]; }; int CMain::Help(FILE* out) { fprintf(out,"usage: [-hV] [-n mism] [-g gaps] [-s +|-] [-r 1|2] " "[-a aligner] " "{sequence} {primer}\n"); fprintf(out,"where:\n" "\t-s +|- - side (begin/end)\n" "\t-r 1|2 - flip sequence 1|2\n" "\t-a aligner - aligner to use\n" "aligeners available are:\n" "\texact - exact match\n" "\tnogaps - allow n mismatches\n" "\tfast - allow n mismatches and g gaps,\n" "\t some patterns are incorrecly recognised\n" "\tlcs - use lcs aligner\n"); done=true; } struct { const char * label; CMain::EAligner aligner; } aligners[]={ {"exact",CMain::eAlignerExact}, {"ex",CMain::eAlignerExact}, {"e",CMain::eAlignerExact}, {"nogaps",CMain::eAlignerNoGaps}, {"ngaps",CMain::eAlignerNoGaps}, {"ng",CMain::eAlignerNoGaps}, {"n",CMain::eAlignerNoGaps}, {"fast",CMain::eAlignerFast}, {"f",CMain::eAlignerFast}, {"lcs",CMain::eAlignerLcs}, {"l",CMain::eAlignerLcs}, {0,CMain::eAlignerLcs} }; int CMain::ParseCmdline() { int optchar; while((optchar=getopt(argc,argv,"hVn:g:s:r:a:"))!=-1) { switch(optchar) { case 'h': Help(); break; case 'V': Version(); break; case 'n': mism=atoi(optarg); break; case 'g': gaps=atoi(optarg); break; case 's': fwd=*optarg=='+'?true:*optarg=='-'?false:fwd; break; case 'r': optarg[0]=='1'?rev[0]=true:optarg[0]=='2'?rev[1]=true:false; break; case 'a': for(int i=0; aligners[i].label; ++i) { if(strcmp(optarg,aligners[i].label)==0) { aligner=aligners[i].aligner; goto next; } } throw runtime_error("Unknown aligner: "+string(optarg)); } next:; } if(done) return 0; if(optind >= argc+2) { Help(stderr); return 1; } return 0; } int CMain::Execute() { IAlign * align; switch(aligner) { case eAlignerExact: align=new CAlignExact(); break; case eAlignerNoGaps: align=new CAlignNoGaps(mism); break; case eAlignerFast: align=new CAlignFast(mism,gaps); break; case eAlignerLcs: align=new CAlignLCS(mism,gaps); break; } char * seq=argv[optind]; if(rev[0]) seq=FlipSequence(seq); char * end=seq+strlen(seq); char * primer=argv[optind+1]; if(rev[1]) primer=FlipSequence(primer); int len=strlen(primer); bool rc=fwd? align->Forward(seq,end,primer,len): align->Reverse(seq,end,primer,len); printf("seq: %s\npri: %s\n%s [%d mism, %d gaps]\n", seq,primer,rc?"OK":"fail", align->GetMismatches(),align->GetGaps()); if(gaps) { CLcsMatrix matrix(256,gaps); vector graph; if(fwd) { matrix.Build(seq,end,primer,len); matrix.Graph(seq,end,primer,len,graph); matrix.Stat(seq,end,primer,len); } else { matrix.Build >( end-1,seq-1,primer+len-1,len); matrix.Graph >( end-1,seq-1,primer+len-1,len,graph); matrix.Stat >( end-1,seq-1,primer+len-1,len); } printf("Primer: %s\n",graph[0].c_str()); printf(" %s\n",graph[2].c_str()); printf("Seqnce: %s\n",graph[1].c_str()); } delete align; if(rev[0]) delete[] seq; if(rev[1]) delete[] primer; } int CMain::Version() { done=true; puts("Fasta converter for e-PCR version " VERSION); } int CMain::Run() { if(int rc=ParseCmdline() ) return rc; if(done) return 0; return Execute(); } int main(int argc, char ** argv) { try { CMain app(argc,argv); return app.Run(); } catch(logic_error& e) { fprintf(stderr,"! Fatal: Internal error %s\n",e.what()); } catch(exception& e) { fprintf(stderr,"! Fatal: %s\n",e.what()); } catch(...) { fprintf(stderr,"! Fatal: Unknown error\n"); } return 100; } /* * $Log: seqcmp_main.cpp,v $ * Revision 1.2 2004/06/03 23:37:21 rotmistr * New aligner added. * * Revision 1.1 2004/02/12 21:38:20 rotmistr * Fixed typo in seqcmp * Optimized and fixed lookup * Better look for reverse.cgi * * Revision 1.2 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/strref.hpp0000644001137700010620000000510411745334032014417 0ustar rotmistrcontig/* $Id: strref.hpp,v 1.3 2008/04/28 16:38:45 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_STRREF__HPP #define EPCR_STRREF__HPP #include #include #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class CStrRef; // This class has string interface, but does NOT create copy of data // Not safe, but VERY useful class CStrRef { public: CStrRef():m_Data(0),m_Length(0) {} CStrRef(const char * s):m_Data(s),m_Length(strlen(s)) {} CStrRef(const char * s, unsigned l):m_Data(s),m_Length(s?l:0) {} CStrRef(const string& s):m_Data(s.c_str()),m_Length(s.length()) {} unsigned length() const { return m_Length; } void resize(unsigned u) { m_Length=u; } const char * data() const { return m_Data; } operator string () const { return string(data(),length()); } protected: const char * m_Data; unsigned m_Length; }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: strref.hpp,v $ * Revision 1.3 2008/04/28 16:38:45 rotmistr * Applied patch to build with gcc-4.3 * * Revision 1.2 2004/10/26 14:25:29 rotmistr * Added resize() * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/sts.hpp0000644001137700010620000000577711745334032013743 0ustar rotmistrcontig/* $Id: sts.hpp,v 1.2 2003/12/23 21:30:50 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_STS__HPP #define EPCR_STS__HPP #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class CSimpleSTS; class CSimpleSTS:public ISts { public: ~CSimpleSTS() throw () {} CSimpleSTS(const string& l, const string& r, int a, int b=0, ISts::EDirect s=ISts::ePlus, const string& n="*", const string& d="...") :m_name(n),m_desc(d),m_left(l),m_right(r), m_lo(a),m_hi(b>a?b:a),m_strand(s) {} virtual const char * GetPrimerData(int s) const { return s==eLeft?m_left.c_str():m_right.c_str(); } virtual unsigned GetPrimerLength(int s) const { return s==eLeft?m_left.length():m_right.length(); } virtual EDirect GetDirection() const { return m_strand; } virtual unsigned GetSizeLo() const { return m_lo; } virtual unsigned GetSizeHi() const { return m_hi; } virtual CStrRef GetName() const { return CStrRef(m_name.c_str(),m_name.length()); } virtual CStrRef GetDescription() const { return CStrRef(m_desc.c_str(),m_desc.length()); } protected: string m_name; string m_desc; string m_left; string m_right; int m_lo, m_hi; EDirect m_strand; }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: sts.hpp,v $ * Revision 1.2 2003/12/23 21:30:50 rotmistr * - gaps/mismatches reporting * - lo/hi fixup * - reverse sts in re-PCR_main * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/sts_i.hpp0000644001137700010620000000501111745334032014230 0ustar rotmistrcontig/* $Id: sts_i.hpp,v 1.2 2004/10/26 17:16:34 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_STS_I__HPP #define EPCR_STS_I__HPP #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class ISts; class ISts { public: enum ESide { eLeft=0, eRight=1 }; enum EDirect { ePlus='+', eMinus='-' }; CStrRef GetPrimer(ESide s) const { return CStrRef(GetPrimerData(s),GetPrimerLength(s)); } virtual const char * GetPrimerData(int) const =0; virtual unsigned GetPrimerLength(int) const =0; virtual EDirect GetDirection() const =0; virtual unsigned GetSizeLo() const =0; virtual unsigned GetSizeHi() const =0; virtual CStrRef GetName() const =0; virtual CStrRef GetDescription() const =0; virtual int GetOverhangChars(int) const { return 0; } virtual ~ISts() throw () {} }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: sts_i.hpp,v $ * Revision 1.2 2004/10/26 17:16:34 rotmistr * Added 5'-end masking for primers * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/stsfilter.cpp0000644001137700010620000001223011745334032015122 0ustar rotmistrcontig/* $Id: stsfilter.cpp,v 1.4 2004/06/03 23:37:22 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); #define DBG fprintf(stdout,__FILE__"[%d]\n",__LINE__); #define SHOWI(a) fprintf(stdout,__FILE__"[%d] "#a"=%d\n",__LINE__,a); #define SHOWS(a) \ fprintf(stdout,__FILE__"[%d] "#a"=%.*s\n",__LINE__,a.length(),a.data()); bool CPcrMachinePostprocess::OrderByPos2Pos1(const SOutput& o1, const SOutput& o2) { return (o1.pos2b.gaps()) return -1; if(a.gaps()b.mism()) return -1; if(a.mism()= min_len && la <= max_len) { if(lb >= min_len && lb <= max_len) { return lb-la; } else return 1; } else { if(lb >= min_len && lb <= max_len) { return -1; } else { int da=(la < min_len)?min_len-la:la-max_len; int db=(lb < min_len)?min_len-lb:lb-max_len; return db-da; } } } bool CPcrMachinePostprocess::Overlap(int l1, int l2, const SOutput& a, const SOutput& b) { return (abs(a.pos2-b.pos2)second.size()==0) continue; int l1=i->first->GetPrimerLength(0); int l2=i->first->GetPrimerLength(1); int lo=i->first->GetSizeLo(); int hi=i->first->GetSizeHi(); TStsHits& hits=i->second; while(hits.size()) { TStsHits todo; TStsHits cluster; cluster.push_back(hits.back()); SOutput best=cluster.front(); hits.pop_back(); for(TStsHits_CI j=hits.begin(); j!=hits.end(); ++j) { bool overlaps=false; for(TStsHits_CI h=cluster.begin(); h!=cluster.end(); ++h) if(Overlap(l1,l2,*j,*h)) { overlaps=true; break; } if(overlaps) { if(Compare(*j,best,lo,hi)>=0) best=*j; cluster.push_back(*j); } else { todo.push_back(*j); } } SScore sc(best.length(),best.mism_l, best.mism_r, best.gaps_l, best.gaps_r); m_Callback->CbkMatch(i->first, best.pos1, best.pos2, &sc); hits=todo; } } m_OutQueues.clear(); } void CPcrMachinePostprocess::CbkMatch(const ISts * sts, unsigned pos1, unsigned pos2, const SScore* score) { SOutput o(pos1,pos2, score->mism_l,score->mism_r, score->gaps_l,score->gaps_r); TStsHits& hits=m_OutQueues[sts]; if(hits.size() && hits.back()==o) return; hits.push_back(o); } /* * $Log: stsfilter.cpp,v $ * Revision 1.4 2004/06/03 23:37:22 rotmistr * New aligner added. * * Revision 1.3 2004/03/25 19:36:52 rotmistr * API: separate left and right primers mism/gaps in forward API * * Revision 1.2 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.1 2004/01/28 23:27:02 rotmistr * "Best of overlapping" hit selection postprocessor added. * */ e-PCR-2.3.12/stsmatch.cpp0000644001137700010620000000743311745334032014742 0ustar rotmistrcontig/* $Id: stsmatch.cpp,v 1.5 2004/06/03 23:37:22 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); //////////////////////////////////////////////////////////////////////// CStsFileCallbackDefault::CStsFileCallbackDefault() { Start(); } bool CStsFileCallbackDefault::Start() { m_String=""; m_Length=0; m_Line=0; memset(m_Bad,0,sizeof(m_Bad)); return true; } bool CStsFileCallbackDefault::NextLine(const char * s, int l) { m_String=s; m_Length=l; ++m_Line; return true; } const char * errorMessage(IStsFileCallback::EError e) { switch(e){ case IStsFileCallback::eErrOK:return "OK"; case IStsFileCallback::eErrShortPrimer:return "short primer"; case IStsFileCallback::eErrAmbiquosPrimer:return "ambiquities in primer"; case IStsFileCallback::eErrBadLine:return "bad line"; default: return "unknown error"; } } bool CStsFileCallbackDefault::Error(EError err) { const char * msg="OK"; switch(err) { case eErrTOTAL: throw logic_error("err can't be eErrTOTAL here"); case eErrOK: return true; case eErrBadLine: m_Bad[0]++; msg="bad line"; break; case eErrShortPrimer: m_Bad[1]++; msg="short primer"; break; case eErrAmbiquosPrimer: m_Bad[2]++; msg="ambiquity in primer"; break; case eErrSystem: throw runtime_error(string("STS file error: ")+strerror(errno)); } return true; } bool CStsFileCallbackDefault::Done() { if(m_Bad[0]) fprintf(stderr,"WARNING: %d STSs have incomplete description line\n", m_Bad[0]); if(m_Bad[1]) fprintf(stderr,"WARNING: %d STSs have primer shorter than W\n", m_Bad[1]); if(m_Bad[2]) fprintf(stderr, "WARNING: %d STSs have ambiguities within W of 3\' end\n", m_Bad[2]); return true; } //////////////////////////////////////////////////////////////////////// /* * $Log: stsmatch.cpp,v $ * Revision 1.5 2004/06/03 23:37:22 rotmistr * New aligner added. * * Revision 1.4 2004/05/27 20:35:48 rotmistr * Version 2.1.0 with appropriate changes (see Changes) is ready for tests. * * Revision 1.3 2004/04/01 05:57:53 rotmistr * Compilable with borland C++ * * Revision 1.2 2004/03/29 21:25:40 rotmistr * Dist files are prepared * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/stsmatch.hpp0000644001137700010620000000466311745334032014751 0ustar rotmistrcontig/* $Id: stsmatch.hpp,v 1.3 2004/05/27 20:35:48 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_STSMATCH__HPP #define EPCR_STSMATCH__HPP #include #include #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class CPcrMachineCompat; class CStsFileCallbackDefault:public IStsFileCallback { public: CStsFileCallbackDefault(); virtual ~CStsFileCallbackDefault() throw () {} virtual bool NextLine(const char * , int); virtual bool Error(EError); virtual bool Start(); virtual bool Done(); protected: const char * m_String; unsigned m_Length; int m_Line; int m_Bad[3]; }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: stsmatch.hpp,v $ * Revision 1.3 2004/05/27 20:35:48 rotmistr * Version 2.1.0 with appropriate changes (see Changes) is ready for tests. * * Revision 1.2 2004/03/29 21:25:40 rotmistr * Dist files are prepared * * Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/stsmatch_i.cpp0000644001137700010620000002404611745334032015251 0ustar rotmistrcontig/* $Id: stsmatch_i.cpp,v 1.13 2005/06/14 16:46:44 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); #define DBG fprintf(stderr,__FILE__":%d\n",__LINE__) #define INT(i) fprintf(stderr,__FILE__":%d - %s = %d\n",__LINE__,#i,i) #define STR(s,i) fprintf(stderr,__FILE__":%d - %s = %.*s\n",__LINE__,#s,i,s) bool CStsHash::sm_OneTimeRun(false); void CStsHash::Reset() { if(m_Table) { // NB: It DOES NOT delete STSes themselve!!! for(unsigned i=0; iclear(); } for(TStsList::iterator i=m_All.begin(); i!=m_All.end(); ++i) { delete *i; } m_All.clear(); } void CStsHash::SetHash(const CHashSet& hs) { Reset(); m_Hash=hs; m_Table=new THashTable[m_Hash.GetWordCount()]; for(size_t i=0; iGetPrimer(ISts::eLeft); if(primer.length()GetWordCount()==0) throw logic_error("Word count should be greater then zero"); if(m_StsHash->m_Table==0) throw logic_error("Sts table should be initialized"); if(seq_len==0) seq_len=strlen(seq_data); m_Callback->CbkSequence(label); m_Callback->CbkSequenceData(seq_data,seq_len); CHashSet& hash=m_HashL; if(m_Progress) m_Progress->PgsSequenceStart(label,seq_data,seq_len, hash.GetWordSize()); if(seq_lenCbkWarning("too short sequence"); m_Callback->CbkSequenceEnd(); if(m_Progress) m_Progress->PgsSequenceEnd(); return; } if(hash.GetWordCount()==1) { CStsHash::THashTable tab=m_StsHash->m_Table[0]; for(hash.Begin(seq_data); !hash.End(); hash.Next()) { if(m_Progress) m_Progress->PgsSequenceAt(hash.GetPosition()); if(!hash.Good(0)) continue; if(tab[hash[0]].size()) Match(0, tab[hash[0]], seq_data, seq_len); } } else { CStsHash::TData tab=m_StsHash->m_Table; for(hash.Begin(seq_data); !hash.End(); hash.Next()) { if(m_Progress) m_Progress->PgsSequenceAt(hash.GetPosition()); for(unsigned word=0; wordPgsSequenceEnd(); m_Callback->CbkSequenceEnd(); } void CPcrMachine::Match(unsigned word, const TStsList& lst, const char * seq_data, int seq_len) { int seq_pos = m_HashL.GetPosition(); const char * seq_cur = seq_data+seq_pos;//+pcr_p1_len; int cur_len = seq_len - seq_pos; for(TStsList::const_iterator ists=lst.begin(); ists!=lst.end(); ++ists) { const ISts * sts=*ists; int pcr_p1_len = sts->GetPrimerLength(ISts::eLeft); if(m_HashL.GetPosition() < (unsigned)pcr_p1_len) continue; const char * lprimer = sts->GetPrimerData(ISts::eLeft); bool match=m_AlignL->Reverse(seq_data,seq_cur,lprimer,pcr_p1_len); if(match) { int pcr_p2_len = sts->GetPrimerLength(ISts::eRight); int pcr_len = pcr_p1_len + pcr_p2_len; const char * p = seq_cur + sts->GetSizeLo() - pcr_len - m_Margin; const char * P = seq_cur + sts->GetSizeHi() - pcr_len + m_Margin; if( p < seq_cur - pcr_p1_len ) p = seq_cur - pcr_p1_len; if( p < seq_data ) p = seq_data; if( P > seq_cur + cur_len - pcr_p2_len ) P = seq_cur + cur_len - pcr_p2_len; if( p > P ) continue; const char * rprimer = sts->GetPrimerData(ISts::eRight); #ifndef USE_HASH_FOR_RIGHT_PRIMER #define USE_HASH_FOR_RIGHT_PRIMER 1 #endif int ovhg1 = sts->GetOverhangChars(ISts::eLeft); int ovhg2 = sts->GetOverhangChars(ISts::eRight); int ovhgall = ovhg1+ovhg2; #if !USE_HASH_FOR_RIGHT_PRIMER for(; p<=P; --P) { if(m_AlignR->Forward(P,seq_data+seq_len,rprimer,pcr_p2_len)) { IPcrMachineCallback::SScore score( P-p+pcr_len+ovhgall, m_AlignL->GetMismatches(), m_AlignR->GetMismatches(), m_AlignL->GetGaps(), m_AlignR->GetGaps()); m_Callback->CbkMatch( sts, seq_pos-pcr_p1_len-ovhg1, P-seq_data+pcr_p2_len+ovhg2, &score); } } #else CHashSet& rhash=m_HashR; CHashSet& mhash=m_HashS; rhash.Begin(sts->GetPrimerData(ISts::eRight)); for (mhash.Begin(p); p<=P; p++, mhash.Next()) { for(unsigned wd=0; wdForward(p,seq_data+seq_len, rprimer,pcr_p2_len)) { IPcrMachineCallback::SScore score( P-p+pcr_len+ovhgall, m_AlignL->GetMismatches(), m_AlignR->GetMismatches(), m_AlignL->GetGaps(), m_AlignR->GetGaps()); m_Callback->CbkMatch(sts, seq_pos-pcr_p1_len-ovhg1, p-seq_data+pcr_p2_len+ovhg2, &score); break; // no other reports for this positions } } } #endif } } } /* * $Log: stsmatch_i.cpp,v $ * Revision 1.13 2005/06/14 16:46:44 rotmistr * Changed report format for floppy tails * * Revision 1.12 2004/10/26 17:16:35 rotmistr * Added 5'-end masking for primers * * Revision 1.11 2004/06/03 23:37:22 rotmistr * New aligner added. * * Revision 1.10 2004/05/27 21:12:46 rotmistr * Some warnings fixed. * * Revision 1.9 2004/05/27 20:35:49 rotmistr * Version 2.1.0 with appropriate changes (see Changes) is ready for tests. * * Revision 1.8 2004/03/31 05:04:00 rotmistr * Search range fix * * Revision 1.7 2004/03/25 19:36:52 rotmistr * API: separate left and right primers mism/gaps in forward API * * Revision 1.6 2004/03/07 06:35:59 rotmistr * Many bugfixes and optimisations -- cgi is to go to production * * Revision 1.5 2004/02/18 05:43:22 rotmistr * Bug fix with search range * * Revision 1.4 2004/02/11 04:34:57 rotmistr * Optimised lookup speed and memory usage * Fixed bug with end of sequence in stsmatch * Changing CGI look * * Revision 1.3 2004/01/28 23:27:02 rotmistr * "Best of overlapping" hit selection postprocessor added. * * Revision 1.2 2004/01/08 23:22:41 rotmistr * Fixed init error in faread, * Adjusted output to standard, * Added output format style and output file to parameters. * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/stsmatch_i.hpp0000644001137700010620000002140011745334032015245 0ustar rotmistrcontig/* $Id: stsmatch_i.hpp,v 1.10 2005/06/14 16:46:44 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_STSMATCH_I__HPP #define EPCR_STSMATCH_I__HPP #include #include #include #include #include #include #include #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class CStsHash; class CPcrMachine; class IPcrMachineCallback; class IPcrProgressCallback; // This class implements STS hash table, but lacks table loading methods class CStsHash { public: typedef vector TStsList; typedef TStsList * THashTable; typedef THashTable* TData; CStsHash():m_Table(0) {} void SetHash(const CHashSet& hs); const CHashSet& GetHash() const { return m_Hash; } virtual void Reset(); // deletes everything, if sm_OneTimeRun is not set virtual void Clear(); // clears hash lists and deletes data virtual ~CStsHash() throw () { if(sm_OneTimeRun) Reset(); } unsigned GetWordSize() const { return m_Hash.GetWordSize(); } unsigned GetWordCount() const { return m_Hash.GetWordCount(); } static void SetOneTimeRun(bool v) { sm_OneTimeRun=v; } protected: virtual bool AddStsEntry(ISts * sts); protected: static bool sm_OneTimeRun; friend class CPcrMachine; TData m_Table; CHashSet m_Hash; TStsList m_All; }; // This class implements STS match algorithm class CPcrMachine { public: typedef CStsHash::TStsList TStsList; typedef CStsHash::THashTable THashTable; typedef CStsHash::TData TData; virtual void SetMargin(unsigned margin) { m_Margin=margin; } unsigned GetMargin() const { return m_Margin; } virtual void SetAligner(IAlign * alignl, IAlign * alignr) { m_AlignL=alignl; m_AlignR=alignr; } virtual void SetProgressCallback(IPcrProgressCallback * callback) { m_Progress=callback; } virtual void SetCallback(IPcrMachineCallback * callback) { m_Callback=callback; } virtual void SetStsHash(const CStsHash* stshash) { m_StsHash=stshash; if(m_StsHash) { m_HashL=m_HashR=m_HashS=stshash->m_Hash; } } virtual void ProcessSequence(const char * label, const char * seqdata, unsigned seqlen); CPcrMachine():m_AlignL(0),m_AlignR(0),m_Callback(0),m_StsHash(0),m_Progress(0) {} virtual ~CPcrMachine() throw () {} protected: virtual void Match(unsigned word, const TStsList& lst, const char * seq_data, int seq_len); protected: unsigned m_Margin; IAlign * m_AlignL, * m_AlignR; CHashSet m_HashL, m_HashR, m_HashS; IPcrMachineCallback * m_Callback; const CStsHash * m_StsHash; IPcrProgressCallback * m_Progress; }; // Callback that is used in CPcrMachine::ProcessSequence class IPcrMachineCallback { public: struct SScore { // int n_mism; // int n_gaps; int actlen; char mism_l, mism_r, gaps_l, gaps_r; SScore(int a=0, char m_l=0, char m_r=0, char g_l=0, char g_r=0): actlen(a), mism_l(m_l),mism_r(m_r), gaps_l(g_l),gaps_r(g_r){} }; virtual void CbkStart() {}; virtual void CbkEnd() {}; virtual void CbkSequenceEnd() {}; virtual void CbkSequence(const char * label) =0; virtual void CbkSequenceData(const char * data, unsigned size) {}; virtual void CbkMatch(const ISts * sts, unsigned pos1, unsigned pos2, const SScore *score) =0; virtual void CbkWarning(const char * message) =0; virtual ~IPcrMachineCallback() {} }; // Separate to optimize CPcrMachine::ProcessSequence scanning loop // (callback only if it is not null) class IPcrProgressCallback { public: virtual ~IPcrProgressCallback() {} virtual void PgsSequenceStart(const char * label, const char * data, unsigned length, unsigned wsize) = 0; virtual void PgsSequenceEnd() = 0; virtual void PgsSequenceAt(unsigned pos) = 0; }; // Accumulates hits, removes redundant hits as well as suboptimal hits class CPcrMachinePostprocess:public IPcrMachineCallback { public: ~CPcrMachinePostprocess() {} CPcrMachinePostprocess(IPcrMachineCallback * out):m_Callback(out) {} virtual void CbkStart() { m_Callback->CbkStart(); } virtual void CbkEnd() { m_Callback->CbkEnd(); } virtual void CbkSequenceEnd() { Flush(); m_Callback->CbkSequenceEnd(); } virtual void CbkSequence(const char * label) { Flush(); m_Callback->CbkSequence(label); } virtual void CbkMatch(const ISts * sts, unsigned pos1, unsigned pos2, const SScore *score); virtual void CbkWarning(const char * message) { m_Callback->CbkWarning(message); } virtual void CbkSequenceData(const char * data, unsigned size) { m_Callback->CbkSequenceData(data,size); }; void Flush(); protected: IPcrMachineCallback * m_Callback; struct SOutput { int pos1, pos2; char mism_l, mism_r, gaps_l, gaps_r; int length() const { return pos2-pos1; } int mism() const { return mism_l+mism_r; } int gaps() const { return gaps_l+gaps_r; } bool operator == (const SOutput& o) const { return pos2==o.pos2 && pos1==o.pos1 && mism_l==o.mism_l && gaps_l==o.gaps_l && mism_r==o.mism_r && gaps_r==o.gaps_r; } SOutput(int p1, int p2, char m_l, char m_r, char g_l, char g_r): pos1(p1),pos2(p2),mism_l(m_l),mism_r(m_r),gaps_l(g_l),gaps_r(g_r){} SOutput(){} }; static bool OrderByPos2Pos1(const SOutput&, const SOutput&); static int Compare(const SOutput&, const SOutput&, int, int); static bool Overlap(int, int, const SOutput&, const SOutput&); typedef vector TStsHits; typedef map TAllHits; typedef TStsHits::iterator TStsHits_I; typedef TAllHits::iterator TAllHits_I; typedef TStsHits::const_iterator TStsHits_CI; typedef TAllHits::const_iterator TAllHits_CI; TAllHits m_OutQueues; }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: stsmatch_i.hpp,v $ * Revision 1.10 2005/06/14 16:46:44 rotmistr * Changed report format for floppy tails * * Revision 1.9 2004/06/07 16:24:57 rotmistr * Bug fixes to previos version. * * Revision 1.8 2004/06/03 23:37:22 rotmistr * New aligner added. * * Revision 1.7 2004/03/26 17:02:13 rotmistr * Compat-options are now allowed everywhere, and multiple fasta files can be used. * * Revision 1.6 2004/03/25 19:36:52 rotmistr * API: separate left and right primers mism/gaps in forward API * * Revision 1.5 2004/03/07 06:35:59 rotmistr * Many bugfixes and optimisations -- cgi is to go to production * * Revision 1.4 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.3 2004/01/28 23:27:02 rotmistr * "Best of overlapping" hit selection postprocessor added. * * Revision 1.2 2004/01/08 23:22:41 rotmistr * Fixed init error in faread, * Adjusted output to standard, * Added output format style and output file to parameters. * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/stsmatch_m.cpp0000644001137700010620000002227711745334032015261 0ustar rotmistrcontig/* $Id: stsmatch_m.cpp,v 1.16 2008/06/18 14:45:33 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #include #include #include #include #include #include #include #include #include #include USING_NCBI_SCOPE; USING_SCOPE(EPCR_SCOPE); CMmFileSts::CMmFileSts(const char * ref, const CStrRef& p1, const CStrRef& p2, unsigned lo, unsigned hi, unsigned flags, unsigned char ovhg1, unsigned char ovhg2): m_SizeLo(lo), m_SizeHi(hi), m_Reference(ref), m_Flags(flags) { m_Primer[0]=p1.data(); m_Length[0]=p1.length(); m_Primer[1]=p2.data(); m_Length[1]=p2.length(); m_OvhgChars[0]=ovhg1; m_OvhgChars[1]=ovhg2; } void CMmFileSts::ParseRange(const CStrRef& range, int& lo, int& hi, int defLo, int defHi) { char * c=const_cast(range.data()); if(isdigit(*c)) { lo=strtol(c,&c,10); hi=(*c=='-')?strtol(c+1,0,10):lo; } else { lo=defLo; //ePCR_DEFAULT_size_lo; hi=defHi; //ePCR_DEFAULT_size_hi; } } int CMmFileSts::Parse(const char * ref, CStrRef * dest, int maxf) { int i=0; while(i ref && isspace(pp[-1]) ) --pp; dest[i++]=CStrRef(ref,pp-ref); } if(p==0 || *p!='\t') return i; ref=p+1; } return i; } CStrRef CMmFileSts::GetName() const { CStrRef fld("?"); Parse(m_Reference,&fld,1); return fld; } CStrRef CMmFileSts::GetDescription() const { CStrRef fld[5]; if(Parse(m_Reference,fld,5)>=5) { const char * c=fld[4].data(); const char * cc=strpbrk(c,"\n\r"); return CStrRef(c,cc?cc-c:strlen(c)); } else return ""; } //////////////////////////////////////////////////////////////////////// void CStsFileHash::AttachFile(const string& fname) { int fd=open(fname.c_str(),O_RDONLY); if(fd==-1) throw runtime_error("opening "+fname+": "+strerror(errno)); struct stat st; if(fstat(fd,&st)) { close(fd); throw runtime_error("getting status "+fname+": "+strerror(errno)); } m_MemorySize=st.st_size; #ifdef USE_WIN m_MemoryBase=new char[m_MemorySize+1]; read(fd,m_MemoryBase,m_MemorySize); #else m_MemoryBase=(char*)mmap(0,m_MemorySize,PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_NORESERVE,fd,0); #endif close(fd); if(m_MemoryBase==0 || m_MemoryBase==caddr_t(-1)) throw runtime_error("memory mapping "+fname+": "+strerror(errno)); if(strchr("\r\n",m_MemoryBase[m_MemorySize-1])==0) { DetachFile(); throw runtime_error("format error "+fname+": need newline before EOF"); } } void CStsFileHash::DetachFile() { #ifdef USE_WIN delete[] m_MemoryBase; #else munmap(const_cast(m_MemoryBase),m_MemorySize); #endif m_MemoryBase=0; m_MemorySize=0; } void CStsFileHash::ReadStsFile(const string& fname, IStsFileCallback * cbk) { // TODO:: dsjkghdgifdjgd not complete reset!!! Clear(); AttachFile(fname); #ifndef USE_WIN madvise(const_cast(m_MemoryBase),m_MemorySize, MADV_SEQUENTIAL|MADV_WILLNEED); #endif if(cbk==0 || cbk->Start()) { const char * pos=m_MemoryBase; const char * end=m_MemoryBase+m_MemorySize; for(int to_parse=m_MemorySize; to_parse>1; to_parse=end-pos) { const char * nl=strpbrk(pos,"\n\r"); if(!ParseLine(cbk,pos,nl-pos)) break; if(nl[0]!=nl[1] && strchr("\r\n",nl[1])) pos=nl+2; else pos=nl+1; } if(cbk) cbk->Done(); } #ifndef USE_WIN madvise(const_cast(m_MemoryBase),m_MemorySize, MADV_RANDOM|MADV_WILLNEED); #endif } // lowercase characters are masked. inline bool IsMasked(char c) { return islower(c); } static int Clip5PrimeLowercase(CStrRef& fld) { int len=fld.length(); int oldlen=len; const char * c=fld.data(); while( len>0 && IsMasked(*c) ) --len, ++c; fld=CStrRef(c,len); return oldlen-len; } bool CStsFileHash::ParseLine(IStsFileCallback * cbk, const char * pos, unsigned len) { if(cbk && !cbk->NextLine(pos,len)) return false; if(*pos!='#') { CStrRef fld[5]; int cnt=CMmFileSts::Parse(pos,fld,4); if(cnt>3) { int oh1=0, oh2=0; if(AllowOverhang()) { oh1=Clip5PrimeLowercase(fld[1]); oh2=Clip5PrimeLowercase(fld[2]); } if(fld[1].length()Error(IStsFileCallback::eErrShortPrimer)); } const char * rev1=FlipSequence(fld[1].data(),fld[1].length()); const char * rev2=FlipSequence(fld[2].data(),fld[2].length()); int lo,hi; CMmFileSts::ParseRange(fld[3],lo,hi, m_DefaultSizeLo,m_DefaultSizeHi); //cerr << "\e[31m" << __PRETTY_FUNCTION__ << "\e[032m: def-lo = " << m_DefaultSizeLo << ", def-hi = " << m_DefaultSizeHi << ", lo = " << lo << ", hi = " << hi << "\e[0m\n"; const char * fwd1=(UnmaskPrimers()? UCaseSequence(fld[1].data(),fld[1].length()): fld[1].data()); const char * fwd2=(UnmaskPrimers()? UCaseSequence(fld[2].data(),fld[2].length()): fld[2].data()); int flags=(UnmaskPrimers()?CMmFileSts::fAllocLeft:0)| CMmFileSts::fAllocRight; CMmFileSts * fsts = new CMmFileSts( pos, CStrRef(fwd1,fld[1].length()), CStrRef(rev2,fld[2].length()), lo,hi,flags,oh1,oh2); CMmFileSts * rsts = new CMmFileSts( pos, CStrRef(fwd2,fld[2].length()), CStrRef(rev1,fld[1].length()), lo,hi,flags|CMmFileSts::fReverse,oh2,oh1); bool fok=AddStsEntry(fsts); bool rok=AddStsEntry(rsts); if(!sm_OneTimeRun) { if(!fok) delete fsts; if(!rok) delete rsts; } if(!fok && !rok) return (cbk&&cbk->Error(IStsFileCallback::eErrAmbiquosPrimer)); } else if(cbk && !cbk->Error(IStsFileCallback::eErrBadLine)) return false; } return true; } /* * $Log: stsmatch_m.cpp,v $ * Revision 1.16 2008/06/18 14:45:33 rotmistr * Fixed problem with -d x-X parameter being reset if -w or some others are used after it. * * Revision 1.15 2007/07/11 20:49:30 rotmistr * Made 64bit-compatible * * Revision 1.14 2007/07/05 16:05:58 rotmistr * Made things compileable by MS Visual C++ 8.0 * * Revision 1.13 2004/10/26 17:16:35 rotmistr * Added 5'-end masking for primers * * Revision 1.12 2004/09/03 21:28:50 rotmistr * Fixes to compile with Borland C++ 5.5 * * Revision 1.11 2004/06/03 23:37:23 rotmistr * New aligner added. * * Revision 1.10 2004/05/27 20:35:49 rotmistr * Version 2.1.0 with appropriate changes (see Changes) is ready for tests. * * Revision 1.9 2004/04/01 17:23:20 rotmistr * *** empty log message *** * * Revision 1.8 2004/04/01 16:37:42 rotmistr * Cleaned after adding windows capabilities * * Revision 1.7 2004/04/01 05:57:53 rotmistr * Compilable with borland C++ * * Revision 1.6 2004/03/30 21:06:53 rotmistr * Fixes for setting default STS size range. * * Revision 1.5 2004/03/30 19:08:03 rotmistr * default STS size is tunnable now * * Revision 1.4 2004/03/23 22:35:26 rotmistr * Fixed processing of -mid flag in cmdline * Fixed destructor for fasta reader * Removed cgi * * Revision 1.3 2004/03/07 06:36:00 rotmistr * Many bugfixes and optimisations -- cgi is to go to production * * Revision 1.2 2004/02/04 21:23:22 rotmistr * - gcc-3.3.2 compatible * - better postfiltering for reverse-e-PCR for discontiguos words * - cgi added, that supports: * -- contig to chromosome mapping * -- simple mapviewer links * -- unists links * -- discontiguos words * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/stsmatch_m.hpp0000644001137700010620000001373211745334032015262 0ustar rotmistrcontig/* $Id: stsmatch_m.hpp,v 1.7 2008/06/18 14:45:33 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_STSMATCH_M__HPP #define EPCR_STSMATCH_M__HPP #include #include #include BEGIN_NCBI_SCOPE BEGIN_SCOPE(EPCR_SCOPE) class CMmFileSts; class CStsFileHash; class IStsFileCallback; class CMmFileSts:public ISts { public: enum EFlags { fNone = 0, fReverse = 0x01, fAllocLeft = 0x10, fAllocRight = 0x20 }; virtual const char * GetPrimerData(int s) const { return m_Primer[s]; } virtual unsigned GetPrimerLength(int s) const { return m_Length[s]; } virtual EDirect GetDirection() const { return m_Flags&fReverse?eMinus:ePlus; } virtual unsigned GetSizeLo() const { return m_SizeLo; } virtual unsigned GetSizeHi() const { return m_SizeHi; } virtual CStrRef GetName() const; virtual CStrRef GetDescription() const; virtual int GetOverhangChars(int s) const { return m_OvhgChars[s]; } virtual ~CMmFileSts() throw () { if(m_Flags&fAllocLeft) delete[] m_Primer[0]; if(m_Flags&fAllocRight) delete[] m_Primer[1]; } CMmFileSts(const char * ref, const CStrRef& p1, const CStrRef& p2, unsigned lo, unsigned hi, unsigned flags, unsigned char ovhg1 = 0, unsigned char ovhg2 = 0); static int Parse(const char * ref, CStrRef * dest, int maxf=5); static void ParseRange(const CStrRef& range, int& lo, int& hi, int defLo=ePCR_DEFAULT_size_lo, int defHi=ePCR_DEFAULT_size_hi); bool Valid() const { return m_Reference!=0; } protected: const char * m_Primer[2]; char m_Length[2]; unsigned m_SizeLo; unsigned m_SizeHi; const char * m_Reference; char m_Flags; char m_OvhgChars[2]; }; // This class implements loading of STS hash table from mmapped file class CStsFileHash:public CStsHash { public: enum EFlags { fAllowOverhang = 1, fUnmaskPrimers = 2, fNONE = 0 }; virtual void Reset() { CStsHash::Reset(); DetachFile(); //SetDefaultSize(ePCR_DEFAULT_size_lo,ePCR_DEFAULT_size_hi); } virtual ~CStsFileHash() throw () {} CStsFileHash():m_MemoryBase(0),m_MemorySize(0), m_DefaultSizeLo(ePCR_DEFAULT_size_lo), m_DefaultSizeHi(ePCR_DEFAULT_size_hi),m_Flags(0) {} unsigned GetDefaultSizeLo() const { return m_DefaultSizeLo; } unsigned GetDefaultSizeHi() const { return m_DefaultSizeHi; } void SetDefaultSize(unsigned lo, unsigned hi) { //cerr << "\e[31m" << __PRETTY_FUNCTION__ << "\e[32m: lo = " << lo << ", hi = " << hi << "\e[0m\n"; m_DefaultSizeLo=lo; m_DefaultSizeHi=hi; } virtual void ReadStsFile(const string& fname, IStsFileCallback* cbk = 0); bool AllowOverhang() const { return m_Flags & fAllowOverhang; } bool UnmaskPrimers() const { return m_Flags & fUnmaskPrimers; } void SetFlags(int flags, bool on=true) { if(on) m_Flags|=flags; else m_Flags&=~flags; } protected: virtual void AttachFile(const string& fname); virtual void DetachFile(); bool ParseLine(IStsFileCallback * cbk, const char * pos, unsigned len); protected: char * m_MemoryBase; unsigned m_MemorySize; unsigned m_DefaultSizeLo; unsigned m_DefaultSizeHi; unsigned m_Flags; }; // Callback for warnings and errors class IStsFileCallback { public: enum EError { eErrOK, eErrBadLine, eErrShortPrimer, eErrAmbiquosPrimer, eErrSystem, eErrTOTAL }; virtual ~IStsFileCallback() throw () {} virtual bool NextLine(const char * line, int sz) =0; virtual bool Error(EError) =0; virtual bool Start() =0; virtual bool Done() =0; }; END_SCOPE(EPCR_SCOPE) END_NCBI_SCOPE #endif /* * $Log: stsmatch_m.hpp,v $ * Revision 1.7 2008/06/18 14:45:33 rotmistr * Fixed problem with -d x-X parameter being reset if -w or some others are used after it. * * Revision 1.6 2004/10/26 17:16:36 rotmistr * Added 5'-end masking for primers * * Revision 1.5 2004/04/01 05:57:53 rotmistr * Compilable with borland C++ * * Revision 1.4 2004/03/30 21:06:53 rotmistr * Fixes for setting default STS size range. * * Revision 1.3 2004/03/30 19:08:03 rotmistr * default STS size is tunnable now * * Revision 1.2 2004/01/28 23:27:02 rotmistr * "Best of overlapping" hit selection postprocessor added. * * Revision 1.1.1.1 2003/12/23 18:17:27 rotmistr * Package that includes e-PCR, reverse e-PCR, and sequence data preparation * program for reverse e-PCR looks ready * */ e-PCR-2.3.12/mswin.h0000644001137700010620000000550411745334032013713 0ustar rotmistrcontig/* $Id: mswin.h,v 1.6 2007/07/05 16:05:58 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_MSWIN__HPP #define EPCR_MSWIN__HPP #include #include #include #define NO_POPEN 1 #define USE_WIN 1 #define O_LARGEFILE 0 typedef char * caddr_t; typedef long int32_t; typedef __int64 off64_t; typedef __int64 huge; typedef unsigned __int64 uhuge; #define fopen64 fopen #define snprintf _snprintf inline off64_t lseek64(int fd, off64_t off, int dir) { long lo=off; long hi=off>>32; switch(dir) { case SEEK_SET: dir=FILE_BEGIN; break; case SEEK_CUR: dir=FILE_CURRENT; break; case SEEK_END: dir=FILE_END; break; } HANDLE h=(HANDLE)_get_osfhandle(fd); lo=SetFilePointer(h, lo, &hi, dir); if(lo==INVALID_SET_FILE_POINTER && GetLastError()!=NO_ERROR) return -1; return lo+((off64_t(hi)<<32)&0xffffffff); } inline int fseeko64(FILE* f, off64_t off, int dir) { fflush(f); off64_t rc = lseek64(fileno(f), off, dir); return rc == (off64_t)-1 ? -1 : 0; } inline off64_t ftello64(FILE* f) { fflush(f); return lseek64(fileno(f), 0, SEEK_CUR); } #ifdef __cplusplus extern "C" { #endif extern int optind; extern int optopt; extern int opterr; extern const char* optarg; int getopt(int argc, char ** argv, const char* optstring); #ifdef __cplusplus } #endif #endif /* * $Log: mswin.h,v $ * Revision 1.6 2007/07/05 16:05:58 rotmistr * Made things compileable by MS Visual C++ 8.0 * * Revision 1.5 2004/09/03 19:10:21 rotmistr * Public domain notice added. * */ e-PCR-2.3.12/native64.h0000644001137700010620000000340011745334032014207 0ustar rotmistrcontig/* $Id: native64.h,v 1.3 2004/09/03 19:10:21 rotmistr Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * ========================================================================= * * Author: Kirill Rotmistrovsky * * ========================================================================= */ #ifndef EPCR_NATIVE64__HPP #define EPCR_NATIVE64__HPP #include #define O_LARGEFILE 0 typedef off_t off64_t; #define fopen64 fopen #define fseeko64 fseeko #define ftello64 ftello #define lseek64 lseek #define mmap64 mmap #endif /* * $Log: native64.h,v $ * Revision 1.3 2004/09/03 19:10:21 rotmistr * Public domain notice added. * */ e-PCR-2.3.12/Makefile0000644001137700010620000000545111745334032014046 0ustar rotmistrcontig## $Id: Makefile,v 1.7 2007/07/05 16:06:04 rotmistr Exp $ ######################################################################## ## ## PUBLIC DOMAIN NOTICE ## National Center for Biotechnology Information ## ## This software/database is a "United States Government Work" under the ## terms of the United States Copyright Act. It was written as part of ## the author's official duties as a United States Government employee and ## thus cannot be copyrighted. This software/database is freely available ## to the public for use. The National Library of Medicine and the U.S. ## Government have not placed any restriction on its use or reproduction. ## ## Although all reasonable efforts have been taken to ensure the accuracy ## and reliability of the software and data, the NLM and the U.S. ## Government do not and cannot warrant the performance or results that ## may be obtained by using this software or data. The NLM and the U.S. ## Government disclaim all warranties, express or implied, including ## warranties of performance, merchantability or fitness for any particular ## purpose. ## ## Please cite the author in any work or product based on this material. ## ######################################################################## LIBS = epcr BINS = cmd_epcr cmd_famap cmd_fahash cmd_rpcr srcdir = . all links dirs clean dist clean-all install install-lib dist-clean depend: for i in $(LIBS:%=lib%) $(BINS) ; do \ $(MAKE) -ef $(srcdir)/stand/Makefile.$$i $@ ; \ done # ######################################################################## ## $Log: Makefile,v $ ## Revision 1.7 2007/07/05 16:06:04 rotmistr ## Made things compileable by MS Visual C++ 8.0 ## ## Revision 1.6 2004/05/27 20:36:03 rotmistr ## Version 2.1.0 with appropriate changes (see Changes) is ready for tests. ## ## Revision 1.5 2004/04/06 16:44:57 rotmistr ## *** empty log message *** ## ## Revision 1.4 2004/03/29 03:16:47 rotmistr ## *** empty log message *** ## ## Revision 1.3 2004/02/04 21:23:46 rotmistr ## - gcc-3.3.2 compatible ## - better postfiltering for reverse-e-PCR for discontiguos words ## - cgi added, that supports: ## -- contig to chromosome mapping ## -- simple mapviewer links ## -- unists links ## -- discontiguos words ## ## Revision 1.2 2003/12/23 21:30:57 rotmistr ## - gaps/mismatches reporting ## - lo/hi fixup ## - reverse sts in re-PCR_main ## ## Revision 1.1.1.1 2003/12/23 18:17:28 rotmistr ## Package that includes e-PCR, reverse e-PCR, and sequence data preparation ## program for reverse e-PCR looks ready ## ## Revision 1.3 2003/11/20 05:56:02 rotmistr ## Loading looks working ## ## Revision 1.2 2003/11/20 02:12:28 rotmistr ## Fixed id, log tags and copyright notice ## ########################################################################