pax_global_header00006660000000000000000000000064116773776620014540gustar00rootroot0000000000000052 comment=3c9fb01843d797c64be578ecbcd1be683cfe245a filo-master/000077500000000000000000000000001167737766200133265ustar00rootroot00000000000000filo-master/LICENSE000066400000000000000000000020571167737766200143370ustar00rootroot00000000000000Copyright (C) 2009,2010,2011 by Aaron Quinlan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. filo-master/Makefile000066400000000000000000000020671167737766200147730ustar00rootroot00000000000000# ========================== # BEDTools Makefile # (c) 2009 Aaron Quinlan # ========================== # define our object and binary directories export OBJ_DIR = obj export BIN_DIR = bin export SRC_DIR = src export CXX = g++ export CXXFLAGS = -Wall -O2 export LIBS = -lz SUBDIRS = $(SRC_DIR)/groupBy \ $(SRC_DIR)/shuffle \ $(SRC_DIR)/stats \ UTIL_SUBDIRS = $(SRC_DIR)/common/tabFile \ $(SRC_DIR)/common/gzstream \ $(SRC_DIR)/common/fileType \ all: [ -d $(OBJ_DIR) ] || mkdir -p $(OBJ_DIR) [ -d $(BIN_DIR) ] || mkdir -p $(BIN_DIR) @echo "=========================================================" @echo "Building filo:" @echo "=========================================================" @for dir in $(UTIL_SUBDIRS); do \ echo "- Building in $$dir"; \ $(MAKE) --no-print-directory -C $$dir; \ echo ""; \ done @for dir in $(SUBDIRS); do \ echo "- Building in $$dir"; \ $(MAKE) --no-print-directory -C $$dir; \ echo ""; \ done .PHONY: all clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* .PHONY: clean filo-master/README.rst000066400000000000000000000125411167737766200150200ustar00rootroot00000000000000:Project: filo :Version: 1.1.0 :Authors: - Aaron Quinlan, University of Virginia - Assaf Gordon, Cold Spring Harbor Laboratories :Contact: arq5x@virginia.edu ========================================= filo - Useful FILe and stream Operations ========================================= The following tools are available as part of the **filo** package. More to come... .. contents:: groupBy ------- **groupBy** is a useful tool that mimics the "groupBy" clause in database systems. Given a file or stream that is sorted by the appropriate "grouping columns", **groupBy** will compute summary statistics on another column in the file or stream. This will work with output from all BEDTools as well as any other tab-delimited file or stream. You specify a list of columns that should be "grouped" with the -g parameter (e.g., -g 2,3,4 will group on the second through fourth columns). You then specify column(s) that should be summarized or "operated upon" for each group with the -c parameter (e.g., -c 2 or -c 2,3 or -c 2,2,2,5). Finally, you specify what operations should be applied to the list of columns in -c. Here is the current list of the available operations. 1. *sum* - numeric only 2. *count* - numeric or text 3. *min* - numeric only 4. *max* - numeric only 5. *mean* - numeric only 6. *stdev* - numeric only 7. *median* - numeric only 8. *mode* - numeric or text 9. *antimode* - numeric or text collapse (i.e., print a comma separated list) - numeric or text 10. *freqasc* - print a comma separated list of values observed and the number of times they were observed. Reported in **ascending** order of frequency. 11. *freqdesc* - print a comma separated list of values observed and the number of times they were observed. Reported in descending order of frequency. 12. *collapse* - print a comma separated list of each value in the grouped column. 13. *concat* - concattenate each value in the grouped column into a single string. And here are some usage examples. I hope you find this utility to be of use in your work. I have found it to be a huge time saver. :: $ cat ex1.out chr1 10 20 A chr1 15 25 B.1 1000 chr1 10 20 A chr1 25 35 B.2 10000 $ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum chr1 10 20 A 11000 $ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max chr1 10 20 A 11000 10000 $ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean chr1 10 20 A B.1,B.2, 5500 $ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean chr1 10 20 A B.1,B.2, 5500 shuffle ------- **shuffle** will randomize the order of lines in a file. In other words, if you have a sorted file, **shuffle** will undo the sort. :: $ cat test 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 $ shuffle test 8 9 0 1 4 5 7 8 9 10 5 6 3 4 2 3 1 2 6 7 10 11 $ cat test | shuffle 4 5 6 7 0 1 9 10 3 4 7 8 5 6 10 11 8 9 2 3 1 2 stats ----- **stats** is a small utility for computing descriptive statistic on a given column of a tab-delimited file or stream. By default, it will assume you want to gather stats on the first column in your file/stream and compute *all* of the following statistics: 1. total number of lines 2. the sum of all the values in the column 3. the arithmetic mean (i.e., the "average") of the values in the column 4. the geometric mean (if possible) 5. the median 6. the mode 7. the anti-mode (i.e., the least frequent value) 8. the minimum 9. the maximum 10. the variance 11. the standard deviation. Here are some examples of stats in action. :: $ cat test 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 ####################################################################### # Default is to compute statistics on the first (tab-delimited) column. ####################################################################### $ stats test Total lines: 11 Sum of lines: 55 Ari. Mean: 5 Geo. Mean: undef (zero found in data) Median: 5 Mode: 0 (N=1) Anti-Mode: 0 (N=1) Minimum: 0 Maximum: 10 Variance: 10 StdDev: 3.16227766016838 ####################################################################### # Let's work with the second (1-based) column. ####################################################################### $ stats test -c 2 Total lines: 11 Sum of lines: 66 Ari. Mean: 6 Geo. Mean: 4.9092387795844 Median: 6 Mode: 1 (N=1) Anti-Mode: 1 (N=1) Minimum: 1 Maximum: 11 Variance: 10 StdDev: 3.16227766016838 ####################################################################### # Let's just get the mean of the second (1-based) column. ####################################################################### $ stats test -c 2 -mu Total lines: 11 Sum of lines: 66 Ari. Mean: 6 ####################################################################### # It works on stdin as well. ####################################################################### $ stats -c 2 -max < test Total lines: 11 Sum of lines: 66 Maximum: 11 $ cat test | stats -c 2 -med Total lines: 11 Sum of lines: 66 Median: 6 ####################################################################### # You get the idea. Other options are available with -h ####################################################################### filo-master/src/000077500000000000000000000000001167737766200141155ustar00rootroot00000000000000filo-master/src/common/000077500000000000000000000000001167737766200154055ustar00rootroot00000000000000filo-master/src/common/fileType/000077500000000000000000000000001167737766200171665ustar00rootroot00000000000000filo-master/src/common/fileType/Makefile000066400000000000000000000013031167737766200206230ustar00rootroot00000000000000OBJ_DIR = ../../../obj/ BIN_DIR = ../../../bin/ UTILITIES_DIR = ../../utils/ # ------------------- # define our includes # ------------------- INCLUDES = # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= fileType.cpp OBJECTS= $(SOURCES:.cpp=.o) _EXT_OBJECTS= EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) $(BUILT_OBJECTS): $(SOURCES) @echo " * compiling" $(*F).cpp @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(INCLUDES) clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* .PHONY: cleanfilo-master/src/common/fileType/fileType.cpp000066400000000000000000000033041167737766200214530ustar00rootroot00000000000000/***************************************************************************** fileType.cpp (c) 2009 - Aaron Quinlan Hall Laboratory Department of Biochemistry and Molecular Genetics University of Virginia aaronquinlan@gmail.com Licensed under the GNU General Public License 2.0 license. ******************************************************************************/ #include "fileType.h" /* returns TRUE if the file is a regular file: not a pipe/device. This implies that the file can be opened/closed/seek'd multiple times without losing information */ bool isRegularFile(const string& filename) { struct stat buf ; int i; i = stat(filename.c_str(), &buf); if (i!=0) { cerr << "Error: can't determine file type of '" << filename << "': " << strerror(errno) << endl; exit(1); } if (S_ISREG(buf.st_mode)) return true; return false; } /* returns TRUE if the file has a GZIP header. Should only be run on regular files. */ bool isGzipFile(const string& filename) { //see http://www.gzip.org/zlib/rfc-gzip.html#file-format struct { unsigned char id1; unsigned char id2; unsigned char cm; } gzip_header; ifstream f(filename.c_str(), ios::in|ios::binary); if (!f) return false; if (!f.read((char*)&gzip_header, sizeof(gzip_header))) return false; if ( gzip_header.id1 == 0x1f && gzip_header.id2 == 0x8b && gzip_header.cm == 8 ) return true; return false; } filo-master/src/common/fileType/fileType.h000066400000000000000000000020641167737766200211220ustar00rootroot00000000000000/***************************************************************************** fileType.h (c) 2009 - Aaron Quinlan Hall Laboratory Department of Biochemistry and Molecular Genetics University of Virginia aaronquinlan@gmail.com Licensed under the GNU General Public License 2.0 license. ******************************************************************************/ #ifndef FILETYPE_H #define FILETYPE_H #include #include #include #include #include #include #include #include #include #include using namespace std; /***************************************************************************** Convenience functions to detect whether a given file is "regular" and/or "gzipped". Kindly contributed by Assaf Gordon. ******************************************************************************/ string string_error(int errnum); bool isRegularFile(const string& filename); bool isGzipFile(const string& filename); #endif /* FILETYPE_H */ filo-master/src/common/gzstream/000077500000000000000000000000001167737766200172415ustar00rootroot00000000000000filo-master/src/common/gzstream/COPYING.LIB000066400000000000000000000634761167737766200207210ustar00rootroot00000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. , 1 April 1990 Ty Coon, President of Vice That's all there is to it! filo-master/src/common/gzstream/Makefile000066400000000000000000000037531167737766200207110ustar00rootroot00000000000000# ============================================================================ # gzstream, C++ iostream classes wrapping the zlib compression library. # Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # ============================================================================ # # File : Makefile # Revision : $Revision: 1.3 $ # Revision_date : $Date: 2001/10/04 15:09:28 $ # Author(s) : Deepak Bandyopadhyay, Lutz Kettner # # ============================================================================ # ---------------------------------------------------------------------------- # adapt these settings to your need: # add '-DGZSTREAM_NAMESPACE=name' to CPPFLAGS to place the classes # in its own namespace. Note, this macro needs to be set while creating # the library as well while compiling applications based on it. # As an alternative, gzstream.C and gzstream.h can be edited. # ---------------------------------------------------------------------------- CXX = g++ CXXFLAGS = -I. -O2 -Wall LDFLAGS = -L. -lgzstream -lz OBJ_DIR = ../../../obj/ BIN_DIR = ../../../bin/ ${OBJ_DIR}/gzstream.o : gzstream.C gzstream.h ${CXX} ${CXXFLAGS} -c -o ${OBJ_DIR}/gzstream.o gzstream.C clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* .PHONY: clean filo-master/src/common/gzstream/README000066400000000000000000000003731167737766200201240ustar00rootroot00000000000000 gzstream C++ iostream classes wrapping the zlib compression library. =========================================================================== See index.html for documentation and installation instructions. filo-master/src/common/gzstream/gzstream.C000066400000000000000000000117451167737766200212110ustar00rootroot00000000000000// ============================================================================ // gzstream, C++ iostream classes wrapping the zlib compression library. // Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // ============================================================================ // // File : gzstream.C // Revision : $Revision: 1.7 $ // Revision_date : $Date: 2003/01/08 14:41:27 $ // Author(s) : Deepak Bandyopadhyay, Lutz Kettner // // Standard streambuf implementation following Nicolai Josuttis, "The // Standard C++ Library". // ============================================================================ #include #include #include // for memcpy #ifdef GZSTREAM_NAMESPACE namespace GZSTREAM_NAMESPACE { #endif // ---------------------------------------------------------------------------- // Internal classes to implement gzstream. See header file for user classes. // ---------------------------------------------------------------------------- // -------------------------------------- // class gzstreambuf: // -------------------------------------- gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { if ( is_open()) return (gzstreambuf*)0; mode = open_mode; // no append nor read/write mode if ((mode & std::ios::ate) || (mode & std::ios::app) || ((mode & std::ios::in) && (mode & std::ios::out))) return (gzstreambuf*)0; char fmode[10]; char* fmodeptr = fmode; if ( mode & std::ios::in) *fmodeptr++ = 'r'; else if ( mode & std::ios::out) *fmodeptr++ = 'w'; *fmodeptr++ = 'b'; *fmodeptr = '\0'; file = gzopen( name, fmode); if (file == 0) return (gzstreambuf*)0; opened = 1; return this; } gzstreambuf * gzstreambuf::close() { if ( is_open()) { sync(); opened = 0; if ( gzclose( file) == Z_OK) return this; } return (gzstreambuf*)0; } int gzstreambuf::underflow() { // used for input buffer only if ( gptr() && ( gptr() < egptr())) return * reinterpret_cast( gptr()); if ( ! (mode & std::ios::in) || ! opened) return EOF; // Josuttis' implementation of inbuf int n_putback = gptr() - eback(); if ( n_putback > 4) n_putback = 4; memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); int num = gzread( file, buffer+4, bufferSize-4); if (num <= 0) // ERROR or EOF return EOF; // reset buffer pointers setg( buffer + (4 - n_putback), // beginning of putback area buffer + 4, // read position buffer + 4 + num); // end of buffer // return next character return * reinterpret_cast( gptr()); } int gzstreambuf::flush_buffer() { // Separate the writing of the buffer from overflow() and // sync() operation. int w = pptr() - pbase(); if ( gzwrite( file, pbase(), w) != w) return EOF; pbump( -w); return w; } int gzstreambuf::overflow( int c) { // used for output buffer only if ( ! ( mode & std::ios::out) || ! opened) return EOF; if (c != EOF) { *pptr() = c; pbump(1); } if ( flush_buffer() == EOF) return EOF; return c; } int gzstreambuf::sync() { // Changed to use flush_buffer() instead of overflow( EOF) // which caused improper behavior with std::endl and flush(), // bug reported by Vincent Ricard. if ( pptr() && pptr() > pbase()) { if ( flush_buffer() == EOF) return -1; } return 0; } // -------------------------------------- // class gzstreambase: // -------------------------------------- gzstreambase::gzstreambase( const char* name, int mode) { init( &buf); open( name, mode); } gzstreambase::~gzstreambase() { buf.close(); } void gzstreambase::open( const char* name, int open_mode) { if ( ! buf.open( name, open_mode)) clear( rdstate() | std::ios::badbit); } void gzstreambase::close() { if ( buf.is_open()) if ( ! buf.close()) clear( rdstate() | std::ios::badbit); } #ifdef GZSTREAM_NAMESPACE } // namespace GZSTREAM_NAMESPACE #endif // ============================================================================ // EOF // filo-master/src/common/gzstream/gzstream.h000066400000000000000000000107731167737766200212560ustar00rootroot00000000000000// ============================================================================ // gzstream, C++ iostream classes wrapping the zlib compression library. // Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // ============================================================================ // // File : gzstream.h // Revision : $Revision: 1.5 $ // Revision_date : $Date: 2002/04/26 23:30:15 $ // Author(s) : Deepak Bandyopadhyay, Lutz Kettner // // Standard streambuf implementation following Nicolai Josuttis, "The // Standard C++ Library". // ============================================================================ #ifndef GZSTREAM_H #define GZSTREAM_H 1 // standard C++ with new header file names and std:: namespace #include #include #include #ifdef GZSTREAM_NAMESPACE namespace GZSTREAM_NAMESPACE { #endif // ---------------------------------------------------------------------------- // Internal classes to implement gzstream. See below for user classes. // ---------------------------------------------------------------------------- class gzstreambuf : public std::streambuf { private: static const int bufferSize = 47+256; // size of data buff // totals 512 bytes under g++ for igzstream at the end. gzFile file; // file handle for compressed file char buffer[bufferSize]; // data buffer char opened; // open/close state of stream int mode; // I/O mode int flush_buffer(); public: gzstreambuf() : opened(0) { setp( buffer, buffer + (bufferSize-1)); setg( buffer + 4, // beginning of putback area buffer + 4, // read position buffer + 4); // end position // ASSERT: both input & output capabilities will not be used together } int is_open() { return opened; } gzstreambuf* open( const char* name, int open_mode); gzstreambuf* close(); ~gzstreambuf() { close(); } virtual int overflow( int c = EOF); virtual int underflow(); virtual int sync(); }; class gzstreambase : virtual public std::ios { protected: gzstreambuf buf; public: gzstreambase() { init(&buf); } gzstreambase( const char* name, int open_mode); ~gzstreambase(); void open( const char* name, int open_mode); void close(); gzstreambuf* rdbuf() { return &buf; } }; // ---------------------------------------------------------------------------- // User classes. Use igzstream and ogzstream analogously to ifstream and // ofstream respectively. They read and write files based on the gz* // function interface of the zlib. Files are compatible with gzip compression. // ---------------------------------------------------------------------------- class igzstream : public gzstreambase, public std::istream { public: igzstream() : std::istream( &buf) {} igzstream( const char* name, int open_mode = std::ios::in) : gzstreambase( name, open_mode), std::istream( &buf) {} gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } void open( const char* name, int open_mode = std::ios::in) { gzstreambase::open( name, open_mode); } }; class ogzstream : public gzstreambase, public std::ostream { public: ogzstream() : std::ostream( &buf) {} ogzstream( const char* name, int mode = std::ios::out) : gzstreambase( name, mode), std::ostream( &buf) {} gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } void open( const char* name, int open_mode = std::ios::out) { gzstreambase::open( name, open_mode); } }; #ifdef GZSTREAM_NAMESPACE } // namespace GZSTREAM_NAMESPACE #endif #endif // GZSTREAM_H // ============================================================================ // EOF // filo-master/src/common/gzstream/gzstream.o000066400000000000000000000264001167737766200212570ustar00rootroot00000000000000  a@` __text__TEXT6 @g__mod_init_func__DATA8 x  __StaticInit__TEXT@ L __textcoal_nt__TEXT  __gcc_except_tab__TEXTl T__const__DATA (__cstring__TEXT (__bss__DATA`__eh_frame__TEXT XHpW h__constructor__TEXT`__destructor__TEXT`(Y"H P'6#UHH=UHH]LeHIHw H_()H@9t HID$(H$Ld$ÐUHSHxt'wtt HG(@0HG(u H[ÐUHSHHwtHP0ƃwH{@tHH[UHSHHt#HHuHHHxw H[ÐUHSH(HHHH0Hu1wu_xuTuOЃtEt ErHEHEtEwHEb@HuHHC@Ht ƃwHHHMH3 tH([ÐUHSHHHHuHHHxw H[UHH]LeLmH HHwHt}H;wsw}wtb+COHc)HcLl HH)LLcLH{@+L¸~,LkLcHcIHCA$ xtH]LeLmUHHG(HtH;G v ƒtUHAUATSHHIAHHHPHFHHHHGHGHGHG HG(HG0HG8H@HHCƃHCPHC0HC(H~HC8HCTHCHCHC HsHHHxDLHaILcHHCL$IHHI$I|$8LHHI$I|$8LH[A\A]UHH]LeLmLuH HIALLIDŽ$AƄ$AƄ$IDŽ$IDŽ$IDŽ$IDŽ$HHH@HHHHCHCHCHC HC(HC0HC8H{@HHCƃHCPHC0HC(H~HC8HCTHCHCHC HsLDLHpILkHHCLIHHIEI}8HHIEI}8HHI$LLH$Ld$Ll$Lt$UHH]LeLmH HHHHPHFHLgLQIHHCLIHHI$I|$8uHHI$I|$8ZHHCLIHHI$I|$8'HHI$I|$8H]LeLmLLIzUHH]LeLmH IHHH@HH_HQIHID$HIHHHH{8HHHH{8wHID$HIHHHH{8EHHHH{8I$HHI$H]LeLmI$HHI$LLIzUHH]LeLmH IHHH@HH_HNIHID$HIHHHH{8IHHHH{80HID$HBIHHHH{8I$HHI$LHHHH{8I$HHI$LH]LeLmUHu,u$H=HH=UHUHH]LeHIHH$HHHI$I|$8HHHI$I|$8H$Ld$UHH]LeHIHH$HHHI$I|$8HHHI$I|$8LH$Ld$$C$C'&4+3HO407HOA6=OV(46=OV<xx@12gzstreambase11gzstreambufzPLRx 47 4 4 <K  4I 4> 4? 4 46 < 4( <l  <t  <8  < < <- <0 $ N- E- M=A-L=6-E-M=A-L=-2A-{L=rA-bL=V-I2=-*3 3'-6-E-M=E-M=A-L=A-vL=j-]2TA-DL=8A-(L=-2-33)-6-A-L=A-yL=m-a2XA-FL==A-+L=-2-6-E-M=A-L=A-sL=g-[2K-=F-2@-L=33JD-6-A-L=6-A-L=-2-F-Y2R@-L=-uV-`X-G--R-S=U-8S=G--T--6W- C-F-1O-, P=B-N-A-L=6-A-L=-2]A-KL=D6-<A-*L=-2 H/JI0K;>8:?=<97xph-X3P3H4@58. ').1(^1  (^ ' \( *^ *^) \* ,^,^+ \, q$^q`$^`#\ \\$ 1&^1 &^ % \& ^^ \ ^^ \ h^hd \d (^($ \$  ^ \ ^ \ "^!| \|" H^HD \D ^  \  ^ \ ^ \ ` ^`\ \\ ( ^($ \$ QMw @ < `L U l g  y    C     (   ` ;  O. &  hs Hz^S x (8 C  * [  y `V\  { E    G    \(Bhz/p61[ N(I.Aq__ZNSt8ios_base4InitC1Ev___dso_handle___cxa_atexit__ZNSt8ios_base4InitD1Ev__ZN11gzstreambuf12flush_bufferEv_gzwrite__ZN11gzstreambuf8overflowEi__ZN11gzstreambuf5closeEv_gzclose__ZN12gzstreambase5closeEv__ZNSt9basic_iosIcSt11char_traitsIcEE5clearESt12_Ios_Iostate__ZN11gzstreambuf4openEPKci___stack_chk_guard_gzopen___stack_chk_fail__ZN12gzstreambase4openEPKci__ZN11gzstreambuf9underflowEv_memcpy_gzread__ZN11gzstreambuf4syncEv__ZN11gzstreambufD1Ev__ZTV11gzstreambuf__ZTVSt15basic_streambufIcSt11char_traitsIcEE__ZNSt6localeD1Ev__Unwind_Resume__ZN11gzstreambufD0Ev__ZdlPv__ZN12gzstreambaseC2EPKci__ZNSt6localeC1Ev__ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E__ZN12gzstreambaseC1EPKci__ZNSt8ios_baseC2Ev__ZTV12gzstreambase__ZTVSt9basic_iosIcSt11char_traitsIcEE__ZNSt8ios_baseD2Ev__ZN12gzstreambaseD2Ev__ZTv0_n24_N12gzstreambaseD1Ev__ZN12gzstreambaseD1Ev__ZTv0_n24_N12gzstreambaseD0Ev__ZN12gzstreambaseD0Ev__ZTI12gzstreambase__ZTT12gzstreambase__ZTI11gzstreambuf__ZNSt15basic_streambufIcSt11char_traitsIcEE5imbueERKSt6locale__ZNSt15basic_streambufIcSt11char_traitsIcEE6setbufEPcl__ZNSt15basic_streambufIcSt11char_traitsIcEE7seekoffExSt12_Ios_SeekdirSt13_Ios_Openmode__ZNSt15basic_streambufIcSt11char_traitsIcEE7seekposESt4fposI11__mbstate_tESt13_Ios_Openmode__ZNSt15basic_streambufIcSt11char_traitsIcEE9showmanycEv__ZNSt15basic_streambufIcSt11char_traitsIcEE6xsgetnEPcl__ZNSt15basic_streambufIcSt11char_traitsIcEE5uflowEv__ZNSt15basic_streambufIcSt11char_traitsIcEE9pbackfailEi__ZNSt15basic_streambufIcSt11char_traitsIcEE6xsputnEPKcl__ZTS12gzstreambase__ZTVN10__cxxabiv121__vmi_class_type_infoE__ZTISt9basic_iosIcSt11char_traitsIcEE__ZTS11gzstreambuf__ZTVN10__cxxabiv120__si_class_type_infoE__ZTISt15basic_streambufIcSt11char_traitsIcEE___gxx_personality_v0__ZN11gzstreambuf12flush_bufferEv.eh__ZN11gzstreambuf8overflowEi.eh__ZN11gzstreambuf5closeEv.eh__ZN12gzstreambase5closeEv.eh__ZN11gzstreambuf4openEPKci.eh__ZN12gzstreambase4openEPKci.eh__ZN11gzstreambuf9underflowEv.eh__ZN11gzstreambuf4syncEv.eh__ZN11gzstreambufD1Ev.eh__ZN11gzstreambufD0Ev.eh__ZN12gzstreambaseC2EPKci.eh__ZN12gzstreambaseC1EPKci.eh__ZN12gzstreambaseD2Ev.eh__ZN12gzstreambaseD1Ev.eh__ZN12gzstreambaseD0Ev.eh__GLOBAL__I__ZN11gzstreambuf4openEPKci__Z41__static_initialization_and_destruction_0ii__ZStL8__ioinit___tcf_0GCC_except_table0GCC_except_table1GCC_except_table2GCC_except_table3GCC_except_table4GCC_except_table5GCC_except_table6EH_frame1__Z41__static_initialization_and_destruction_0ii.eh__GLOBAL__I__ZN11gzstreambuf4openEPKci.eh___tcf_0.ehfilo-master/src/common/gzstream/test_gunzip.o000066400000000000000000000254041167737766200220010ustar00rootroot00000000000000p  __text__TEXT __mod_init_func__DATA x __StaticInit__TEXT L__textcoal_nt__TEXT<  __gcc_except_tab__TEXT 8t__cstring__TEXT__bss__DATA__const_coal__DATA0 __const_coal__TEXTX  __eh_frame__TEXTh h__constructor__TEXT__destructor__TEXTO# P"-UHH=UHH]LeLmLuL}H` IHHHU1҃t?L&H5HHLHHH5, H LLHHHIDŽ$AƄ$AƄ$IDŽ$IDŽ$IDŽ$IDŽ$H HAH HPHAHHHH(HDž0HDž8HDž@HDžHHDžPHDžXH{@HHH(ƅHCPHPHHH~HXHCTH0H8H@HsH HHxnIHHH(H('IH HHHGH@H HHHGH@HLhLH HIMHLx IEL<HDžHsHHxHHPH HPhHH@@HIFHIvH9IH H5HHHI$LLt ERROR: Opening file `' failed. ERROR: Closing file `ERROR: Reading file `ERROR: Writing file `@@h@xhhhh9igzstreamzPLRx 47 4 4 < < <   M- N= - "- - -s -b 6-S D=" 2- C= 6- D= 2- C= -- /-t A=T B=C 6-4 D= 2- C= 6- D= 2- C={ --o /-T A=4 B= 9- :=  9- :=8-9-:= 9- t;-i;-d:=] C8-!/-%-*-9-;- ;-;-:=b6-SD=2- C=2-C=--/-A=8-8-_.-O7-D0- B=>=oD=h5->;-9 /;-$;-:=6-D=)-==7-]&-Q=D=5-;- ;-;-:= t%-c;-^ T;-I;-D:==#"-6- D=)-=$-=7-U=I2-8C=%2-C=#-?=7-|?=u1-C==D=5-z;-uk;-`9-X:=Q7N= 4-F-1J-, K=3-I-6-D=w"-o6-^D=J)-"==MM6-D="-6-D=z)-R='=M M=<@hG`HX=@+8,0=EF<'(<x`XP0( a!^a P!^P L\L! !^!^ \  ^^\ ^\ `^`\\\ (^($\$ LMv'   $;Q\ r  # h- a u 4& W pjV A 0`  XG }I  O< im   &W@wW6|,@iKm)na(+o__ZNSt8ios_base4InitC1Ev___dso_handle___cxa_atexit__ZNSt8ios_base4InitD1Ev__ZTv0_n24_N9igzstreamD1Ev__ZN9igzstreamD1Ev__ZThn392_N9igzstreamD1Ev__ZTV9igzstream__ZTT9igzstream__ZN12gzstreambaseD2Ev__ZTVSt9basic_iosIcSt11char_traitsIcEE__ZNSt8ios_baseD2Ev__Unwind_Resume__ZTv0_n24_N9igzstreamD0Ev__ZN9igzstreamD0Ev__ZThn392_N9igzstreamD0Ev__ZdlPv_main___stack_chk_guard__ZSt4cerr__ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l__ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc__ZNSt8ios_baseC2Ev__ZTVSt15basic_streambufIcSt11char_traitsIcEE__ZNSt6localeC1Ev__ZTV11gzstreambuf__ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E__ZN11gzstreambuf5closeEv__ZNSt6localeD1Ev__ZN12gzstreambase4openEPKci__ZN12gzstreambase5closeEv__ZN12gzstreambaseC2EPKci__ZTTSt14basic_ofstreamIcSt11char_traitsIcEE__ZTVSt14basic_ofstreamIcSt11char_traitsIcEE__ZNSt13basic_filebufIcSt11char_traitsIcEEC1Ev__ZNSt13basic_filebufIcSt11char_traitsIcEE4openEPKcSt13_Ios_Openmode__ZNSt9basic_iosIcSt11char_traitsIcEE5clearESt12_Ios_Iostate__ZTVSt13basic_filebufIcSt11char_traitsIcEE__ZNSt13basic_filebufIcSt11char_traitsIcEE5closeEv__ZNSt12__basic_fileIcED1Ev__ZNSi3getERc___stack_chk_fail__ZTC9igzstream0_12gzstreambase__ZTC9igzstream392_Si__ZTI9igzstream__ZTI12gzstreambase__ZN12gzstreambaseD1Ev__ZN12gzstreambaseD0Ev__ZTv0_n24_N12gzstreambaseD1Ev__ZTv0_n24_N12gzstreambaseD0Ev__ZTISi__ZNSiD1Ev__ZNSiD0Ev__ZTv0_n24_NSiD1Ev__ZTv0_n24_NSiD0Ev__ZTVN10__cxxabiv121__vmi_class_type_infoE__ZTS9igzstream___gxx_personality_v0__ZN9igzstreamD1Ev.eh__ZN9igzstreamD0Ev.eh_main.eh__GLOBAL__I_main__Z41__static_initialization_and_destruction_0ii__ZStL8__ioinit___tcf_0GCC_except_table0GCC_except_table1LC0LC1LC2LC3LC4LC5LC6GCC_except_table2EH_frame1__Z41__static_initialization_and_destruction_0ii.eh__GLOBAL__I_main.eh___tcf_0.ehfilo-master/src/common/gzstream/test_gzip.o000066400000000000000000000254441167737766200214420ustar00rootroot00000000000000p  __text__TEXT __mod_init_func__DATA  __StaticInit__TEXT L__textcoal_nt__TEXTd  __gcc_except_tab__TEXT 8__cstring__TEXT/__bss__DATA__const_coal__DATAP __const_coal__TEXTx  __eh_frame__TEXT h__constructor__TEXT__destructor__TEXTO# P"-UHH=UHH]LeLmLuL}H` IHHHU1҃t?L&H5HHLHHH5X H0LLHHHIDŽ$AƄ$AƄ$IDŽ$IDŽ$IDŽ$IDŽ$H HAH0HPHAHHHH8HDž@HDžHHDžPHDžXHDž`HDžhH{@HHH8ƅHCPH`HXH~HhHCTH@HHHPHsH0HHxnIHHH8H8'IH0HHHGH@H0HHHGH@HLhLH0HIMHLx IEL<HsHHxHHPH0HPhHH@@HIFHIvH9IH0H5HHHI$LLt ERROR: Opening file `' failed. ERROR: Closing file `ERROR: Reading file `ERROR: Writing file `@@h@xpppp9ogzstreamzPLRx 47 4 4 < < <   M- N= - "- - - - 6- D=C 2-2 C=" 6- D= 2- C= -- /- A=j B=Y 6-J D= 2- C= 6- D= 2- C={ --o /-T A=4 B= 9- :=  9- :=8-9-:= 9- t;-i;-d:=] C%-78-/-*-9-;- ;-;-:=b6-SD=2-C=2-C=--/-A=8-v8-T.-D7-90-B=>=YD=R5-(;-# ;-;- :=6-D=)-==7-R&-F=D=5-;- ;-;-:= i%-X;-S I;->;-9:=2"-6-D=)-=$-=7-U=I2-8C=%2-C=#-?=7-|?=u1-C==D=5-z;-uk;-`9-X:=Q7N= 4-F-1J-, K=3-I-6-qD=b"-Z6-ID=5)-==MM6-D="-6-D=o)-K='=M M=<@hG`HX=@+8,0=EF<'(<x`XP0( a!^a P!^P L\L! !^!^ \  ^^\ ^\ `^`\\\ (^($\$ LMvO   /7Ndo  3# - a u 4D W j~ A P@ xG6 }q ) Od im   &W@wW6|,@iKm)na(+o__ZNSt8ios_base4InitC1Ev___dso_handle___cxa_atexit__ZNSt8ios_base4InitD1Ev__ZTv0_n24_N9ogzstreamD1Ev__ZN9ogzstreamD1Ev__ZThn392_N9ogzstreamD1Ev__ZTV9ogzstream__ZTT9ogzstream__ZN12gzstreambaseD2Ev__ZTVSt9basic_iosIcSt11char_traitsIcEE__ZNSt8ios_baseD2Ev__Unwind_Resume__ZTv0_n24_N9ogzstreamD0Ev__ZN9ogzstreamD0Ev__ZThn392_N9ogzstreamD0Ev__ZdlPv_main___stack_chk_guard__ZSt4cerr__ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l__ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc__ZNSt8ios_baseC2Ev__ZTVSt15basic_streambufIcSt11char_traitsIcEE__ZNSt6localeC1Ev__ZTV11gzstreambuf__ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E__ZN11gzstreambuf5closeEv__ZNSt6localeD1Ev__ZN12gzstreambase4openEPKci__ZN12gzstreambase5closeEv__ZN12gzstreambaseC2EPKci__ZTTSt14basic_ifstreamIcSt11char_traitsIcEE__ZTVSt14basic_ifstreamIcSt11char_traitsIcEE__ZNSt13basic_filebufIcSt11char_traitsIcEEC1Ev__ZNSt13basic_filebufIcSt11char_traitsIcEE4openEPKcSt13_Ios_Openmode__ZNSt9basic_iosIcSt11char_traitsIcEE5clearESt12_Ios_Iostate__ZTVSt13basic_filebufIcSt11char_traitsIcEE__ZNSt13basic_filebufIcSt11char_traitsIcEE5closeEv__ZNSt12__basic_fileIcED1Ev__ZNSi3getERc___stack_chk_fail__ZTC9ogzstream0_12gzstreambase__ZTC9ogzstream392_So__ZTI9ogzstream__ZTI12gzstreambase__ZN12gzstreambaseD1Ev__ZN12gzstreambaseD0Ev__ZTv0_n24_N12gzstreambaseD1Ev__ZTv0_n24_N12gzstreambaseD0Ev__ZTISo__ZNSoD1Ev__ZNSoD0Ev__ZTv0_n24_NSoD1Ev__ZTv0_n24_NSoD0Ev__ZTVN10__cxxabiv121__vmi_class_type_infoE__ZTS9ogzstream___gxx_personality_v0__ZN9ogzstreamD1Ev.eh__ZN9ogzstreamD0Ev.eh_main.eh__GLOBAL__I_main__Z41__static_initialization_and_destruction_0ii__ZStL8__ioinit___tcf_0GCC_except_table0GCC_except_table1LC0LC1LC2LC3LC4LC5LC6GCC_except_table2EH_frame1__Z41__static_initialization_and_destruction_0ii.eh__GLOBAL__I_main.eh___tcf_0.ehfilo-master/src/common/lineFileUtilities/000077500000000000000000000000001167737766200210305ustar00rootroot00000000000000filo-master/src/common/lineFileUtilities/lineFileUtilities.h000066400000000000000000000032401167737766200246230ustar00rootroot00000000000000#ifndef LINEFILEUTILITIES_H #define LINEFILEUTILITIES_H #include #include #include #include using namespace std; // templated function to convert objects to strings template inline std::string ToString(const T & value) { std::stringstream ss; ss << value; return ss.str(); } inline void Tokenize(const string &str, vector &tokens, const string &delimiter = "\t") { // Skip delimiters at beginning. string::size_type lastPos = str.find_first_not_of(delimiter, 0); // Find first "non-delimiter". string::size_type pos = str.find_first_of(delimiter, lastPos); while (string::npos != pos || string::npos != lastPos) { // Found a token, add it to the vector. tokens.push_back(str.substr(lastPos, pos - lastPos)); // Skip delimiters. Note the "not_of" lastPos = str.find_first_not_of(delimiter, pos); // Find next "non-delimiter" pos = str.find_first_of(delimiter, lastPos); } } inline void Tokenize(const string &str, vector &tokens, const string &delimiter = "\t") { // Skip delimiters at beginning. string::size_type lastPos = str.find_first_not_of(delimiter, 0); // Find first "non-delimiter". string::size_type pos = str.find_first_of(delimiter, lastPos); while (string::npos != pos || string::npos != lastPos) { // Found a token, add it to the vector. tokens.push_back(atoi(str.substr(lastPos, pos - lastPos).c_str())); // Skip delimiters. Note the "not_of" lastPos = str.find_first_not_of(delimiter, pos); // Find next "non-delimiter" pos = str.find_first_of(delimiter, lastPos); } } #endif /* LINEFILEUTILITIES_H */ filo-master/src/common/tabFile/000077500000000000000000000000001167737766200167535ustar00rootroot00000000000000filo-master/src/common/tabFile/Makefile000066400000000000000000000015151167737766200204150ustar00rootroot00000000000000OBJ_DIR = ../../../obj/ BIN_DIR = ../../../bin/ UTILITIES_DIR = ../../common/ # ------------------- # define our includes # ------------------- INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= tabFile.cpp OBJECTS= $(SOURCES:.cpp=.o) _EXT_OBJECTS=lineFileUtilities.o gzstream.o fileType.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) $(BUILT_OBJECTS): $(SOURCES) @echo " * compiling" $(*F).cpp @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C -W $(INCLUDES) clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* .PHONY: cleanfilo-master/src/common/tabFile/tabFile.cpp000066400000000000000000000045721167737766200210350ustar00rootroot00000000000000/***************************************************************************** tabFile.cpp (c) 2009 - Aaron Quinlan Hall Laboratory Department of Biochemistry and Molecular Genetics University of Virginia aaronquinlan@gmail.com Licensed under the GNU General Public License 2.0 license. ******************************************************************************/ #include "lineFileUtilities.h" #include "tabFile.h" /******************************************* Class methods *******************************************/ // Constructor TabFile::TabFile(const string &tabFile) : _tabFile(tabFile) {} // Destructor TabFile::~TabFile(void) { } void TabFile::Open(void) { if (_tabFile == "stdin") { _tabStream = &cin; } else { size_t foundPos; foundPos = _tabFile.find_last_of(".gz"); // is this a GZIPPED TAB file? if (foundPos == _tabFile.size() - 1) { igzstream tabs(_tabFile.c_str(), ios::in); if ( !tabs ) { cerr << "Error: The requested file (" << _tabFile << ") could not be opened. Exiting!" << endl; exit (1); } else { // if so, close it (this was just a test) tabs.close(); // now set a pointer to the stream so that we // can read the file later on. _tabStream = new igzstream(_tabFile.c_str(), ios::in); } } // not GZIPPED. else { ifstream tabs(_tabFile.c_str(), ios::in); // can we open the file? if ( !tabs ) { cerr << "Error: The requested file (" << _tabFile << ") could not be opened. Exiting!" << endl; exit (1); } else { // if so, close it (this was just a test) tabs.close(); // now set a pointer to the stream so that we // can read the file later on. _tabStream = new ifstream(_tabFile.c_str(), ios::in); } } } } // Close the TAB file void TabFile::Close(void) { if (_tabFile != "stdin") delete _tabStream; } TabLineStatus TabFile::GetNextTabLine(TAB_FIELDS &tabFields, int &lineNum) { // make sure there are still lines to process. // if so, tokenize, return the TAB_FIELDS. if (_tabStream->good() == true) { string tabLine; tabFields.reserve(20); // parse the tabStream pointer getline(*_tabStream, tabLine); lineNum++; // split into a string vector. Tokenize(tabLine, tabFields); // parse the line and validate it return parseTabLine(tabFields, lineNum); } // default if file is closed or EOF return TAB_INVALID; } filo-master/src/common/tabFile/tabFile.h000066400000000000000000000033521167737766200204750ustar00rootroot00000000000000/***************************************************************************** tabFile.h (c) 2009 - Aaron Quinlan Hall Laboratory Department of Biochemistry and Molecular Genetics University of Virginia aaronquinlan@gmail.com Licensed under the GNU General Public License 2.0 license. ******************************************************************************/ #ifndef TABFILE_H #define TABFILE_H #include "gzstream.h" #include #include #include using namespace std; // enum to flag the state of a given line in a TAB file. enum TabLineStatus { TAB_INVALID = -1, TAB_HEADER = 0, TAB_BLANK = 1, TAB_VALID = 2 }; typedef vector TAB_FIELDS; //************************************************ // TabFile Class methods and elements //************************************************ class TabFile { public: // Constructor TabFile(const string &tabFile); // Destructor ~TabFile(void); // Open a TAB file for reading (creates an istream pointer) void Open(void); // Close an opened TAB file. void Close(void); // Get the next TAB entry in an opened TAB file. TabLineStatus GetNextTabLine (TAB_FIELDS &tab, int &lineNum); private: // data istream *_tabStream; string _tabFile; // methods inline TabLineStatus parseTabLine (const vector &lineVector, int &lineNum) { // bail out if we have a blank line if (lineVector.size() == 0) return TAB_BLANK; // real line with data if (lineVector[0][0] != '#') { return TAB_VALID; } // comment or header line else { lineNum--; return TAB_HEADER; } // default return TAB_INVALID; } }; #endif /* TABFILE_H */ filo-master/src/common/version/000077500000000000000000000000001167737766200170725ustar00rootroot00000000000000filo-master/src/common/version/version.h000066400000000000000000000002471167737766200207330ustar00rootroot00000000000000#ifndef VERSION_H #define VERSION_H // define the version. All tools in the // suite carry the same version number. #define VERSION "1.1.0" #endif /* VERSION_H */ filo-master/src/groupBy/000077500000000000000000000000001167737766200155445ustar00rootroot00000000000000filo-master/src/groupBy/Makefile000066400000000000000000000022661167737766200172120ustar00rootroot00000000000000UTILITIES_DIR = ../common/ OBJ_DIR = ../../obj/ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- INCLUDES = -I$(UTILITIES_DIR)/tabFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= groupBy.cpp OBJECTS= $(SOURCES:.cpp=.o) _EXT_OBJECTS=tabFile.o gzstream.o fileType.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= groupBy all: $(PROGRAM) .PHONY: all $(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) @echo " * linking $(PROGRAM)" @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) $(BUILT_OBJECTS): $(SOURCES) @echo " * compiling" $(*F).cpp @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/tabFile/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* .PHONY: clean filo-master/src/groupBy/groupBy.cpp000066400000000000000000000637271167737766200177160ustar00rootroot00000000000000/***************************************************************************** groupBy.cpp (c) 2009, 2010, 2011 - Aaron Quinlan Center for Public Health Genomics University of Virginia aaronquinlan@gmail.com Licenced under the MIT license. ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // out_of_range exception #include "version.h" #include "lineFileUtilities.h" #include "tabFile.h" using namespace std; const int PRECISION = 21; // define our program name #define PROGRAM_NAME "groupBy" // define our parameter checking macro #define PARAMETER_CHECK(param, paramLen, actualLen) ((strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)) #define LOOKS_LIKE_A_PARAM(string) (strlen(string)>0 && string[0]=='-') struct ValueGreaterThan { bool operator()( const vector< pair >::value_type& lhs, const vector< pair >::value_type& rhs ) const { return lhs.first > rhs.first; } }; struct ValueLessThan { bool operator()( const vector< pair >::value_type& lhs, const vector< pair >::value_type& rhs ) const { return lhs.first < rhs.first; } }; // function declarations void ShowHelp(void); void GroupBy(const string &inFile, const vector &groupColumns, const vector &opColumns, const vector &ops, const bool printOriginalLine, const bool printHeaderLine, const bool InputHaveHeaderLine, const bool ignoreCase); void PrintHeaderLine(const vector &InputFields, const vector &groupColumns, const vector &opColumns, const vector &ops, const bool PrintFullInputLine, const bool InputHaveHeaderLine); void ReportSummary(const vector &group, const vector > &data, const vector &ops); void addValue (const vector &fromList, vector &toList, int index, int lineNum, const bool ignoreCase); float ToFloat (string element); double ToDouble(const string &element); void TabPrintPost (string element); void TabPrintPre (string element); void CommaPrint (string element); int main(int argc, char* argv[]) { // input files string inFile = "stdin"; string groupColumnsString = "1,2,3"; string opsColumnString; string opsString; // our configuration variables bool showHelp = false; bool haveOpColumns = false; bool haveOps = true; bool printOriginalLine = false; bool printHeaderLine = false; bool InputHaveHeaderLine = false; bool ignoreCase = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); if((PARAMETER_CHECK("-h", 2, parameterLength)) || (PARAMETER_CHECK("--help", 5, parameterLength))) { showHelp = true; } } if(showHelp) ShowHelp(); // do some parsing (all of these parameters require 2 strings) for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); if(PARAMETER_CHECK("-i", 2, parameterLength)) { if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { cerr << endl << "*****ERROR: -i parameter requires a value." << endl << endl; ShowHelp(); break; } else { inFile = argv[i + 1]; i++; } } else if (PARAMETER_CHECK("-grp", 4, parameterLength) || PARAMETER_CHECK("-g", 2, parameterLength)) { if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { cerr << endl << "*****ERROR: -grp parameter requires a value." << endl << endl; ShowHelp(); break; } else { groupColumnsString = argv[i + 1]; i++; } } else if(PARAMETER_CHECK("-opCols", 7, parameterLength) || PARAMETER_CHECK("-c", 2, parameterLength)) { if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { cerr << endl << "*****ERROR: -opCols parameter requires a value." << endl << endl; ShowHelp(); break; } else { haveOpColumns = true; opsColumnString = argv[i + 1]; i++; } } else if(PARAMETER_CHECK("-ops", 4, parameterLength) || PARAMETER_CHECK("-o", 2, parameterLength)) { if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { cerr << endl << "*****ERROR: -ops parameter requires a value." << endl << endl; ShowHelp(); break; } else { haveOps = true; opsString = argv[i + 1]; i++; } } else if(PARAMETER_CHECK("-full", 5, parameterLength)) { printOriginalLine = true; } else if(PARAMETER_CHECK("-outheader", 10, parameterLength)) { printHeaderLine = true; } else if(PARAMETER_CHECK("-inheader", 9, parameterLength)) { InputHaveHeaderLine = true; } else if(PARAMETER_CHECK("-header", 7, parameterLength)) { InputHaveHeaderLine = true; printHeaderLine = true; } else if(PARAMETER_CHECK("-ignorecase", 11, parameterLength)) { ignoreCase = true; } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; } } if (!haveOpColumns) { cerr << endl << "*****" << endl << "*****ERROR: Need -opCols." << endl << "*****" << endl; showHelp = true; } // split the opsString into discrete operations and make sure they are all valid. vector ops; opsString.erase(remove_if(opsString.begin(),opsString.end(),::isspace),opsString.end()); Tokenize(opsString, ops, ","); for( size_t i = 0; i < ops.size(); i++ ) { if ((ops[i] != "sum") && (ops[i] != "max") && (ops[i] != "min") && (ops[i] != "mean") && (ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") && (ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse") && (ops[i] != "distinct") && (ops[i] != "concat") && (ops[i] != "freqdesc") && (ops[i] != "freqasc")) { cerr << endl << "*****" << endl << "*****ERROR: Invalid operation selection \"" << ops[i] << endl << "\" *****" << endl; showHelp = true; } } if (!showHelp) { // Split the column string sent by the user into discrete column numbers // A comma separated string is expected. vector groupColumnsInt; Tokenize(groupColumnsString, groupColumnsInt, ","); vector opColumnsInt; Tokenize(opsColumnString, opColumnsInt, ","); // sanity check the group columns for(size_t i = 0; i < groupColumnsInt.size(); ++i) { int groupColumnInt = groupColumnsInt[i]; if (groupColumnInt < 1) { cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl; ShowHelp(); } } // sanity check the op columns for(size_t i = 0; i < opColumnsInt.size(); ++i) { int opColumnInt = opColumnsInt[i]; if (opColumnInt < 1) { cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl; ShowHelp(); } } // sanity check that there are equal number of opColumns and ops if (ops.size() != opColumnsInt.size()) { cerr << endl << "*****" << endl << "*****ERROR: There must be equal number of ops and opCols. " << endl << "*****" << endl; ShowHelp(); } GroupBy(inFile, groupColumnsInt, opColumnsInt, ops, printOriginalLine, printHeaderLine, InputHaveHeaderLine, ignoreCase); } else { ShowHelp(); } } void ShowHelp(void) { cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; cerr << "Authors: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; cerr << " Assaf Gordon" << endl; cerr << "Summary: Summarizes a dataset column based upon" << endl; cerr << "\t common column groupings. Akin to the SQL \"group by\" command." << endl << endl; cerr << "Usage:\t " << PROGRAM_NAME << " -i [FILE] -g [group_column(s)] -c [op_column(s)] -o [ops] " << endl; cerr << "\t " << "cat [FILE] | " << PROGRAM_NAME << " -g [group_column(s)] -c [op_column(s)] -o [ops] " << endl << endl; cerr << "Options: " << endl; cerr << "\t-i\t\t" << "Input file. Assumes \"stdin\" if omitted." << endl << endl; cerr << "\t-g -grp\t\t" << "Specify the columns (1-based) for the grouping." << endl; cerr << "\t\t\tThe columns must be comma separated." << endl; cerr << "\t\t\t- Default: 1,2,3" << endl << endl; cerr << "\t-c -opCols\t" << "Specify the column (1-based) that should be summarized." << endl; cerr << "\t\t\t- Required." << endl << endl; cerr << "\t-o -ops\t\t" << "Specify the operation that should be applied to opCol." << endl; cerr << "\t\t\tValid operations:" << endl; cerr << "\t\t\t sum, count, min, max," << endl; cerr << "\t\t\t mean, median, mode, antimode," << endl; cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl; cerr << "\t\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl; cerr << "\t\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl; cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl; cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl; cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl; cerr << "\t\t\t- Default: sum" << endl << endl; cerr << "\t-full\t\t" << "Print all columns from input file." << endl; cerr << "\t\t\tDefault: print only grouped columns." << endl << endl; cerr << "\t-inheader\t" << "Input file has a header line - the first line will be ignored." << endl << endl ; cerr << "\t-outheader\t" << "Print header line in the output, detailing the column names. " << endl; cerr << "\t\t\tIf the input file has headers (-inheader), the output file" << endl; cerr << "\t\t\twill use the input's column names." << endl; cerr << "\t\t\tIf the input file has no headers, the output file" << endl; cerr << "\t\t\twill use \"col_1\", \"col_2\", etc. as the column names." << endl << endl; cerr << "\t-header\t\t" << "same as '-inheader -outheader'" << endl << endl; cerr << "\t-ignorecase\t" << "Group values regardless of upper/lower case." << endl << endl; cerr << "Examples: " << endl; cerr << "\t$ cat ex1.out" << endl; cerr << "\tchr1 10 20 A chr1 15 25 B.1 1000 ATAT" << endl; cerr << "\tchr1 10 20 A chr1 25 35 B.2 10000 CGCG" << endl << endl; cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum" << endl; cerr << "\tchr1 10 20 A 11000" << endl << endl; cerr << "\t$ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max" << endl; cerr << "\tchr1 10 20 A 11000 10000" << endl << endl; cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl; cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl; cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl; cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl; cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat" << endl; cerr << "\tchr1 10 20 A ATATCGCG" << endl << endl; cerr << "Notes: " << endl; cerr << "\t(1) The input file/stream should be sorted/grouped by the -grp. columns" << endl; cerr << "\t(2) If -i is unspecified, input is assumed to come from stdin." << endl << endl; // end the program here exit(1); } void GroupBy (const string &inFile, const vector &groupColumns, const vector &opColumns, const vector &ops, const bool printOriginalLine, const bool printHeaderLine, const bool InputHaveHeaderLine, const bool ignoreCase) { // current line number int lineNum = 0; // string representing current line string inLine; // vector of strings holding the tokenized current line vector inFields; vector inFieldsFirstLineInGroup; inFields.reserve(20); // keys for the current and previous group vector prevGroup(0); vector currGroup(0); // vector (one per column) of vector (one per value/column) of the opColumn values for the current group vector< vector > values; for( size_t i = 0; i < opColumns.size(); i++ ) { values.push_back( vector() ); } bool first_line = true; // check the status of the current line TabLineStatus tabLineStatus; // open a new tab file, loop through it line by line // and summarize the data for a given group when the group // fields change TabFile *_tab = new TabFile(inFile); _tab->Open(); while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) { if (tabLineStatus == TAB_VALID) { if (first_line) { first_line = false; if (printHeaderLine) PrintHeaderLine(inFields, groupColumns, opColumns, ops, printOriginalLine, InputHaveHeaderLine); if (InputHaveHeaderLine) { inFields.clear(); continue; // no need to process this line - it's the header } } if (inFieldsFirstLineInGroup.empty()) //first line in file? - save it inFieldsFirstLineInGroup = inFields; // build the group vector for the current line currGroup.clear(); vector::const_iterator gIt = groupColumns.begin(); vector::const_iterator gEnd = groupColumns.end(); for (; gIt != gEnd; ++gIt) addValue(inFields, currGroup, (*gIt-1), lineNum, ignoreCase); // there has been a group change if ((currGroup != prevGroup) && (prevGroup.size() > 0)) { // Summarize this group ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:prevGroup, values, ops); // reset and add the first value for the next group. values.clear(); for( size_t i = 0; i < opColumns.size(); i++ ) { values.push_back( vector() ); addValue(inFields, values[i], opColumns[i]-1, lineNum, ignoreCase); } inFieldsFirstLineInGroup = inFields; } // we're still dealing with the same group else { for( size_t i = 0; i < opColumns.size(); i++ ) addValue(inFields, values[i], opColumns[i]-1, lineNum, ignoreCase); } // reset for the next line prevGroup = currGroup; } inFields.clear(); } // report the last group ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:currGroup, values, ops); _tab->Close(); } void ReportSummary(const vector &group, const vector > &data, const vector &ops) { vector result; for( size_t i = 0; i < data.size(); i++ ) { string op = ops[i]; std::stringstream buffer; vector dataF; // are we doing a numeric conversion? if so, convert the strings to doubles. if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") || (op == "median") || (op == "stdev") || (op == "sstdev")) { transform(data[i].begin(), data[i].end(), back_inserter(dataF), ToDouble); } if (op == "sum") { // sum them up double total = accumulate(dataF.begin(), dataF.end(), 0.0); buffer << setprecision (PRECISION) << total; result.push_back(buffer.str()); } else if (op == "collapse") { string collapse; for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter if (j>0) collapse.append(","); collapse.append(data[i][j]); } result.push_back(collapse); } else if (op == "distinct") { string distinct; // get the current column's data vector col_data = data[i]; // remove duplicate entries from the vector // http://stackoverflow.com/questions/1041620/most-efficient-way-to-erase-duplicates-and-sort-a-c-vector sort( col_data.begin(), col_data.end() ); col_data.erase( unique( col_data.begin(), col_data.end() ), col_data.end() ); for( size_t j = 0; j < col_data.size(); j++ ) {//Ugly, but cannot use back_inserter if (j>0) distinct.append(","); distinct.append(col_data[j]); } result.push_back(distinct); } else if (op == "concat") { string concat; for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter concat.append(data[i][j]); } result.push_back(concat); } else if (op == "min") { buffer << setprecision (PRECISION) << *min_element( dataF.begin(), dataF.end() ); result.push_back(buffer.str()); } else if (op == "max") { buffer << setprecision (PRECISION) << *max_element( dataF.begin(), dataF.end() ); result.push_back(buffer.str()); } else if (op == "mean") { double total = accumulate(dataF.begin(), dataF.end(), 0.0); double mean = total / dataF.size(); buffer << setprecision (PRECISION) << mean; result.push_back(buffer.str()); } else if (op == "median") { double median = 0.0; sort(dataF.begin(), dataF.end()); int totalLines = dataF.size(); if ((totalLines % 2) > 0) { long mid; mid = totalLines / 2; median = dataF[mid]; } else { long midLow, midHigh; midLow = (totalLines / 2) - 1; midHigh = (totalLines / 2); median = (dataF[midLow] + dataF[midHigh]) / 2.0; } buffer << setprecision (PRECISION) << median; result.push_back(buffer.str()); } else if (op == "count") { buffer << setprecision (PRECISION) << data[i].size(); result.push_back(buffer.str()); } else if ((op == "mode") || (op == "antimode") || (op == "freqdesc") || (op == "freqasc")) { // compute the frequency of each unique value map freqs; vector::const_iterator dIt = data[i].begin(); vector::const_iterator dEnd = data[i].end(); for (; dIt != dEnd; ++dIt) { freqs[*dIt]++; } // grab the mode and the anti mode string mode, antiMode; int count = 0; int minCount = INT_MAX; for(map::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) { if (iter->second > count) { mode = iter->first; count = iter->second; } if (iter->second < minCount) { antiMode = iter->first; minCount = iter->second; } } // report if (op == "mode") { buffer << setprecision (PRECISION) << mode; result.push_back(buffer.str()); } else if (op == "antimode") { buffer << setprecision (PRECISION) << antiMode; result.push_back(buffer.str()); } else if (op == "freqdesc" || op == "freqasc") { // pair for the num times a values was // observed (1) and the value itself (2) pair freqPair; vector< pair > freqList; // create a list of pairs of all the observed values (second) // and their occurences (first) map::const_iterator mapIter = freqs.begin(); map::const_iterator mapEnd = freqs.end(); for(; mapIter != mapEnd; ++mapIter) freqList.push_back( make_pair(mapIter->second, mapIter->first) ); // sort the list of pairs in the requested order by the frequency // this will make the value that was observed least/most bubble to the top if (op == "freqdesc") sort(freqList.begin(), freqList.end(), ValueGreaterThan()); else if (op == "freqasc") sort(freqList.begin(), freqList.end(), ValueLessThan()); // record all of the values and their frequencies. vector< pair >::const_iterator iter = freqList.begin(); vector< pair >::const_iterator iterEnd = freqList.end(); for (; iter != iterEnd; ++iter) buffer << iter->second << ":" << iter->first << ","; result.push_back(buffer.str()); } } else if (op == "stdev" || op == "sstdev") { // get the mean double total = accumulate(dataF.begin(), dataF.end(), 0.0); double mean = total / dataF.size(); // get the variance double totalVariance = 0.0; vector::const_iterator dIt = dataF.begin(); vector::const_iterator dEnd = dataF.end(); for (; dIt != dEnd; ++dIt) { totalVariance += pow((*dIt - mean),2); } double variance = 0.0; if (op == "stdev") { variance = totalVariance / dataF.size(); } else if (op == "sstdev" && dataF.size() > 1) { variance = totalVariance / (dataF.size() - 1); } double stddev = sqrt(variance); // report buffer << setprecision (PRECISION) << stddev; result.push_back(buffer.str()); } } for_each(group.begin(), group.end(), TabPrintPost); cout << *result.begin(); for_each(++result.begin(), result.end(), TabPrintPre); cout << endl; //Gets rid of extraneous tab } void addValue (const vector &fromList, vector &toList, int index, int lineNum, const bool ignoreCase) { try { string s(fromList.at(index)); if(ignoreCase) transform(s.begin(),s.end(),s.begin(),::tolower); toList.push_back(s); } catch(std::out_of_range& e) { cerr << endl << "*****" << endl << "*****ERROR: requested column exceeds the number of columns in file at line " << lineNum << ". Exiting." << endl << "*****" << endl; exit(1); } } float ToFloat (string element) { return atof(element.c_str()); } void TabPrintPost (string element) { cout << element << "\t"; } void TabPrintPre (string element) { cout << "\t" << element; } void CommaPrint (string element) { cout << element << ","; } double ToDouble(const string &element) { std::istringstream i(element); double x; if (!(i >> x)) { cerr << "Error: Could not properly convert string to numeric (\"" + element + "\")" << endl; exit(1); } return x; } inline string ColumnHeaderName(const vector &inFields, const size_t FieldIndex, bool InputHaveHeaderLine) { stringstream s; if (InputHaveHeaderLine) s << inFields[FieldIndex-1]; else s << "col_" << (FieldIndex); return s.str(); } void PrintHeaderLine(const vector &inFields, const vector &groupColumns, const vector &opColumns, const vector &ops, const bool PrintFullInputLine, const bool InputHaveHeaderLine) { vector header; //Header fields of input file if (PrintFullInputLine) { //All input columns for (size_t i=0;i #include #include #include #include #include #include // for getpid() #include "version.h" using namespace std; // define our program name #define PROGRAM_NAME "shuffle" // define our parameter checking macro #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen)) == 0) && (actualLen == paramLen) // function declarations void ShowHelp(void); int main(int argc, char* argv[]) { // our configuration variables bool showHelp = false; string inFile; bool isFile = false; bool useSeed = false; int seed = 0; for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); if(PARAMETER_CHECK("-h", 2, parameterLength) || PARAMETER_CHECK("--help", 5, parameterLength)) { showHelp = true; } } if(showHelp) ShowHelp(); // do some parsing (all of these parameters require 1 string---the are booleans) for(int i = 1; i < argc; i++) { string parameter = argv[i]; int parameterLength = (int)strlen(argv[i]); if(PARAMETER_CHECK("-s", 2, parameterLength)) { useSeed = true; seed = atoi(argv[i+1]); i++; showHelp = true; } else if((parameterLength > 0) && (parameter[0] != '-')) { isFile = true; inFile = parameter; } else { cout << "ERROR: Unrecognized parameter: " << argv[i] << endl; showHelp = true; } } //############################################### // Main processing. //############################################### string line; long totalLines = 0; vector linesVector; linesVector.reserve((int) 1E6); // allocate 1 mill lines of input. // 0. Are we dealing with a stream or a proper file? Default to a stream. istream *in = &cin; if (isFile) { ifstream *inF = new ifstream(inFile.c_str(), ios::in); // ensure that the file can be opened if ( !inF ) { cerr << "Error: The requested input file (" << inFile << ") could not be opened. Exiting!" << endl; exit (1); } else { in = inF; } } // 1. Read and store all the lines of the file. while (getline(*in,line)) { // skip if just whitespace. if(line.empty()) continue; // increment the count of lines processed totalLines++; // add the line to the vector and map linesVector.push_back(line); } // 2. Shuffle the input. // if no seed supplied, use time and the current priocess id if (!useSeed) srand((unsigned)time(0)+(unsigned)getpid()); else srand(seed); // now shuffle the input: random_shuffle (linesVector.begin(), linesVector.end()); // 3. Write the shuffled data to stdout for(vector::const_iterator iter = linesVector.begin(); iter != linesVector.end(); ++iter) { printf("%s\n", (*iter).c_str()); //cout << *iter << endl; } // exit nicely return 0; } // show the help void ShowHelp(void) { cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; cerr << "Summary: shuffles/randomizes a file or pipe" << endl << endl; cerr << "Usage:\t" << PROGRAM_NAME << " [OPTIONS] [FILE]" << endl; cerr << "\t" << PROGRAM_NAME << " [OPTIONS] < [FILE]" << endl; cerr << "\t" << "cat [FILE] | " << PROGRAM_NAME << " [OPTIONS]" << endl << endl; cerr << "Options: " << endl; cerr << "\t-s\t" << "Custom seed for randomizing the output." << endl; cerr << "\t\tUses time and process id by default." << endl << endl; cout << "\t-h\t" << "Shows this help text" << endl << endl; // end the program here exit(1); } filo-master/src/stats/000077500000000000000000000000001167737766200152535ustar00rootroot00000000000000filo-master/src/stats/Makefile000066400000000000000000000022611167737766200167140ustar00rootroot00000000000000UTILITIES_DIR = ../common/ OBJ_DIR = ../../obj/ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- INCLUDES = -I$(UTILITIES_DIR)/tabFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= stats.cpp OBJECTS= $(SOURCES:.cpp=.o) _EXT_OBJECTS=tabFile.o gzstream.o fileType.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= stats all: $(PROGRAM) .PHONY: all $(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) @echo " * linking $(PROGRAM)" @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) $(BUILT_OBJECTS): $(SOURCES) @echo " * compiling" $(*F).cpp @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/tabFile/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* .PHONY: cleanfilo-master/src/stats/stats.cpp000077500000000000000000000262051167737766200171250ustar00rootroot00000000000000/***************************************************************************** stats.cpp (c) 2009, 2010, 2011 - Aaron Quinlan Center for Public Health Genomics University of Virginia aaronquinlan@gmail.com Licenced under the MIT license. ******************************************************************************/ #include #include #include #include #include #include #include #include #include // out_of_range exception #include #include #include "version.h" #include "lineFileUtilities.h" #include "tabFile.h" using namespace std; // define our program name #define PROGRAM_NAME "stats" // define our parameter checking macro #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen)) == 0) && (actualLen == paramLen) #define LOOKS_LIKE_A_PARAM(string) (strlen(string)>0 && string[0]=='-') // function declarations void ShowHelp(void); inline void addValue (const vector &fromList, double &to, int index, int lineNum); bool isLineNumeric(string); int main(int argc, char* argv[]) { // our configuration variables bool showHelp = false; bool isFile = false; string inFile = "stdin"; int column = 0; bool doAll = true; bool doMean = false; bool doGeoMean = false; bool doMode = false; bool doMedian = false; bool doMin = false; bool doMax = false; bool doVariance = false; bool doStdDev = false; //############################################### // Parse command line. //############################################### for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); if(PARAMETER_CHECK("-h", 2, parameterLength) || PARAMETER_CHECK("--help", 5, parameterLength)) { showHelp = true; } } if(showHelp) ShowHelp(); // do some parsing (all of these parameters require 1 string---the are booleans) for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); string parameter = argv[i]; if(PARAMETER_CHECK("-all", 4, parameterLength)) { doAll = true; } else if(PARAMETER_CHECK("-c", 2, parameterLength)) { if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { cerr << endl << "*****ERROR: -c parameter requires a value." << endl << endl; ShowHelp(); break; } else { column = atoi(argv[i + 1]) - 1; // user is requesting 1-based columns i++; } } else if(PARAMETER_CHECK("-mu", 3, parameterLength)) { doMean = true; doAll = false; } else if(PARAMETER_CHECK("-gm", 3, parameterLength)) { doGeoMean = true; doAll = false; } else if(PARAMETER_CHECK("-med", 4, parameterLength)) { doMedian = true; doAll = false; } else if(PARAMETER_CHECK("-mod", 4, parameterLength)) { doMode = true; doAll = false; } else if(PARAMETER_CHECK("-var", 4, parameterLength)) { doVariance = true; doAll = false; } else if(PARAMETER_CHECK("-std", 4, parameterLength)) { doStdDev = true; doAll = false; } else if(PARAMETER_CHECK("-min", 4, parameterLength)) { doMin = true; doAll = false; } else if(PARAMETER_CHECK("-max", 4, parameterLength)) { doMax = true; doAll = false; } else if((parameterLength > 0) && (parameter[0] != '-')) { isFile = true; inFile = parameter; } else { cout << "ERROR: Unrecognized parameter: " << argv[i] << endl; showHelp = true; exit(1); } } //############################################### // Main processing. //############################################### string line; double sumOfLines = 0.0; double natLogSumOfLines = 0.0; double mean = 0.0; double geoMean = 0.0; double median = 0.0; double mode = 0.0; double min, max; double variance; double stddev; long totalLines = 0; bool zeroFound = false; // vector of strings holding the tokenized current line vector inFields; inFields.reserve(20); double value; vector linesVector; map linesMap; // 0. Are we dealing with a stream or a proper file? Default to a stream. istream *in = &cin; if (isFile) { ifstream *inF = new ifstream(inFile.c_str(), ios::in); // ensure that the file can be opened if ( !inF ) { cerr << "Error: The requested input file (" << inFile << ") could not be opened. Exiting!" << endl; exit (1); } else { in = inF; } } // check the status of the current line TabLineStatus tabLineStatus; // open a new tab file, loop through it line by line // and summarize the data for a given group when the group // fields change int lineNum = 0; TabFile *_tab = new TabFile(inFile); _tab->Open(); while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) { if (tabLineStatus == TAB_VALID) { // grab the current value from the requested column addValue(inFields, value, column, lineNum); //value = atof(inFields[column].c_str()); // increment the count of lines processed totalLines++; // add to the total sum of the lines sumOfLines += value; // add to the sum of natural logs. used for geometric mean natLogSumOfLines += log(value); // flag if we found a zero. This prevents a geometric mean from being calculated. if (value == 0) zeroFound = true; // add the line to the vector and map linesVector.push_back(value); linesMap[value]++; } inFields.clear(); } _tab->Close(); // Report the results. cout << "Total lines:\t\t" << setprecision (15) << totalLines << endl; cout << "Sum of lines:\t\t" << setprecision (15) << sumOfLines << endl; if (doMean || doAll) { // compute the mean mean = sumOfLines / totalLines; cout << "Ari. Mean:\t\t" << setprecision (15) << mean << endl; } if (doGeoMean || doAll) { // compute the mean if (!zeroFound) { geoMean = exp(natLogSumOfLines / totalLines); cout << "Geo. Mean:\t\t" << setprecision (15) << geoMean << endl; } else { cout << "Geo. Mean:\t\t" << "undef (zero found in data)"<< endl; } } if (doMedian || doAll) { //sort the vector of values in ascending order. We will then pick the "middle element" sort(linesVector.begin(), linesVector.end()); if ((totalLines % 2) > 0) { long mid; mid = totalLines / 2; median = linesVector[mid]; } else { long midLow; long midHigh; midLow = (totalLines / 2) - 1; midHigh = (totalLines / 2); median = (linesVector[midLow] + linesVector[midHigh]) / 2.0; } cout << "Median:\t\t\t" << setprecision (15) << median << endl; } if (doMode || doAll) { // get the mode int count = 0; int minCount = INT_MAX; double antiMode = 0.0; for(map::const_iterator iter = linesMap.begin(); iter != linesMap.end(); ++iter) { if (iter->second > count) { mode = iter->first; count = iter->second; } if (iter->second < minCount) { antiMode = iter->first; minCount = iter->second; } } cout << "Mode:\t\t\t" << setprecision (15) << mode << " (N=" << count << ")" << endl; cout << "Anti-Mode:\t\t" << setprecision (15) << antiMode << " (N=" << minCount << ")" << endl; } if (doMin || doAll) { // get the min value min = *min_element(linesVector.begin(), linesVector.end()); cout << "Minimum:\t\t" << setprecision (15) << min << endl; } if (doMax || doAll) { // get the min value max = *max_element(linesVector.begin(), linesVector.end()); cout << "Maximum:\t\t" << setprecision (15) << max << endl; } if (doVariance || doStdDev || doAll) { double totalVariance = 0.0; for(vector::const_iterator iter = linesVector.begin(); iter != linesVector.end(); ++iter) { totalVariance += pow((*iter - mean),2); } variance = totalVariance / totalLines; cout << "Variance:\t\t" << setprecision (15) << variance << endl; } if (doStdDev || doAll) { stddev = sqrt(variance); cout << "StdDev:\t\t\t" << setprecision (15) << stddev << endl; } return 0; } // function to show a help menu void ShowHelp(void) { cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; cerr << "Summary: computes statistics on a _single column_ " << endl; cerr << "\tfrom a file or stream." << endl << endl; cerr << "Usage:\t" << PROGRAM_NAME << " [OPTIONS] [FILE]" << endl; cerr << "\t" << PROGRAM_NAME << " [OPTIONS] < [FILE]" << endl; cerr << "\t" << "cat [FILE] | " << PROGRAM_NAME << " [OPTIONS]" << endl << endl; cout << "Options:" << endl; cout << " -all Return all metrics below (default)." << endl; cout << " -mu Return the arithmetic mean (opt.)" << endl; cout << " -med Return the median (opt.)" << endl; cout << " -mod Return the mode (opt.)" << endl; cout << " -min Return the minimum value (opt.)" << endl; cout << " -max Return the maximum value (opt.)" << endl; cout << " -var Return the variance (opt.)" << endl; cout << " -std Return the standard deviation (opt.)" << endl << endl; cout << "Help:" << endl; cout << " -h Shows this help text" << endl; // end the program here exit(1); } inline void addValue (const vector &fromList, double &to, int index, int lineNum) { try { to = atof(fromList.at(index).c_str()); } catch(std::out_of_range& e) { cerr << endl << "*****" << endl << "*****ERROR: requested column exceeds the number of columns in file at line " << lineNum << ". Exiting." << endl << "*****" << endl; exit(1); } } // test to see if a string is numeric inline bool isLineNumeric(string l) { // check to make sure there are no alphanumeric characters in the data for(size_t i = 0; i < strlen(l.c_str()); i++) { if (isalpha(l[i])) { return false; } } return true; }