pax_global_header00006660000000000000000000000064130707175420014520gustar00rootroot0000000000000052 comment=03e9c724439d1f74c96ff77605f306d34fa7d61b externalsortinginjava-externalsortinginjava-0.2.3/000077500000000000000000000000001307071754200225625ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/.gitignore000066400000000000000000000000101307071754200245410ustar00rootroot00000000000000/target/externalsortinginjava-externalsortinginjava-0.2.3/.travis.yml000066400000000000000000000002431307071754200246720ustar00rootroot00000000000000language: java jdk: - oraclejdk8 install: true branches: only: - master script: mvn clean test jacoco:report after_success: - mvn coveralls:report externalsortinginjava-externalsortinginjava-0.2.3/LICENSE.txt000066400000000000000000000003621307071754200244060ustar00rootroot00000000000000This code is in the public domain. You can take it, modify it, and use it in your commercial projects without attribution. We encourage you, however, to acknowledge this package whenever possible and to contribute your bug fixes and reports. externalsortinginjava-externalsortinginjava-0.2.3/README.md000066400000000000000000000044101307071754200240400ustar00rootroot00000000000000Externalsortinginjava ========================================================== [![Build Status](https://travis-ci.org/lemire/externalsortinginjava.png)](https://travis-ci.org/lemire/externalsortinginjava) [![][maven img]][maven] [![][license img]][license] [![docs-badge][]][docs] [![Coverage Status](https://coveralls.io/repos/github/lemire/externalsortinginjava/badge.svg?branch=master)](https://coveralls.io/github/lemire/externalsortinginjava?branch=master) External-Memory Sorting in Java: useful to sort very large files using multiple cores and an external-memory algorithm. The versions 0.1 of the library are compatible with Java 6 and above. Versions 0.2 and above require at least Java 8. Code sample ------------ ```java import com.google.code.externalsorting.ExternalSort; //... inputfile: input file name //... outputfile: output file name // next command sorts the lines from inputfile to outputfile ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(new File(inputfile)), new File(outputfile)); // you can also provide a custom string comparator, see API ``` API Documentation ----------------- http://www.javadoc.io/doc/com.google.code.externalsortinginjava/externalsortinginjava/ Maven dependency ----------------- You can download the jar files from the Maven central repository: http://repo1.maven.org/maven2/com/google/code/externalsortinginjava/externalsortinginjava/ You can also specify the dependency in the Maven "pom.xml" file: ```xml com.google.code.externalsortinginjava externalsortinginjava [0.1.9,) ``` How to build ----------------- - get the java jdk - Install Maven 2 - mvn install - builds jar (requires signing) - mvn test - runs tests [maven img]:https://maven-badges.herokuapp.com/maven-central/com.googlecode.javaewah/JavaEWAH/badge.svg [maven]:http://search.maven.org/#search%7Cga%7C1%7Cexternalsortinginjava [license]:LICENSE.txt [license img]:https://img.shields.io/badge/License-Apache%202-blue.svg [docs-badge]:https://img.shields.io/badge/API-docs-blue.svg?style=flat-square [docs]:http://www.javadoc.io/doc/com.google.code.externalsortinginjava/externalsortinginjava/ externalsortinginjava-externalsortinginjava-0.2.3/pom.xml000077500000000000000000000203021307071754200240770ustar00rootroot00000000000000 4.0.0 com.google.code.externalsortinginjava externalsortinginjava jar 0.2.3 externalsortinginjava http://github.com/lemire/externalsortinginjava/ Sometimes, you want to sort large file without first loading them into memory. The solution is to use External Sorting. You divide the files into small blocks, sort each block in RAM, and then merge the result. Many database engines and the Unix sort command support external sorting. But what if you want to avoid a database? Or what if you want to sort in a non-lexicographic order? Or maybe you just want a simple external sorting example? When we could not find such a simple program, we wrote one. UTF-8 1.8 1.8 1.8 GitHub Issue Tracking https://github.com/lemire/externalsortinginjava/issues org.sonatype.oss oss-parent 5 Public Domain http://creativecommons.org/licenses/publicdomain repo This code is in the public domain. You can take it, modify it, and use it in your commercial projects without attribution. We encourage you, however, to acknowledge this package whenever possible and to contribute your bug fixes and reports. junit junit 4.12 com.github.jbellis jamm 0.3.1 junit junit test com.github.jbellis jamm test maven-dependency-plugin copy-dependencies ${project.build.directory}/lib org.jacoco jacoco-maven-plugin 0.7.8 prepare-agent prepare-agent org.eluder.coveralls coveralls-maven-plugin 3.2.1 org.apache.maven.plugins maven-compiler-plugin 3.5.1 ${java.target.version} ${java.target.version} org.apache.maven.plugins maven-surefire-plugin 2.19.1 **/*Spec.* **/*Test.* **/*Benchmark.java -javaagent:${project.build.directory}/lib/jamm-0.3.1.jar org.apache.maven.plugins maven-jar-plugin 2.6 true com.google.code.externalsorting.ExternalSort maven-release-plugin 2.5.3 deploy org.apache.felix maven-bundle-plugin 2.3.7 true com.googlecode.javaewah.* * org.apache.maven.plugins maven-gpg-plugin 1.6 sign-artifacts verify sign org.apache.maven.plugins maven-javadoc-plugin 2.10.4 attach-javadocs jar org.apache.maven.plugins maven-source-plugin 3.0.1 attach-sources jar scm:git:git@github.com:lemire/externalsortinginjava.git scm:git:git@github.com:lemire/externalsortinginjava.git scm:git:git@github.com:lemire/externalsortinginjava.git externalsortinginjava-0.2.3 externalsortinginjava-externalsortinginjava-0.2.3/src/000077500000000000000000000000001307071754200233515ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/main/000077500000000000000000000000001307071754200242755ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/main/java/000077500000000000000000000000001307071754200252165ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/main/java/com/000077500000000000000000000000001307071754200257745ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/main/java/com/google/000077500000000000000000000000001307071754200272505ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/main/java/com/google/code/000077500000000000000000000000001307071754200301625ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/main/java/com/google/code/externalsorting/000077500000000000000000000000001307071754200334125ustar00rootroot00000000000000ExternalSort.java000066400000000000000000001225471307071754200366430ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/main/java/com/google/code/externalsortingpackage com.google.code.externalsorting; // filename: ExternalSort.java import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.util.stream.Collectors; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.Comparator; import java.util.List; import java.util.PriorityQueue; import java.util.zip.Deflater; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; /** * Goal: offer a generic external-memory sorting program in Java. * * It must be : - hackable (easy to adapt) - scalable to large files - sensibly * efficient. * * This software is in the public domain. * * Usage: java com/google/code/externalsorting/ExternalSort somefile.txt out.txt * * You can change the default maximal number of temporary files with the -t * flag: java com/google/code/externalsorting/ExternalSort somefile.txt out.txt * -t 3 * * For very large files, you might want to use an appropriate flag to allocate * more memory to the Java VM: java -Xms2G * com/google/code/externalsorting/ExternalSort somefile.txt out.txt * * By (in alphabetical order) Philippe Beaudoin, Eleftherios Chetzakis, Jon * Elsas, Christan Grant, Daniel Haran, Daniel Lemire, Sugumaran Harikrishnan, * Amit Jain, Thomas Mueller, Jerry Yang, First published: April 2010 originally posted at * http://lemire.me/blog/archives/2010/04/01/external-memory-sorting-in-java/ */ public class ExternalSort { private static void displayUsage() { System.out .println("java com.google.externalsorting.ExternalSort inputfile outputfile"); System.out.println("Flags are:"); System.out.println("-v or --verbose: verbose output"); System.out.println("-d or --distinct: prune duplicate lines"); System.out .println("-t or --maxtmpfiles (followed by an integer): specify an upper bound on the number of temporary files"); System.out .println("-c or --charset (followed by a charset code): specify the character set to use (for sorting)"); System.out .println("-z or --gzip: use compression for the temporary files"); System.out .println("-H or --header (followed by an integer): ignore the first few lines"); System.out .println("-s or --store (following by a path): where to store the temporary files"); System.out.println("-h or --help: display this message"); } /** * This method calls the garbage collector and then returns the free * memory. This avoids problems with applications where the GC hasn't * reclaimed memory and reports no available memory. * * @return available memory */ public static long estimateAvailableMemory() { System.gc(); // http://stackoverflow.com/questions/12807797/java-get-available-memory Runtime r = Runtime.getRuntime(); long allocatedMemory = r.totalMemory() - r.freeMemory(); long presFreeMemory = r.maxMemory() - allocatedMemory; return presFreeMemory; } /** * we divide the file into small blocks. If the blocks are too small, we * shall create too many temporary files. If they are too big, we shall * be using too much memory. * * @param sizeoffile how much data (in bytes) can we expect * @param maxtmpfiles how many temporary files can we create (e.g., 1024) * @param maxMemory Maximum memory to use (in bytes) * @return the estimate */ public static long estimateBestSizeOfBlocks(final long sizeoffile, final int maxtmpfiles, final long maxMemory) { // we don't want to open up much more than maxtmpfiles temporary // files, better run // out of memory first. long blocksize = sizeoffile / maxtmpfiles + (sizeoffile % maxtmpfiles == 0 ? 0 : 1); // on the other hand, we don't want to create many temporary // files // for naught. If blocksize is smaller than half the free // memory, grow it. if (blocksize < maxMemory / 2) { blocksize = maxMemory / 2; } return blocksize; } /** * @param args command line argument * @throws IOException generic IO exception */ public static void main(final String[] args) throws IOException { boolean verbose = false; boolean distinct = false; int maxtmpfiles = DEFAULTMAXTEMPFILES; Charset cs = Charset.defaultCharset(); String inputfile = null, outputfile = null; File tempFileStore = null; boolean usegzip = false; boolean parallel = true; int headersize = 0; for (int param = 0; param < args.length; ++param) { if (args[param].equals("-v") || args[param].equals("--verbose")) { verbose = true; } else if ((args[param].equals("-h") || args[param] .equals("--help"))) { displayUsage(); return; } else if ((args[param].equals("-d") || args[param] .equals("--distinct"))) { distinct = true; } else if ((args[param].equals("-t") || args[param] .equals("--maxtmpfiles")) && args.length > param + 1) { param++; maxtmpfiles = Integer.parseInt(args[param]); if (maxtmpfiles < 0) { System.err .println("maxtmpfiles should be positive"); } } else if ((args[param].equals("-c") || args[param] .equals("--charset")) && args.length > param + 1) { param++; cs = Charset.forName(args[param]); } else if ((args[param].equals("-z") || args[param] .equals("--gzip"))) { usegzip = true; } else if ((args[param].equals("-H") || args[param] .equals("--header")) && args.length > param + 1) { param++; headersize = Integer.parseInt(args[param]); if (headersize < 0) { System.err .println("headersize should be positive"); } } else if ((args[param].equals("-s") || args[param] .equals("--store")) && args.length > param + 1) { param++; tempFileStore = new File(args[param]); } else { if (inputfile == null) { inputfile = args[param]; } else if (outputfile == null) { outputfile = args[param]; } else { System.out.println("Unparsed: " + args[param]); } } } if (outputfile == null) { System.out .println("please provide input and output file names"); displayUsage(); return; } Comparator comparator = defaultcomparator; List l = sortInBatch(new File(inputfile), comparator, maxtmpfiles, cs, tempFileStore, distinct, headersize, usegzip, parallel); if (verbose) { System.out .println("created " + l.size() + " tmp files"); } mergeSortedFiles(l, new File(outputfile), comparator, cs, distinct, false, usegzip); } /** * This merges several BinaryFileBuffer to an output writer. * * @param fbw A buffer where we write the data. * @param cmp A comparator object that tells us how to sort the * lines. * @param distinct Pass true if duplicate lines should be * discarded. * @param buffers * Where the data should be read. * @return The number of lines sorted. * @throws IOException generic IO exception * */ public static int mergeSortedFiles(BufferedWriter fbw, final Comparator cmp, boolean distinct, List buffers) throws IOException { PriorityQueue pq = new PriorityQueue<>( 11, new Comparator() { @Override public int compare(BinaryFileBuffer i, BinaryFileBuffer j) { return cmp.compare(i.peek(), j.peek()); } }); for (BinaryFileBuffer bfb : buffers) { if (!bfb.empty()) { pq.add(bfb); } } int rowcounter = 0; try { if (!distinct) { while (pq.size() > 0) { BinaryFileBuffer bfb = pq.poll(); String r = bfb.pop(); fbw.write(r); fbw.newLine(); ++rowcounter; if (bfb.empty()) { bfb.fbr.close(); } else { pq.add(bfb); // add it back } } } else { String lastLine = null; if(pq.size() > 0) { BinaryFileBuffer bfb = pq.poll(); lastLine = bfb.pop(); fbw.write(lastLine); fbw.newLine(); ++rowcounter; if (bfb.empty()) { bfb.fbr.close(); } else { pq.add(bfb); // add it back } } while (pq.size() > 0) { BinaryFileBuffer bfb = pq.poll(); String r = bfb.pop(); // Skip duplicate lines if (cmp.compare(r, lastLine) != 0) { fbw.write(r); fbw.newLine(); lastLine = r; } ++rowcounter; if (bfb.empty()) { bfb.fbr.close(); } else { pq.add(bfb); // add it back } } } } finally { fbw.close(); for (BinaryFileBuffer bfb : pq) { bfb.close(); } } return rowcounter; } /** * This merges a bunch of temporary flat files * * @param files The {@link List} of sorted {@link File}s to be merged. * @param outputfile The output {@link File} to merge the results to. * @return The number of lines sorted. * @throws IOException generic IO exception */ public static int mergeSortedFiles(List files, File outputfile) throws IOException { return mergeSortedFiles(files, outputfile, defaultcomparator, Charset.defaultCharset()); } /** * This merges a bunch of temporary flat files * * @param files The {@link List} of sorted {@link File}s to be merged. * @param outputfile The output {@link File} to merge the results to. * @param cmp The {@link Comparator} to use to compare * {@link String}s. * @return The number of lines sorted. * @throws IOException generic IO exception */ public static int mergeSortedFiles(List files, File outputfile, final Comparator cmp) throws IOException { return mergeSortedFiles(files, outputfile, cmp, Charset.defaultCharset()); } /** * This merges a bunch of temporary flat files * * @param files The {@link List} of sorted {@link File}s to be merged. * @param outputfile The output {@link File} to merge the results to. * @param cmp The {@link Comparator} to use to compare * {@link String}s. * @param distinct Pass true if duplicate lines should be * discarded. * @return The number of lines sorted. * @throws IOException generic IO exception */ public static int mergeSortedFiles(List files, File outputfile, final Comparator cmp, boolean distinct) throws IOException { return mergeSortedFiles(files, outputfile, cmp, Charset.defaultCharset(), distinct); } /** * This merges a bunch of temporary flat files * * @param files The {@link List} of sorted {@link File}s to be merged. * @param outputfile The output {@link File} to merge the results to. * @param cmp The {@link Comparator} to use to compare * {@link String}s. * @param cs The {@link Charset} to be used for the byte to * character conversion. * @return The number of lines sorted. * @throws IOException generic IO exception */ public static int mergeSortedFiles(List files, File outputfile, final Comparator cmp, Charset cs) throws IOException { return mergeSortedFiles(files, outputfile, cmp, cs, false); } /** * This merges a bunch of temporary flat files * * @param files The {@link List} of sorted {@link File}s to be merged. * @param distinct Pass true if duplicate lines should be * discarded. * @param outputfile The output {@link File} to merge the results to. * @param cmp The {@link Comparator} to use to compare * {@link String}s. * @param cs The {@link Charset} to be used for the byte to * character conversion. * @return The number of lines sorted. * @throws IOException generic IO exception * @since v0.1.2 */ public static int mergeSortedFiles(List files, File outputfile, final Comparator cmp, Charset cs, boolean distinct) throws IOException { return mergeSortedFiles(files, outputfile, cmp, cs, distinct, false, false); } /** * This merges a bunch of temporary flat files * * @param files The {@link List} of sorted {@link File}s to be merged. * @param distinct Pass true if duplicate lines should be * discarded. * @param outputfile The output {@link File} to merge the results to. * @param cmp The {@link Comparator} to use to compare * {@link String}s. * @param cs The {@link Charset} to be used for the byte to * character conversion. * @param append Pass true if result should append to * {@link File} instead of overwrite. Default to be false * for overloading methods. * @param usegzip assumes we used gzip compression for temporary files * @return The number of lines sorted. * @throws IOException generic IO exception * @since v0.1.4 */ public static int mergeSortedFiles(List files, File outputfile, final Comparator cmp, Charset cs, boolean distinct, boolean append, boolean usegzip) throws IOException { ArrayList bfbs = new ArrayList<>(); for (File f : files) { final int BUFFERSIZE = 2048; InputStream in = new FileInputStream(f); BufferedReader br; if (usegzip) { br = new BufferedReader( new InputStreamReader( new GZIPInputStream(in, BUFFERSIZE), cs)); } else { br = new BufferedReader(new InputStreamReader( in, cs)); } BinaryFileBuffer bfb = new BinaryFileBuffer(br); bfbs.add(bfb); } BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(outputfile, append), cs)); int rowcounter = mergeSortedFiles(fbw, cmp, distinct, bfbs); for (File f : files) { f.delete(); } return rowcounter; } /** * This sorts a file (input) to an output file (output) using default * parameters * * @param input source file * * @param output output file * @throws IOException generic IO exception */ public static void sort(final File input, final File output) throws IOException { ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(input), output); } /** * Sort a list and save it to a temporary file * * @return the file containing the sorted data * @param tmplist data to be sorted * @param cmp string comparator * @param cs charset to use for output (can use * Charset.defaultCharset()) * @param tmpdirectory location of the temporary files (set to null for * default location) * @throws IOException generic IO exception */ public static File sortAndSave(List tmplist, Comparator cmp, Charset cs, File tmpdirectory) throws IOException { return sortAndSave(tmplist, cmp, cs, tmpdirectory, false, false, true); } /** * Sort a list and save it to a temporary file * * @return the file containing the sorted data * @param tmplist data to be sorted * @param cmp string comparator * @param cs charset to use for output (can use * Charset.defaultCharset()) * @param tmpdirectory location of the temporary files (set to null for * default location) * @param distinct Pass true if duplicate lines should be * discarded. * @param usegzip set to true if you are using gzip compression for the * temporary files * @param parallel set to true when sorting in parallel * @throws IOException generic IO exception */ public static File sortAndSave(List tmplist, Comparator cmp, Charset cs, File tmpdirectory, boolean distinct, boolean usegzip, boolean parallel) throws IOException { if (parallel) { tmplist = tmplist.parallelStream().sorted(cmp).collect(Collectors.toCollection(ArrayList::new)); } else { Collections.sort(tmplist, cmp); } File newtmpfile = File.createTempFile("sortInBatch", "flatfile", tmpdirectory); newtmpfile.deleteOnExit(); OutputStream out = new FileOutputStream(newtmpfile); int ZIPBUFFERSIZE = 2048; if (usegzip) { out = new GZIPOutputStream(out, ZIPBUFFERSIZE) { { this.def.setLevel(Deflater.BEST_SPEED); } }; } try (BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter( out, cs))) { if (!distinct) { for (String r : tmplist) { fbw.write(r); fbw.newLine(); } } else { String lastLine = null; Iterator i = tmplist.iterator(); if(i.hasNext()) { lastLine = i.next(); fbw.write(lastLine); fbw.newLine(); } while (i.hasNext()) { String r = i.next(); // Skip duplicate lines if (cmp.compare(r, lastLine) != 0) { fbw.write(r); fbw.newLine(); lastLine = r; } } } } return newtmpfile; } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. * * @param fbr data source * @param datalength estimated data volume (in bytes) * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(final BufferedReader fbr, final long datalength) throws IOException { return sortInBatch(fbr, datalength, defaultcomparator, DEFAULTMAXTEMPFILES, estimateAvailableMemory(), Charset.defaultCharset(), null, false, 0, false, true); } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. * * @param fbr data source * @param datalength estimated data volume (in bytes) * @param cmp string comparator * @param distinct Pass true if duplicate lines should be * discarded. * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(final BufferedReader fbr, final long datalength, final Comparator cmp, final boolean distinct) throws IOException { return sortInBatch(fbr, datalength, cmp, DEFAULTMAXTEMPFILES, estimateAvailableMemory(), Charset.defaultCharset(), null, distinct, 0, false, true); } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. * * @param fbr data source * @param datalength estimated data volume (in bytes) * @param cmp string comparator * @param maxtmpfiles maximal number of temporary files * @param maxMemory maximum amount of memory to use (in bytes) * @param cs character set to use (can use * Charset.defaultCharset()) * @param tmpdirectory location of the temporary files (set to null for * default location) * @param distinct Pass true if duplicate lines should be * discarded. * @param numHeader number of lines to preclude before sorting starts * @param usegzip use gzip compression for the temporary files * @param parallel sort in parallel * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(final BufferedReader fbr, final long datalength, final Comparator cmp, final int maxtmpfiles, long maxMemory, final Charset cs, final File tmpdirectory, final boolean distinct, final int numHeader, final boolean usegzip, final boolean parallel) throws IOException { List files = new ArrayList<>(); long blocksize = estimateBestSizeOfBlocks(datalength, maxtmpfiles, maxMemory);// in // bytes try { List tmplist = new ArrayList<>(); String line = ""; try { int counter = 0; while (line != null) { long currentblocksize = 0;// in bytes while ((currentblocksize < blocksize) && ((line = fbr.readLine()) != null)) { // as long as you have enough // memory if (counter < numHeader) { counter++; continue; } tmplist.add(line); currentblocksize += StringSizeEstimator .estimatedSizeOf(line); } files.add(sortAndSave(tmplist, cmp, cs, tmpdirectory, distinct, usegzip, parallel)); tmplist.clear(); } } catch (EOFException oef) { if (tmplist.size() > 0) { files.add(sortAndSave(tmplist, cmp, cs, tmpdirectory, distinct, usegzip, parallel)); tmplist.clear(); } } } finally { fbr.close(); } return files; } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. * * @param file some flat file * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(File file) throws IOException { return sortInBatch(file, defaultcomparator); } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. * * @param file some flat file * @param cmp string comparator * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(File file, Comparator cmp) throws IOException { return sortInBatch(file, cmp, false); } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. * * @param file some flat file * @param cmp string comparator * @param distinct Pass true if duplicate lines should be * discarded. * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(File file, Comparator cmp, boolean distinct) throws IOException { return sortInBatch(file, cmp, DEFAULTMAXTEMPFILES, Charset.defaultCharset(), null, distinct); } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. You can specify a bound on the number of temporary * files that will be created. * * @param file some flat file * @param cmp string comparator * @param tmpdirectory location of the temporary files (set to null for * default location) * @param distinct Pass true if duplicate lines should be * discarded. * @param numHeader number of lines to preclude before sorting starts * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(File file, Comparator cmp, File tmpdirectory, boolean distinct, int numHeader) throws IOException { return sortInBatch(file, cmp, DEFAULTMAXTEMPFILES, Charset.defaultCharset(), tmpdirectory, distinct, numHeader); } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. You can specify a bound on the number of temporary * files that will be created. * * @param file some flat file * @param cmp string comparator * @param maxtmpfiles maximal number of temporary files * @param cs character set to use (can use * Charset.defaultCharset()) * @param tmpdirectory location of the temporary files (set to null for * default location) * @param distinct Pass true if duplicate lines should be * discarded. * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(File file, Comparator cmp, int maxtmpfiles, Charset cs, File tmpdirectory, boolean distinct) throws IOException { return sortInBatch(file, cmp, maxtmpfiles, cs, tmpdirectory, distinct, 0); } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. You can specify a bound on the number of temporary * files that will be created. * * @param file some flat file * @param cmp string comparator * @param cs character set to use (can use * Charset.defaultCharset()) * @param tmpdirectory location of the temporary files (set to null for * default location) * @param distinct Pass true if duplicate lines should be * discarded. * @param numHeader number of lines to preclude before sorting starts * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(File file, Comparator cmp, Charset cs, File tmpdirectory, boolean distinct, int numHeader) throws IOException { BufferedReader fbr = new BufferedReader(new InputStreamReader( new FileInputStream(file), cs)); return sortInBatch(fbr, file.length(), cmp, DEFAULTMAXTEMPFILES, estimateAvailableMemory(), cs, tmpdirectory, distinct, numHeader, false, true); } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. You can specify a bound on the number of temporary * files that will be created. * * @param file some flat file * @param cmp string comparator * @param maxtmpfiles maximal number of temporary files * @param cs character set to use (can use * Charset.defaultCharset()) * @param tmpdirectory location of the temporary files (set to null for * default location) * @param distinct Pass true if duplicate lines should be * discarded. * @param numHeader number of lines to preclude before sorting starts * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(File file, Comparator cmp, int maxtmpfiles, Charset cs, File tmpdirectory, boolean distinct, int numHeader) throws IOException { BufferedReader fbr = new BufferedReader(new InputStreamReader( new FileInputStream(file), cs)); return sortInBatch(fbr, file.length(), cmp, maxtmpfiles, estimateAvailableMemory(), cs, tmpdirectory, distinct, numHeader, false, true); } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. You can specify a bound on the number of temporary * files that will be created. * * @param file some flat file * @param cmp string comparator * @param maxtmpfiles maximal number of temporary files * @param cs character set to use (can use * Charset.defaultCharset()) * @param tmpdirectory location of the temporary files (set to null for * default location) * @param distinct Pass true if duplicate lines should be * discarded. * @param numHeader number of lines to preclude before sorting starts * @param usegzip use gzip compression for the temporary files * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(File file, Comparator cmp, int maxtmpfiles, Charset cs, File tmpdirectory, boolean distinct, int numHeader, boolean usegzip) throws IOException { BufferedReader fbr = new BufferedReader(new InputStreamReader( new FileInputStream(file), cs)); return sortInBatch(fbr, file.length(), cmp, maxtmpfiles, estimateAvailableMemory(), cs, tmpdirectory, distinct, numHeader, usegzip, true); } /** * This will simply load the file by blocks of lines, then sort them * in-memory, and write the result to temporary files that have to be * merged later. You can specify a bound on the number of temporary * files that will be created. * * @param file some flat file * @param cmp string comparator * @param maxtmpfiles maximal number of temporary files * @param cs character set to use (can use * Charset.defaultCharset()) * @param tmpdirectory location of the temporary files (set to null for * default location) * @param distinct Pass true if duplicate lines should be * discarded. * @param numHeader number of lines to preclude before sorting starts * @param usegzip use gzip compression for the temporary files * @param parallel whether to sort in parallel * @return a list of temporary flat files * @throws IOException generic IO exception */ public static List sortInBatch(File file, Comparator cmp, int maxtmpfiles, Charset cs, File tmpdirectory, boolean distinct, int numHeader, boolean usegzip, boolean parallel) throws IOException { BufferedReader fbr = new BufferedReader(new InputStreamReader( new FileInputStream(file), cs)); return sortInBatch(fbr, file.length(), cmp, maxtmpfiles, estimateAvailableMemory(), cs, tmpdirectory, distinct, numHeader, usegzip, parallel); } /** * default comparator between strings. */ public static Comparator defaultcomparator = new Comparator() { @Override public int compare(String r1, String r2) { return r1.compareTo(r2); } }; /** * Default maximal number of temporary files allowed. */ public static final int DEFAULTMAXTEMPFILES = 1024; } /** * This is essentially a thin wrapper on top of a BufferedReader... which keeps * the last line in memory. * */ final class BinaryFileBuffer { public BinaryFileBuffer(BufferedReader r) throws IOException { this.fbr = r; reload(); } public void close() throws IOException { this.fbr.close(); } public boolean empty() { return this.cache == null; } public String peek() { return this.cache; } public String pop() throws IOException { String answer = peek().toString();// make a copy reload(); return answer; } private void reload() throws IOException { this.cache = this.fbr.readLine(); } public BufferedReader fbr; private String cache; } StringSizeEstimator.java000066400000000000000000000047371307071754200402020ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/main/java/com/google/code/externalsorting/** * */ package com.google.code.externalsorting; /** * Simple class used to estimate memory usage. * * @author Eleftherios Chetzakis * */ public final class StringSizeEstimator { private static int OBJ_HEADER; private static int ARR_HEADER; private static int INT_FIELDS = 12; private static int OBJ_REF; private static int OBJ_OVERHEAD; private static boolean IS_64_BIT_JVM; /** * Private constructor to prevent instantiation. */ private StringSizeEstimator() { } /** * Class initializations. */ static { // By default we assume 64 bit JVM // (defensive approach since we will get // larger estimations in case we are not sure) IS_64_BIT_JVM = true; // check the system property "sun.arch.data.model" // not very safe, as it might not work for all JVM implementations // nevertheless the worst thing that might happen is that the JVM is 32bit // but we assume its 64bit, so we will be counting a few extra bytes per string object // no harm done here since this is just an approximation. String arch = System.getProperty("sun.arch.data.model"); if (arch != null) { if (arch.contains("32")) { // If exists and is 32 bit then we assume a 32bit JVM IS_64_BIT_JVM = false; } } // The sizes below are a bit rough as we don't take into account // advanced JVM options such as compressed oops // however if our calculation is not accurate it'll be a bit over // so there is no danger of an out of memory error because of this. OBJ_HEADER = IS_64_BIT_JVM ? 16 : 8; ARR_HEADER = IS_64_BIT_JVM ? 24 : 12; OBJ_REF = IS_64_BIT_JVM ? 8 : 4; OBJ_OVERHEAD = OBJ_HEADER + INT_FIELDS + OBJ_REF + ARR_HEADER; } /** * Estimates the size of a {@link String} object in bytes. * * This function was designed with the following goals in mind (in order of importance) : * * First goal is speed: this function is called repeatedly and it should * execute in not much more than a nanosecond. * * Second goal is to never underestimate (as it would lead to memory shortage and a crash). * * Third goal is to never overestimate too much (say within a factor of two), as it would * mean that we are leaving much of the RAM underutilized. * * @param s The string to estimate memory footprint. * @return The estimated size in bytes. */ public static long estimatedSizeOf(String s) { return (s.length() * 2) + OBJ_OVERHEAD; } } externalsortinginjava-externalsortinginjava-0.2.3/src/test/000077500000000000000000000000001307071754200243305ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/test/java/000077500000000000000000000000001307071754200252515ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/test/java/com/000077500000000000000000000000001307071754200260275ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/test/java/com/google/000077500000000000000000000000001307071754200273035ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/test/java/com/google/code/000077500000000000000000000000001307071754200302155ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/test/java/com/google/code/externalsorting/000077500000000000000000000000001307071754200334455ustar00rootroot00000000000000ExternalSortTest.java000066400000000000000000000344411307071754200375310ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/test/java/com/google/code/externalsortingpackage com.google.code.externalsorting; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.List; import java.util.Scanner; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.github.jamm.*; /** * Unit test for simple App. */ @SuppressWarnings({"static-method","javadoc"}) public class ExternalSortTest { private static final String TEST_FILE1_TXT = "test-file-1.txt"; private static final String TEST_FILE2_TXT = "test-file-2.txt"; private static final String TEST_FILE1_CSV = "test-file-1.csv"; private static final String[] EXPECTED_SORT_RESULTS = { "a", "b", "b", "e", "f", "i", "m", "o", "u", "u", "x", "y", "z" }; private static final String[] EXPECTED_MERGE_RESULTS = {"a", "a", "b", "c", "c", "d", "e", "e", "f", "g", "g","h", "i", "j", "k"}; private static final String[] EXPECTED_MERGE_DISTINCT_RESULTS = {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}; private static final String[] EXPECTED_HEADER_RESULTS = {"HEADER, HEADER", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}; private static final String[] EXPECTED_DISTINCT_RESULTS = { "a", "b", "e", "f", "i", "m", "o", "u", "x", "y", "z" }; private static final String[] SAMPLE = { "f", "m", "b", "e", "i", "o", "u", "x", "a", "y", "z", "b", "u" }; private File file1; private File file2; private File csvFile; private List fileList; /** * @throws Exception */ @Before public void setUp() throws Exception { this.fileList = new ArrayList(3); this.file1 = new File(this.getClass().getClassLoader() .getResource(TEST_FILE1_TXT).toURI()); this.file2 = new File(this.getClass().getClassLoader() .getResource(TEST_FILE2_TXT).toURI()); this.csvFile = new File(this.getClass().getClassLoader() .getResource(TEST_FILE1_CSV).toURI()); File tmpFile1 = new File(this.file1.getPath().toString()+".tmp"); File tmpFile2 = new File(this.file2.getPath().toString()+".tmp"); copyFile(this.file1, tmpFile1); copyFile(this.file2, tmpFile2); this.fileList.add(tmpFile1); this.fileList.add(tmpFile2); } /** * @throws Exception */ @After public void tearDown() throws Exception { this.file1 = null; this.file2 = null; this.csvFile = null; for(File f:this.fileList) { f.delete(); } this.fileList.clear(); this.fileList = null; } private static void copyFile(File sourceFile, File destFile) throws IOException { if (!destFile.exists()) { destFile.createNewFile(); } try (FileInputStream fis = new FileInputStream(sourceFile); FileChannel source = fis.getChannel(); FileOutputStream fos = new FileOutputStream(destFile); FileChannel destination = fos.getChannel()) { destination.transferFrom(source, 0, source.size()); } } public static int estimateTotalSize(String[] mystrings) { int total = 0; for (String s : mystrings) { total += StringSizeEstimator.estimatedSizeOf(s); } return total; } public static void oneRoundOfStringSizeEstimation() { // could use JMH for better results but this should do final int N = 1024; String [] mystrings = new String[1024]; for(int k = 0; k < N ; ++k ) { mystrings[k] = Integer.toString(k); } final int repeat = 1000; long bef, aft, diff; long bestdiff = Long.MAX_VALUE; int bogus = 0; for(int t = 0 ; t < repeat; ++t ) { bef = System.nanoTime(); bogus += estimateTotalSize(mystrings); aft = System.nanoTime(); diff = aft - bef; if(diff < bestdiff) bestdiff = diff; } System.out.println("#ignore = "+bogus); System.out.println("[performance] String size estimator uses "+bestdiff * 1.0 / N + " ns per string"); } /** * This checks that the estimation is reasonably accurate. */ @Test public void stringSizeEstimatorQuality() { MemoryMeter meter = new MemoryMeter().ignoreKnownSingletons().ignoreOuterClassReference().ignoreNonStrongReferences(); for(int k = 0; k < 100; ++k) { String s = new String(); while(s.length() < k) s += "-"; long myestimate = StringSizeEstimator.estimatedSizeOf(s); long jammestimate = meter.measureDeep(s); System.out.println("String of size "+k+" estimates are us: "+myestimate+ " bytes jamm: "+jammestimate+" bytes"); assertTrue(jammestimate <= myestimate); assertTrue(2 * jammestimate > myestimate); } System.out.println("All our string memory usage estimation are within a factor of two of jamm's and never lower."); } @Test public void stringSizeEstimator() { for(int k = 0; k < 10; ++k) { oneRoundOfStringSizeEstimation(); } } @Test public void displayTest() throws Exception { ExternalSort.main(new String[]{}); // check that it does not crash } @Test public void mainTest() throws Exception { ExternalSort.main(new String[]{"-h"}); // check that it does not crash ExternalSort.main(new String[]{""});// check that it does not crash ExternalSort.main(new String[]{"-v"}); // check that it does not crash File f1 = File.createTempFile("tmp", "unit"); File f2 = File.createTempFile("tmp", "unit"); f1.deleteOnExit(); f2.deleteOnExit(); writeStringToFile(f1, "oh"); ExternalSort.main(new String[]{"-v","-d","-t","5000","-c","ascii","-z","-H","1","-s",".",f1.toString(),f2.toString()}); } @Test public void testEmptyFiles() throws Exception { File f1 = File.createTempFile("tmp", "unit"); File f2 = File.createTempFile("tmp", "unit"); f1.deleteOnExit(); f2.deleteOnExit(); ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(f1),f2); if (f2.length() != 0) throw new RuntimeException("empty files should end up emtpy"); } @Test public void testMergeSortedFiles() throws Exception { String line; Comparator cmp = new Comparator() { @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } }; File out = File.createTempFile("test_results", ".tmp", null); out.deleteOnExit(); ExternalSort.mergeSortedFiles(this.fileList, out, cmp, Charset.defaultCharset(), false); List result = new ArrayList<>(); try (BufferedReader bf = new BufferedReader(new FileReader(out))) { while ((line = bf.readLine()) != null) { result.add(line); } } assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_MERGE_RESULTS, result.toArray()); } @Test public void testMergeSortedFiles_Distinct() throws Exception { String line; Comparator cmp = new Comparator() { @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } }; File out = File.createTempFile("test_results", ".tmp", null); out.deleteOnExit(); ExternalSort.mergeSortedFiles(this.fileList, out, cmp, Charset.defaultCharset(), true); List result = new ArrayList<>(); try (BufferedReader bf = new BufferedReader(new FileReader(out))) { while ((line = bf.readLine()) != null) { result.add(line); } } assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_MERGE_DISTINCT_RESULTS, result.toArray()); } @Test public void testMergeSortedFiles_Append() throws Exception { String line; Comparator cmp = new Comparator() { @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } }; File out = File.createTempFile("test_results", ".tmp", null); out.deleteOnExit(); writeStringToFile(out, "HEADER, HEADER\n"); ExternalSort.mergeSortedFiles(this.fileList, out, cmp, Charset.defaultCharset(), true, true, false); List result = new ArrayList<>(); try (BufferedReader bf = new BufferedReader(new FileReader(out))) { while ((line = bf.readLine()) != null) { result.add(line); } } assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_HEADER_RESULTS, result.toArray()); } @Test public void testSortAndSave() throws Exception { File f; String line; List sample = Arrays.asList(SAMPLE); Comparator cmp = new Comparator() { @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } }; f = ExternalSort.sortAndSave(sample, cmp, Charset.defaultCharset(), null, false, false, true); assertNotNull(f); assertTrue(f.exists()); assertTrue(f.length() > 0); List result = new ArrayList<>(); try (BufferedReader bf = new BufferedReader(new FileReader(f))) { while ((line = bf.readLine()) != null) { result.add(line); } } assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_SORT_RESULTS, result.toArray()); } @Test public void testSortAndSave_Distinct() throws Exception { File f; String line; BufferedReader bf; List sample = Arrays.asList(SAMPLE); Comparator cmp = new Comparator() { @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } }; f = ExternalSort.sortAndSave(sample, cmp, Charset.defaultCharset(), null, true, false, true); assertNotNull(f); assertTrue(f.exists()); assertTrue(f.length() > 0); bf = new BufferedReader(new FileReader(f)); List result = new ArrayList<>(); while ((line = bf.readLine()) != null) { result.add(line); } bf.close(); assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_DISTINCT_RESULTS, result.toArray()); } @Test public void testSortInBatch() throws Exception { Comparator cmp = new Comparator() { @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } }; List listOfFiles = ExternalSort.sortInBatch(this.csvFile, cmp, ExternalSort.DEFAULTMAXTEMPFILES, Charset.defaultCharset(), null, false, 1, false, true); assertEquals(1, listOfFiles.size()); ArrayList result = readLines(listOfFiles.get(0)); assertArrayEquals(Arrays.toString(result.toArray()),EXPECTED_MERGE_DISTINCT_RESULTS, result.toArray()); } /** * Sample case to sort csv file. * @throws Exception * */ @Test public void testCSVSorting() throws Exception { testCSVSortingWithParams(false); testCSVSortingWithParams(true); } /** * Sample case to sort csv file. * @param usegzip use compression for temporary files * @throws Exception * */ public void testCSVSortingWithParams(boolean usegzip) throws Exception { File out = File.createTempFile("test_results", ".tmp", null); out.deleteOnExit(); Comparator cmp = new Comparator() { @Override public int compare(String o1, String o2) { return o1.compareTo(o2); } }; String head; try ( // read header FileReader fr = new FileReader(this.csvFile)) { try (Scanner scan = new Scanner(fr)) { head = scan.nextLine(); } } // write to the file writeStringToFile(out, head+"\n"); // omit the first line, which is the header.. List listOfFiles = ExternalSort.sortInBatch(this.csvFile, cmp, ExternalSort.DEFAULTMAXTEMPFILES, Charset.defaultCharset(), null, false, 1, usegzip, true); // now merge with append ExternalSort.mergeSortedFiles(listOfFiles, out, cmp, Charset.defaultCharset(), false, true, usegzip); ArrayList result = readLines(out); assertEquals(12, result.size()); assertArrayEquals(Arrays.toString(result.toArray()),EXPECTED_HEADER_RESULTS, result.toArray()); } public static ArrayList readLines(File f) throws IOException { ArrayList answer; try (BufferedReader r = new BufferedReader(new FileReader(f))) { answer = new ArrayList<>(); String line; while ((line = r.readLine()) != null) { answer.add(line); } } return answer; } public static void writeStringToFile(File f, String s) throws IOException { try (FileOutputStream out = new FileOutputStream(f)) { out.write(s.getBytes()); } } } externalsortinginjava-externalsortinginjava-0.2.3/src/test/resources/000077500000000000000000000000001307071754200263425ustar00rootroot00000000000000externalsortinginjava-externalsortinginjava-0.2.3/src/test/resources/test-file-1.csv000066400000000000000000000000451307071754200311100ustar00rootroot00000000000000HEADER, HEADER a b k c d i j e h f g externalsortinginjava-externalsortinginjava-0.2.3/src/test/resources/test-file-1.txt000066400000000000000000000000171307071754200311330ustar00rootroot00000000000000a b c d e f g hexternalsortinginjava-externalsortinginjava-0.2.3/src/test/resources/test-file-2.txt000066400000000000000000000000151307071754200311320ustar00rootroot00000000000000a c e g i j k