pax_global_header 0000666 0000000 0000000 00000000064 13313734070 0014513 g ustar 00root root 0000000 0000000 52 comment=38133cffc1bdd6702e3e8541147a55ebe77b418c
externalsortinginjava-externalsortinginjava-0.2.5/ 0000775 0000000 0000000 00000000000 13313734070 0022557 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/.gitignore 0000664 0000000 0000000 00000000010 13313734070 0024536 0 ustar 00root root 0000000 0000000 /target/ externalsortinginjava-externalsortinginjava-0.2.5/.travis.yml 0000664 0000000 0000000 00000000262 13313734070 0024670 0 ustar 00root root 0000000 0000000 language: java
jdk:
- oraclejdk9
- oraclejdk8
install: true
branches:
only:
- master
script: mvn clean test jacoco:report
after_success:
- mvn coveralls:report
externalsortinginjava-externalsortinginjava-0.2.5/LICENSE.txt 0000664 0000000 0000000 00000000362 13313734070 0024403 0 ustar 00root root 0000000 0000000 This code is in the public domain. You can take it, modify it, and use it in your commercial projects without attribution. We encourage you, however, to acknowledge this package whenever possible and to contribute your bug fixes and reports. externalsortinginjava-externalsortinginjava-0.2.5/README.md 0000664 0000000 0000000 00000004541 13313734070 0024042 0 ustar 00root root 0000000 0000000 Externalsortinginjava
==========================================================
[](https://travis-ci.org/lemire/externalsortinginjava)
[![][maven img]][maven]
[![][license img]][license]
[![docs-badge][]][docs]
[](https://coveralls.io/github/lemire/externalsortinginjava?branch=master)
External-Memory Sorting in Java: useful to sort very large files using multiple cores and an external-memory algorithm.
The versions 0.1 of the library are compatible with Java 6 and above. Versions 0.2 and above
require at least Java 8.
This code is used in [Apache Jackrabbit Oak](https://github.com/apache/jackrabbit-oak).
Code sample
------------
```java
import com.google.code.externalsorting.ExternalSort;
//... inputfile: input file name
//... outputfile: output file name
// next command sorts the lines from inputfile to outputfile
ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(new File(inputfile)), new File(outputfile));
// you can also provide a custom string comparator, see API
```
API Documentation
-----------------
http://www.javadoc.io/doc/com.google.code.externalsortinginjava/externalsortinginjava/
Maven dependency
-----------------
You can download the jar files from the Maven central repository:
http://repo1.maven.org/maven2/com/google/code/externalsortinginjava/externalsortinginjava/
You can also specify the dependency in the Maven "pom.xml" file:
```xml
com.google.code.externalsortinginjavaexternalsortinginjava[0.1.9,)
```
How to build
-----------------
- get the java jdk
- Install Maven 2
- mvn install - builds jar (requires signing)
- mvn test - runs tests
[maven img]:https://maven-badges.herokuapp.com/maven-central/com.googlecode.javaewah/JavaEWAH/badge.svg
[maven]:http://search.maven.org/#search%7Cga%7C1%7Cexternalsortinginjava
[license]:LICENSE.txt
[license img]:https://img.shields.io/badge/License-Apache%202-blue.svg
[docs-badge]:https://img.shields.io/badge/API-docs-blue.svg?style=flat-square
[docs]:http://www.javadoc.io/doc/com.google.code.externalsortinginjava/externalsortinginjava/
externalsortinginjava-externalsortinginjava-0.2.5/pom.xml 0000775 0000000 0000000 00000020302 13313734070 0024074 0 ustar 00root root 0000000 0000000 4.0.0com.google.code.externalsortinginjavaexternalsortinginjavajar0.2.5externalsortinginjavahttp://github.com/lemire/externalsortinginjava/Sometimes, you want to sort large file without first loading them into memory. The solution is to use External Sorting. You divide the files into small blocks, sort each block in RAM, and then merge the result.
Many database engines and the Unix sort command support external sorting. But what if you want to avoid a database? Or what if you want to sort in a non-lexicographic order? Or maybe you just want a simple external sorting example?
When we could not find such a simple program, we wrote one. UTF-81.81.81.8GitHub Issue Trackinghttps://github.com/lemire/externalsortinginjava/issuesorg.sonatype.ossoss-parent5Public Domainhttp://creativecommons.org/licenses/publicdomainrepoThis code is in the public domain. You can take it, modify it, and use it in your commercial projects without attribution. We encourage you, however, to acknowledge this package whenever possible and to contribute your bug fixes and reports. junitjunit4.12com.github.jbellisjamm0.3.1junitjunittestcom.github.jbellisjammtestmaven-dependency-plugincopy-dependencies${project.build.directory}/liborg.jacocojacoco-maven-plugin0.7.8prepare-agentprepare-agentorg.eluder.coverallscoveralls-maven-plugin3.2.1org.apache.maven.pluginsmaven-compiler-plugin3.5.1${java.target.version}${java.target.version}org.apache.maven.pluginsmaven-surefire-plugin2.19.1**/*Spec.***/*Test.***/*Benchmark.java-javaagent:${project.build.directory}/lib/jamm-0.3.1.jarorg.apache.maven.pluginsmaven-jar-plugin2.6truecom.google.code.externalsorting.ExternalSortmaven-release-plugin2.5.3deployorg.apache.felixmaven-bundle-plugin2.3.7truecom.googlecode.javaewah.**org.apache.maven.pluginsmaven-gpg-plugin1.6sign-artifactsverifysignorg.apache.maven.pluginsmaven-javadoc-plugin2.10.4attach-javadocsjarorg.apache.maven.pluginsmaven-source-plugin3.0.1attach-sourcesjarscm:git:git@github.com:lemire/externalsortinginjava.gitscm:git:git@github.com:lemire/externalsortinginjava.gitscm:git:git@github.com:lemire/externalsortinginjava.gitexternalsortinginjava-0.2.5
externalsortinginjava-externalsortinginjava-0.2.5/src/ 0000775 0000000 0000000 00000000000 13313734070 0023346 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/main/ 0000775 0000000 0000000 00000000000 13313734070 0024272 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/main/java/ 0000775 0000000 0000000 00000000000 13313734070 0025213 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/main/java/com/ 0000775 0000000 0000000 00000000000 13313734070 0025771 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/main/java/com/google/ 0000775 0000000 0000000 00000000000 13313734070 0027245 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/main/java/com/google/code/ 0000775 0000000 0000000 00000000000 13313734070 0030157 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/main/java/com/google/code/externalsorting/ 0000775 0000000 0000000 00000000000 13313734070 0033407 5 ustar 00root root 0000000 0000000 ExternalSort.java 0000664 0000000 0000000 00000124005 13313734070 0036627 0 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/main/java/com/google/code/externalsorting package com.google.code.externalsorting;
// filename: ExternalSort.java
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.stream.Collectors;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;
import java.util.zip.Deflater;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
/**
* Goal: offer a generic external-memory sorting program in Java.
*
* It must be : - hackable (easy to adapt) - scalable to large files - sensibly
* efficient.
*
* This software is in the public domain.
*
* Usage: java com/google/code/externalsorting/ExternalSort somefile.txt out.txt
*
* You can change the default maximal number of temporary files with the -t
* flag: java com/google/code/externalsorting/ExternalSort somefile.txt out.txt
* -t 3
*
* For very large files, you might want to use an appropriate flag to allocate
* more memory to the Java VM: java -Xms2G
* com/google/code/externalsorting/ExternalSort somefile.txt out.txt
*
* By (in alphabetical order) Philippe Beaudoin, Eleftherios Chetzakis, Jon
* Elsas, Christan Grant, Daniel Haran, Daniel Lemire, Sugumaran Harikrishnan,
* Amit Jain, Thomas Mueller, Jerry Yang, First published: April 2010 originally posted at
* http://lemire.me/blog/archives/2010/04/01/external-memory-sorting-in-java/
*/
public class ExternalSort {
private static void displayUsage() {
System.out
.println("java com.google.externalsorting.ExternalSort inputfile outputfile");
System.out.println("Flags are:");
System.out.println("-v or --verbose: verbose output");
System.out.println("-d or --distinct: prune duplicate lines");
System.out
.println("-t or --maxtmpfiles (followed by an integer): specify an upper bound on the number of temporary files");
System.out
.println("-c or --charset (followed by a charset code): specify the character set to use (for sorting)");
System.out
.println("-z or --gzip: use compression for the temporary files");
System.out
.println("-H or --header (followed by an integer): ignore the first few lines");
System.out
.println("-s or --store (following by a path): where to store the temporary files");
System.out.println("-h or --help: display this message");
}
/**
* This method calls the garbage collector and then returns the free
* memory. This avoids problems with applications where the GC hasn't
* reclaimed memory and reports no available memory.
*
* @return available memory
*/
public static long estimateAvailableMemory() {
System.gc();
// http://stackoverflow.com/questions/12807797/java-get-available-memory
Runtime r = Runtime.getRuntime();
long allocatedMemory = r.totalMemory() - r.freeMemory();
long presFreeMemory = r.maxMemory() - allocatedMemory;
return presFreeMemory;
}
/**
* we divide the file into small blocks. If the blocks are too small, we
* shall create too many temporary files. If they are too big, we shall
* be using too much memory.
*
* @param sizeoffile how much data (in bytes) can we expect
* @param maxtmpfiles how many temporary files can we create (e.g., 1024)
* @param maxMemory Maximum memory to use (in bytes)
* @return the estimate
*/
public static long estimateBestSizeOfBlocks(final long sizeoffile,
final int maxtmpfiles, final long maxMemory) {
// we don't want to open up much more than maxtmpfiles temporary
// files, better run
// out of memory first.
long blocksize = sizeoffile / maxtmpfiles
+ (sizeoffile % maxtmpfiles == 0 ? 0 : 1);
// on the other hand, we don't want to create many temporary
// files
// for naught. If blocksize is smaller than half the free
// memory, grow it.
if (blocksize < maxMemory / 2) {
blocksize = maxMemory / 2;
}
return blocksize;
}
/**
* @param args command line argument
* @throws IOException generic IO exception
*/
public static void main(final String[] args) throws IOException {
boolean verbose = false;
boolean distinct = false;
int maxtmpfiles = DEFAULTMAXTEMPFILES;
Charset cs = Charset.defaultCharset();
String inputfile = null, outputfile = null;
File tempFileStore = null;
boolean usegzip = false;
boolean parallel = true;
int headersize = 0;
for (int param = 0; param < args.length; ++param) {
if (args[param].equals("-v")
|| args[param].equals("--verbose")) {
verbose = true;
} else if ((args[param].equals("-h") || args[param]
.equals("--help"))) {
displayUsage();
return;
} else if ((args[param].equals("-d") || args[param]
.equals("--distinct"))) {
distinct = true;
} else if ((args[param].equals("-t") || args[param]
.equals("--maxtmpfiles"))
&& args.length > param + 1) {
param++;
maxtmpfiles = Integer.parseInt(args[param]);
if (maxtmpfiles < 0) {
System.err
.println("maxtmpfiles should be positive");
}
} else if ((args[param].equals("-c") || args[param]
.equals("--charset"))
&& args.length > param + 1) {
param++;
cs = Charset.forName(args[param]);
} else if ((args[param].equals("-z") || args[param]
.equals("--gzip"))) {
usegzip = true;
} else if ((args[param].equals("-H") || args[param]
.equals("--header")) && args.length > param + 1) {
param++;
headersize = Integer.parseInt(args[param]);
if (headersize < 0) {
System.err
.println("headersize should be positive");
}
} else if ((args[param].equals("-s") || args[param]
.equals("--store")) && args.length > param + 1) {
param++;
tempFileStore = new File(args[param]);
} else {
if (inputfile == null) {
inputfile = args[param];
} else if (outputfile == null) {
outputfile = args[param];
} else {
System.out.println("Unparsed: "
+ args[param]);
}
}
}
if (outputfile == null) {
System.out
.println("please provide input and output file names");
displayUsage();
return;
}
Comparator comparator = defaultcomparator;
List l = sortInBatch(new File(inputfile), comparator,
maxtmpfiles, cs, tempFileStore, distinct, headersize,
usegzip, parallel);
if (verbose) {
System.out
.println("created " + l.size() + " tmp files");
}
mergeSortedFiles(l, new File(outputfile), comparator, cs,
distinct, false, usegzip);
}
/**
* This merges several BinaryFileBuffer to an output writer.
*
* @param fbw A buffer where we write the data.
* @param cmp A comparator object that tells us how to sort the
* lines.
* @param distinct Pass true if duplicate lines should be
* discarded.
* @param buffers
* Where the data should be read.
* @return The number of lines sorted.
* @throws IOException generic IO exception
*
*/
public static long mergeSortedFiles(BufferedWriter fbw,
final Comparator cmp, boolean distinct,
List buffers) throws IOException {
PriorityQueue pq = new PriorityQueue<>(
11, new Comparator() {
@Override
public int compare(BinaryFileBuffer i,
BinaryFileBuffer j) {
return cmp.compare(i.peek(), j.peek());
}
});
for (BinaryFileBuffer bfb : buffers) {
if (!bfb.empty()) {
pq.add(bfb);
}
}
long rowcounter = 0;
try {
if (!distinct) {
while (pq.size() > 0) {
BinaryFileBuffer bfb = pq.poll();
String r = bfb.pop();
fbw.write(r);
fbw.newLine();
++rowcounter;
if (bfb.empty()) {
bfb.fbr.close();
} else {
pq.add(bfb); // add it back
}
}
} else {
String lastLine = null;
if(pq.size() > 0) {
BinaryFileBuffer bfb = pq.poll();
lastLine = bfb.pop();
fbw.write(lastLine);
fbw.newLine();
++rowcounter;
if (bfb.empty()) {
bfb.fbr.close();
} else {
pq.add(bfb); // add it back
}
}
while (pq.size() > 0) {
BinaryFileBuffer bfb = pq.poll();
String r = bfb.pop();
// Skip duplicate lines
if (cmp.compare(r, lastLine) != 0) {
fbw.write(r);
fbw.newLine();
lastLine = r;
}
++rowcounter;
if (bfb.empty()) {
bfb.fbr.close();
} else {
pq.add(bfb); // add it back
}
}
}
} finally {
fbw.close();
for (BinaryFileBuffer bfb : pq) {
bfb.close();
}
}
return rowcounter;
}
/**
* This merges a bunch of temporary flat files
*
* @param files The {@link List} of sorted {@link File}s to be merged.
* @param outputfile The output {@link File} to merge the results to.
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static long mergeSortedFiles(List files, File outputfile)
throws IOException {
return mergeSortedFiles(files, outputfile, defaultcomparator,
Charset.defaultCharset());
}
/**
* This merges a bunch of temporary flat files
*
* @param files The {@link List} of sorted {@link File}s to be merged.
* @param outputfile The output {@link File} to merge the results to.
* @param cmp The {@link Comparator} to use to compare
* {@link String}s.
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static long mergeSortedFiles(List files, File outputfile,
final Comparator cmp) throws IOException {
return mergeSortedFiles(files, outputfile, cmp,
Charset.defaultCharset());
}
/**
* This merges a bunch of temporary flat files
*
* @param files The {@link List} of sorted {@link File}s to be merged.
* @param outputfile The output {@link File} to merge the results to.
* @param cmp The {@link Comparator} to use to compare
* {@link String}s.
* @param distinct Pass true if duplicate lines should be
* discarded.
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static long mergeSortedFiles(List files, File outputfile,
final Comparator cmp, boolean distinct)
throws IOException {
return mergeSortedFiles(files, outputfile, cmp,
Charset.defaultCharset(), distinct);
}
/**
* This merges a bunch of temporary flat files
*
* @param files The {@link List} of sorted {@link File}s to be merged.
* @param outputfile The output {@link File} to merge the results to.
* @param cmp The {@link Comparator} to use to compare
* {@link String}s.
* @param cs The {@link Charset} to be used for the byte to
* character conversion.
* @return The number of lines sorted.
* @throws IOException generic IO exception
*/
public static long mergeSortedFiles(List files, File outputfile,
final Comparator cmp, Charset cs) throws IOException {
return mergeSortedFiles(files, outputfile, cmp, cs, false);
}
/**
* This merges a bunch of temporary flat files
*
* @param files The {@link List} of sorted {@link File}s to be merged.
* @param distinct Pass true if duplicate lines should be
* discarded.
* @param outputfile The output {@link File} to merge the results to.
* @param cmp The {@link Comparator} to use to compare
* {@link String}s.
* @param cs The {@link Charset} to be used for the byte to
* character conversion.
* @return The number of lines sorted.
* @throws IOException generic IO exception
* @since v0.1.2
*/
public static long mergeSortedFiles(List files, File outputfile,
final Comparator cmp, Charset cs, boolean distinct)
throws IOException {
return mergeSortedFiles(files, outputfile, cmp, cs, distinct,
false, false);
}
/**
* This merges a bunch of temporary flat files
*
* @param files The {@link List} of sorted {@link File}s to be merged.
* @param distinct Pass true if duplicate lines should be
* discarded.
* @param outputfile The output {@link File} to merge the results to.
* @param cmp The {@link Comparator} to use to compare
* {@link String}s.
* @param cs The {@link Charset} to be used for the byte to
* character conversion.
* @param append Pass true if result should append to
* {@link File} instead of overwrite. Default to be false
* for overloading methods.
* @param usegzip assumes we used gzip compression for temporary files
* @return The number of lines sorted.
* @throws IOException generic IO exception
* @since v0.1.4
*/
public static long mergeSortedFiles(List files, File outputfile,
final Comparator cmp, Charset cs, boolean distinct,
boolean append, boolean usegzip) throws IOException {
ArrayList bfbs = new ArrayList<>();
for (File f : files) {
final int BUFFERSIZE = 2048;
InputStream in = new FileInputStream(f);
BufferedReader br;
if (usegzip) {
br = new BufferedReader(
new InputStreamReader(
new GZIPInputStream(in,
BUFFERSIZE), cs));
} else {
br = new BufferedReader(new InputStreamReader(
in, cs));
}
BinaryFileBuffer bfb = new BinaryFileBuffer(br);
bfbs.add(bfb);
}
BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(outputfile, append), cs));
long rowcounter = mergeSortedFiles(fbw, cmp, distinct, bfbs);
for (File f : files) {
f.delete();
}
return rowcounter;
}
/**
* This sorts a file (input) to an output file (output) using default
* parameters
*
* @param input source file
*
* @param output output file
* @throws IOException generic IO exception
*/
public static void sort(final File input, final File output)
throws IOException {
ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(input),
output);
}
/**
* This sorts a file (input) to an output file (output) using customized comparator
*
* @param input source file
*
* @param output output file
*
* @param cmp The {@link Comparator} to use to compare
* {@link String}s.
* @throws IOException generic IO exception
*/
public static void sort(final File input, final File output, final Comparator cmp)
throws IOException {
ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(input),
output, cmp);
}
/**
* Sort a list and save it to a temporary file
*
* @return the file containing the sorted data
* @param tmplist data to be sorted
* @param cmp string comparator
* @param cs charset to use for output (can use
* Charset.defaultCharset())
* @param tmpdirectory location of the temporary files (set to null for
* default location)
* @throws IOException generic IO exception
*/
public static File sortAndSave(List tmplist,
Comparator cmp, Charset cs, File tmpdirectory)
throws IOException {
return sortAndSave(tmplist, cmp, cs, tmpdirectory, false, false, true);
}
/**
* Sort a list and save it to a temporary file
*
* @return the file containing the sorted data
* @param tmplist data to be sorted
* @param cmp string comparator
* @param cs charset to use for output (can use
* Charset.defaultCharset())
* @param tmpdirectory location of the temporary files (set to null for
* default location)
* @param distinct Pass true if duplicate lines should be
* discarded.
* @param usegzip set to true if you are using gzip compression for the
* temporary files
* @param parallel set to true when sorting in parallel
* @throws IOException generic IO exception
*/
public static File sortAndSave(List tmplist,
Comparator cmp, Charset cs, File tmpdirectory,
boolean distinct, boolean usegzip, boolean parallel) throws IOException {
if (parallel) {
tmplist = tmplist.parallelStream().sorted(cmp).collect(Collectors.toCollection(ArrayList::new));
} else {
Collections.sort(tmplist, cmp);
}
File newtmpfile = File.createTempFile("sortInBatch",
"flatfile", tmpdirectory);
newtmpfile.deleteOnExit();
OutputStream out = new FileOutputStream(newtmpfile);
int ZIPBUFFERSIZE = 2048;
if (usegzip) {
out = new GZIPOutputStream(out, ZIPBUFFERSIZE) {
{
this.def.setLevel(Deflater.BEST_SPEED);
}
};
}
try (BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter(
out, cs))) {
if (!distinct) {
for (String r : tmplist) {
fbw.write(r);
fbw.newLine();
}
} else {
String lastLine = null;
Iterator i = tmplist.iterator();
if(i.hasNext()) {
lastLine = i.next();
fbw.write(lastLine);
fbw.newLine();
}
while (i.hasNext()) {
String r = i.next();
// Skip duplicate lines
if (cmp.compare(r, lastLine) != 0) {
fbw.write(r);
fbw.newLine();
lastLine = r;
}
}
}
}
return newtmpfile;
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later.
*
* @param fbr data source
* @param datalength estimated data volume (in bytes)
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(final BufferedReader fbr,
final long datalength) throws IOException {
return sortInBatch(fbr, datalength, defaultcomparator,
DEFAULTMAXTEMPFILES, estimateAvailableMemory(),
Charset.defaultCharset(), null, false, 0, false, true);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later.
*
* @param fbr data source
* @param datalength estimated data volume (in bytes)
* @param cmp string comparator
* @param distinct Pass true if duplicate lines should be
* discarded.
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(final BufferedReader fbr,
final long datalength, final Comparator cmp,
final boolean distinct) throws IOException {
return sortInBatch(fbr, datalength, cmp, DEFAULTMAXTEMPFILES,
estimateAvailableMemory(), Charset.defaultCharset(),
null, distinct, 0, false, true);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later.
*
* @param fbr data source
* @param datalength estimated data volume (in bytes)
* @param cmp string comparator
* @param maxtmpfiles maximal number of temporary files
* @param maxMemory maximum amount of memory to use (in bytes)
* @param cs character set to use (can use
* Charset.defaultCharset())
* @param tmpdirectory location of the temporary files (set to null for
* default location)
* @param distinct Pass true if duplicate lines should be
* discarded.
* @param numHeader number of lines to preclude before sorting starts
* @param usegzip use gzip compression for the temporary files
* @param parallel sort in parallel
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(final BufferedReader fbr,
final long datalength, final Comparator cmp,
final int maxtmpfiles, long maxMemory, final Charset cs,
final File tmpdirectory, final boolean distinct,
final int numHeader, final boolean usegzip, final boolean parallel)
throws IOException {
List files = new ArrayList<>();
long blocksize = estimateBestSizeOfBlocks(datalength,
maxtmpfiles, maxMemory);// in
// bytes
try {
List tmplist = new ArrayList<>();
String line = "";
try {
int counter = 0;
while (line != null) {
long currentblocksize = 0;// in bytes
while ((currentblocksize < blocksize)
&& ((line = fbr.readLine()) != null)) {
// as long as you have enough
// memory
if (counter < numHeader) {
counter++;
continue;
}
tmplist.add(line);
currentblocksize += StringSizeEstimator
.estimatedSizeOf(line);
}
files.add(sortAndSave(tmplist, cmp, cs,
tmpdirectory, distinct, usegzip, parallel));
tmplist.clear();
}
} catch (EOFException oef) {
if (tmplist.size() > 0) {
files.add(sortAndSave(tmplist, cmp, cs,
tmpdirectory, distinct, usegzip, parallel));
tmplist.clear();
}
}
} finally {
fbr.close();
}
return files;
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later.
*
* @param file some flat file
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(File file) throws IOException {
return sortInBatch(file, defaultcomparator);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later.
*
* @param file some flat file
* @param cmp string comparator
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(File file, Comparator cmp)
throws IOException {
return sortInBatch(file, cmp, false);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later.
*
* @param file some flat file
* @param cmp string comparator
* @param distinct Pass true if duplicate lines should be
* discarded.
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(File file, Comparator cmp,
boolean distinct) throws IOException {
return sortInBatch(file, cmp, DEFAULTMAXTEMPFILES,
Charset.defaultCharset(), null, distinct);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later. You can specify a bound on the number of temporary
* files that will be created.
*
* @param file some flat file
* @param cmp string comparator
* @param tmpdirectory location of the temporary files (set to null for
* default location)
* @param distinct Pass true if duplicate lines should be
* discarded.
* @param numHeader number of lines to preclude before sorting starts
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(File file, Comparator cmp,
File tmpdirectory,
boolean distinct, int numHeader)
throws IOException {
return sortInBatch(file, cmp, DEFAULTMAXTEMPFILES,
Charset.defaultCharset(), tmpdirectory, distinct,
numHeader);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later. You can specify a bound on the number of temporary
* files that will be created.
*
* @param file some flat file
* @param cmp string comparator
* @param maxtmpfiles maximal number of temporary files
* @param cs character set to use (can use
* Charset.defaultCharset())
* @param tmpdirectory location of the temporary files (set to null for
* default location)
* @param distinct Pass true if duplicate lines should be
* discarded.
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(File file, Comparator cmp,
int maxtmpfiles, Charset cs, File tmpdirectory, boolean distinct)
throws IOException {
return sortInBatch(file, cmp, maxtmpfiles, cs, tmpdirectory,
distinct, 0);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later. You can specify a bound on the number of temporary
* files that will be created.
*
* @param file some flat file
* @param cmp string comparator
* @param cs character set to use (can use
* Charset.defaultCharset())
* @param tmpdirectory location of the temporary files (set to null for
* default location)
* @param distinct Pass true if duplicate lines should be
* discarded.
* @param numHeader number of lines to preclude before sorting starts
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(File file, Comparator cmp,
Charset cs, File tmpdirectory,
boolean distinct, int numHeader)
throws IOException {
BufferedReader fbr = new BufferedReader(new InputStreamReader(
new FileInputStream(file), cs));
return sortInBatch(fbr, file.length(), cmp, DEFAULTMAXTEMPFILES,
estimateAvailableMemory(), cs, tmpdirectory, distinct,
numHeader, false, true);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later. You can specify a bound on the number of temporary
* files that will be created.
*
* @param file some flat file
* @param cmp string comparator
* @param maxtmpfiles maximal number of temporary files
* @param cs character set to use (can use
* Charset.defaultCharset())
* @param tmpdirectory location of the temporary files (set to null for
* default location)
* @param distinct Pass true if duplicate lines should be
* discarded.
* @param numHeader number of lines to preclude before sorting starts
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(File file, Comparator cmp,
int maxtmpfiles, Charset cs, File tmpdirectory,
boolean distinct, int numHeader)
throws IOException {
BufferedReader fbr = new BufferedReader(new InputStreamReader(
new FileInputStream(file), cs));
return sortInBatch(fbr, file.length(), cmp, maxtmpfiles,
estimateAvailableMemory(), cs, tmpdirectory, distinct,
numHeader, false, true);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later. You can specify a bound on the number of temporary
* files that will be created.
*
* @param file some flat file
* @param cmp string comparator
* @param maxtmpfiles maximal number of temporary files
* @param cs character set to use (can use
* Charset.defaultCharset())
* @param tmpdirectory location of the temporary files (set to null for
* default location)
* @param distinct Pass true if duplicate lines should be
* discarded.
* @param numHeader number of lines to preclude before sorting starts
* @param usegzip use gzip compression for the temporary files
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(File file, Comparator cmp,
int maxtmpfiles, Charset cs, File tmpdirectory,
boolean distinct, int numHeader, boolean usegzip)
throws IOException {
BufferedReader fbr = new BufferedReader(new InputStreamReader(
new FileInputStream(file), cs));
return sortInBatch(fbr, file.length(), cmp, maxtmpfiles,
estimateAvailableMemory(), cs, tmpdirectory, distinct,
numHeader, usegzip, true);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later. You can specify a bound on the number of temporary
* files that will be created.
*
* @param file some flat file
* @param cmp string comparator
* @param maxtmpfiles maximal number of temporary files
* @param cs character set to use (can use
* Charset.defaultCharset())
* @param tmpdirectory location of the temporary files (set to null for
* default location)
* @param distinct Pass true if duplicate lines should be
* discarded.
* @param numHeader number of lines to preclude before sorting starts
* @param usegzip use gzip compression for the temporary files
* @param parallel whether to sort in parallel
* @return a list of temporary flat files
* @throws IOException generic IO exception
*/
public static List sortInBatch(File file, Comparator cmp,
int maxtmpfiles, Charset cs, File tmpdirectory,
boolean distinct, int numHeader, boolean usegzip, boolean parallel)
throws IOException {
BufferedReader fbr = new BufferedReader(new InputStreamReader(
new FileInputStream(file), cs));
return sortInBatch(fbr, file.length(), cmp, maxtmpfiles,
estimateAvailableMemory(), cs, tmpdirectory, distinct,
numHeader, usegzip, parallel);
}
/**
* default comparator between strings.
*/
public static Comparator defaultcomparator = new Comparator() {
@Override
public int compare(String r1, String r2) {
return r1.compareTo(r2);
}
};
/**
* Default maximal number of temporary files allowed.
*/
public static final int DEFAULTMAXTEMPFILES = 1024;
}
/**
* This is essentially a thin wrapper on top of a BufferedReader... which keeps
* the last line in memory.
*
*/
final class BinaryFileBuffer {
public BinaryFileBuffer(BufferedReader r) throws IOException {
this.fbr = r;
reload();
}
public void close() throws IOException {
this.fbr.close();
}
public boolean empty() {
return this.cache == null;
}
public String peek() {
return this.cache;
}
public String pop() throws IOException {
String answer = peek().toString();// make a copy
reload();
return answer;
}
private void reload() throws IOException {
this.cache = this.fbr.readLine();
}
public BufferedReader fbr;
private String cache;
}
StringSizeEstimator.java 0000664 0000000 0000000 00000004737 13313734070 0040177 0 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/main/java/com/google/code/externalsorting /**
*
*/
package com.google.code.externalsorting;
/**
* Simple class used to estimate memory usage.
*
* @author Eleftherios Chetzakis
*
*/
public final class StringSizeEstimator {
private static int OBJ_HEADER;
private static int ARR_HEADER;
private static int INT_FIELDS = 12;
private static int OBJ_REF;
private static int OBJ_OVERHEAD;
private static boolean IS_64_BIT_JVM;
/**
* Private constructor to prevent instantiation.
*/
private StringSizeEstimator() {
}
/**
* Class initializations.
*/
static {
// By default we assume 64 bit JVM
// (defensive approach since we will get
// larger estimations in case we are not sure)
IS_64_BIT_JVM = true;
// check the system property "sun.arch.data.model"
// not very safe, as it might not work for all JVM implementations
// nevertheless the worst thing that might happen is that the JVM is 32bit
// but we assume its 64bit, so we will be counting a few extra bytes per string object
// no harm done here since this is just an approximation.
String arch = System.getProperty("sun.arch.data.model");
if (arch != null) {
if (arch.contains("32")) {
// If exists and is 32 bit then we assume a 32bit JVM
IS_64_BIT_JVM = false;
}
}
// The sizes below are a bit rough as we don't take into account
// advanced JVM options such as compressed oops
// however if our calculation is not accurate it'll be a bit over
// so there is no danger of an out of memory error because of this.
OBJ_HEADER = IS_64_BIT_JVM ? 16 : 8;
ARR_HEADER = IS_64_BIT_JVM ? 24 : 12;
OBJ_REF = IS_64_BIT_JVM ? 8 : 4;
OBJ_OVERHEAD = OBJ_HEADER + INT_FIELDS + OBJ_REF + ARR_HEADER;
}
/**
* Estimates the size of a {@link String} object in bytes.
*
* This function was designed with the following goals in mind (in order of importance) :
*
* First goal is speed: this function is called repeatedly and it should
* execute in not much more than a nanosecond.
*
* Second goal is to never underestimate (as it would lead to memory shortage and a crash).
*
* Third goal is to never overestimate too much (say within a factor of two), as it would
* mean that we are leaving much of the RAM underutilized.
*
* @param s The string to estimate memory footprint.
* @return The estimated size in bytes.
*/
public static long estimatedSizeOf(String s) {
return (s.length() * 2) + OBJ_OVERHEAD;
}
}
externalsortinginjava-externalsortinginjava-0.2.5/src/test/ 0000775 0000000 0000000 00000000000 13313734070 0024325 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/test/java/ 0000775 0000000 0000000 00000000000 13313734070 0025246 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/test/java/com/ 0000775 0000000 0000000 00000000000 13313734070 0026024 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/test/java/com/google/ 0000775 0000000 0000000 00000000000 13313734070 0027300 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/test/java/com/google/code/ 0000775 0000000 0000000 00000000000 13313734070 0030212 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/test/java/com/google/code/externalsorting/ 0000775 0000000 0000000 00000000000 13313734070 0033442 5 ustar 00root root 0000000 0000000 ExternalSortTest.java 0000664 0000000 0000000 00000040535 13313734070 0037527 0 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/test/java/com/google/code/externalsorting package com.google.code.externalsorting;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.Scanner;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.github.jamm.*;
/**
* Unit test for simple App.
*/
@SuppressWarnings({"static-method","javadoc"})
public class ExternalSortTest {
private static final String TEST_FILE1_TXT = "test-file-1.txt";
private static final String TEST_FILE2_TXT = "test-file-2.txt";
private static final String TEST_FILE1_CSV = "test-file-1.csv";
private static final String[] EXPECTED_SORT_RESULTS = { "a", "b", "b", "e", "f",
"i", "m", "o", "u", "u", "x", "y", "z"
};
private static final String[] EXPECTED_MERGE_RESULTS = {"a", "a", "b", "c", "c", "d", "e", "e", "f", "g", "g","h", "i", "j", "k"};
private static final String[] EXPECTED_MERGE_DISTINCT_RESULTS = {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"};
private static final String[] EXPECTED_HEADER_RESULTS = {"HEADER, HEADER", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"};
private static final String[] EXPECTED_DISTINCT_RESULTS = { "a", "b", "e",
"f", "i", "m", "o", "u", "x", "y", "z"
};
private static final String[] SAMPLE = { "f", "m", "b", "e", "i", "o", "u",
"x", "a", "y", "z", "b", "u"
};
private File file1;
private File file2;
private File csvFile;
private List fileList;
/**
* @throws Exception
*/
@Before
public void setUp() throws Exception {
this.fileList = new ArrayList(3);
this.file1 = new File(this.getClass().getClassLoader()
.getResource(TEST_FILE1_TXT).toURI());
this.file2 = new File(this.getClass().getClassLoader()
.getResource(TEST_FILE2_TXT).toURI());
this.csvFile = new File(this.getClass().getClassLoader()
.getResource(TEST_FILE1_CSV).toURI());
File tmpFile1 = new File(this.file1.getPath().toString()+".tmp");
File tmpFile2 = new File(this.file2.getPath().toString()+".tmp");
copyFile(this.file1, tmpFile1);
copyFile(this.file2, tmpFile2);
this.fileList.add(tmpFile1);
this.fileList.add(tmpFile2);
}
/**
* @throws Exception
*/
@After
public void tearDown() throws Exception {
this.file1 = null;
this.file2 = null;
this.csvFile = null;
for(File f:this.fileList) {
f.delete();
}
this.fileList.clear();
this.fileList = null;
}
private static void copyFile(File sourceFile, File destFile) throws IOException {
if (!destFile.exists()) {
destFile.createNewFile();
}
try (FileInputStream fis = new FileInputStream(sourceFile);
FileChannel source = fis.getChannel();
FileOutputStream fos = new FileOutputStream(destFile);
FileChannel destination = fos.getChannel()) {
destination.transferFrom(source, 0, source.size());
}
}
public static int estimateTotalSize(String[] mystrings) {
int total = 0;
for (String s : mystrings) {
total += StringSizeEstimator.estimatedSizeOf(s);
}
return total;
}
public static void oneRoundOfStringSizeEstimation() {
// could use JMH for better results but this should do
final int N = 1024;
String [] mystrings = new String[1024];
for(int k = 0; k < N ; ++k ) {
mystrings[k] = Integer.toString(k);
}
final int repeat = 1000;
long bef, aft, diff;
long bestdiff = Long.MAX_VALUE;
int bogus = 0;
for(int t = 0 ; t < repeat; ++t ) {
bef = System.nanoTime();
bogus += estimateTotalSize(mystrings);
aft = System.nanoTime();
diff = aft - bef;
if(diff < bestdiff) bestdiff = diff;
}
System.out.println("#ignore = "+bogus);
System.out.println("[performance] String size estimator uses "+bestdiff * 1.0 / N + " ns per string");
}
/**
* This checks that the estimation is reasonably accurate.
*/
@Test
public void stringSizeEstimatorQuality() {
MemoryMeter meter = new MemoryMeter().ignoreKnownSingletons().ignoreOuterClassReference().ignoreNonStrongReferences();
for(int k = 0; k < 100; ++k) {
String s = new String();
while(s.length() < k) s += "-";
long myestimate = StringSizeEstimator.estimatedSizeOf(s);
long jammestimate = meter.measureDeep(s);
System.out.println("String of size "+k+" estimates are us: "+myestimate+ " bytes jamm: "+jammestimate+" bytes");
assertTrue(jammestimate <= myestimate);
assertTrue(2 * jammestimate > myestimate);
}
System.out.println("All our string memory usage estimation are within a factor of two of jamm's and never lower.");
}
@Test
public void stringSizeEstimator() {
for(int k = 0; k < 10; ++k) {
oneRoundOfStringSizeEstimation();
}
}
@Test
public void displayTest() throws Exception {
ExternalSort.main(new String[]{}); // check that it does not crash
}
@Test
public void mainTest() throws Exception {
ExternalSort.main(new String[]{"-h"}); // check that it does not crash
ExternalSort.main(new String[]{""});// check that it does not crash
ExternalSort.main(new String[]{"-v"}); // check that it does not crash
File f1 = File.createTempFile("tmp", "unit");
File f2 = File.createTempFile("tmp", "unit");
f1.deleteOnExit();
f2.deleteOnExit();
writeStringToFile(f1, "oh");
ExternalSort.main(new String[]{"-v","-d","-t","5000","-c","ascii","-z","-H","1","-s",".",f1.toString(),f2.toString()});
}
@Test
public void testEmptyFiles() throws Exception {
File f1 = File.createTempFile("tmp", "unit");
File f2 = File.createTempFile("tmp", "unit");
f1.deleteOnExit();
f2.deleteOnExit();
ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(f1),f2);
if (f2.length() != 0) throw new RuntimeException("empty files should end up emtpy");
}
@Test
public void testMergeSortedFiles() throws Exception {
String line;
Comparator cmp = new Comparator() {
@Override
public int compare(String o1, String o2) {
return o1.compareTo(o2);
}
};
File out = File.createTempFile("test_results", ".tmp", null);
out.deleteOnExit();
ExternalSort.mergeSortedFiles(this.fileList, out, cmp,
Charset.defaultCharset(), false);
List result = new ArrayList<>();
try (BufferedReader bf = new BufferedReader(new FileReader(out))) {
while ((line = bf.readLine()) != null) {
result.add(line);
}
}
assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_MERGE_RESULTS,
result.toArray());
}
@Test
public void testMergeSortedFiles_Distinct() throws Exception {
String line;
Comparator cmp = new Comparator() {
@Override
public int compare(String o1, String o2) {
return o1.compareTo(o2);
}
};
File out = File.createTempFile("test_results", ".tmp", null);
out.deleteOnExit();
ExternalSort.mergeSortedFiles(this.fileList, out, cmp,
Charset.defaultCharset(), true);
List result = new ArrayList<>();
try (BufferedReader bf = new BufferedReader(new FileReader(out))) {
while ((line = bf.readLine()) != null) {
result.add(line);
}
}
assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_MERGE_DISTINCT_RESULTS,
result.toArray());
}
@Test
public void testMergeSortedFiles_Append() throws Exception {
String line;
Comparator cmp = new Comparator() {
@Override
public int compare(String o1, String o2)
{
return o1.compareTo(o2);
}
};
File out = File.createTempFile("test_results", ".tmp", null);
out.deleteOnExit();
writeStringToFile(out, "HEADER, HEADER\n");
ExternalSort.mergeSortedFiles(this.fileList, out, cmp, Charset.defaultCharset(), true, true, false);
List result = new ArrayList<>();
try (BufferedReader bf = new BufferedReader(new FileReader(out))) {
while ((line = bf.readLine()) != null) {
result.add(line);
}
}
assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_HEADER_RESULTS, result.toArray());
}
@Test
public void testSortAndSave() throws Exception {
File f;
String line;
List sample = Arrays.asList(SAMPLE);
Comparator cmp = new Comparator() {
@Override
public int compare(String o1, String o2) {
return o1.compareTo(o2);
}
};
f = ExternalSort.sortAndSave(sample, cmp, Charset.defaultCharset(),
null, false, false, true);
assertNotNull(f);
assertTrue(f.exists());
assertTrue(f.length() > 0);
List result = new ArrayList<>();
try (BufferedReader bf = new BufferedReader(new FileReader(f))) {
while ((line = bf.readLine()) != null) {
result.add(line);
}
}
assertArrayEquals(Arrays.toString(result.toArray()), EXPECTED_SORT_RESULTS,
result.toArray());
}
@Test
public void testSortAndSave_Distinct() throws Exception {
File f;
String line;
BufferedReader bf;
List sample = Arrays.asList(SAMPLE);
Comparator cmp = new Comparator() {
@Override
public int compare(String o1, String o2) {
return o1.compareTo(o2);
}
};
f = ExternalSort.sortAndSave(sample, cmp, Charset.defaultCharset(),
null, true, false, true);
assertNotNull(f);
assertTrue(f.exists());
assertTrue(f.length() > 0);
bf = new BufferedReader(new FileReader(f));
List result = new ArrayList<>();
while ((line = bf.readLine()) != null) {
result.add(line);
}
bf.close();
assertArrayEquals(Arrays.toString(result.toArray()),
EXPECTED_DISTINCT_RESULTS, result.toArray());
}
@Test
public void testSortInBatch() throws Exception {
Comparator cmp = new Comparator() {
@Override
public int compare(String o1, String o2) {
return o1.compareTo(o2);
}
};
List listOfFiles = ExternalSort.sortInBatch(this.csvFile, cmp, ExternalSort.DEFAULTMAXTEMPFILES, Charset.defaultCharset(), null, false, 1, false, true);
assertEquals(1, listOfFiles.size());
ArrayList result = readLines(listOfFiles.get(0));
assertArrayEquals(Arrays.toString(result.toArray()),EXPECTED_MERGE_DISTINCT_RESULTS, result.toArray());
}
/**
* Sample case to sort csv file.
* @throws Exception
*
*/
@Test
public void testCSVSorting() throws Exception {
testCSVSortingWithParams(false);
testCSVSortingWithParams(true);
}
/**
* Sample case to sort csv file.
* @param usegzip use compression for temporary files
* @throws Exception
*
*/
public void testCSVSortingWithParams(boolean usegzip) throws Exception {
File out = File.createTempFile("test_results", ".tmp", null);
out.deleteOnExit();
Comparator cmp = new Comparator() {
@Override
public int compare(String o1, String o2)
{
return o1.compareTo(o2);
}
};
String head;
try ( // read header
FileReader fr = new FileReader(this.csvFile)) {
try (Scanner scan = new Scanner(fr)) {
head = scan.nextLine();
}
}
// write to the file
writeStringToFile(out, head+"\n");
// omit the first line, which is the header..
List listOfFiles = ExternalSort.sortInBatch(this.csvFile, cmp, ExternalSort.DEFAULTMAXTEMPFILES, Charset.defaultCharset(), null, false, 1, usegzip, true);
// now merge with append
ExternalSort.mergeSortedFiles(listOfFiles, out, cmp, Charset.defaultCharset(), false, true, usegzip);
ArrayList result = readLines(out);
assertEquals(12, result.size());
assertArrayEquals(Arrays.toString(result.toArray()),EXPECTED_HEADER_RESULTS, result.toArray());
}
public static ArrayList readLines(File f) throws IOException {
ArrayList answer;
try (BufferedReader r = new BufferedReader(new FileReader(f))) {
answer = new ArrayList<>();
String line;
while ((line = r.readLine()) != null) {
answer.add(line);
}
}
return answer;
}
public static void writeStringToFile(File f, String s) throws IOException {
try (FileOutputStream out = new FileOutputStream(f)) {
out.write(s.getBytes());
}
}
/**
* Sort a text file with lines greater than {@link Integer#MAX_VALUE}.
*
* @throws IOException
*/
@Ignore("This test takes too long to execute")
@Test
public void sortVeryLargeFile() throws IOException {
final Path veryLargeFile = getTestFile();
final Path outputFile = Files.createTempFile("Merged-File", ".tmp");
final long sortedLines = ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(veryLargeFile.toFile()), outputFile.toFile());
final long expectedLines = 2148L * 1000000L;
assertEquals(expectedLines, sortedLines);
}
/**
* Generate a test file with 2148 million lines.
*
* @throws IOException
*/
private Path getTestFile() throws IOException {
System.out.println("Temp File Creation: Started");
final Path path = Files.createTempFile("IntegrationTestFile", ".txt");
final List idList = new ArrayList<>();
final int saneLimit = 1000000;
IntStream.range(0, saneLimit)
.forEach(i -> idList.add("A"));
final String content = idList.stream().collect(Collectors.joining("\n"));
Files.write(path, content.getBytes(StandardCharsets.UTF_8), StandardOpenOption.TRUNCATE_EXISTING);
final String newLine = "\n";
IntStream.range(1, 2148)
.forEach(i -> {
try {
Files.write(path, newLine.getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND);
Files.write(path, content.getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
});
System.out.println("Temp File Creation: Finished");
return path;
}
}
externalsortinginjava-externalsortinginjava-0.2.5/src/test/resources/ 0000775 0000000 0000000 00000000000 13313734070 0026337 5 ustar 00root root 0000000 0000000 externalsortinginjava-externalsortinginjava-0.2.5/src/test/resources/test-file-1.csv 0000664 0000000 0000000 00000000045 13313734070 0031105 0 ustar 00root root 0000000 0000000 HEADER, HEADER
a
b
k
c
d
i
j
e
h
f
g
externalsortinginjava-externalsortinginjava-0.2.5/src/test/resources/test-file-1.txt 0000664 0000000 0000000 00000000017 13313734070 0031130 0 ustar 00root root 0000000 0000000 a
b
c
d
e
f
g
h externalsortinginjava-externalsortinginjava-0.2.5/src/test/resources/test-file-2.txt 0000664 0000000 0000000 00000000015 13313734070 0031127 0 ustar 00root root 0000000 0000000 a
c
e
g
i
j
k