Commit 644553fa authored by John Zhang's avatar John Zhang

luindex: new enwiki workload

- add `--linedoc` and `--dirwalk` option to specify doc-per-line
  format and directory walk;
- add uncompressed Wikipedia archive used for Lucene benchmark;
- modified benchmark harness;
- currently referencing the Wikipedia data relative to scratch,
  this needs to be changed later with external data feature.
- download and prepare data at build-time
parent 7784c35e
......@@ -43,17 +43,37 @@
</target>
<target name="data">
<target name="data" depends="lucene-enwiki-bench-data">
<mkdir dir="${bm-data}/luindex"/>
<mkdir dir="${bm-data}/luindex/william"/>
<untar src="${bm-downloads}/shakespeare.tgz" dest="${bm-data}/luindex/william" compression="gzip"/>
<mkdir dir="${bm-data}/luindex/kjv"/>
<unzip src="${bm-downloads}/kjv.zip" dest="${bm-data}/luindex/kjv"/>
<mkdir dir="${bm-data}/luindex/shakespeare"/>
<untar src="${bm-downloads}/shakespeare.tgz" dest="${bm-data}/luindex/shakespeare" compression="gzip"/>
<zip destfile="${bm-dat}/luindex.zip">
<fileset dir="${bm-data}/" includes="luindex/kjv/**/*"/>
<fileset dir="${bm-data}/" includes="luindex/william/**/*"/>
<fileset dir="${bm-data}/" includes="luindex/shakespeare/**/*"/>
</zip>
<delete dir="${bm-data}/luindex"/>
</target>
<target name="lucene-enwiki-bench-data-check">
<available property="lucene-enwiki-bench-data-exists" file="${bm-data}/luindex/enwiki/enwiki.txt"/>
<condition property="lucene-enwiki-bench-data-valid-md5">
<and>
<isset property="lucene-enwiki-bench-data-exists"/>
<checksum file="${bm-data}/luindex/enwiki/enwiki.txt" fileext=".MD5" algorithm="MD5"/>
</and>
</condition>
</target>
<target name="lucene-enwiki-bench-data" depends="lucene-enwiki-bench-data-check" unless="lucene-enwiki-bench-data-valid-md5">
<mkdir dir="${lucene-build-dir}/benchmark/work"/>
<exec executable="ant" dir="${lucene-build-dir}/benchmark">
<arg value="enwiki"/>
</exec>
<mkdir dir="${bm-data}/luindex/enwiki"/>
<move file="${lucene-build-dir}/benchmark/work/enwiki.txt" todir="${bm-data}/luindex/enwiki"/>
</target>
</project>
......@@ -10,6 +10,7 @@ package org.dacapo.harness;
import java.io.File;
import java.lang.reflect.Constructor;
import java.util.Arrays;
import org.dacapo.harness.Benchmark;
import org.dacapo.harness.DacapoException;
......@@ -22,11 +23,11 @@ import org.dacapo.parser.Config;
public class Luindex extends Benchmark {
private final Object benchmark;
private final Class<?> clazz;
public Luindex(Config config, File scratch) throws Exception {
super(config, scratch);
Class<?> clazz = Class.forName("org.dacapo.luindex.Index", true, loader);
this.method = clazz.getMethod("main", File.class, String[].class);
this.clazz = Class.forName("org.dacapo.luindex.Index", true, loader);
Constructor<?> cons = clazz.getConstructor(File.class);
useBenchmarkClassLoader();
try {
......@@ -63,7 +64,11 @@ public class Luindex extends Benchmark {
throw new DacapoException("Cannot write to index directory");
}
method.invoke(benchmark, INDEX_DIR, args);
if (args[0].equals("--dirwalk"))
this.method = this.clazz.getMethod("indexDir", File.class, String[].class);
else if (args[0].equals("--linedoc"))
this.method = this.clazz.getMethod("indexLineDoc", File.class, String[].class);
method.invoke(benchmark, INDEX_DIR, Arrays.copyOfRange(args, 1, args.length));
}
public void postIteration(String size) {
......
......@@ -3,19 +3,27 @@ benchmark luindex
thread-model single
jars "dacapo-luindex.jar", "lucene-core-7.1.0-SNAPSHOT.jar", "lucene-demo-7.1.0-SNAPSHOT.jar";
size small args "${SCRATCH}/luindex/william/poetry"
output stdout digest 0xebb11b9e1c56c6ef5620b890cca24ec773301388,
size small args "--dirwalk", "${SCRATCH}/luindex/kjv"
output stdout digest 0xe5beba33a55380ac9af7d25ee257d219a0ebb385,
stderr digest 0xda39a3ee5e6b4b0d3255bfef95601890afd80709,
"index/segments_1" bytes 136;
size default args "${SCRATCH}/luindex/william","${SCRATCH}/luindex/kjv"
output stdout digest 0xc90792fce1594b4b9ea1b01d593aefe801e6e58b,
stderr digest 0xda39a3ee5e6b4b0d3255bfef95601890afd80709,
size default args "--dirwalk", "${SCRATCH}/luindex/shakespeare","${SCRATCH}/luindex/kjv"
output stdout digest 0x5b625bbe95e8386283c060727dedd942546de554,
stderr digest 0xda39a3ee5e6b4b0d3255bfef95601890afd80709,
"index/segments_1" bytes 136;
/* NOTE: not including enwiki.txt in the dacapo.jar
* When the mechanism for dealing with large dataset is in place,
* this should be modified. */
size huge args "--linedoc", "${SCRATCH}/../bms/luindex/data/luindex/enwiki/enwiki.txt"
output stdout digest 0x3067fa0a00d9311e95cf963dab1dfea1fca4e993,
stderr digest 0xda39a3ee5e6b4b0d3255bfef95601890afd80709,
"index/segments_1" bytes 137;
description
short "A text indexing tool",
long "Indexes a set of documents, the works of Shakespeare and the King James Bible",
long "Indexes a set of documents, the King James Bible and Wikipedia",
copyright "Copyright (C) The Apache Software Foundation",
author "Lucene Project Management Committee",
license "Apache License, Version 2.0",
......
......@@ -20,6 +20,8 @@
*/
package org.dacapo.luindex;
import java.io.BufferedReader;
/**
*
......@@ -37,6 +39,7 @@ package org.dacapo.luindex;
*/
import java.io.File;
import java.io.FileInputStream;
import java.nio.file.Paths;
import java.io.FileReader;
import java.io.FileNotFoundException;
......@@ -69,7 +72,7 @@ public class Index {
/**
* Index all text files under a directory.
*/
public void main(final File INDEX_DIR, final String[] args) throws IOException {
public void indexDir(final File INDEX_DIR, final String[] args) throws IOException {
IndexWriterConfig IWConfig = new IndexWriterConfig();
IWConfig.setOpenMode (IndexWriterConfig.OpenMode.CREATE);
IWConfig.setMergePolicy (new LogByteSizeMergePolicy());
......@@ -87,6 +90,72 @@ public class Index {
}
writer.close();
}
/**
* Takes in a merged one-document-per-line text file from Lucene Wikipedia output,
* and index documents there.
*/
public void indexLineDoc(final File INDEX_DIR, final String[] args) throws IOException {
IndexWriterConfig IWConfig = new IndexWriterConfig();
IWConfig.setOpenMode (IndexWriterConfig.OpenMode.CREATE);
IWConfig.setMergePolicy (new LogByteSizeMergePolicy());
IndexWriter writer = new IndexWriter(FSDirectory.open(Paths.get(INDEX_DIR.getCanonicalPath())), IWConfig);
File txtFile = new File(args[0]);
if (!txtFile.exists() || !txtFile.canRead()) {
System.out.println("Document directory '" + txtFile.getAbsolutePath() + "' does not exist or is not readable, please check the path");
throw new IOException("Cannot read from document directory");
}
BufferedReader reader = new BufferedReader(new FileReader(txtFile));
reader.readLine(); // skip header line
int nLines = (args.length > 1) ? Integer.parseInt(args[1]) : Integer.MAX_VALUE;
String line = reader.readLine();
int n = 0;
while (line != null && n < nLines) {
System.out.println("adding " + line.substring(0, line.indexOf(SEP)));
writer.addDocument(getLuceneDocFromLine(line));
line = reader.readLine();
n ++;
}
System.out.println("Optimizing...");
writer.forceMerge(1);
writer.close();
}
private final char SEP = '\t';
Document getLuceneDocFromLine(String line) {
Document doc = new Document();
FieldType defaultFT = new FieldType();
defaultFT.setTokenized (false);
defaultFT.setStored (true);
defaultFT.setIndexOptions (IndexOptions.DOCS);
int spot = line.indexOf(SEP);
int spot2 = line.indexOf(SEP, 1 + spot);
int spot3 = line.indexOf(SEP, 1 + spot2);
if (spot3 == -1) {
spot3 = line.length();
}
// Add title as a field. Use a field that is
// indexed (i.e. searchable), but don't tokenize the field into words.
doc.add(new Field("title", line.substring(0, spot), defaultFT));
// Add date as a field. Indexed, but not tokenized.
doc.add(new Field("date", line.substring(1+spot, spot2), defaultFT));
// Add body as a field. Tokenized and indexed, but not stored.
FieldType bodyFT = new FieldType();
bodyFT.setTokenized(true);
bodyFT.setStored(false);
bodyFT.setIndexOptions(IndexOptions.DOCS);
doc.add(new Field("body", line.substring(1 + spot2, spot3), bodyFT));
return doc;
}
/**
* Index either a file or a directory tree.
......
......@@ -91,10 +91,14 @@
</target>
<!-- lucene -->
<property name="lucene-version" value="7.1.0"/>
<property name="lucene-version-snapshot" value="${lucene-version}-SNAPSHOT"/>
<property name="lucene-build-dir" value="${lib-base}/lucene/build/lucene-${lucene-version}"/>
<property name="lucene-jars" value="${lib-base}/lucene/dist/jar"/>
<property name="lucene-core-jar-name" value="lucene-core-7.1.0-SNAPSHOT.jar"/>
<property name="lucene-demos-jar-name" value="lucene-demo-7.1.0-SNAPSHOT.jar"/>
<property name="lucene-queryparser-jar-name" value="lucene-queryparser-7.1.0-SNAPSHOT.jar"/>
<property name="lucene-data" value="${lib-base}/lucene/dist/dat"/>
<property name="lucene-core-jar-name" value="lucene-core-${lucene-version-snapshot}.jar"/>
<property name="lucene-demos-jar-name" value="lucene-demo-${lucene-version-snapshot}.jar"/>
<property name="lucene-queryparser-jar-name" value="lucene-queryparser-${lucene-version-snapshot}.jar"/>
<target name="lucene">
<ant antfile="${lib-base}/lucene/build.xml"/>
</target>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment