Skip to content

Commit

Permalink
ORC-397. Allow selective disabling of dictionary encoding.
Browse files Browse the repository at this point in the history
Original patch was by Mithun Radhakrishnan.

Fixes apache#304

Signed-off-by: Owen O'Malley <omalley@apache.org>
  • Loading branch information
omalley committed Sep 4, 2018
1 parent 72a6886 commit f47e02c
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 551 deletions.
4 changes: 3 additions & 1 deletion java/core/src/java/org/apache/orc/OrcConf.java
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,9 @@ public enum OrcConf {
"A boolean flag to determine if the comparision of field names in schema evolution is case sensitive .\n"),
WRITE_VARIABLE_LENGTH_BLOCKS("orc.write.variable.length.blocks", null, false,
"A boolean flag as to whether the ORC writer should write variable length\n"
+ "HDFS blocks.")
+ "HDFS blocks."),
DIRECT_ENCODING_COLUMNS("orc.column.encoding.direct", "orc.column.encoding.direct", "",
"Comma-separated list of columns for which dictionary encoding is to be skipped."),
;

private final String attribute;
Expand Down
17 changes: 17 additions & 0 deletions java/core/src/java/org/apache/orc/OrcFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ public static class WriterOptions implements Cloneable {
private boolean overwrite;
private boolean writeVariableLengthBlocks;
private HadoopShims shims;
private String directEncodingColumns;

protected WriterOptions(Properties tableProperties, Configuration conf) {
configuration = conf;
Expand Down Expand Up @@ -449,6 +450,8 @@ protected WriterOptions(Properties tableProperties, Configuration conf) {
shims = HadoopShimsFactory.get();
writeVariableLengthBlocks =
OrcConf.WRITE_VARIABLE_LENGTH_BLOCKS.getBoolean(tableProperties,conf);
directEncodingColumns = OrcConf.DIRECT_ENCODING_COLUMNS.getString(
tableProperties, conf);
}

/**
Expand Down Expand Up @@ -687,6 +690,16 @@ public WriterOptions useUTCTimestamp(boolean value) {
return this;
}

/**
* Set the comma-separated list of columns that should be direct encoded.
* @param value the value to set
* @return this
*/
public WriterOptions directEncodingColumns(String value) {
directEncodingColumns = value;
return this;
}

public boolean getBlockPadding() {
return blockPaddingValue;
}
Expand Down Expand Up @@ -786,6 +799,10 @@ public HadoopShims getHadoopShims() {
public boolean getUseUTCTimestamp() {
return useUTCTimestamp;
}

public String getDirectEncodingColumns() {
return directEncodingColumns;
}
}

/**
Expand Down
11 changes: 11 additions & 0 deletions java/core/src/java/org/apache/orc/impl/WriterImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.orc.CompressionCodec;
import org.apache.orc.CompressionKind;
import org.apache.orc.MemoryManager;
import org.apache.orc.OrcConf;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcProto;
import org.apache.orc.OrcUtils;
Expand Down Expand Up @@ -113,6 +114,8 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback {
private final OrcFile.BloomFilterVersion bloomFilterVersion;
private final boolean writeTimeZone;
private final boolean useUTCTimeZone;
private final double dictionaryKeySizeThreshold;
private final boolean[] directEncodingColumns;

public WriterImpl(FileSystem fs,
Path path,
Expand All @@ -123,6 +126,10 @@ public WriterImpl(FileSystem fs,
this.schema = opts.getSchema();
this.writerVersion = opts.getWriterVersion();
bloomFilterVersion = opts.getBloomFilterVersion();
this.directEncodingColumns = OrcUtils.includeColumns(
opts.getDirectEncodingColumns(), opts.getSchema());
dictionaryKeySizeThreshold =
OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf);
if (callback != null) {
callbackContext = new OrcFile.WriterContext(){

Expand Down Expand Up @@ -410,6 +417,10 @@ public void writeBloomFilter(StreamName name,
public boolean getUseUTCTimestamp() {
return useUTCTimeZone;
}

public double getDictionaryKeySizeThreshold(int columnId) {
return directEncodingColumns[columnId] ? 0.0 : dictionaryKeySizeThreshold;
}
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ public abstract class StringBaseTreeWriter extends TreeWriterBase {
rowIndexValueCount.add(0L);
buildIndex = writer.buildIndex();
Configuration conf = writer.getConfiguration();
dictionaryKeySizeThreshold =
OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf);
dictionaryKeySizeThreshold = writer.getDictionaryKeySizeThreshold(columnId);
strideDictionaryCheck =
OrcConf.ROW_INDEX_STRIDE_DICTIONARY_CHECK.getBoolean(conf);
if (dictionaryKeySizeThreshold <= 0.0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,6 @@ void writeBloomFilter(StreamName name,
) throws IOException;

boolean getUseUTCTimestamp();

double getDictionaryKeySizeThreshold(int column);
}
Loading

0 comments on commit f47e02c

Please sign in to comment.