Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding ChunkingProcessor #120949

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/120949.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 120949
summary: Adding `ChunkingProcessor`
area: Machine Learning
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@
import org.elasticsearch.indices.SystemIndexDescriptor;
import org.elasticsearch.inference.InferenceServiceExtension;
import org.elasticsearch.inference.InferenceServiceRegistry;
import org.elasticsearch.ingest.Processor;
import org.elasticsearch.license.License;
import org.elasticsearch.license.LicensedFeature;
import org.elasticsearch.node.PluginComponentBinding;
import org.elasticsearch.plugins.ActionPlugin;
import org.elasticsearch.plugins.ExtensiblePlugin;
import org.elasticsearch.plugins.IngestPlugin;
import org.elasticsearch.plugins.MapperPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.plugins.SearchPlugin;
Expand Down Expand Up @@ -76,6 +78,7 @@
import org.elasticsearch.xpack.inference.external.http.sender.HttpRequestSender;
import org.elasticsearch.xpack.inference.external.http.sender.RequestExecutorServiceSettings;
import org.elasticsearch.xpack.inference.highlight.SemanticTextHighlighter;
import org.elasticsearch.xpack.inference.ingest.ChunkingProcessor;
import org.elasticsearch.xpack.inference.logging.ThrottlerManager;
import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper;
import org.elasticsearch.xpack.inference.mapper.SemanticInferenceMetadataFieldsMapper;
Expand Down Expand Up @@ -134,7 +137,14 @@
import static org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceServiceFeature.DEPRECATED_ELASTIC_INFERENCE_SERVICE_FEATURE_FLAG;
import static org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceServiceFeature.ELASTIC_INFERENCE_SERVICE_FEATURE_FLAG;

public class InferencePlugin extends Plugin implements ActionPlugin, ExtensiblePlugin, SystemIndexPlugin, MapperPlugin, SearchPlugin {
public class InferencePlugin extends Plugin
implements
ActionPlugin,
ExtensiblePlugin,
SystemIndexPlugin,
MapperPlugin,
SearchPlugin,
IngestPlugin {

/**
* When this setting is true the verification check that
Expand Down Expand Up @@ -463,6 +473,13 @@ public List<QueryRewriteInterceptor> getQueryRewriteInterceptors() {
);
}

@Override
public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
ChunkingProcessor.Factory chunkingProcessorFactory = new ChunkingProcessor.Factory();

return Map.of(ChunkingProcessor.TYPE, chunkingProcessorFactory);
}

@Override
public List<RetrieverSpec<?>> getRetrievers() {
return List.of(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.inference.ingest;

import org.elasticsearch.inference.ChunkingSettings;
import org.elasticsearch.ingest.AbstractProcessor;
import org.elasticsearch.ingest.ConfigurationUtils;
import org.elasticsearch.ingest.IngestDocument;
import org.elasticsearch.ingest.Processor;
import org.elasticsearch.xpack.inference.chunking.Chunker;
import org.elasticsearch.xpack.inference.chunking.ChunkerBuilder;
import org.elasticsearch.xpack.inference.chunking.ChunkingSettingsBuilder;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

public class ChunkingProcessor extends AbstractProcessor {

public static final String TYPE = "chunking";

private final List<Factory.InputConfig> inputConfigs;
private final ChunkingSettings chunkingSettings;

public ChunkingProcessor(String tag, String description, List<Factory.InputConfig> inputConfigs, ChunkingSettings chunkingSettings) {
super(tag, description);
this.inputConfigs = inputConfigs;
this.chunkingSettings = chunkingSettings;
}

@Override
public String getType() {
return TYPE;
}

@Override
public IngestDocument execute(IngestDocument document) {
for (var inputConfig : inputConfigs) {
var text = document.getFieldValue(inputConfig.inputField, String.class);
var chunks = ChunkerBuilder.fromChunkingStrategy(chunkingSettings.getChunkingStrategy()).chunk(text, chunkingSettings);
document.setFieldValue(inputConfig.outputField, toChunkText(chunks, text));
}

return document;
}

private List<String> toChunkText(List<Chunker.ChunkOffset> offsets, String text) {
return offsets.stream().map(o -> text.substring(o.start(), o.end())).collect(Collectors.toList());
}

public static final class Factory implements Processor.Factory {
@Override
public Processor create(
Map<String, Processor.Factory> processorFactories,
String tag,
String description,
Map<String, Object> config
) {
List<InputConfig> inputConfigs = parseInputConfigs(tag, ConfigurationUtils.readList(TYPE, tag, config, "input_output"));
ChunkingSettings chunkingSettings = ChunkingSettingsBuilder.fromMap(
ConfigurationUtils.readMap(TYPE, tag, config, "chunking_settings")
);
return new ChunkingProcessor(tag, description, inputConfigs, chunkingSettings);
}

private List<InputConfig> parseInputConfigs(String tag, List<Map<String, Object>> inputConfigMaps) {
List<InputConfig> inputConfigs = new ArrayList<>();
for (var inputConfigMap : inputConfigMaps) {
String inputField = ConfigurationUtils.readStringProperty(TYPE, tag, inputConfigMap, "input_field");
String outputField = ConfigurationUtils.readStringProperty(TYPE, tag, inputConfigMap, "output_field");
inputConfigs.add(new InputConfig(inputField, outputField));
}
return inputConfigs;
}

public record InputConfig(String inputField, String outputField) {}
}
}