You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@inlong.apache.org by "EMsnap (via GitHub)" <gi...@apache.org> on 2023/03/28 06:10:39 UTC

[GitHub] [inlong] EMsnap commented on a diff in pull request #7712: [INLONG-7581][Sort] Support multiple-sink migration for Elasticsearch

EMsnap commented on code in PR #7712:
URL: https://github.com/apache/inlong/pull/7712#discussion_r1150072352


##########
inlong-sort/sort-connectors/elasticsearch-base/src/main/java/org/apache/inlong/sort/elasticsearch/table/MultipleElasticsearchSinkFunctionBase.java:
##########
@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.inlong.sort.elasticsearch.table;
+
+import org.apache.flink.api.common.functions.RuntimeContext;
+import org.apache.flink.api.common.serialization.SerializationSchema;
+import org.apache.flink.formats.common.TimestampFormat;
+import org.apache.flink.formats.json.JsonOptions.MapNullKeyMode;
+import org.apache.flink.runtime.state.FunctionInitializationContext;
+import org.apache.flink.runtime.state.FunctionSnapshotContext;
+import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.JsonNode;
+import org.apache.flink.table.api.TableSchema;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.formats.json.JsonRowDataSerializationSchema;
+import java.util.UUID;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+import org.apache.inlong.sort.base.dirty.DirtySinkHelper;
+import org.apache.inlong.sort.base.dirty.DirtyType;
+import org.apache.inlong.sort.base.format.DynamicSchemaFormatFactory;
+import org.apache.inlong.sort.base.format.JsonDynamicSchemaFormat;
+import org.apache.inlong.sort.base.metric.SinkMetricData;
+import org.apache.inlong.sort.base.sink.SchemaUpdateExceptionPolicy;
+import org.apache.inlong.sort.elasticsearch.ElasticsearchSinkFunction;
+import org.apache.inlong.sort.elasticsearch.RequestIndexer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.annotation.Nullable;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.function.Function;
+
+/**
+ * Sink function for converting upserts into Elasticsearch ActionRequests.
+ */
+public abstract class MultipleElasticsearchSinkFunctionBase<Request, ContentType>
+        implements
+            ElasticsearchSinkFunction<RowData, Request> {
+
+    private static final long serialVersionUID = 1L;
+
+    private static final Logger LOGGER = LoggerFactory.getLogger(ElasticsearchSinkFunctionBase.class);
+
+    private final String docType;
+    private final ContentType contentType;
+    private final RequestFactory<Request, ContentType> requestFactory;
+    private final Function<RowData, String> createKey;
+    private final Function<RowData, String> createRouting;
+    private final DirtySinkHelper<Object> dirtySinkHelper;
+    private final String multipleFormat;
+    private final String indexPattern;
+    private final TableSchemaFactory tableSchemaFactory;
+    // initialized and reserved for a later feature.
+    private final SchemaUpdateExceptionPolicy schemaUpdateExceptionPolicy;
+    // open and store an index generator for each new index.
+    private Map<String, IndexGenerator> indexGeneratorMap;
+    // table level metrics
+    private SinkMetricData sinkMetricData;
+    private transient JsonDynamicSchemaFormat jsonDynamicSchemaFormat;
+    private transient SerializationSchema<RowData> serializationSchema;
+
+    public MultipleElasticsearchSinkFunctionBase(
+            @Nullable String docType, // this is deprecated in es 7+
+            SerializationSchema<RowData> serializationSchema,
+            ContentType contentType,
+            RequestFactory<Request, ContentType> requestFactory,
+            Function<RowData, String> createKey,
+            @Nullable Function<RowData, String> createRouting,
+            DirtySinkHelper<Object> dirtySinkHelper,
+            TableSchemaFactory tableSchemaFactory,
+            String multipleFormat,
+            String indexPattern,
+            SchemaUpdateExceptionPolicy schemaUpdateExceptionPolicy) {
+        this.docType = docType;
+        this.serializationSchema = Preconditions.checkNotNull(serializationSchema);
+        this.contentType = Preconditions.checkNotNull(contentType);
+        this.requestFactory = Preconditions.checkNotNull(requestFactory);
+        this.createKey = Preconditions.checkNotNull(createKey);
+        this.createRouting = createRouting;
+        this.dirtySinkHelper = dirtySinkHelper;
+        this.tableSchemaFactory = tableSchemaFactory;
+        this.multipleFormat = multipleFormat;
+        this.indexPattern = indexPattern;
+        this.schemaUpdateExceptionPolicy = schemaUpdateExceptionPolicy;
+    }
+
+    @Override
+    public void open(RuntimeContext ctx, SinkMetricData sinkMetricData) {
+        indexGeneratorMap = new HashMap<>();
+        this.sinkMetricData = sinkMetricData;
+    }
+
+    private void sendMetrics(byte[] document) {
+        if (sinkMetricData != null) {
+            sinkMetricData.invoke(1, document.length);
+        }
+    }
+
+    @Override
+    public void initializeState(FunctionInitializationContext context) {
+    }
+
+    @Override
+    public void snapshotState(FunctionSnapshotContext context) {
+    }
+
+    @Override
+    public void process(RowData element, RuntimeContext ctx, RequestIndexer<Request> indexer) {
+        JsonNode rootNode = null;
+        // parse rootnode
+        try {
+            jsonDynamicSchemaFormat =
+                    (JsonDynamicSchemaFormat) DynamicSchemaFormatFactory.getFormat(multipleFormat);
+            rootNode = jsonDynamicSchemaFormat.deserialize(element.getBinary(0));
+            // Ignore ddl change for now
+            boolean isDDL = jsonDynamicSchemaFormat.extractDDLFlag(rootNode);
+            if (isDDL) {
+                LOGGER.error("ddl change unsupported");
+                return;
+            }
+        } catch (Exception e) {
+            LOGGER.error(String.format("deserialize error, raw data: %s", new String(element.getBinary(0))), e);
+        }
+
+        RowType rowType = jsonDynamicSchemaFormat.extractSchema(rootNode);
+        RowData data = jsonDynamicSchemaFormat.extractRowData(rootNode, rowType).get(0);
+        // generate the serialization schema
+        serializationSchema = new JsonRowDataSerializationSchema(
+                rowType, TimestampFormat.ISO_8601, MapNullKeyMode.LITERAL, "null", true);
+
+        final byte[] document;
+        try {
+            // for multiple sink, need to update runtimeconverter to correct rowtype
+            // for now create custom schema
+            document = serializationSchema.serialize(data);
+        } catch (Exception e) {
+            LOGGER.error(String.format("Serialize error, raw data: %s", data), e);
+            dirtySinkHelper.invoke(data, DirtyType.SERIALIZE_ERROR, e);
+            if (sinkMetricData != null) {
+                sinkMetricData.invokeDirty(1, data.toString().getBytes(StandardCharsets.UTF_8).length);
+            }
+            return;
+        }
+        final String key;
+        try {
+            // use uuid3 as key
+            JsonNode physicalData = jsonDynamicSchemaFormat.getPhysicalData(rootNode);
+            // will this line have performance issues?

Review Comment:
   what does this comment stands for ? 



##########
inlong-sort/sort-connectors/elasticsearch-base/src/main/java/org/apache/inlong/sort/elasticsearch/table/MultipleElasticsearchSinkFunctionBase.java:
##########
@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.inlong.sort.elasticsearch.table;
+
+import org.apache.flink.api.common.functions.RuntimeContext;
+import org.apache.flink.api.common.serialization.SerializationSchema;
+import org.apache.flink.formats.common.TimestampFormat;
+import org.apache.flink.formats.json.JsonOptions.MapNullKeyMode;
+import org.apache.flink.runtime.state.FunctionInitializationContext;
+import org.apache.flink.runtime.state.FunctionSnapshotContext;
+import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.JsonNode;
+import org.apache.flink.table.api.TableSchema;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.formats.json.JsonRowDataSerializationSchema;
+import java.util.UUID;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+import org.apache.inlong.sort.base.dirty.DirtySinkHelper;
+import org.apache.inlong.sort.base.dirty.DirtyType;
+import org.apache.inlong.sort.base.format.DynamicSchemaFormatFactory;
+import org.apache.inlong.sort.base.format.JsonDynamicSchemaFormat;
+import org.apache.inlong.sort.base.metric.SinkMetricData;
+import org.apache.inlong.sort.base.sink.SchemaUpdateExceptionPolicy;
+import org.apache.inlong.sort.elasticsearch.ElasticsearchSinkFunction;
+import org.apache.inlong.sort.elasticsearch.RequestIndexer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.annotation.Nullable;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.function.Function;
+
+/**
+ * Sink function for converting upserts into Elasticsearch ActionRequests.
+ */
+public abstract class MultipleElasticsearchSinkFunctionBase<Request, ContentType>
+        implements
+            ElasticsearchSinkFunction<RowData, Request> {
+
+    private static final long serialVersionUID = 1L;
+
+    private static final Logger LOGGER = LoggerFactory.getLogger(ElasticsearchSinkFunctionBase.class);
+
+    private final String docType;
+    private final ContentType contentType;
+    private final RequestFactory<Request, ContentType> requestFactory;
+    private final Function<RowData, String> createKey;
+    private final Function<RowData, String> createRouting;
+    private final DirtySinkHelper<Object> dirtySinkHelper;
+    private final String multipleFormat;
+    private final String indexPattern;
+    private final TableSchemaFactory tableSchemaFactory;
+    // initialized and reserved for a later feature.
+    private final SchemaUpdateExceptionPolicy schemaUpdateExceptionPolicy;
+    // open and store an index generator for each new index.
+    private Map<String, IndexGenerator> indexGeneratorMap;
+    // table level metrics
+    private SinkMetricData sinkMetricData;
+    private transient JsonDynamicSchemaFormat jsonDynamicSchemaFormat;
+    private transient SerializationSchema<RowData> serializationSchema;
+
+    public MultipleElasticsearchSinkFunctionBase(
+            @Nullable String docType, // this is deprecated in es 7+
+            SerializationSchema<RowData> serializationSchema,
+            ContentType contentType,
+            RequestFactory<Request, ContentType> requestFactory,
+            Function<RowData, String> createKey,
+            @Nullable Function<RowData, String> createRouting,
+            DirtySinkHelper<Object> dirtySinkHelper,
+            TableSchemaFactory tableSchemaFactory,
+            String multipleFormat,
+            String indexPattern,
+            SchemaUpdateExceptionPolicy schemaUpdateExceptionPolicy) {
+        this.docType = docType;
+        this.serializationSchema = Preconditions.checkNotNull(serializationSchema);
+        this.contentType = Preconditions.checkNotNull(contentType);
+        this.requestFactory = Preconditions.checkNotNull(requestFactory);
+        this.createKey = Preconditions.checkNotNull(createKey);
+        this.createRouting = createRouting;
+        this.dirtySinkHelper = dirtySinkHelper;
+        this.tableSchemaFactory = tableSchemaFactory;
+        this.multipleFormat = multipleFormat;
+        this.indexPattern = indexPattern;
+        this.schemaUpdateExceptionPolicy = schemaUpdateExceptionPolicy;
+    }
+
+    @Override
+    public void open(RuntimeContext ctx, SinkMetricData sinkMetricData) {
+        indexGeneratorMap = new HashMap<>();
+        this.sinkMetricData = sinkMetricData;
+    }
+
+    private void sendMetrics(byte[] document) {
+        if (sinkMetricData != null) {
+            sinkMetricData.invoke(1, document.length);
+        }
+    }
+
+    @Override
+    public void initializeState(FunctionInitializationContext context) {
+    }
+
+    @Override
+    public void snapshotState(FunctionSnapshotContext context) {
+    }
+
+    @Override
+    public void process(RowData element, RuntimeContext ctx, RequestIndexer<Request> indexer) {
+        JsonNode rootNode = null;
+        // parse rootnode
+        try {
+            jsonDynamicSchemaFormat =
+                    (JsonDynamicSchemaFormat) DynamicSchemaFormatFactory.getFormat(multipleFormat);
+            rootNode = jsonDynamicSchemaFormat.deserialize(element.getBinary(0));
+            // Ignore ddl change for now
+            boolean isDDL = jsonDynamicSchemaFormat.extractDDLFlag(rootNode);
+            if (isDDL) {
+                LOGGER.error("ddl change unsupported");
+                return;
+            }
+        } catch (Exception e) {
+            LOGGER.error(String.format("deserialize error, raw data: %s", new String(element.getBinary(0))), e);
+        }
+
+        RowType rowType = jsonDynamicSchemaFormat.extractSchema(rootNode);
+        RowData data = jsonDynamicSchemaFormat.extractRowData(rootNode, rowType).get(0);
+        // generate the serialization schema
+        serializationSchema = new JsonRowDataSerializationSchema(
+                rowType, TimestampFormat.ISO_8601, MapNullKeyMode.LITERAL, "null", true);
+
+        final byte[] document;
+        try {
+            // for multiple sink, need to update runtimeconverter to correct rowtype
+            // for now create custom schema
+            document = serializationSchema.serialize(data);
+        } catch (Exception e) {
+            LOGGER.error(String.format("Serialize error, raw data: %s", data), e);
+            dirtySinkHelper.invoke(data, DirtyType.SERIALIZE_ERROR, e);
+            if (sinkMetricData != null) {
+                sinkMetricData.invokeDirty(1, data.toString().getBytes(StandardCharsets.UTF_8).length);
+            }
+            return;
+        }
+        final String key;
+        try {
+            // use uuid3 as key
+            JsonNode physicalData = jsonDynamicSchemaFormat.getPhysicalData(rootNode);
+            // will this line have performance issues?

Review Comment:
   what does this comment stand for ? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@inlong.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org