You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2023/01/11 09:19:32 UTC

[GitHub] [doris] github-actions[bot] commented on a diff in pull request #15821: [Feature-WIP](inverted index) inverted index writer's implementation

github-actions[bot] commented on code in PR #15821:
URL: https://github.com/apache/doris/pull/15821#discussion_r1066750686


##########
be/src/olap/rowset/segment_v2/inverted_index_compound_directory.h:
##########
@@ -0,0 +1,158 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>

Review Comment:
   warning: 'CLucene.h' file not found [clang-diagnostic-error]
   ```cpp
   #include <CLucene.h>
            ^
   ```
   



##########
be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp:
##########
@@ -0,0 +1,294 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/inverted_index_compound_reader.h"
+
+#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
+
+#define BUFFER_LENGTH 16384
+#define CL_MAX_PATH 4096
+#define CL_MAX_DIR CL_MAX_PATH
+
+#define STRDUP_WtoA(x) CL_NS(util)::Misc::_wideToChar(x)
+#define STRDUP_TtoA STRDUP_WtoA
+
+using FileWriterPtr = std::unique_ptr<doris::io::FileWriter>;
+
+namespace doris {
+namespace segment_v2 {
+
+class DorisCompoundReader::ReaderFileEntry : LUCENE_BASE {
+public:
+    std::string file_name {};
+    int64_t offset;
+    int64_t length;
+    ReaderFileEntry() {
+        //file_name = nullptr;
+        offset = 0;
+        length = 0;
+    }
+    ~ReaderFileEntry() override = default;
+};
+
+/** Implementation of an IndexInput that reads from a portion of the
+ *  compound file.
+ */
+class CSIndexInput : public lucene::store::BufferedIndexInput {
+private:
+    CL_NS(store)::IndexInput* base;
+    int64_t fileOffset;
+    int64_t _length;
+
+protected:
+    void readInternal(uint8_t* /*b*/, const int32_t /*len*/) override;
+    void seekInternal(const int64_t /*pos*/) override {}
+
+public:
+    CSIndexInput(CL_NS(store)::IndexInput* base, const int64_t fileOffset, const int64_t length,
+                 const int32_t readBufferSize = CL_NS(store)::BufferedIndexInput::BUFFER_SIZE);
+    CSIndexInput(const CSIndexInput& clone);
+    ~CSIndexInput() override;
+    void close() override;
+    lucene::store::IndexInput* clone() const override;
+    int64_t length() const override { return _length; }
+    const char* getDirectoryType() const override { return DorisCompoundReader::getClassName(); }
+    const char* getObjectName() const override { return getClassName(); }
+    static const char* getClassName() { return "CSIndexInput"; }
+};
+
+CSIndexInput::CSIndexInput(CL_NS(store)::IndexInput* base, const int64_t fileOffset,
+                           const int64_t length, const int32_t _readBufferSize)
+        : BufferedIndexInput(_readBufferSize) {
+    this->base = base;
+    this->fileOffset = fileOffset;
+    this->_length = length;
+}
+
+void CSIndexInput::readInternal(uint8_t* b, const int32_t len) {
+    std::lock_guard<std::mutex> wlock(((DorisCompoundDirectory::FSIndexInput*)base)->_this_lock);
+
+    int64_t start = getFilePointer();
+    if (start + len > _length) {
+        _CLTHROWA(CL_ERR_IO, "read past EOF");
+    }
+    base->seek(fileOffset + start);
+    base->readBytes(b, len, false);
+}
+
+CSIndexInput::~CSIndexInput() = default;
+
+lucene::store::IndexInput* CSIndexInput::clone() const {
+    return _CLNEW CSIndexInput(*this);
+}
+
+CSIndexInput::CSIndexInput(const CSIndexInput& clone) : BufferedIndexInput(clone) {
+    this->base = clone.base;
+    this->fileOffset = clone.fileOffset;
+    this->_length = clone._length;
+}
+
+void CSIndexInput::close() {}
+
+DorisCompoundReader::DorisCompoundReader(lucene::store::Directory* d, const char* name,
+                                         int32_t read_buffer_size)
+        : readBufferSize(read_buffer_size),
+          dir(d),
+          ram_dir(new lucene::store::RAMDirectory()),
+          stream(nullptr),
+          entries(_CLNEW EntriesType(true, true)) {
+    file_name = strdup(name);
+    bool success = false;
+    try {
+        stream = dir->openInput(name, readBufferSize);
+
+        int32_t count = stream->readVInt();
+        ReaderFileEntry* entry = nullptr;
+        TCHAR tid[CL_MAX_PATH];
+        uint8_t buffer[BUFFER_LENGTH];
+        for (int32_t i = 0; i < count; i++) {
+            entry = _CLNEW ReaderFileEntry();
+            stream->readString(tid, CL_MAX_PATH);
+            char* aid = STRDUP_TtoA(tid);
+            entry->file_name = aid;
+            entry->offset = stream->readLong();
+            entry->length = stream->readLong();
+            entries->put(aid, entry);
+            // read header file data
+            if (entry->offset < 0) {
+                copyFile(entry->file_name.c_str(), entry->length, buffer, BUFFER_LENGTH);
+            }
+        }
+
+        success = true;
+    }
+    _CLFINALLY(if (!success && (stream != nullptr)) {
+        try {
+            stream->close();
+            _CLDELETE(stream)
+        } catch (CLuceneError& err) {
+            if (err.number() != CL_ERR_IO) {
+                throw err;
+            }
+        }
+    })
+}
+
+void DorisCompoundReader::copyFile(const char* file, int64_t file_length, uint8_t* buffer,
+                                   int64_t buffer_length) {
+    std::unique_ptr<lucene::store::IndexOutput> output(ram_dir->createOutput(file));
+    int64_t start_ptr = output->getFilePointer();
+    int64_t remainder = file_length;
+    int64_t chunk = buffer_length;
+
+    while (remainder > 0) {
+        int64_t len = std::min(std::min(chunk, file_length), remainder);
+        stream->readBytes(buffer, len);
+        output->writeBytes(buffer, len);
+        remainder -= len;
+    }
+    if (remainder != 0) {
+        TCHAR buf[CL_MAX_PATH + 100];
+        swprintf(buf, CL_MAX_PATH + 100,
+                 _T("Non-zero remainder length after copying")
+                 _T(": %d (id: %s, length: %d, buffer size: %d)"),
+                 (int)remainder, file_name.c_str(), (int)file_length, (int)chunk);
+        _CLTHROWT(CL_ERR_IO, buf);
+    }
+
+    int64_t end_ptr = output->getFilePointer();
+    int64_t diff = end_ptr - start_ptr;
+    if (diff != file_length) {
+        TCHAR buf[100];
+        swprintf(buf, 100,
+                 _T("Difference in the output file offsets %d ")
+                 _T("does not match the original file length %d"),
+                 (int)diff, (int)file_length);
+        _CLTHROWA(CL_ERR_IO, buf);
+    }
+    output->close();
+}
+
+DorisCompoundReader::~DorisCompoundReader() {

Review Comment:
   warning: use '= default' to define a trivial destructor [modernize-use-equals-default]
   ```cpp
   DorisCompoundReader::~DorisCompoundReader() {
                        ^
   ```
   



##########
be/src/olap/rowset/segment_v2/inverted_index_compound_reader.h:
##########
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>

Review Comment:
   warning: 'CLucene.h' file not found [clang-diagnostic-error]
   ```cpp
   #include <CLucene.h>
            ^
   ```
   



##########
be/src/olap/rowset/segment_v2/inverted_index_writer.cpp:
##########
@@ -0,0 +1,421 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/inverted_index_writer.h"
+
+#include <CLucene.h>

Review Comment:
   warning: 'CLucene.h' file not found [clang-diagnostic-error]
   ```cpp
   #include <CLucene.h>
            ^
   ```
   



##########
be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp:
##########
@@ -0,0 +1,294 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/inverted_index_compound_reader.h"
+
+#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
+
+#define BUFFER_LENGTH 16384
+#define CL_MAX_PATH 4096
+#define CL_MAX_DIR CL_MAX_PATH
+
+#define STRDUP_WtoA(x) CL_NS(util)::Misc::_wideToChar(x)
+#define STRDUP_TtoA STRDUP_WtoA
+
+using FileWriterPtr = std::unique_ptr<doris::io::FileWriter>;
+
+namespace doris {
+namespace segment_v2 {
+
+class DorisCompoundReader::ReaderFileEntry : LUCENE_BASE {
+public:
+    std::string file_name {};
+    int64_t offset;
+    int64_t length;
+    ReaderFileEntry() {
+        //file_name = nullptr;
+        offset = 0;
+        length = 0;
+    }
+    ~ReaderFileEntry() override = default;
+};
+
+/** Implementation of an IndexInput that reads from a portion of the
+ *  compound file.
+ */
+class CSIndexInput : public lucene::store::BufferedIndexInput {
+private:
+    CL_NS(store)::IndexInput* base;
+    int64_t fileOffset;
+    int64_t _length;
+
+protected:
+    void readInternal(uint8_t* /*b*/, const int32_t /*len*/) override;
+    void seekInternal(const int64_t /*pos*/) override {}
+
+public:
+    CSIndexInput(CL_NS(store)::IndexInput* base, const int64_t fileOffset, const int64_t length,
+                 const int32_t readBufferSize = CL_NS(store)::BufferedIndexInput::BUFFER_SIZE);
+    CSIndexInput(const CSIndexInput& clone);
+    ~CSIndexInput() override;
+    void close() override;
+    lucene::store::IndexInput* clone() const override;
+    int64_t length() const override { return _length; }
+    const char* getDirectoryType() const override { return DorisCompoundReader::getClassName(); }
+    const char* getObjectName() const override { return getClassName(); }
+    static const char* getClassName() { return "CSIndexInput"; }
+};
+
+CSIndexInput::CSIndexInput(CL_NS(store)::IndexInput* base, const int64_t fileOffset,
+                           const int64_t length, const int32_t _readBufferSize)
+        : BufferedIndexInput(_readBufferSize) {
+    this->base = base;
+    this->fileOffset = fileOffset;
+    this->_length = length;
+}
+
+void CSIndexInput::readInternal(uint8_t* b, const int32_t len) {

Review Comment:
   warning: method 'readInternal' can be made const [readability-make-member-function-const]
   
   be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp:56:
   ```diff
   -     void readInternal(uint8_t* /*b*/, const int32_t /*len*/) override;
   +     void readInternal(uint8_t* /*b*/, const int32_t /*len*/) const override;
   ```
   
   ```suggestion
   void CSIndexInput::readInternal(uint8_t* b, const int32_t len) const {
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org