// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
//
// Dictionary encoding for strings. There is only one dictionary block
// for all the data blocks within a cfile.
// layout for dictionary encoded block:
// Either header + embedded codeword block, which can be encoded with any
//        int blockbuilder, when mode_ = kCodeWordMode.
// Or     header + embedded StringPlainBlock, when mode_ = kPlainStringMode.
// Data blocks start with mode_ = kCodeWordMode, when the the size of dictionary
// block go beyond the option_->block_size, the subsequent data blocks will switch
// to string plain block automatically.

// You can embed any int block builder encoding formats, such as group-varint,
// bitshuffle. Currently, we use bitshuffle builder for codewords.
//
// To use other block builder/decoder, just make sure that BlockDecoder has
// interface CopyNextValuesToArray(size_t*, uint8_t*). To do that, just replace
// BShufBuilder/Decoder is ok.
#pragma once

#include <sys/types.h>

#include <cstddef>
#include <cstdint>
#include <memory>
#include <vector>

#include <sparsehash/dense_hash_map>

#include "kudu/cfile/binary_plain_block.h"
#include "kudu/cfile/block_encodings.h"
#include "kudu/cfile/block_handle.h"
#include "kudu/common/rowid.h"
#include "kudu/gutil/casts.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/port.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/gutil/strings/stringpiece.h"
#include "kudu/util/faststring.h"
#include "kudu/util/memory/arena.h"
#include "kudu/util/slice.h"
#include "kudu/util/status.h"

template <class X>
struct GoodFastHash;

namespace kudu {

class ColumnDataView;
class ColumnMaterializationContext;
class SelectionVectorView;

namespace cfile {

class CFileFooterPB;
class CFileWriter;
struct WriterOptions;

// Header Mode type
enum DictEncodingMode {
  DictEncodingMode_min = 1,
  kCodeWordMode = 1,
  kPlainBinaryMode = 2,
  DictEncodingMode_max = 2
};

class BinaryDictBlockBuilder final : public BlockBuilder {
 public:
  explicit BinaryDictBlockBuilder(const WriterOptions* options);

  bool IsBlockFull() const override;

  // Append the dictionary block for the current cfile to the end of the cfile and set the footer
  // accordingly.
  Status AppendExtraInfo(CFileWriter* c_writer, CFileFooterPB* footer) OVERRIDE;

  int Add(const uint8_t* vals, size_t count) OVERRIDE;

  void Finish(rowid_t ordinal_pos, std::vector<Slice>* slices) OVERRIDE;

  void Reset() OVERRIDE;

  size_t Count() const OVERRIDE;

  Status GetFirstKey(void* key) const OVERRIDE;

  Status GetLastKey(void* key) const OVERRIDE;

  static const size_t kMaxHeaderSize = sizeof(uint32_t) * 1;

 private:
  int AddCodeWords(const uint8_t* vals, size_t count);

  ATTRIBUTE_COLD
  bool AddToDict(Slice val, uint32_t* codeword);

  // Buffer used in Finish() for holding the encoded header.
  faststring header_buffer_;
  bool finished_;
  const WriterOptions* options_;

  std::unique_ptr<BlockBuilder> data_builder_;

  // dict_block_, dictionary_, dictionary_strings_arena_
  // is related to the dictionary block (one per cfile).
  // They should NOT be cleared in the Reset() method.
  BinaryPlainBlockBuilder dict_block_;

  google::dense_hash_map<StringPiece, uint32_t, GoodFastHash<StringPiece> > dictionary_;
  // Memory to hold the actual content for strings in the dictionary_.
  //
  // The size of it should be bigger than the size limit for dictionary block
  // (e.g option_->block_size).
  //
  // Currently, it can hold at most 64MB content.
  Arena dictionary_strings_arena_;

  DictEncodingMode mode_;

  // First key when mode_ = kCodeWordMode
  faststring first_key_;
};

class CFileIterator;

class BinaryDictBlockDecoder final : public BlockDecoder {
 public:
  explicit BinaryDictBlockDecoder(scoped_refptr<BlockHandle> block, CFileIterator* iter);

  virtual Status ParseHeader() OVERRIDE;
  virtual void SeekToPositionInBlock(uint pos) OVERRIDE;
  virtual Status SeekAtOrAfterValue(const void* value, bool* exact_match) OVERRIDE;
  Status CopyNextValues(size_t* n, ColumnDataView* dst) OVERRIDE;
  Status CopyNextAndEval(size_t* n,
                         ColumnMaterializationContext* ctx,
                         SelectionVectorView* sel,
                         ColumnDataView* dst) override;

  virtual bool HasNext() const OVERRIDE {
    return data_decoder_->HasNext();
  }

  virtual size_t Count() const OVERRIDE {
    return data_decoder_->Count();
  }

  virtual size_t GetCurrentIndex() const OVERRIDE {
    return data_decoder_->GetCurrentIndex();
  }

  virtual rowid_t GetFirstRowId() const OVERRIDE {
    return data_decoder_->GetFirstRowId();
  }

  static const size_t kMinHeaderSize = sizeof(uint32_t) * 1;

 private:
  Status CopyNextDecodeStrings(size_t* n, ColumnDataView* dst);

  scoped_refptr<BlockHandle> block_;
  Slice data_;
  bool parsed_;

  // Dictionary block decoder.
  BinaryPlainBlockDecoder* dict_decoder_;

  std::unique_ptr<BlockDecoder> data_decoder_;

  // Parent CFileIterator, each dictionary decoder in the same CFile will share
  // the same vocabulary, and thus, the same set of matching codewords.
  CFileIterator* parent_cfile_iter_;

  DictEncodingMode mode_;

  // buffer to hold the codewords, needed by CopyNextDecodeStrings()
  faststring codeword_buf_;

};

} // namespace cfile
} // namespace kudu

// Defined for tight_enum_test_cast<> -- has to be defined outside of any namespace.
MAKE_ENUM_LIMITS(kudu::cfile::DictEncodingMode,
                 kudu::cfile::DictEncodingMode_min,
                 kudu::cfile::DictEncodingMode_max);
