Repo created

2025-11-22 13:58:55 +01:00 · 2025-11-22 13:58:55 +01:00 · 68073add76
commit 68073add76
parent 4af19165ec
12458 changed files with 12350765 additions and 2 deletions
--- a/libs/search/base/text_index/dictionary.hpp
+++ b/libs/search/base/text_index/dictionary.hpp
@ -0,0 +1,116 @@
+#pragma once
+
+#include "search/base/text_index/header.hpp"
+#include "search/base/text_index/text_index.hpp"
+
+#include "coding/write_to_sink.hpp"
+
+#include "base/assert.hpp"
+#include "base/checked_cast.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+namespace search_base
+{
+// The dictionary contains all tokens that are present
+// in the text index.
+class TextIndexDictionary
+{
+public:
+  bool GetTokenId(Token const & token, size_t & id) const
+  {
+    auto const it = std::lower_bound(m_tokens.cbegin(), m_tokens.cend(), token);
+    if (it == m_tokens.cend() || *it != token)
+      return false;
+    id = base::checked_cast<uint32_t>(std::distance(m_tokens.cbegin(), it));
+    return true;
+  }
+
+  void SetTokens(std::vector<Token> && tokens)
+  {
+    ASSERT(std::is_sorted(tokens.begin(), tokens.end()), ());
+    m_tokens = std::move(tokens);
+  }
+
+  std::vector<Token> const & GetTokens() const { return m_tokens; }
+
+  template <typename Sink>
+  void Serialize(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
+  {
+    header.m_numTokens = base::checked_cast<uint32_t>(m_tokens.size());
+
+    header.m_dictPositionsOffset = RelativePos(sink, startPos);
+    // An uint32_t for each 32-bit offset and an uint32_t for the dummy entry at the end.
+    WriteZeroesToSink(sink, sizeof(uint32_t) * (header.m_numTokens + 1));
+    header.m_dictWordsOffset = RelativePos(sink, startPos);
+
+    std::vector<uint32_t> offsets;
+    offsets.reserve(header.m_numTokens + 1);
+    for (auto const & token : m_tokens)
+    {
+      offsets.emplace_back(RelativePos(sink, startPos));
+      SerializeToken(sink, token);
+    }
+    offsets.emplace_back(RelativePos(sink, startPos));
+
+    {
+      uint64_t const savedPos = sink.Pos();
+      sink.Seek(startPos + header.m_dictPositionsOffset);
+
+      for (uint32_t const o : offsets)
+        WriteToSink(sink, o);
+
+      CHECK_EQUAL(sink.Pos(), startPos + header.m_dictWordsOffset, ());
+      sink.Seek(savedPos);
+    }
+  }
+
+  template <typename Source>
+  void Deserialize(Source & source, TextIndexHeader const & header)
+  {
+    auto const startPos = source.Pos();
+
+    std::vector<uint32_t> tokenOffsets(header.m_numTokens + 1);
+    for (uint32_t & offset : tokenOffsets)
+      offset = ReadPrimitiveFromSource<uint32_t>(source);
+
+    uint64_t const expectedSize = header.m_dictWordsOffset - header.m_dictPositionsOffset;
+    CHECK_EQUAL(source.Pos(), startPos + expectedSize, ());
+    m_tokens.resize(header.m_numTokens);
+    for (size_t i = 0; i < m_tokens.size(); ++i)
+    {
+      size_t const size = base::checked_cast<size_t>(tokenOffsets[i + 1] - tokenOffsets[i]);
+      DeserializeToken(source, m_tokens[i], size);
+    }
+  }
+
+private:
+  template <typename Sink>
+  static void SerializeToken(Sink & sink, Token const & token)
+  {
+    CHECK(!token.empty(), ());
+    // todo(@m) Endianness.
+    sink.Write(token.data(), token.size() * sizeof(typename Token::value_type));
+  }
+
+  template <typename Source>
+  static void DeserializeToken(Source & source, Token & token, size_t size)
+  {
+    CHECK_GREATER(size, 0, ());
+    ASSERT_EQUAL(size % sizeof(typename Token::value_type), 0, ());
+    token.resize(size / sizeof(typename Token::value_type));
+    source.Read(&token[0], size);
+  }
+
+  template <typename Sink>
+  static uint32_t RelativePos(Sink & sink, uint64_t startPos)
+  {
+    return base::checked_cast<uint32_t>(sink.Pos() - startPos);
+  }
+
+  std::vector<Token> m_tokens;
+};
+}  // namespace search_base
--- a/libs/search/base/text_index/header.cpp
+++ b/libs/search/base/text_index/header.cpp
@ -0,0 +1,9 @@
+#include "search/base/text_index/header.hpp"
+
+using namespace std;
+
+namespace search_base
+{
+// static
+string const TextIndexHeader::kHeaderMagic = "mapsmetextidx";
+}  // namespace search_base
--- a/libs/search/base/text_index/header.hpp
+++ b/libs/search/base/text_index/header.hpp
@ -0,0 +1,57 @@
+
+#pragma once
+
+#include "search/base/text_index/text_index.hpp"
+
+#include "coding/reader.hpp"
+#include "coding/write_to_sink.hpp"
+
+#include "base/assert.hpp"
+
+#include <cstdint>
+#include <string>
+
+namespace search_base
+{
+struct TextIndexHeader
+{
+  template <typename Sink>
+  void Serialize(Sink & sink) const
+  {
+    CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
+
+    sink.Write(kHeaderMagic.data(), kHeaderMagic.size());
+    WriteToSink(sink, static_cast<uint8_t>(m_version));
+    WriteToSink(sink, m_numTokens);
+    WriteToSink(sink, m_dictPositionsOffset);
+    WriteToSink(sink, m_dictWordsOffset);
+    WriteToSink(sink, m_postingsStartsOffset);
+    WriteToSink(sink, m_postingsListsOffset);
+  }
+
+  template <typename Source>
+  void Deserialize(Source & source)
+  {
+    CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
+
+    std::string headerMagic(kHeaderMagic.size(), ' ');
+    source.Read(&headerMagic[0], headerMagic.size());
+    CHECK_EQUAL(headerMagic, kHeaderMagic, ());
+    m_version = static_cast<TextIndexVersion>(ReadPrimitiveFromSource<uint8_t>(source));
+    CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
+    m_numTokens = ReadPrimitiveFromSource<uint32_t>(source);
+    m_dictPositionsOffset = ReadPrimitiveFromSource<uint32_t>(source);
+    m_dictWordsOffset = ReadPrimitiveFromSource<uint32_t>(source);
+    m_postingsStartsOffset = ReadPrimitiveFromSource<uint32_t>(source);
+    m_postingsListsOffset = ReadPrimitiveFromSource<uint32_t>(source);
+  }
+
+  static std::string const kHeaderMagic;
+  TextIndexVersion m_version = TextIndexVersion::Latest;
+  uint32_t m_numTokens = 0;
+  uint32_t m_dictPositionsOffset = 0;
+  uint32_t m_dictWordsOffset = 0;
+  uint32_t m_postingsStartsOffset = 0;
+  uint32_t m_postingsListsOffset = 0;
+};
+}  // namespace search_base
--- a/libs/search/base/text_index/mem.cpp
+++ b/libs/search/base/text_index/mem.cpp
@ -0,0 +1,34 @@
+#include "search/base/text_index/mem.hpp"
+
+#include "base/stl_helpers.hpp"
+
+using namespace std;
+
+namespace search_base
+{
+void MemTextIndex::AddPosting(Token const & token, Posting const & posting)
+{
+  m_postingsByToken[token].emplace_back(posting);
+}
+
+void MemTextIndex::SortPostings()
+{
+  for (auto & entry : m_postingsByToken)
+  {
+    // A posting may occur several times in a document,
+    // so we remove duplicates for the docid index.
+    // If the count is needed for ranking it may be stored
+    // separately.
+    base::SortUnique(entry.second);
+  }
+}
+
+void MemTextIndex::BuildDictionary()
+{
+  vector<Token> tokens;
+  tokens.reserve(m_postingsByToken.size());
+  for (auto const & entry : m_postingsByToken)
+    tokens.emplace_back(entry.first);
+  m_dictionary.SetTokens(std::move(tokens));
+}
+}  // namespace search_base
--- a/libs/search/base/text_index/mem.hpp
+++ b/libs/search/base/text_index/mem.hpp
@ -0,0 +1,167 @@
+#pragma once
+
+#include "search/base/text_index/dictionary.hpp"
+#include "search/base/text_index/header.hpp"
+#include "search/base/text_index/postings.hpp"
+#include "search/base/text_index/text_index.hpp"
+#include "search/base/text_index/utils.hpp"
+
+#include "coding/reader.hpp"
+#include "coding/varint.hpp"
+
+#include "base/assert.hpp"
+#include "base/string_utils.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace search_base
+{
+class MemTextIndex
+{
+public:
+  MemTextIndex() = default;
+
+  void AddPosting(Token const & token, Posting const & posting);
+
+  // Executes |fn| on every posting associated with |token|.
+  // The order of postings is not specified.
+  template <typename Fn>
+  void ForEachPosting(Token const & token, Fn && fn) const
+  {
+    auto const it = m_postingsByToken.find(token);
+    if (it == m_postingsByToken.end())
+      return;
+    for (auto const p : it->second)
+      fn(p);
+  }
+
+  template <typename Fn>
+  void ForEachPosting(strings::UniString const & token, Fn && fn) const
+  {
+    ForEachPosting(strings::ToUtf8(token), std::forward<Fn>(fn));
+  }
+
+  template <typename Sink>
+  void Serialize(Sink & sink)
+  {
+    SortPostings();
+    BuildDictionary();
+
+    TextIndexHeader header;
+
+    uint64_t const startPos = sink.Pos();
+    // Will be filled in later.
+    header.Serialize(sink);
+
+    SerializeDictionary(sink, header, startPos);
+    SerializePostingsLists(sink, header, startPos);
+
+    uint64_t const finishPos = sink.Pos();
+    sink.Seek(startPos);
+    header.Serialize(sink);
+    sink.Seek(finishPos);
+  }
+
+  template <typename Source>
+  void Deserialize(Source & source)
+  {
+    uint64_t startPos = source.Pos();
+
+    TextIndexHeader header;
+    header.Deserialize(source);
+
+    DeserializeDictionary(source, header, startPos);
+    DeserializePostingsLists(source, header, startPos);
+  }
+
+private:
+  class MemPostingsFetcher : public PostingsFetcher
+  {
+  public:
+    explicit MemPostingsFetcher(std::map<Token, std::vector<Posting>> const & postingsByToken)
+      : m_postingsByToken(postingsByToken)
+      , m_it(m_postingsByToken.begin())
+    {}
+
+    // PostingsFetcher overrides:
+    bool IsValid() const override { return m_it != m_postingsByToken.end(); }
+
+    void Advance() override
+    {
+      if (m_it != m_postingsByToken.end())
+        ++m_it;
+    }
+
+    void ForEachPosting(Fn const & fn) const override
+    {
+      CHECK(IsValid(), ());
+      for (uint32_t p : m_it->second)
+        fn(p);
+    }
+
+  private:
+    std::map<Token, std::vector<Posting>> const & m_postingsByToken;
+    // Iterator to the current token that will be processed when ForEachPosting is called.
+    std::map<Token, std::vector<Posting>>::const_iterator m_it;
+  };
+
+  void SortPostings();
+
+  void BuildDictionary();
+
+  template <typename Sink>
+  void SerializeDictionary(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
+  {
+    m_dictionary.Serialize(sink, header, startPos);
+  }
+
+  template <typename Source>
+  void DeserializeDictionary(Source & source, TextIndexHeader const & header, uint64_t startPos)
+  {
+    CHECK_EQUAL(source.Pos(), startPos + header.m_dictPositionsOffset, ());
+    m_dictionary.Deserialize(source, header);
+  }
+
+  template <typename Sink>
+  void SerializePostingsLists(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
+  {
+    MemPostingsFetcher fetcher(m_postingsByToken);
+    WritePostings(sink, startPos, header, fetcher);
+  }
+
+  template <typename Source>
+  void DeserializePostingsLists(Source & source, TextIndexHeader const & header, uint64_t startPos)
+  {
+    CHECK_EQUAL(source.Pos(), startPos + header.m_postingsStartsOffset, ());
+    std::vector<uint32_t> postingsStarts(header.m_numTokens + 1);
+    for (uint32_t & start : postingsStarts)
+      start = ReadPrimitiveFromSource<uint32_t>(source);
+
+    auto const & tokens = m_dictionary.GetTokens();
+    CHECK_EQUAL(source.Pos(), startPos + header.m_postingsListsOffset, ());
+    m_postingsByToken.clear();
+    for (size_t i = 0; i < header.m_numTokens; ++i)
+    {
+      std::vector<uint32_t> postings;
+      uint32_t last = 0;
+      while (source.Pos() < startPos + postingsStarts[i + 1])
+      {
+        last += ReadVarUint<uint32_t>(source);
+        postings.emplace_back(last);
+      }
+      CHECK_EQUAL(source.Pos(), postingsStarts[i + 1], ());
+
+      m_postingsByToken.emplace(tokens[i], postings);
+    }
+  }
+
+  std::map<Token, std::vector<Posting>> m_postingsByToken;
+  TextIndexDictionary m_dictionary;
+};
+}  // namespace search_base
--- a/libs/search/base/text_index/merger.cpp
+++ b/libs/search/base/text_index/merger.cpp
@ -0,0 +1,126 @@
+#include "search/base/text_index/merger.hpp"
+
+#include "search/base/text_index/dictionary.hpp"
+#include "search/base/text_index/header.hpp"
+#include "search/base/text_index/postings.hpp"
+
+#include "coding/file_writer.hpp"
+#include "coding/varint.hpp"
+#include "coding/write_to_sink.hpp"
+
+#include "base/assert.hpp"
+#include "base/logging.hpp"
+#include "base/stl_helpers.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+using namespace std;
+
+namespace
+{
+using namespace search_base;
+
+class MergedPostingsListFetcher : public PostingsFetcher
+{
+public:
+  MergedPostingsListFetcher(TextIndexDictionary const & dict, TextIndexReader const & index1,
+                            TextIndexReader const & index2)
+    : m_dict(dict)
+    , m_index1(index1)
+    , m_index2(index2)
+  {
+    ReadPostings();
+  }
+
+  // PostingsFetcher overrides:
+  bool IsValid() const override
+  {
+    auto const & tokens = m_dict.GetTokens();
+    CHECK_LESS_OR_EQUAL(m_tokenId, tokens.size(), ());
+    return m_tokenId < tokens.size();
+  }
+
+  void Advance() override
+  {
+    auto const & tokens = m_dict.GetTokens();
+    CHECK_LESS_OR_EQUAL(m_tokenId, tokens.size(), ());
+    if (m_tokenId == tokens.size())
+      return;
+
+    ++m_tokenId;
+    ReadPostings();
+  }
+
+  void ForEachPosting(Fn const & fn) const override
+  {
+    CHECK(IsValid(), ());
+    for (uint32_t p : m_postings)
+      fn(p);
+  }
+
+private:
+  // Reads postings for the current token.
+  void ReadPostings()
+  {
+    m_postings.clear();
+    if (!IsValid())
+      return;
+
+    auto const & tokens = m_dict.GetTokens();
+    m_index1.ForEachPosting(tokens[m_tokenId], base::MakeBackInsertFunctor(m_postings));
+    m_index2.ForEachPosting(tokens[m_tokenId], base::MakeBackInsertFunctor(m_postings));
+    base::SortUnique(m_postings);
+  }
+
+  TextIndexDictionary const & m_dict;
+  TextIndexReader const & m_index1;
+  TextIndexReader const & m_index2;
+  // Index of the next token from |m_dict| to be processed.
+  size_t m_tokenId = 0;
+  vector<uint32_t> m_postings;
+};
+
+TextIndexDictionary MergeDictionaries(TextIndexDictionary const & dict1, TextIndexDictionary const & dict2)
+{
+  vector<Token> commonTokens;
+  auto const & ts1 = dict1.GetTokens();
+  auto const & ts2 = dict2.GetTokens();
+  merge(ts1.begin(), ts1.end(), ts2.begin(), ts2.end(), back_inserter(commonTokens));
+  ASSERT(is_sorted(commonTokens.begin(), commonTokens.end()), ());
+  commonTokens.erase(unique(commonTokens.begin(), commonTokens.end()), commonTokens.end());
+
+  TextIndexDictionary dict;
+  dict.SetTokens(std::move(commonTokens));
+  return dict;
+}
+}  // namespace
+
+namespace search_base
+{
+// static
+void TextIndexMerger::Merge(TextIndexReader const & index1, TextIndexReader const & index2, FileWriter & sink)
+{
+  TextIndexDictionary const dict = MergeDictionaries(index1.GetDictionary(), index2.GetDictionary());
+
+  TextIndexHeader header;
+
+  uint64_t const startPos = sink.Pos();
+  // Will be filled in later.
+  header.Serialize(sink);
+
+  dict.Serialize(sink, header, startPos);
+
+  MergedPostingsListFetcher fetcher(dict, index1, index2);
+  WritePostings(sink, startPos, header, fetcher);
+
+  // Fill in the header.
+  uint64_t const finishPos = sink.Pos();
+  sink.Seek(startPos);
+  header.Serialize(sink);
+  sink.Seek(finishPos);
+}
+}  // namespace search_base
--- a/libs/search/base/text_index/merger.hpp
+++ b/libs/search/base/text_index/merger.hpp
@ -0,0 +1,26 @@
+#pragma once
+
+#include "search/base/text_index/reader.hpp"
+
+class FileWriter;
+
+namespace search_base
+{
+// Merges two on-disk text indexes and writes them to a new one.
+class TextIndexMerger
+{
+public:
+  // The merging process is as follows.
+  // 1. Dictionaries from both indexes are read into memory, merged
+  //    and written to disk.
+  // 2. One uint32_t per entry is reserved in memory to calculate the
+  //    offsets of the postings lists.
+  // 3. One token at a time, all postings for the token are read from
+  //    both indexes into memory, unified and written to disk.
+  // 4. The offsets are written to disk.
+  //
+  // Note that the dictionary and offsets are kept in memory during the whole
+  // merging process.
+  static void Merge(TextIndexReader const & index1, TextIndexReader const & index2, FileWriter & sink);
+};
+}  // namespace search_base
--- a/libs/search/base/text_index/postings.hpp
+++ b/libs/search/base/text_index/postings.hpp
@ -0,0 +1,88 @@
+#pragma once
+
+#include "search/base/text_index/header.hpp"
+#include "search/base/text_index/text_index.hpp"
+#include "search/base/text_index/utils.hpp"
+
+#include "coding/varint.hpp"
+#include "coding/write_to_sink.hpp"
+
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+namespace search_base
+{
+struct TextIndexHeader;
+
+// A helper class that fetches the postings lists for
+// one token at a time. It is assumed that the tokens
+// are enumerated in the lexicographic order.
+class PostingsFetcher
+{
+public:
+  using Fn = std::function<void(uint32_t)>;
+
+  virtual ~PostingsFetcher() = default;
+
+  // Returns true when there are tokens left in the fetcher and false otherwise.
+  virtual bool IsValid() const = 0;
+
+  // Advances fetcher to the next token.
+  virtual void Advance() = 0;
+
+  // Calls |fn| for every posting for the current token. Initially,
+  // current token is the first token and then calls to Advance
+  // may be used to process the next token until the underlying
+  // source of the tokens is exhausted and the fetcher is no longer valid.
+  virtual void ForEachPosting(Fn const & fn) const = 0;
+};
+
+// Fetches the postings list one by one from |fetcher| and writes them
+// to |sink|, updating the fields in |header| that correspond to the
+// postings list.
+// |startPos| marks the start of the entire text index and is needed to compute
+// the offsets that are stored in |header|.
+template <typename Sink>
+void WritePostings(Sink & sink, uint64_t startPos, TextIndexHeader & header, PostingsFetcher & fetcher)
+{
+  header.m_postingsStartsOffset = RelativePos(sink, startPos);
+  // An uint32_t for each 32-bit offset and an uint32_t for the dummy entry at the end.
+  WriteZeroesToSink(sink, sizeof(uint32_t) * (header.m_numTokens + 1));
+
+  header.m_postingsListsOffset = RelativePos(sink, startPos);
+
+  std::vector<uint32_t> postingsStarts;
+  postingsStarts.reserve(header.m_numTokens);
+  {
+    uint32_t last;
+    // todo(@m) s/uint32_t/Posting/ ?
+    auto writePostings = [&](uint32_t p)
+    {
+      CHECK(last == 0 || last < p, (last, p));
+      uint32_t const delta = p - last;
+      WriteVarUint(sink, delta);
+      last = p;
+    };
+    while (fetcher.IsValid())
+    {
+      postingsStarts.emplace_back(RelativePos(sink, startPos));
+      last = 0;
+      fetcher.ForEachPosting(writePostings);
+      fetcher.Advance();
+    }
+  }
+  // One more for convenience.
+  postingsStarts.emplace_back(RelativePos(sink, startPos));
+
+  {
+    uint64_t const savedPos = sink.Pos();
+    sink.Seek(startPos + header.m_postingsStartsOffset);
+    for (uint32_t const s : postingsStarts)
+      WriteToSink(sink, s);
+
+    CHECK_EQUAL(sink.Pos(), startPos + header.m_postingsListsOffset, ());
+    sink.Seek(savedPos);
+  }
+}
+}  // namespace search_base
--- a/libs/search/base/text_index/reader.hpp
+++ b/libs/search/base/text_index/reader.hpp
@ -0,0 +1,78 @@
+#pragma once
+
+#include "search/base/text_index/dictionary.hpp"
+#include "search/base/text_index/text_index.hpp"
+
+#include "coding/file_reader.hpp"
+#include "coding/reader.hpp"
+#include "coding/varint.hpp"
+
+#include "base/assert.hpp"
+#include "base/string_utils.hpp"
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace search_base
+{
+// A reader class for on-demand reading of postings lists from disk.
+class TextIndexReader
+{
+public:
+  explicit TextIndexReader(FileReader const & fileReader) : m_fileReader(fileReader)
+  {
+    ReaderSource<FileReader> headerSource(m_fileReader);
+    TextIndexHeader header;
+    header.Deserialize(headerSource);
+
+    uint64_t const dictStart = header.m_dictPositionsOffset;
+    uint64_t const dictEnd = header.m_postingsStartsOffset;
+    ReaderSource<FileReader> dictSource(m_fileReader.SubReader(dictStart, dictEnd - dictStart));
+    m_dictionary.Deserialize(dictSource, header);
+
+    uint64_t const postStart = header.m_postingsStartsOffset;
+    uint64_t const postEnd = header.m_postingsListsOffset;
+    ReaderSource<FileReader> postingsSource(m_fileReader.SubReader(postStart, postEnd - postStart));
+    m_postingsStarts.resize(header.m_numTokens + 1);
+    for (uint32_t & start : m_postingsStarts)
+      start = ReadPrimitiveFromSource<uint32_t>(postingsSource);
+  }
+
+  // Executes |fn| on every posting associated with |token|.
+  // The order of postings is not specified.
+  template <typename Fn>
+  void ForEachPosting(Token const & token, Fn && fn) const
+  {
+    size_t tokenId = 0;
+    if (!m_dictionary.GetTokenId(token, tokenId))
+      return;
+    CHECK_LESS(tokenId + 1, m_postingsStarts.size(), ());
+
+    ReaderSource<FileReader> source(
+        m_fileReader.SubReader(m_postingsStarts[tokenId], m_postingsStarts[tokenId + 1] - m_postingsStarts[tokenId]));
+
+    uint32_t last = 0;
+    while (source.Size() > 0)
+    {
+      last += ReadVarUint<uint32_t>(source);
+      fn(last);
+    }
+  }
+
+  template <typename Fn>
+  void ForEachPosting(strings::UniString const & token, Fn && fn) const
+  {
+    auto const utf8s = strings::ToUtf8(token);
+    ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
+  }
+
+  TextIndexDictionary const & GetDictionary() const { return m_dictionary; }
+
+private:
+  FileReader m_fileReader;
+  TextIndexDictionary m_dictionary;
+  std::vector<uint32_t> m_postingsStarts;
+};
+}  // namespace search_base
--- a/libs/search/base/text_index/text_index.cpp
+++ b/libs/search/base/text_index/text_index.cpp
@ -0,0 +1,20 @@
+#include "search/base/text_index/text_index.hpp"
+
+#include "base/assert.hpp"
+#include "base/string_utils.hpp"
+
+using namespace std;
+
+namespace search_base
+{
+string DebugPrint(TextIndexVersion const & version)
+{
+  switch (version)
+  {
+  case TextIndexVersion::V0: return "V0";
+  }
+  string ret = "Unknown TextIndexHeader version: " + strings::to_string(static_cast<uint8_t>(version));
+  ASSERT(false, (ret));
+  return ret;
+}
+}  // namespace search_base
--- a/libs/search/base/text_index/text_index.hpp
+++ b/libs/search/base/text_index/text_index.hpp
@ -0,0 +1,42 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+// This file contains the structures needed to store an
+// updatable text index on disk.
+//
+// The index maps tokens of string type (typically std::string or
+// strings::UniString) to postings lists, i.e. to lists of entities
+// called postings that encode the locations of the strings in the collection
+// of the text documents that is being indexed. An example of a posting
+// is a document id (docid). Another example is a pair of a document id and
+// a position within the corresponding document.
+//
+// The updates are performed by rebuilding the index, either as a result
+// of merging several indexes together, or as a result of clearing outdated
+// entries from an old index.
+//
+// For version 0, the postings lists are docid arrays, i.e. arrays of unsigned
+// 32-bit integers stored in increasing order.
+// The structure of the index is:
+//   [header: version and offsets]
+//   [array containing the starting positions of tokens]
+//   [tokens, written without separators in the lexicographical order]
+//   [array containing the offsets for the postings lists]
+//   [postings lists, stored as delta-encoded varints]
+//
+// All offsets are measured relative to the start of the index.
+namespace search_base
+{
+using Token = std::string;
+using Posting = uint32_t;
+
+enum class TextIndexVersion : uint8_t
+{
+  V0 = 0,
+  Latest = V0
+};
+
+std::string DebugPrint(TextIndexVersion const & version);
+}  // namespace search_base
--- a/libs/search/base/text_index/utils.hpp
+++ b/libs/search/base/text_index/utils.hpp
@ -0,0 +1,14 @@
+#pragma once
+
+#include "base/checked_cast.hpp"
+
+#include <cstdint>
+
+namespace search_base
+{
+template <typename Sink>
+uint32_t RelativePos(Sink & sink, uint64_t startPos)
+{
+  return base::checked_cast<uint32_t>(sink.Pos() - startPos);
+}
+}  // namespace search_base