Repo created
This commit is contained in:
parent
4af19165ec
commit
68073add76
12458 changed files with 12350765 additions and 2 deletions
116
libs/search/base/text_index/dictionary.hpp
Normal file
116
libs/search/base/text_index/dictionary.hpp
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
#pragma once
|
||||
|
||||
#include "search/base/text_index/header.hpp"
|
||||
#include "search/base/text_index/text_index.hpp"
|
||||
|
||||
#include "coding/write_to_sink.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/checked_cast.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
// The dictionary contains all tokens that are present
|
||||
// in the text index.
|
||||
class TextIndexDictionary
|
||||
{
|
||||
public:
|
||||
bool GetTokenId(Token const & token, size_t & id) const
|
||||
{
|
||||
auto const it = std::lower_bound(m_tokens.cbegin(), m_tokens.cend(), token);
|
||||
if (it == m_tokens.cend() || *it != token)
|
||||
return false;
|
||||
id = base::checked_cast<uint32_t>(std::distance(m_tokens.cbegin(), it));
|
||||
return true;
|
||||
}
|
||||
|
||||
void SetTokens(std::vector<Token> && tokens)
|
||||
{
|
||||
ASSERT(std::is_sorted(tokens.begin(), tokens.end()), ());
|
||||
m_tokens = std::move(tokens);
|
||||
}
|
||||
|
||||
std::vector<Token> const & GetTokens() const { return m_tokens; }
|
||||
|
||||
template <typename Sink>
|
||||
void Serialize(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
|
||||
{
|
||||
header.m_numTokens = base::checked_cast<uint32_t>(m_tokens.size());
|
||||
|
||||
header.m_dictPositionsOffset = RelativePos(sink, startPos);
|
||||
// An uint32_t for each 32-bit offset and an uint32_t for the dummy entry at the end.
|
||||
WriteZeroesToSink(sink, sizeof(uint32_t) * (header.m_numTokens + 1));
|
||||
header.m_dictWordsOffset = RelativePos(sink, startPos);
|
||||
|
||||
std::vector<uint32_t> offsets;
|
||||
offsets.reserve(header.m_numTokens + 1);
|
||||
for (auto const & token : m_tokens)
|
||||
{
|
||||
offsets.emplace_back(RelativePos(sink, startPos));
|
||||
SerializeToken(sink, token);
|
||||
}
|
||||
offsets.emplace_back(RelativePos(sink, startPos));
|
||||
|
||||
{
|
||||
uint64_t const savedPos = sink.Pos();
|
||||
sink.Seek(startPos + header.m_dictPositionsOffset);
|
||||
|
||||
for (uint32_t const o : offsets)
|
||||
WriteToSink(sink, o);
|
||||
|
||||
CHECK_EQUAL(sink.Pos(), startPos + header.m_dictWordsOffset, ());
|
||||
sink.Seek(savedPos);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Source>
|
||||
void Deserialize(Source & source, TextIndexHeader const & header)
|
||||
{
|
||||
auto const startPos = source.Pos();
|
||||
|
||||
std::vector<uint32_t> tokenOffsets(header.m_numTokens + 1);
|
||||
for (uint32_t & offset : tokenOffsets)
|
||||
offset = ReadPrimitiveFromSource<uint32_t>(source);
|
||||
|
||||
uint64_t const expectedSize = header.m_dictWordsOffset - header.m_dictPositionsOffset;
|
||||
CHECK_EQUAL(source.Pos(), startPos + expectedSize, ());
|
||||
m_tokens.resize(header.m_numTokens);
|
||||
for (size_t i = 0; i < m_tokens.size(); ++i)
|
||||
{
|
||||
size_t const size = base::checked_cast<size_t>(tokenOffsets[i + 1] - tokenOffsets[i]);
|
||||
DeserializeToken(source, m_tokens[i], size);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename Sink>
|
||||
static void SerializeToken(Sink & sink, Token const & token)
|
||||
{
|
||||
CHECK(!token.empty(), ());
|
||||
// todo(@m) Endianness.
|
||||
sink.Write(token.data(), token.size() * sizeof(typename Token::value_type));
|
||||
}
|
||||
|
||||
template <typename Source>
|
||||
static void DeserializeToken(Source & source, Token & token, size_t size)
|
||||
{
|
||||
CHECK_GREATER(size, 0, ());
|
||||
ASSERT_EQUAL(size % sizeof(typename Token::value_type), 0, ());
|
||||
token.resize(size / sizeof(typename Token::value_type));
|
||||
source.Read(&token[0], size);
|
||||
}
|
||||
|
||||
template <typename Sink>
|
||||
static uint32_t RelativePos(Sink & sink, uint64_t startPos)
|
||||
{
|
||||
return base::checked_cast<uint32_t>(sink.Pos() - startPos);
|
||||
}
|
||||
|
||||
std::vector<Token> m_tokens;
|
||||
};
|
||||
} // namespace search_base
|
||||
9
libs/search/base/text_index/header.cpp
Normal file
9
libs/search/base/text_index/header.cpp
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
#include "search/base/text_index/header.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
// static
|
||||
string const TextIndexHeader::kHeaderMagic = "mapsmetextidx";
|
||||
} // namespace search_base
|
||||
57
libs/search/base/text_index/header.hpp
Normal file
57
libs/search/base/text_index/header.hpp
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "search/base/text_index/text_index.hpp"
|
||||
|
||||
#include "coding/reader.hpp"
|
||||
#include "coding/write_to_sink.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
struct TextIndexHeader
|
||||
{
|
||||
template <typename Sink>
|
||||
void Serialize(Sink & sink) const
|
||||
{
|
||||
CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
|
||||
|
||||
sink.Write(kHeaderMagic.data(), kHeaderMagic.size());
|
||||
WriteToSink(sink, static_cast<uint8_t>(m_version));
|
||||
WriteToSink(sink, m_numTokens);
|
||||
WriteToSink(sink, m_dictPositionsOffset);
|
||||
WriteToSink(sink, m_dictWordsOffset);
|
||||
WriteToSink(sink, m_postingsStartsOffset);
|
||||
WriteToSink(sink, m_postingsListsOffset);
|
||||
}
|
||||
|
||||
template <typename Source>
|
||||
void Deserialize(Source & source)
|
||||
{
|
||||
CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
|
||||
|
||||
std::string headerMagic(kHeaderMagic.size(), ' ');
|
||||
source.Read(&headerMagic[0], headerMagic.size());
|
||||
CHECK_EQUAL(headerMagic, kHeaderMagic, ());
|
||||
m_version = static_cast<TextIndexVersion>(ReadPrimitiveFromSource<uint8_t>(source));
|
||||
CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
|
||||
m_numTokens = ReadPrimitiveFromSource<uint32_t>(source);
|
||||
m_dictPositionsOffset = ReadPrimitiveFromSource<uint32_t>(source);
|
||||
m_dictWordsOffset = ReadPrimitiveFromSource<uint32_t>(source);
|
||||
m_postingsStartsOffset = ReadPrimitiveFromSource<uint32_t>(source);
|
||||
m_postingsListsOffset = ReadPrimitiveFromSource<uint32_t>(source);
|
||||
}
|
||||
|
||||
static std::string const kHeaderMagic;
|
||||
TextIndexVersion m_version = TextIndexVersion::Latest;
|
||||
uint32_t m_numTokens = 0;
|
||||
uint32_t m_dictPositionsOffset = 0;
|
||||
uint32_t m_dictWordsOffset = 0;
|
||||
uint32_t m_postingsStartsOffset = 0;
|
||||
uint32_t m_postingsListsOffset = 0;
|
||||
};
|
||||
} // namespace search_base
|
||||
34
libs/search/base/text_index/mem.cpp
Normal file
34
libs/search/base/text_index/mem.cpp
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
#include "search/base/text_index/mem.hpp"
|
||||
|
||||
#include "base/stl_helpers.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
void MemTextIndex::AddPosting(Token const & token, Posting const & posting)
|
||||
{
|
||||
m_postingsByToken[token].emplace_back(posting);
|
||||
}
|
||||
|
||||
void MemTextIndex::SortPostings()
|
||||
{
|
||||
for (auto & entry : m_postingsByToken)
|
||||
{
|
||||
// A posting may occur several times in a document,
|
||||
// so we remove duplicates for the docid index.
|
||||
// If the count is needed for ranking it may be stored
|
||||
// separately.
|
||||
base::SortUnique(entry.second);
|
||||
}
|
||||
}
|
||||
|
||||
void MemTextIndex::BuildDictionary()
|
||||
{
|
||||
vector<Token> tokens;
|
||||
tokens.reserve(m_postingsByToken.size());
|
||||
for (auto const & entry : m_postingsByToken)
|
||||
tokens.emplace_back(entry.first);
|
||||
m_dictionary.SetTokens(std::move(tokens));
|
||||
}
|
||||
} // namespace search_base
|
||||
167
libs/search/base/text_index/mem.hpp
Normal file
167
libs/search/base/text_index/mem.hpp
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
#pragma once
|
||||
|
||||
#include "search/base/text_index/dictionary.hpp"
|
||||
#include "search/base/text_index/header.hpp"
|
||||
#include "search/base/text_index/postings.hpp"
|
||||
#include "search/base/text_index/text_index.hpp"
|
||||
#include "search/base/text_index/utils.hpp"
|
||||
|
||||
#include "coding/reader.hpp"
|
||||
#include "coding/varint.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
class MemTextIndex
|
||||
{
|
||||
public:
|
||||
MemTextIndex() = default;
|
||||
|
||||
void AddPosting(Token const & token, Posting const & posting);
|
||||
|
||||
// Executes |fn| on every posting associated with |token|.
|
||||
// The order of postings is not specified.
|
||||
template <typename Fn>
|
||||
void ForEachPosting(Token const & token, Fn && fn) const
|
||||
{
|
||||
auto const it = m_postingsByToken.find(token);
|
||||
if (it == m_postingsByToken.end())
|
||||
return;
|
||||
for (auto const p : it->second)
|
||||
fn(p);
|
||||
}
|
||||
|
||||
template <typename Fn>
|
||||
void ForEachPosting(strings::UniString const & token, Fn && fn) const
|
||||
{
|
||||
ForEachPosting(strings::ToUtf8(token), std::forward<Fn>(fn));
|
||||
}
|
||||
|
||||
template <typename Sink>
|
||||
void Serialize(Sink & sink)
|
||||
{
|
||||
SortPostings();
|
||||
BuildDictionary();
|
||||
|
||||
TextIndexHeader header;
|
||||
|
||||
uint64_t const startPos = sink.Pos();
|
||||
// Will be filled in later.
|
||||
header.Serialize(sink);
|
||||
|
||||
SerializeDictionary(sink, header, startPos);
|
||||
SerializePostingsLists(sink, header, startPos);
|
||||
|
||||
uint64_t const finishPos = sink.Pos();
|
||||
sink.Seek(startPos);
|
||||
header.Serialize(sink);
|
||||
sink.Seek(finishPos);
|
||||
}
|
||||
|
||||
template <typename Source>
|
||||
void Deserialize(Source & source)
|
||||
{
|
||||
uint64_t startPos = source.Pos();
|
||||
|
||||
TextIndexHeader header;
|
||||
header.Deserialize(source);
|
||||
|
||||
DeserializeDictionary(source, header, startPos);
|
||||
DeserializePostingsLists(source, header, startPos);
|
||||
}
|
||||
|
||||
private:
|
||||
class MemPostingsFetcher : public PostingsFetcher
|
||||
{
|
||||
public:
|
||||
explicit MemPostingsFetcher(std::map<Token, std::vector<Posting>> const & postingsByToken)
|
||||
: m_postingsByToken(postingsByToken)
|
||||
, m_it(m_postingsByToken.begin())
|
||||
{}
|
||||
|
||||
// PostingsFetcher overrides:
|
||||
bool IsValid() const override { return m_it != m_postingsByToken.end(); }
|
||||
|
||||
void Advance() override
|
||||
{
|
||||
if (m_it != m_postingsByToken.end())
|
||||
++m_it;
|
||||
}
|
||||
|
||||
void ForEachPosting(Fn const & fn) const override
|
||||
{
|
||||
CHECK(IsValid(), ());
|
||||
for (uint32_t p : m_it->second)
|
||||
fn(p);
|
||||
}
|
||||
|
||||
private:
|
||||
std::map<Token, std::vector<Posting>> const & m_postingsByToken;
|
||||
// Iterator to the current token that will be processed when ForEachPosting is called.
|
||||
std::map<Token, std::vector<Posting>>::const_iterator m_it;
|
||||
};
|
||||
|
||||
void SortPostings();
|
||||
|
||||
void BuildDictionary();
|
||||
|
||||
template <typename Sink>
|
||||
void SerializeDictionary(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
|
||||
{
|
||||
m_dictionary.Serialize(sink, header, startPos);
|
||||
}
|
||||
|
||||
template <typename Source>
|
||||
void DeserializeDictionary(Source & source, TextIndexHeader const & header, uint64_t startPos)
|
||||
{
|
||||
CHECK_EQUAL(source.Pos(), startPos + header.m_dictPositionsOffset, ());
|
||||
m_dictionary.Deserialize(source, header);
|
||||
}
|
||||
|
||||
template <typename Sink>
|
||||
void SerializePostingsLists(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
|
||||
{
|
||||
MemPostingsFetcher fetcher(m_postingsByToken);
|
||||
WritePostings(sink, startPos, header, fetcher);
|
||||
}
|
||||
|
||||
template <typename Source>
|
||||
void DeserializePostingsLists(Source & source, TextIndexHeader const & header, uint64_t startPos)
|
||||
{
|
||||
CHECK_EQUAL(source.Pos(), startPos + header.m_postingsStartsOffset, ());
|
||||
std::vector<uint32_t> postingsStarts(header.m_numTokens + 1);
|
||||
for (uint32_t & start : postingsStarts)
|
||||
start = ReadPrimitiveFromSource<uint32_t>(source);
|
||||
|
||||
auto const & tokens = m_dictionary.GetTokens();
|
||||
CHECK_EQUAL(source.Pos(), startPos + header.m_postingsListsOffset, ());
|
||||
m_postingsByToken.clear();
|
||||
for (size_t i = 0; i < header.m_numTokens; ++i)
|
||||
{
|
||||
std::vector<uint32_t> postings;
|
||||
uint32_t last = 0;
|
||||
while (source.Pos() < startPos + postingsStarts[i + 1])
|
||||
{
|
||||
last += ReadVarUint<uint32_t>(source);
|
||||
postings.emplace_back(last);
|
||||
}
|
||||
CHECK_EQUAL(source.Pos(), postingsStarts[i + 1], ());
|
||||
|
||||
m_postingsByToken.emplace(tokens[i], postings);
|
||||
}
|
||||
}
|
||||
|
||||
std::map<Token, std::vector<Posting>> m_postingsByToken;
|
||||
TextIndexDictionary m_dictionary;
|
||||
};
|
||||
} // namespace search_base
|
||||
126
libs/search/base/text_index/merger.cpp
Normal file
126
libs/search/base/text_index/merger.cpp
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
#include "search/base/text_index/merger.hpp"
|
||||
|
||||
#include "search/base/text_index/dictionary.hpp"
|
||||
#include "search/base/text_index/header.hpp"
|
||||
#include "search/base/text_index/postings.hpp"
|
||||
|
||||
#include "coding/file_writer.hpp"
|
||||
#include "coding/varint.hpp"
|
||||
#include "coding/write_to_sink.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/logging.hpp"
|
||||
#include "base/stl_helpers.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <iterator>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace
|
||||
{
|
||||
using namespace search_base;
|
||||
|
||||
class MergedPostingsListFetcher : public PostingsFetcher
|
||||
{
|
||||
public:
|
||||
MergedPostingsListFetcher(TextIndexDictionary const & dict, TextIndexReader const & index1,
|
||||
TextIndexReader const & index2)
|
||||
: m_dict(dict)
|
||||
, m_index1(index1)
|
||||
, m_index2(index2)
|
||||
{
|
||||
ReadPostings();
|
||||
}
|
||||
|
||||
// PostingsFetcher overrides:
|
||||
bool IsValid() const override
|
||||
{
|
||||
auto const & tokens = m_dict.GetTokens();
|
||||
CHECK_LESS_OR_EQUAL(m_tokenId, tokens.size(), ());
|
||||
return m_tokenId < tokens.size();
|
||||
}
|
||||
|
||||
void Advance() override
|
||||
{
|
||||
auto const & tokens = m_dict.GetTokens();
|
||||
CHECK_LESS_OR_EQUAL(m_tokenId, tokens.size(), ());
|
||||
if (m_tokenId == tokens.size())
|
||||
return;
|
||||
|
||||
++m_tokenId;
|
||||
ReadPostings();
|
||||
}
|
||||
|
||||
void ForEachPosting(Fn const & fn) const override
|
||||
{
|
||||
CHECK(IsValid(), ());
|
||||
for (uint32_t p : m_postings)
|
||||
fn(p);
|
||||
}
|
||||
|
||||
private:
|
||||
// Reads postings for the current token.
|
||||
void ReadPostings()
|
||||
{
|
||||
m_postings.clear();
|
||||
if (!IsValid())
|
||||
return;
|
||||
|
||||
auto const & tokens = m_dict.GetTokens();
|
||||
m_index1.ForEachPosting(tokens[m_tokenId], base::MakeBackInsertFunctor(m_postings));
|
||||
m_index2.ForEachPosting(tokens[m_tokenId], base::MakeBackInsertFunctor(m_postings));
|
||||
base::SortUnique(m_postings);
|
||||
}
|
||||
|
||||
TextIndexDictionary const & m_dict;
|
||||
TextIndexReader const & m_index1;
|
||||
TextIndexReader const & m_index2;
|
||||
// Index of the next token from |m_dict| to be processed.
|
||||
size_t m_tokenId = 0;
|
||||
vector<uint32_t> m_postings;
|
||||
};
|
||||
|
||||
TextIndexDictionary MergeDictionaries(TextIndexDictionary const & dict1, TextIndexDictionary const & dict2)
|
||||
{
|
||||
vector<Token> commonTokens;
|
||||
auto const & ts1 = dict1.GetTokens();
|
||||
auto const & ts2 = dict2.GetTokens();
|
||||
merge(ts1.begin(), ts1.end(), ts2.begin(), ts2.end(), back_inserter(commonTokens));
|
||||
ASSERT(is_sorted(commonTokens.begin(), commonTokens.end()), ());
|
||||
commonTokens.erase(unique(commonTokens.begin(), commonTokens.end()), commonTokens.end());
|
||||
|
||||
TextIndexDictionary dict;
|
||||
dict.SetTokens(std::move(commonTokens));
|
||||
return dict;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
// static
|
||||
void TextIndexMerger::Merge(TextIndexReader const & index1, TextIndexReader const & index2, FileWriter & sink)
|
||||
{
|
||||
TextIndexDictionary const dict = MergeDictionaries(index1.GetDictionary(), index2.GetDictionary());
|
||||
|
||||
TextIndexHeader header;
|
||||
|
||||
uint64_t const startPos = sink.Pos();
|
||||
// Will be filled in later.
|
||||
header.Serialize(sink);
|
||||
|
||||
dict.Serialize(sink, header, startPos);
|
||||
|
||||
MergedPostingsListFetcher fetcher(dict, index1, index2);
|
||||
WritePostings(sink, startPos, header, fetcher);
|
||||
|
||||
// Fill in the header.
|
||||
uint64_t const finishPos = sink.Pos();
|
||||
sink.Seek(startPos);
|
||||
header.Serialize(sink);
|
||||
sink.Seek(finishPos);
|
||||
}
|
||||
} // namespace search_base
|
||||
26
libs/search/base/text_index/merger.hpp
Normal file
26
libs/search/base/text_index/merger.hpp
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
#pragma once
|
||||
|
||||
#include "search/base/text_index/reader.hpp"
|
||||
|
||||
class FileWriter;
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
// Merges two on-disk text indexes and writes them to a new one.
|
||||
class TextIndexMerger
|
||||
{
|
||||
public:
|
||||
// The merging process is as follows.
|
||||
// 1. Dictionaries from both indexes are read into memory, merged
|
||||
// and written to disk.
|
||||
// 2. One uint32_t per entry is reserved in memory to calculate the
|
||||
// offsets of the postings lists.
|
||||
// 3. One token at a time, all postings for the token are read from
|
||||
// both indexes into memory, unified and written to disk.
|
||||
// 4. The offsets are written to disk.
|
||||
//
|
||||
// Note that the dictionary and offsets are kept in memory during the whole
|
||||
// merging process.
|
||||
static void Merge(TextIndexReader const & index1, TextIndexReader const & index2, FileWriter & sink);
|
||||
};
|
||||
} // namespace search_base
|
||||
88
libs/search/base/text_index/postings.hpp
Normal file
88
libs/search/base/text_index/postings.hpp
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
#pragma once
|
||||
|
||||
#include "search/base/text_index/header.hpp"
|
||||
#include "search/base/text_index/text_index.hpp"
|
||||
#include "search/base/text_index/utils.hpp"
|
||||
|
||||
#include "coding/varint.hpp"
|
||||
#include "coding/write_to_sink.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
struct TextIndexHeader;
|
||||
|
||||
// A helper class that fetches the postings lists for
|
||||
// one token at a time. It is assumed that the tokens
|
||||
// are enumerated in the lexicographic order.
|
||||
class PostingsFetcher
|
||||
{
|
||||
public:
|
||||
using Fn = std::function<void(uint32_t)>;
|
||||
|
||||
virtual ~PostingsFetcher() = default;
|
||||
|
||||
// Returns true when there are tokens left in the fetcher and false otherwise.
|
||||
virtual bool IsValid() const = 0;
|
||||
|
||||
// Advances fetcher to the next token.
|
||||
virtual void Advance() = 0;
|
||||
|
||||
// Calls |fn| for every posting for the current token. Initially,
|
||||
// current token is the first token and then calls to Advance
|
||||
// may be used to process the next token until the underlying
|
||||
// source of the tokens is exhausted and the fetcher is no longer valid.
|
||||
virtual void ForEachPosting(Fn const & fn) const = 0;
|
||||
};
|
||||
|
||||
// Fetches the postings list one by one from |fetcher| and writes them
|
||||
// to |sink|, updating the fields in |header| that correspond to the
|
||||
// postings list.
|
||||
// |startPos| marks the start of the entire text index and is needed to compute
|
||||
// the offsets that are stored in |header|.
|
||||
template <typename Sink>
|
||||
void WritePostings(Sink & sink, uint64_t startPos, TextIndexHeader & header, PostingsFetcher & fetcher)
|
||||
{
|
||||
header.m_postingsStartsOffset = RelativePos(sink, startPos);
|
||||
// An uint32_t for each 32-bit offset and an uint32_t for the dummy entry at the end.
|
||||
WriteZeroesToSink(sink, sizeof(uint32_t) * (header.m_numTokens + 1));
|
||||
|
||||
header.m_postingsListsOffset = RelativePos(sink, startPos);
|
||||
|
||||
std::vector<uint32_t> postingsStarts;
|
||||
postingsStarts.reserve(header.m_numTokens);
|
||||
{
|
||||
uint32_t last;
|
||||
// todo(@m) s/uint32_t/Posting/ ?
|
||||
auto writePostings = [&](uint32_t p)
|
||||
{
|
||||
CHECK(last == 0 || last < p, (last, p));
|
||||
uint32_t const delta = p - last;
|
||||
WriteVarUint(sink, delta);
|
||||
last = p;
|
||||
};
|
||||
while (fetcher.IsValid())
|
||||
{
|
||||
postingsStarts.emplace_back(RelativePos(sink, startPos));
|
||||
last = 0;
|
||||
fetcher.ForEachPosting(writePostings);
|
||||
fetcher.Advance();
|
||||
}
|
||||
}
|
||||
// One more for convenience.
|
||||
postingsStarts.emplace_back(RelativePos(sink, startPos));
|
||||
|
||||
{
|
||||
uint64_t const savedPos = sink.Pos();
|
||||
sink.Seek(startPos + header.m_postingsStartsOffset);
|
||||
for (uint32_t const s : postingsStarts)
|
||||
WriteToSink(sink, s);
|
||||
|
||||
CHECK_EQUAL(sink.Pos(), startPos + header.m_postingsListsOffset, ());
|
||||
sink.Seek(savedPos);
|
||||
}
|
||||
}
|
||||
} // namespace search_base
|
||||
78
libs/search/base/text_index/reader.hpp
Normal file
78
libs/search/base/text_index/reader.hpp
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
#pragma once
|
||||
|
||||
#include "search/base/text_index/dictionary.hpp"
|
||||
#include "search/base/text_index/text_index.hpp"
|
||||
|
||||
#include "coding/file_reader.hpp"
|
||||
#include "coding/reader.hpp"
|
||||
#include "coding/varint.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
// A reader class for on-demand reading of postings lists from disk.
|
||||
class TextIndexReader
|
||||
{
|
||||
public:
|
||||
explicit TextIndexReader(FileReader const & fileReader) : m_fileReader(fileReader)
|
||||
{
|
||||
ReaderSource<FileReader> headerSource(m_fileReader);
|
||||
TextIndexHeader header;
|
||||
header.Deserialize(headerSource);
|
||||
|
||||
uint64_t const dictStart = header.m_dictPositionsOffset;
|
||||
uint64_t const dictEnd = header.m_postingsStartsOffset;
|
||||
ReaderSource<FileReader> dictSource(m_fileReader.SubReader(dictStart, dictEnd - dictStart));
|
||||
m_dictionary.Deserialize(dictSource, header);
|
||||
|
||||
uint64_t const postStart = header.m_postingsStartsOffset;
|
||||
uint64_t const postEnd = header.m_postingsListsOffset;
|
||||
ReaderSource<FileReader> postingsSource(m_fileReader.SubReader(postStart, postEnd - postStart));
|
||||
m_postingsStarts.resize(header.m_numTokens + 1);
|
||||
for (uint32_t & start : m_postingsStarts)
|
||||
start = ReadPrimitiveFromSource<uint32_t>(postingsSource);
|
||||
}
|
||||
|
||||
// Executes |fn| on every posting associated with |token|.
|
||||
// The order of postings is not specified.
|
||||
template <typename Fn>
|
||||
void ForEachPosting(Token const & token, Fn && fn) const
|
||||
{
|
||||
size_t tokenId = 0;
|
||||
if (!m_dictionary.GetTokenId(token, tokenId))
|
||||
return;
|
||||
CHECK_LESS(tokenId + 1, m_postingsStarts.size(), ());
|
||||
|
||||
ReaderSource<FileReader> source(
|
||||
m_fileReader.SubReader(m_postingsStarts[tokenId], m_postingsStarts[tokenId + 1] - m_postingsStarts[tokenId]));
|
||||
|
||||
uint32_t last = 0;
|
||||
while (source.Size() > 0)
|
||||
{
|
||||
last += ReadVarUint<uint32_t>(source);
|
||||
fn(last);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Fn>
|
||||
void ForEachPosting(strings::UniString const & token, Fn && fn) const
|
||||
{
|
||||
auto const utf8s = strings::ToUtf8(token);
|
||||
ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
|
||||
}
|
||||
|
||||
TextIndexDictionary const & GetDictionary() const { return m_dictionary; }
|
||||
|
||||
private:
|
||||
FileReader m_fileReader;
|
||||
TextIndexDictionary m_dictionary;
|
||||
std::vector<uint32_t> m_postingsStarts;
|
||||
};
|
||||
} // namespace search_base
|
||||
20
libs/search/base/text_index/text_index.cpp
Normal file
20
libs/search/base/text_index/text_index.cpp
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
#include "search/base/text_index/text_index.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
string DebugPrint(TextIndexVersion const & version)
|
||||
{
|
||||
switch (version)
|
||||
{
|
||||
case TextIndexVersion::V0: return "V0";
|
||||
}
|
||||
string ret = "Unknown TextIndexHeader version: " + strings::to_string(static_cast<uint8_t>(version));
|
||||
ASSERT(false, (ret));
|
||||
return ret;
|
||||
}
|
||||
} // namespace search_base
|
||||
42
libs/search/base/text_index/text_index.hpp
Normal file
42
libs/search/base/text_index/text_index.hpp
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
// This file contains the structures needed to store an
|
||||
// updatable text index on disk.
|
||||
//
|
||||
// The index maps tokens of string type (typically std::string or
|
||||
// strings::UniString) to postings lists, i.e. to lists of entities
|
||||
// called postings that encode the locations of the strings in the collection
|
||||
// of the text documents that is being indexed. An example of a posting
|
||||
// is a document id (docid). Another example is a pair of a document id and
|
||||
// a position within the corresponding document.
|
||||
//
|
||||
// The updates are performed by rebuilding the index, either as a result
|
||||
// of merging several indexes together, or as a result of clearing outdated
|
||||
// entries from an old index.
|
||||
//
|
||||
// For version 0, the postings lists are docid arrays, i.e. arrays of unsigned
|
||||
// 32-bit integers stored in increasing order.
|
||||
// The structure of the index is:
|
||||
// [header: version and offsets]
|
||||
// [array containing the starting positions of tokens]
|
||||
// [tokens, written without separators in the lexicographical order]
|
||||
// [array containing the offsets for the postings lists]
|
||||
// [postings lists, stored as delta-encoded varints]
|
||||
//
|
||||
// All offsets are measured relative to the start of the index.
|
||||
namespace search_base
|
||||
{
|
||||
using Token = std::string;
|
||||
using Posting = uint32_t;
|
||||
|
||||
enum class TextIndexVersion : uint8_t
|
||||
{
|
||||
V0 = 0,
|
||||
Latest = V0
|
||||
};
|
||||
|
||||
std::string DebugPrint(TextIndexVersion const & version);
|
||||
} // namespace search_base
|
||||
14
libs/search/base/text_index/utils.hpp
Normal file
14
libs/search/base/text_index/utils.hpp
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
|
||||
#include "base/checked_cast.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace search_base
|
||||
{
|
||||
template <typename Sink>
|
||||
uint32_t RelativePos(Sink & sink, uint64_t startPos)
|
||||
{
|
||||
return base::checked_cast<uint32_t>(sink.Pos() - startPos);
|
||||
}
|
||||
} // namespace search_base
|
||||
Loading…
Add table
Add a link
Reference in a new issue