Repo created
This commit is contained in:
parent
4af19165ec
commit
68073add76
12458 changed files with 12350765 additions and 2 deletions
20
libs/search/bookmarks/data.cpp
Normal file
20
libs/search/bookmarks/data.cpp
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
#include "search/bookmarks/data.hpp"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace bookmarks
|
||||
{
|
||||
string DebugPrint(Data const & data)
|
||||
{
|
||||
ostringstream os;
|
||||
os << "Data [";
|
||||
os << "names: " << ::DebugPrint(data.GetNames()) << ", ";
|
||||
os << "description: " << data.GetDescription() << "]";
|
||||
return os.str();
|
||||
}
|
||||
} // namespace bookmarks
|
||||
} // namespace search
|
||||
90
libs/search/bookmarks/data.hpp
Normal file
90
libs/search/bookmarks/data.hpp
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
#pragma once
|
||||
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include "kml/types.hpp"
|
||||
|
||||
#include "coding/string_utf8_multilang.hpp"
|
||||
|
||||
#include "base/stl_helpers.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace bookmarks
|
||||
{
|
||||
// TODO (@m, @y): add more features for a bookmark here, i.e. address, center.
|
||||
class Data
|
||||
{
|
||||
public:
|
||||
Data() = default;
|
||||
|
||||
Data(kml::BookmarkData const & bookmarkData, std::string const & locale)
|
||||
: m_names(ExtractIndexableNames(bookmarkData, locale))
|
||||
, m_description(kml::GetDefaultStr(bookmarkData.m_description))
|
||||
{}
|
||||
|
||||
template <typename Fn>
|
||||
void ForEachNameToken(Fn && fn) const
|
||||
{
|
||||
auto withDefaultLang = [&](strings::UniString const & token)
|
||||
{
|
||||
// Note that the Default Language here is not the same as in the kml library.
|
||||
// Bookmark search by locale is not supported so every name is stored
|
||||
// in the default branch of the search trie.
|
||||
fn(StringUtf8Multilang::kDefaultCode, token);
|
||||
};
|
||||
|
||||
for (auto const & name : m_names)
|
||||
ForEachNormalizedToken(name, withDefaultLang);
|
||||
}
|
||||
|
||||
template <typename Fn>
|
||||
void ForEachDescriptionToken(Fn && fn) const
|
||||
{
|
||||
auto withDefaultLang = [&](strings::UniString const & token) { fn(StringUtf8Multilang::kDefaultCode, token); };
|
||||
|
||||
ForEachNormalizedToken(m_description, withDefaultLang);
|
||||
}
|
||||
|
||||
std::vector<std::string> const & GetNames() const { return m_names; }
|
||||
std::string const & GetDescription() const { return m_description; }
|
||||
|
||||
private:
|
||||
std::vector<std::string> ExtractIndexableNames(kml::BookmarkData const & bookmarkData, std::string const & locale)
|
||||
{
|
||||
std::vector<std::string> names;
|
||||
|
||||
// Same as GetPreferredBookmarkName from the map library. Duplicated here to avoid dependency.
|
||||
names.emplace_back(kml::GetPreferredBookmarkName(bookmarkData, locale));
|
||||
names.emplace_back(kml::GetPreferredBookmarkStr(bookmarkData.m_name, locale));
|
||||
|
||||
// todo(@m) Platform's API does not allow to use |locale| here.
|
||||
names.emplace_back(kml::GetLocalizedFeatureType(bookmarkData.m_featureTypes));
|
||||
|
||||
// Normalization is postponed. It is unlikely but we may still need original strings later.
|
||||
// Trimming seems harmless, though.
|
||||
for (auto & s : names)
|
||||
strings::Trim(s);
|
||||
|
||||
base::SortUnique(names);
|
||||
base::EraseIf(names, [](std::string const & s) { return s.empty(); });
|
||||
return names;
|
||||
}
|
||||
|
||||
// Names and custom names in all the locales that we are interested in.
|
||||
// The locale set is fixed at startup and the relevant names are provided
|
||||
// by the kml library. In case the user switches the device locale while
|
||||
// running the app, the UI will adapt; however the search will not, and the
|
||||
// bookmarks will not be reindexed. We consider this situation to be improbable
|
||||
// enough to justify not storing redundant names here.
|
||||
std::vector<std::string> m_names;
|
||||
std::string m_description;
|
||||
};
|
||||
|
||||
std::string DebugPrint(Data const & data);
|
||||
} // namespace bookmarks
|
||||
} // namespace search
|
||||
281
libs/search/bookmarks/processor.cpp
Normal file
281
libs/search/bookmarks/processor.cpp
Normal file
|
|
@ -0,0 +1,281 @@
|
|||
#include "search/bookmarks/processor.hpp"
|
||||
|
||||
#include "search/emitter.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/checked_cast.hpp"
|
||||
#include "base/dfa_helpers.hpp"
|
||||
#include "base/levenshtein_dfa.hpp"
|
||||
#include "base/stl_helpers.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace bookmarks
|
||||
{
|
||||
namespace
|
||||
{
|
||||
struct DocVecWrapper
|
||||
{
|
||||
explicit DocVecWrapper(DocVec const & dv) : m_dv(dv) {}
|
||||
|
||||
template <typename Fn>
|
||||
void ForEachToken(Fn && fn) const
|
||||
{
|
||||
for (size_t i = 0; i < m_dv.GetNumTokens(); ++i)
|
||||
fn(StringUtf8Multilang::kDefaultCode, m_dv.GetToken(i));
|
||||
}
|
||||
|
||||
DocVec const & m_dv;
|
||||
};
|
||||
|
||||
struct RankingInfo
|
||||
{
|
||||
bool operator<(RankingInfo const & rhs) const { return m_cosineSimilarity > rhs.m_cosineSimilarity; }
|
||||
|
||||
bool operator>(RankingInfo const & rhs) const { return rhs < *this; }
|
||||
|
||||
bool operator==(RankingInfo const & rhs) const { return !(*this < rhs) && !(*this > rhs); }
|
||||
bool operator!=(RankingInfo const & rhs) const { return !(*this == rhs); }
|
||||
|
||||
double m_cosineSimilarity = 0.0;
|
||||
};
|
||||
|
||||
struct IdInfoPair
|
||||
{
|
||||
IdInfoPair(Id const & id, RankingInfo const & info) : m_id(id), m_info(info) {}
|
||||
|
||||
bool operator<(IdInfoPair const & rhs) const
|
||||
{
|
||||
if (m_info != rhs.m_info)
|
||||
return m_info < rhs.m_info;
|
||||
return m_id < rhs.m_id;
|
||||
}
|
||||
|
||||
Id m_id;
|
||||
RankingInfo m_info;
|
||||
};
|
||||
|
||||
void FillRankingInfo(QueryVec & qv, IdfMap & idfs, DocVec const & dv, RankingInfo & info)
|
||||
{
|
||||
info.m_cosineSimilarity = qv.Similarity(idfs, dv);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Processor::Processor(Emitter & emitter, base::Cancellable const & cancellable)
|
||||
: m_emitter(emitter)
|
||||
, m_cancellable(cancellable)
|
||||
{}
|
||||
|
||||
void Processor::Reset()
|
||||
{
|
||||
m_index = {};
|
||||
m_docs.clear();
|
||||
m_indexDescriptions = false;
|
||||
m_indexableGroups.clear();
|
||||
m_idToGroup.clear();
|
||||
m_bookmarksInGroup.clear();
|
||||
}
|
||||
|
||||
void Processor::EnableIndexingOfDescriptions(bool enable)
|
||||
{
|
||||
m_indexDescriptions = enable;
|
||||
}
|
||||
|
||||
void Processor::EnableIndexingOfBookmarkGroup(GroupId const & groupId, bool enable)
|
||||
{
|
||||
bool const wasIndexable = m_indexableGroups.count(groupId) > 0;
|
||||
if (enable)
|
||||
m_indexableGroups.insert(groupId);
|
||||
else
|
||||
m_indexableGroups.erase(groupId);
|
||||
bool const nowIndexable = m_indexableGroups.count(groupId) > 0;
|
||||
|
||||
if (wasIndexable == nowIndexable)
|
||||
return;
|
||||
|
||||
for (auto const & id : m_bookmarksInGroup[groupId])
|
||||
if (nowIndexable)
|
||||
AddToIndex(id);
|
||||
else
|
||||
EraseFromIndex(id);
|
||||
}
|
||||
|
||||
void Processor::Add(Id const & id, Doc const & doc)
|
||||
{
|
||||
ASSERT_EQUAL(m_docs.count(id), 0, ());
|
||||
|
||||
DocVec::Builder builder;
|
||||
doc.ForEachNameToken([&](int8_t /* lang */, strings::UniString const & token) { builder.Add(token); });
|
||||
|
||||
if (m_indexDescriptions)
|
||||
doc.ForEachDescriptionToken([&](int8_t /* lang */, strings::UniString const & token) { builder.Add(token); });
|
||||
|
||||
DocVec const docVec(builder);
|
||||
|
||||
m_docs[id] = docVec;
|
||||
}
|
||||
|
||||
void Processor::AddToIndex(Id const & id)
|
||||
{
|
||||
ASSERT_EQUAL(m_docs.count(id), 1, ());
|
||||
|
||||
m_index.Add(id, DocVecWrapper(m_docs[id]));
|
||||
}
|
||||
|
||||
void Processor::Update(Id const & id, Doc const & doc)
|
||||
{
|
||||
auto group = kInvalidGroupId;
|
||||
auto const groupIt = m_idToGroup.find(id);
|
||||
if (groupIt != m_idToGroup.end())
|
||||
{
|
||||
// A copy to avoid use-after-free.
|
||||
group = groupIt->second;
|
||||
DetachFromGroup(id, group);
|
||||
}
|
||||
|
||||
Erase(id);
|
||||
Add(id, doc);
|
||||
|
||||
if (group != kInvalidGroupId)
|
||||
AttachToGroup(id, group);
|
||||
}
|
||||
|
||||
void Processor::Erase(Id const & id)
|
||||
{
|
||||
ASSERT_EQUAL(m_docs.count(id), 1, ());
|
||||
|
||||
ASSERT(m_idToGroup.find(id) == m_idToGroup.end(),
|
||||
("A bookmark must be detached from all groups before being deleted."));
|
||||
|
||||
m_docs.erase(id);
|
||||
}
|
||||
|
||||
void Processor::EraseFromIndex(Id const & id)
|
||||
{
|
||||
ASSERT_EQUAL(m_docs.count(id), 1, ());
|
||||
|
||||
auto const & docVec = m_docs[id];
|
||||
m_index.Erase(id, DocVecWrapper(docVec));
|
||||
}
|
||||
|
||||
void Processor::AttachToGroup(Id const & id, GroupId const & group)
|
||||
{
|
||||
auto const it = m_idToGroup.find(id);
|
||||
if (it != m_idToGroup.end())
|
||||
LOG(LWARNING, ("Tried to attach bookmark", id, "to group", group, "but it already belongs to group", it->second));
|
||||
|
||||
m_idToGroup[id] = group;
|
||||
m_bookmarksInGroup[group].insert(id);
|
||||
if (m_indexableGroups.count(group) > 0)
|
||||
AddToIndex(id);
|
||||
}
|
||||
|
||||
void Processor::DetachFromGroup(Id const & id, GroupId const & group)
|
||||
{
|
||||
auto const it = m_idToGroup.find(id);
|
||||
if (it == m_idToGroup.end())
|
||||
{
|
||||
LOG(LWARNING, ("Tried to detach bookmark", id, "from group", group, "but it does not belong to any group"));
|
||||
return;
|
||||
}
|
||||
|
||||
if (it->second != group)
|
||||
{
|
||||
LOG(LWARNING, ("Tried to detach bookmark", id, "from group", group, "but it only belongs to group", it->second));
|
||||
return;
|
||||
}
|
||||
|
||||
m_idToGroup.erase(it);
|
||||
m_bookmarksInGroup[group].erase(id);
|
||||
|
||||
if (m_indexableGroups.count(group) > 0)
|
||||
EraseFromIndex(id);
|
||||
|
||||
auto const groupIt = m_bookmarksInGroup.find(group);
|
||||
CHECK(groupIt != m_bookmarksInGroup.end(), (group, m_bookmarksInGroup));
|
||||
if (groupIt->second.size() == 0)
|
||||
m_bookmarksInGroup.erase(groupIt);
|
||||
}
|
||||
|
||||
void Processor::Search(Params const & params) const
|
||||
{
|
||||
std::set<Id> ids;
|
||||
auto insertId = [&ids](Id const & id, bool /* exactMatch */) { ids.insert(id); };
|
||||
|
||||
for (size_t i = 0; i < params.GetNumTokens(); ++i)
|
||||
{
|
||||
BailIfCancelled();
|
||||
|
||||
auto const & token = params.GetToken(i);
|
||||
if (params.IsPrefixToken(i))
|
||||
Retrieve<strings::PrefixDFAModifier<strings::LevenshteinDFA>>(token, insertId);
|
||||
else
|
||||
Retrieve<strings::LevenshteinDFA>(token, insertId);
|
||||
}
|
||||
|
||||
IdfMap idfs(*this, 1.0 /* unknownIdf */);
|
||||
auto qv = GetQueryVec(idfs, params);
|
||||
|
||||
std::vector<IdInfoPair> idInfos;
|
||||
for (auto const & id : ids)
|
||||
{
|
||||
BailIfCancelled();
|
||||
|
||||
if (params.m_groupId != kInvalidGroupId)
|
||||
{
|
||||
auto const it = m_idToGroup.find(id);
|
||||
if (it == m_idToGroup.end() || it->second != params.m_groupId)
|
||||
continue;
|
||||
}
|
||||
|
||||
auto it = m_docs.find(id);
|
||||
CHECK(it != m_docs.end(), ("Can't find retrieved doc:", id));
|
||||
auto const & doc = it->second;
|
||||
|
||||
RankingInfo info;
|
||||
FillRankingInfo(qv, idfs, doc, info);
|
||||
|
||||
idInfos.emplace_back(id, info);
|
||||
}
|
||||
|
||||
BailIfCancelled();
|
||||
sort(idInfos.begin(), idInfos.end());
|
||||
|
||||
size_t numEmitted = 0;
|
||||
for (auto const & idInfo : idInfos)
|
||||
{
|
||||
if (numEmitted >= params.m_maxNumResults)
|
||||
break;
|
||||
m_emitter.AddBookmarkResult(bookmarks::Result(idInfo.m_id));
|
||||
++numEmitted;
|
||||
}
|
||||
}
|
||||
|
||||
void Processor::Finish(bool cancelled)
|
||||
{
|
||||
m_emitter.Finish(cancelled);
|
||||
}
|
||||
|
||||
uint64_t Processor::GetNumDocs(strings::UniString const & token, bool isPrefix) const
|
||||
{
|
||||
return base::asserted_cast<uint64_t>(m_index.GetNumDocs(StringUtf8Multilang::kDefaultCode, token, isPrefix));
|
||||
}
|
||||
|
||||
QueryVec Processor::GetQueryVec(IdfMap & idfs, QueryParams const & params) const
|
||||
{
|
||||
QueryVec::Builder builder;
|
||||
for (size_t i = 0; i < params.GetNumTokens(); ++i)
|
||||
{
|
||||
auto const & token = params.GetToken(i).GetOriginal();
|
||||
if (params.IsPrefixToken(i))
|
||||
builder.SetPrefix(token);
|
||||
else
|
||||
builder.AddFull(token);
|
||||
}
|
||||
return {idfs, builder};
|
||||
}
|
||||
} // namespace bookmarks
|
||||
} // namespace search
|
||||
105
libs/search/bookmarks/processor.hpp
Normal file
105
libs/search/bookmarks/processor.hpp
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
#pragma once
|
||||
|
||||
#include "search/base/mem_search_index.hpp"
|
||||
#include "search/bookmarks/types.hpp"
|
||||
#include "search/cancel_exception.hpp"
|
||||
#include "search/doc_vec.hpp"
|
||||
#include "search/feature_offset_match.hpp"
|
||||
#include "search/idf_map.hpp"
|
||||
#include "search/query_params.hpp"
|
||||
#include "search/search_params.hpp"
|
||||
#include "search/utils.hpp"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace base
|
||||
{
|
||||
class Cancellable;
|
||||
}
|
||||
|
||||
namespace search
|
||||
{
|
||||
class Emitter;
|
||||
|
||||
namespace bookmarks
|
||||
{
|
||||
class Processor : public IdfMap::Delegate
|
||||
{
|
||||
public:
|
||||
using Index = search_base::MemSearchIndex<Id>;
|
||||
|
||||
struct Params : public QueryParams
|
||||
{
|
||||
// If valid, only show results with bookmarks attached to |m_groupId|.
|
||||
GroupId m_groupId = kInvalidGroupId;
|
||||
|
||||
size_t m_maxNumResults = SearchParams::kDefaultNumResultsEverywhere;
|
||||
};
|
||||
|
||||
Processor(Emitter & emitter, base::Cancellable const & cancellable);
|
||||
~Processor() override = default;
|
||||
|
||||
void Reset();
|
||||
|
||||
// By default, only bookmark names are indexed. This method
|
||||
// should be used to enable or disable indexing bookmarks
|
||||
// by their descriptions.
|
||||
void EnableIndexingOfDescriptions(bool enable);
|
||||
|
||||
void EnableIndexingOfBookmarkGroup(GroupId const & groupId, bool enable);
|
||||
|
||||
// Adds a bookmark to Processor but does not index it.
|
||||
void Add(Id const & id, Doc const & doc);
|
||||
// Indexes an already added bookmark.
|
||||
void AddToIndex(Id const & id);
|
||||
// Updates a bookmark with a new |doc|. Re-indexes if the bookmarks
|
||||
// is already attached to an indexable group.
|
||||
void Update(Id const & id, Doc const & doc);
|
||||
|
||||
void Erase(Id const & id);
|
||||
void EraseFromIndex(Id const & id);
|
||||
|
||||
void AttachToGroup(Id const & id, GroupId const & group);
|
||||
void DetachFromGroup(Id const & id, GroupId const & group);
|
||||
|
||||
void Search(Params const & params) const;
|
||||
|
||||
void Finish(bool cancelled);
|
||||
|
||||
// IdfMap::Delegate overrides:
|
||||
uint64_t GetNumDocs(strings::UniString const & token, bool isPrefix) const override;
|
||||
|
||||
private:
|
||||
void BailIfCancelled() const { ::search::BailIfCancelled(m_cancellable); }
|
||||
|
||||
template <typename DFA, typename Fn>
|
||||
void Retrieve(QueryParams::Token const & token, Fn && fn) const
|
||||
{
|
||||
SearchTrieRequest<DFA> request;
|
||||
FillRequestFromToken(token, request);
|
||||
request.m_langs.insert(StringUtf8Multilang::kDefaultCode);
|
||||
|
||||
MatchFeaturesInTrie(request, m_index.GetRootIterator(), [](Id const & /* id */) { return true; } /* filter */,
|
||||
std::forward<Fn>(fn));
|
||||
}
|
||||
|
||||
QueryVec GetQueryVec(IdfMap & idfs, QueryParams const & params) const;
|
||||
|
||||
Emitter & m_emitter;
|
||||
base::Cancellable const & m_cancellable;
|
||||
|
||||
Index m_index;
|
||||
std::unordered_map<Id, DocVec> m_docs;
|
||||
|
||||
bool m_indexDescriptions = false;
|
||||
std::unordered_set<GroupId> m_indexableGroups;
|
||||
|
||||
// Currently a bookmark can belong to at most one group
|
||||
// but in the future it is possible for a single bookmark to be
|
||||
// attached to multiple groups.
|
||||
std::unordered_map<Id, GroupId> m_idToGroup;
|
||||
std::unordered_map<GroupId, std::unordered_set<Id>> m_bookmarksInGroup;
|
||||
};
|
||||
} // namespace bookmarks
|
||||
} // namespace search
|
||||
20
libs/search/bookmarks/results.hpp
Normal file
20
libs/search/bookmarks/results.hpp
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
#pragma once
|
||||
|
||||
#include "search/bookmarks/types.hpp"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace bookmarks
|
||||
{
|
||||
struct Result
|
||||
{
|
||||
explicit Result(Id id) : m_id(id) {}
|
||||
|
||||
Id m_id = {};
|
||||
};
|
||||
|
||||
using Results = std::vector<Result>;
|
||||
} // namespace bookmarks
|
||||
} // namespace search
|
||||
9
libs/search/bookmarks/types.cpp
Normal file
9
libs/search/bookmarks/types.cpp
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
#include "search/bookmarks/types.hpp"
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace bookmarks
|
||||
{
|
||||
GroupId constexpr kInvalidGroupId = std::numeric_limits<GroupId>::max();
|
||||
} // namespace bookmarks
|
||||
} // namespace search
|
||||
19
libs/search/bookmarks/types.hpp
Normal file
19
libs/search/bookmarks/types.hpp
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
#pragma once
|
||||
|
||||
#include "search/bookmarks/data.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace bookmarks
|
||||
{
|
||||
// todo(@m) s/Id/DocId/g ?
|
||||
using Id = uint64_t;
|
||||
using GroupId = uint64_t;
|
||||
using Doc = Data;
|
||||
|
||||
extern GroupId const kInvalidGroupId;
|
||||
} // namespace bookmarks
|
||||
} // namespace search
|
||||
Loading…
Add table
Add a link
Reference in a new issue