Repo created

This commit is contained in:
Fr4nz D13trich 2025-11-22 14:04:28 +01:00
parent 81b91f4139
commit f8c34fa5ee
22732 changed files with 4815320 additions and 2 deletions

View file

@ -0,0 +1 @@
jshin@chromium.org

View file

@ -0,0 +1,29 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_BASE_I18N_EXPORT_H_
#define BASE_I18N_BASE_I18N_EXPORT_H_
#if defined(COMPONENT_BUILD)
#if defined(WIN32)
#if defined(BASE_I18N_IMPLEMENTATION)
#define BASE_I18N_EXPORT __declspec(dllexport)
#else
#define BASE_I18N_EXPORT __declspec(dllimport)
#endif // defined(BASE_I18N_IMPLEMENTATION)
#else // defined(WIN32)
#if defined(BASE_I18N_IMPLEMENTATION)
#define BASE_I18N_EXPORT __attribute__((visibility("default")))
#else
#define BASE_I18N_EXPORT
#endif
#endif
#else // defined(COMPONENT_BUILD)
#define BASE_I18N_EXPORT
#endif
#endif // BASE_I18N_BASE_I18N_EXPORT_H_

View file

@ -0,0 +1,21 @@
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/base_i18n_switches.h"
namespace switches {
// Force the UI to a specific direction. Valid values are "ltr" (left-to-right)
// and "rtl" (right-to-left).
const char kForceUIDirection[] = "force-ui-direction";
// Force the text rendering to a specific direction. Valid values are "ltr"
// (left-to-right) and "rtl" (right-to-left). Only tested meaningfully with
// RTL.
const char kForceTextDirection[] = "force-text-direction";
const char kForceDirectionLTR[] = "ltr";
const char kForceDirectionRTL[] = "rtl";
} // namespace switches

View file

@ -0,0 +1,21 @@
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_BASE_I18N_SWITCHES_H_
#define BASE_I18N_BASE_I18N_SWITCHES_H_
#include "base/i18n/base_i18n_export.h"
namespace switches {
BASE_I18N_EXPORT extern const char kForceUIDirection[];
BASE_I18N_EXPORT extern const char kForceTextDirection[];
// kForce*Direction choices for the switches above.
BASE_I18N_EXPORT extern const char kForceDirectionLTR[];
BASE_I18N_EXPORT extern const char kForceDirectionRTL[];
} // namespace switches
#endif // BASE_I18N_BASE_I18N_SWITCHES_H_

View file

@ -0,0 +1,203 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/break_iterator.h"
#include <stdint.h>
#include "base/logging.h"
#include "third_party/icu/source/common/unicode/ubrk.h"
#include "third_party/icu/source/common/unicode/uchar.h"
#include "third_party/icu/source/common/unicode/ustring.h"
namespace base {
namespace i18n {
const size_t npos = static_cast<size_t>(-1);
BreakIterator::BreakIterator(const StringPiece16& str, BreakType break_type)
: iter_(nullptr),
string_(str),
break_type_(break_type),
prev_(npos),
pos_(0) {}
BreakIterator::BreakIterator(const StringPiece16& str, const string16& rules)
: iter_(nullptr),
string_(str),
rules_(rules),
break_type_(RULE_BASED),
prev_(npos),
pos_(0) {}
BreakIterator::~BreakIterator() {
if (iter_)
ubrk_close(static_cast<UBreakIterator*>(iter_));
}
bool BreakIterator::Init() {
UErrorCode status = U_ZERO_ERROR;
UParseError parse_error;
UBreakIteratorType break_type;
switch (break_type_) {
case BREAK_CHARACTER:
break_type = UBRK_CHARACTER;
break;
case BREAK_WORD:
break_type = UBRK_WORD;
break;
case BREAK_SENTENCE:
break_type = UBRK_SENTENCE;
break;
case BREAK_LINE:
case BREAK_NEWLINE:
case RULE_BASED: // (Keep compiler happy, break_type not used in this case)
break_type = UBRK_LINE;
break;
default:
NOTREACHED() << "invalid break_type_";
return false;
}
if (break_type_ == RULE_BASED) {
iter_ = ubrk_openRules(rules_.c_str(),
static_cast<int32_t>(rules_.length()),
string_.data(),
static_cast<int32_t>(string_.size()),
&parse_error,
&status);
if (U_FAILURE(status)) {
NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
<< parse_error.line << ", offset " << parse_error.offset;
}
} else {
iter_ = ubrk_open(break_type, nullptr, string_.data(),
static_cast<int32_t>(string_.size()), &status);
if (U_FAILURE(status)) {
NOTREACHED() << "ubrk_open failed for type " << break_type
<< " with error " << status;
}
}
if (U_FAILURE(status)) {
return false;
}
// Move the iterator to the beginning of the string.
ubrk_first(static_cast<UBreakIterator*>(iter_));
return true;
}
bool BreakIterator::Advance() {
int32_t pos;
int32_t status;
prev_ = pos_;
switch (break_type_) {
case BREAK_CHARACTER:
case BREAK_WORD:
case BREAK_LINE:
case BREAK_SENTENCE:
case RULE_BASED:
pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
if (pos == UBRK_DONE) {
pos_ = npos;
return false;
}
pos_ = static_cast<size_t>(pos);
return true;
case BREAK_NEWLINE:
do {
pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
if (pos == UBRK_DONE)
break;
pos_ = static_cast<size_t>(pos);
status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
} while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
if (pos == UBRK_DONE && prev_ == pos_) {
pos_ = npos;
return false;
}
return true;
default:
NOTREACHED() << "invalid break_type_";
return false;
}
}
bool BreakIterator::SetText(const base::char16* text, const size_t length) {
UErrorCode status = U_ZERO_ERROR;
ubrk_setText(static_cast<UBreakIterator*>(iter_),
text, length, &status);
pos_ = 0; // implicit when ubrk_setText is done
prev_ = npos;
if (U_FAILURE(status)) {
NOTREACHED() << "ubrk_setText failed";
return false;
}
string_ = StringPiece16(text, length);
return true;
}
bool BreakIterator::IsWord() const {
return GetWordBreakStatus() == IS_WORD_BREAK;
}
BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const {
int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
return IS_LINE_OR_CHAR_BREAK;
// In ICU 60, trying to advance past the end of the text does not change
// |status| so that |pos_| has to be checked as well as |status|.
// See http://bugs.icu-project.org/trac/ticket/13447 .
return (status == UBRK_WORD_NONE || pos_ == npos) ? IS_SKIPPABLE_WORD
: IS_WORD_BREAK;
}
bool BreakIterator::IsEndOfWord(size_t position) const {
if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
return false;
UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
int32_t status = ubrk_getRuleStatus(iter);
return (!!boundary && status != UBRK_WORD_NONE);
}
bool BreakIterator::IsStartOfWord(size_t position) const {
if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
return false;
UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
ubrk_next(iter);
int32_t next_status = ubrk_getRuleStatus(iter);
return (!!boundary && next_status != UBRK_WORD_NONE);
}
bool BreakIterator::IsSentenceBoundary(size_t position) const {
if (break_type_ != BREAK_SENTENCE && break_type_ != RULE_BASED)
return false;
UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
return !!ubrk_isBoundary(iter, static_cast<int32_t>(position));
}
bool BreakIterator::IsGraphemeBoundary(size_t position) const {
if (break_type_ != BREAK_CHARACTER)
return false;
UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
return !!ubrk_isBoundary(iter, static_cast<int32_t>(position));
}
string16 BreakIterator::GetString() const {
return GetStringPiece().as_string();
}
StringPiece16 BreakIterator::GetStringPiece() const {
DCHECK(prev_ != npos && pos_ != npos);
return string_.substr(prev_, pos_ - prev_);
}
} // namespace i18n
} // namespace base

View file

@ -0,0 +1,195 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_BREAK_ITERATOR_H_
#define BASE_I18N_BREAK_ITERATOR_H_
#include <stddef.h>
#include "base/i18n/base_i18n_export.h"
#include "base/macros.h"
#include "base/strings/string16.h"
#include "base/strings/string_piece.h"
// The BreakIterator class iterates through the words, word breaks, and
// line breaks in a UTF-16 string.
//
// It provides several modes, BREAK_WORD, BREAK_LINE, BREAK_NEWLINE, and
// BREAK_SENTENCE which modify how characters are aggregated into the returned
// string.
//
// Under BREAK_WORD mode, once a word is encountered any non-word
// characters are not included in the returned string (e.g. in the
// UTF-16 equivalent of the string " foo bar! ", the word breaks are at
// the periods in ". .foo. .bar.!. .").
// Note that Chinese/Japanese/Thai do not use spaces between words so that
// boundaries can fall in the middle of a continuous run of non-space /
// non-punctuation characters.
//
// Under BREAK_LINE mode, once a line breaking opportunity is encountered,
// any non-word characters are included in the returned string, breaking
// only when a space-equivalent character or a line breaking opportunity
// is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ",
// the breaks are at the periods in ". .foo .bar! .").
//
// Note that lines can be broken at any character/syllable/grapheme cluster
// boundary in Chinese/Japanese/Korean and at word boundaries in Thai
// (Thai does not use spaces between words). Therefore, this is NOT the same
// as breaking only at space-equivalent characters where its former
// name (BREAK_SPACE) implied.
//
// Under BREAK_NEWLINE mode, all characters are included in the returned
// string, breaking only when a newline-equivalent character is encountered
// (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line
// breaks are at the periods in ".foo\n.bar\n.\n.").
//
// Under BREAK_SENTENCE mode, all characters are included in the returned
// string, breaking only on sentence boundaries defined in "Unicode Standard
// Annex #29: Text Segmentation." Whitespace immediately following the sentence
// is also included. For example, in the UTF-16 equivalent of the string
// "foo bar! baz qux?" the breaks are at the periods in ".foo bar! .baz quz?."
//
// To extract the words from a string, move a BREAK_WORD BreakIterator
// through the string and test whether IsWord() is true. E.g.,
// BreakIterator iter(str, BreakIterator::BREAK_WORD);
// if (!iter.Init())
// return false;
// while (iter.Advance()) {
// if (iter.IsWord()) {
// // Region [iter.prev(), iter.pos()) contains a word.
// VLOG(1) << "word: " << iter.GetString();
// }
// }
namespace base {
namespace i18n {
class BASE_I18N_EXPORT BreakIterator {
public:
enum BreakType {
BREAK_WORD,
BREAK_LINE,
// TODO(jshin): Remove this after reviewing call sites.
// If call sites really need break only on space-like characters
// implement it separately.
BREAK_SPACE = BREAK_LINE,
BREAK_NEWLINE,
BREAK_CHARACTER,
// But don't remove this one!
RULE_BASED,
BREAK_SENTENCE,
};
enum WordBreakStatus {
// The end of text that the iterator recognizes as word characters.
// Non-word characters are things like punctuation and spaces.
IS_WORD_BREAK,
// Characters that the iterator can skip past, such as punctuation,
// whitespace, and, if using RULE_BASED mode, characters from another
// character set.
IS_SKIPPABLE_WORD,
// Only used if not in BREAK_WORD or RULE_BASED mode. This is returned for
// newlines, line breaks, and character breaks.
IS_LINE_OR_CHAR_BREAK
};
// Requires |str| to live as long as the BreakIterator does.
BreakIterator(const StringPiece16& str, BreakType break_type);
// Make a rule-based iterator. BreakType == RULE_BASED is implied.
// TODO(andrewhayden): This signature could easily be misinterpreted as
// "(const string16& str, const string16& locale)". We should do something
// better.
BreakIterator(const StringPiece16& str, const string16& rules);
~BreakIterator();
// Init() must be called before any of the iterators are valid.
// Returns false if ICU failed to initialize.
bool Init();
// Advance to the next break. Returns false if we've run past the end of
// the string. (Note that the very last "break" is after the final
// character in the string, and when we advance to that position it's the
// last time Advance() returns true.)
bool Advance();
// Updates the text used by the iterator, resetting the iterator as if
// if Init() had been called again. Any old state is lost. Returns true
// unless there is an error setting the text.
bool SetText(const base::char16* text, const size_t length);
// Under BREAK_WORD mode, returns true if the break we just hit is the
// end of a word. (Otherwise, the break iterator just skipped over e.g.
// whitespace or punctuation.) Under BREAK_LINE and BREAK_NEWLINE modes,
// this distinction doesn't apply and it always returns false.
bool IsWord() const;
// Under BREAK_WORD mode:
// - Returns IS_SKIPPABLE_WORD if non-word characters, such as punctuation or
// spaces, are found.
// - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence
// of word characters.
// Under RULE_BASED mode:
// - Returns IS_SKIPPABLE_WORD if characters outside the rules' character set
// or non-word characters, such as punctuation or spaces, are found.
// - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence
// of word characters that are in the rules' character set.
// Not under BREAK_WORD or RULE_BASED mode:
// - Returns IS_LINE_OR_CHAR_BREAK.
BreakIterator::WordBreakStatus GetWordBreakStatus() const;
// Under BREAK_WORD mode, returns true if |position| is at the end of word or
// at the start of word. It always returns false under modes that are not
// BREAK_WORD or RULE_BASED.
bool IsEndOfWord(size_t position) const;
bool IsStartOfWord(size_t position) const;
// Under BREAK_SENTENCE mode, returns true if |position| is at a sentence
// boundary. It always returns false under modes that are not BREAK_SENTENCE
// or RULE_BASED.
bool IsSentenceBoundary(size_t position) const;
// Under BREAK_CHARACTER mode, returns whether |position| is a Unicode
// grapheme boundary.
bool IsGraphemeBoundary(size_t position) const;
// Returns the string between prev() and pos().
// Advance() must have been called successfully at least once for pos() to
// have advanced to somewhere useful.
string16 GetString() const;
StringPiece16 GetStringPiece() const;
// Returns the value of pos() returned before Advance() was last called.
size_t prev() const { return prev_; }
// Returns the current break position within the string,
// or BreakIterator::npos when done.
size_t pos() const { return pos_; }
private:
// ICU iterator, avoiding ICU ubrk.h dependence.
// This is actually an ICU UBreakiterator* type, which turns out to be
// a typedef for a void* in the ICU headers. Using void* directly prevents
// callers from needing access to the ICU public headers directory.
void* iter_;
// The string we're iterating over. Can be changed with SetText(...)
StringPiece16 string_;
// Rules for our iterator. Mutually exclusive with break_type_.
const string16 rules_;
// The breaking style (word/space/newline). Mutually exclusive with rules_
BreakType break_type_;
// Previous and current iterator positions.
size_t prev_, pos_;
DISALLOW_COPY_AND_ASSIGN(BreakIterator);
};
} // namespace i18n
} // namespace base
#endif // BASE_I18N_BREAK_ITERATOR_H_

View file

@ -0,0 +1,465 @@
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Create a state machine for validating UTF-8. The algorithm in brief:
// 1. Convert the complete unicode range of code points, except for the
// surrogate code points, to an ordered array of sequences of bytes in
// UTF-8.
// 2. Convert individual bytes to ranges, starting from the right of each byte
// sequence. For each range, ensure the bytes on the left and the ranges
// on the right are the identical.
// 3. Convert the resulting list of ranges into a state machine, collapsing
// identical states.
// 4. Convert the state machine to an array of bytes.
// 5. Output as a C++ file.
//
// To use:
// $ ninja -C out/Release build_utf8_validator_tables
// $ out/Release/build_utf8_validator_tables
// --output=base/i18n/utf8_validator_tables.cc
// $ git add base/i18n/utf8_validator_tables.cc
//
// Because the table is not expected to ever change, it is checked into the
// repository rather than being regenerated at build time.
//
// This code uses type uint8_t throughout to represent bytes, to avoid
// signed/unsigned char confusion.
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "base/command_line.h"
#include "base/files/file_path.h"
#include "base/files/file_util.h"
#include "base/logging.h"
#include "base/numerics/safe_conversions.h"
#include "base/stl_util.h"
#include "base/strings/stringprintf.h"
#include "third_party/icu/source/common/unicode/utf8.h"
namespace {
const char kHelpText[] =
"Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";
const char kProlog[] =
"// Copyright 2013 The Chromium Authors. All rights reserved.\n"
"// Use of this source code is governed by a BSD-style license that can "
"be\n"
"// found in the LICENSE file.\n"
"\n"
"// This file is auto-generated by build_utf8_validator_tables.\n"
"// DO NOT EDIT.\n"
"\n"
"#include \"base/i18n/utf8_validator_tables.h\"\n"
"\n"
"namespace base {\n"
"namespace internal {\n"
"\n"
"const uint8_t kUtf8ValidatorTables[] = {\n";
const char kEpilog[] =
"};\n"
"\n"
"const size_t kUtf8ValidatorTablesSize = "
"base::size(kUtf8ValidatorTables);\n"
"\n"
"} // namespace internal\n"
"} // namespace base\n";
// Ranges are inclusive at both ends--they represent [from, to]
class Range {
public:
// Ranges always start with just one byte.
explicit Range(uint8_t value) : from_(value), to_(value) {}
// Range objects are copyable and assignable to be used in STL
// containers. Since they only contain non-pointer POD types, the default copy
// constructor, assignment operator and destructor will work.
// Add a byte to the range. We intentionally only support adding a byte at the
// end, since that is the only operation the code needs.
void AddByte(uint8_t to) {
CHECK(to == to_ + 1);
to_ = to;
}
uint8_t from() const { return from_; }
uint8_t to() const { return to_; }
bool operator<(const Range& rhs) const {
return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
}
bool operator==(const Range& rhs) const {
return from() == rhs.from() && to() == rhs.to();
}
private:
uint8_t from_;
uint8_t to_;
};
// A vector of Ranges is like a simple regular expression--it corresponds to
// a set of strings of the same length that have bytes in each position in
// the appropriate range.
typedef std::vector<Range> StringSet;
// A UTF-8 "character" is represented by a sequence of bytes.
typedef std::vector<uint8_t> Character;
// In the second stage of the algorithm, we want to convert a large list of
// Characters into a small list of StringSets.
struct Pair {
Character character;
StringSet set;
};
typedef std::vector<Pair> PairVector;
// A class to print a table of numbers in the same style as clang-format.
class TablePrinter {
public:
explicit TablePrinter(FILE* stream)
: stream_(stream), values_on_this_line_(0), current_offset_(0) {}
void PrintValue(uint8_t value) {
if (values_on_this_line_ == 0) {
fputs(" ", stream_);
} else if (values_on_this_line_ == kMaxValuesPerLine) {
fprintf(stream_, " // 0x%02x\n ", current_offset_);
values_on_this_line_ = 0;
}
fprintf(stream_, " 0x%02x,", static_cast<int>(value));
++values_on_this_line_;
++current_offset_;
}
void NewLine() {
while (values_on_this_line_ < kMaxValuesPerLine) {
fputs(" ", stream_);
++values_on_this_line_;
}
fprintf(stream_, " // 0x%02x\n", current_offset_);
values_on_this_line_ = 0;
}
private:
// stdio stream. Not owned.
FILE* stream_;
// Number of values so far printed on this line.
int values_on_this_line_;
// Total values printed so far.
int current_offset_;
static const int kMaxValuesPerLine = 8;
DISALLOW_COPY_AND_ASSIGN(TablePrinter);
};
// Start by filling a PairVector with characters. The resulting vector goes from
// "\x00" to "\xf4\x8f\xbf\xbf".
PairVector InitializeCharacters() {
PairVector vector;
for (int i = 0; i <= 0x10FFFF; ++i) {
if (i >= 0xD800 && i < 0xE000) {
// Surrogate codepoints are not permitted. Non-character code points are
// explicitly permitted.
continue;
}
uint8_t bytes[4];
unsigned int offset = 0;
UBool is_error = false;
U8_APPEND(bytes, offset, base::size(bytes), i, is_error);
DCHECK(!is_error);
DCHECK_GT(offset, 0u);
DCHECK_LE(offset, base::size(bytes));
Pair pair = {Character(bytes, bytes + offset), StringSet()};
vector.push_back(pair);
}
return vector;
}
// Construct a new Pair from |character| and the concatenation of |new_range|
// and |existing_set|, and append it to |pairs|.
void ConstructPairAndAppend(const Character& character,
const Range& new_range,
const StringSet& existing_set,
PairVector* pairs) {
Pair new_pair = {character, StringSet(1, new_range)};
new_pair.set.insert(
new_pair.set.end(), existing_set.begin(), existing_set.end());
pairs->push_back(new_pair);
}
// Each pass over the PairVector strips one byte off the right-hand-side of the
// characters and adds a range to the set on the right. For example, the first
// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
// [\xa0-\xbf][\x80-\xbf]).
void MoveRightMostCharToSet(PairVector* pairs) {
PairVector new_pairs;
PairVector::const_iterator it = pairs->begin();
while (it != pairs->end() && it->character.empty()) {
new_pairs.push_back(*it);
++it;
}
CHECK(it != pairs->end());
Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
Range new_range(it->character.back());
StringSet converted = it->set;
++it;
while (it != pairs->end()) {
const Pair& current_pair = *it++;
if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
std::equal(unconverted_bytes.begin(),
unconverted_bytes.end(),
current_pair.character.begin()) &&
converted == current_pair.set) {
// The particular set of UTF-8 codepoints we are validating guarantees
// that each byte range will be contiguous. This would not necessarily be
// true for an arbitrary set of UTF-8 codepoints.
DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
new_range.AddByte(current_pair.character.back());
continue;
}
ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
unconverted_bytes = Character(current_pair.character.begin(),
current_pair.character.end() - 1);
new_range = Range(current_pair.character.back());
converted = current_pair.set;
}
ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
new_pairs.swap(*pairs);
}
void MoveAllCharsToSets(PairVector* pairs) {
// Since each pass of the function moves one character, and UTF-8 sequences
// are at most 4 characters long, this simply runs the algorithm four times.
for (int i = 0; i < 4; ++i) {
MoveRightMostCharToSet(pairs);
}
#if DCHECK_IS_ON()
for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
++it) {
DCHECK(it->character.empty());
}
#endif
}
// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
// algorithm is working. Use the command-line option
// --vmodule=build_utf8_validator_tables=1 to see this output.
void LogStringSets(const PairVector& pairs) {
for (const auto& pair_it : pairs) {
std::string set_as_string;
for (auto set_it = pair_it.set.begin(); set_it != pair_it.set.end();
++set_it) {
set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
static_cast<int>(set_it->from()),
static_cast<int>(set_it->to()));
}
VLOG(1) << set_as_string;
}
}
// A single state in the state machine is represented by a sorted vector of
// start bytes and target states. All input bytes in the range between the start
// byte and the next entry in the vector (or 0xFF) result in a transition to the
// target state.
struct StateRange {
uint8_t from;
uint8_t target_state;
};
typedef std::vector<StateRange> State;
// Generates a state where all bytes go to state 1 (invalid). This is also used
// as an initialiser for other states (since bytes from outside the desired
// range are invalid).
State GenerateInvalidState() {
const StateRange range = {0, 1};
return State(1, range);
}
// A map from a state (ie. a set of strings which will match from this state) to
// a number (which is an index into the array of states).
typedef std::map<StringSet, uint8_t> StateMap;
// Create a new state corresponding to |set|, add it |states| and |state_map|
// and return the index it was given in |states|.
uint8_t MakeState(const StringSet& set,
std::vector<State>* states,
StateMap* state_map) {
DCHECK(!set.empty());
const Range& range = set.front();
const StringSet rest(set.begin() + 1, set.end());
const StateMap::const_iterator where = state_map->find(rest);
const uint8_t target_state = where == state_map->end()
? MakeState(rest, states, state_map)
: where->second;
DCHECK_LT(0, range.from());
DCHECK_LT(range.to(), 0xFF);
const StateRange new_state_initializer[] = {
{0, 1},
{range.from(), target_state},
{static_cast<uint8_t>(range.to() + 1), 1}};
states->push_back(
State(new_state_initializer,
new_state_initializer + base::size(new_state_initializer)));
const uint8_t new_state_number =
base::checked_cast<uint8_t>(states->size() - 1);
CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
return new_state_number;
}
std::vector<State> GenerateStates(const PairVector& pairs) {
// States 0 and 1 are the initial/valid state and invalid state, respectively.
std::vector<State> states(2, GenerateInvalidState());
StateMap state_map;
state_map.insert(std::make_pair(StringSet(), 0));
for (auto it = pairs.begin(); it != pairs.end(); ++it) {
DCHECK(it->character.empty());
DCHECK(!it->set.empty());
const Range& range = it->set.front();
const StringSet rest(it->set.begin() + 1, it->set.end());
const StateMap::const_iterator where = state_map.find(rest);
const uint8_t target_state = where == state_map.end()
? MakeState(rest, &states, &state_map)
: where->second;
if (states[0].back().from == range.from()) {
DCHECK_EQ(1, states[0].back().target_state);
states[0].back().target_state = target_state;
DCHECK_LT(range.to(), 0xFF);
const StateRange new_range = {static_cast<uint8_t>(range.to() + 1), 1};
states[0].push_back(new_range);
} else {
DCHECK_LT(range.to(), 0xFF);
const StateRange new_range_initializer[] = {
{range.from(), target_state},
{static_cast<uint8_t>(range.to() + 1), 1}};
states[0].insert(
states[0].end(), new_range_initializer,
new_range_initializer + base::size(new_range_initializer));
}
}
return states;
}
// Output the generated states as a C++ table. Two tricks are used to compact
// the table: each state in the table starts with a shift value which indicates
// how many bits we can discard from the right-hand-side of the byte before
// doing the table lookup. Secondly, only the state-transitions for bytes
// with the top-bit set are included in the table; bytes without the top-bit set
// are just ASCII and are handled directly by the code.
void PrintStates(const std::vector<State>& states, FILE* stream) {
// First calculate the start-offset of each state. This allows the state
// machine to jump directly to the correct offset, avoiding an extra
// indirection. State 0 starts at offset 0.
std::vector<uint8_t> state_offset(1, 0);
std::vector<uint8_t> shifts;
uint8_t pos = 0;
for (const auto& state_it : states) {
// We want to set |shift| to the (0-based) index of the least-significant
// set bit in any of the ranges for this state, since this tells us how many
// bits we can discard and still determine what range a byte lies in. Sadly
// it appears that ffs() is not portable, so we do it clumsily.
uint8_t shift = 7;
for (auto range_it = state_it.begin(); range_it != state_it.end();
++range_it) {
while (shift > 0 && range_it->from % (1 << shift) != 0) {
--shift;
}
}
shifts.push_back(shift);
pos += 1 + (1 << (7 - shift));
state_offset.push_back(pos);
}
DCHECK_EQ(129, state_offset[1]);
fputs(kProlog, stream);
TablePrinter table_printer(stream);
for (uint8_t state_index = 0; state_index < states.size(); ++state_index) {
const uint8_t shift = shifts[state_index];
uint8_t next_range = 0;
uint8_t target_state = 1;
fprintf(stream,
" // State %d, offset 0x%02x\n",
static_cast<int>(state_index),
static_cast<int>(state_offset[state_index]));
table_printer.PrintValue(shift);
for (int i = 0; i < 0x100; i += (1 << shift)) {
if (next_range < states[state_index].size() &&
states[state_index][next_range].from == i) {
target_state = states[state_index][next_range].target_state;
++next_range;
}
if (i >= 0x80) {
table_printer.PrintValue(state_offset[target_state]);
}
}
table_printer.NewLine();
}
fputs(kEpilog, stream);
}
} // namespace
int main(int argc, char* argv[]) {
base::CommandLine::Init(argc, argv);
logging::LoggingSettings settings;
settings.logging_dest =
logging::LOG_TO_SYSTEM_DEBUG_LOG | logging::LOG_TO_STDERR;
logging::InitLogging(settings);
if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {
fwrite(kHelpText, 1, base::size(kHelpText), stdout);
exit(EXIT_SUCCESS);
}
base::FilePath filename =
base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");
FILE* output = stdout;
if (!filename.empty()) {
output = base::OpenFile(filename, "wb");
if (!output)
PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
<< "' for writing";
}
// Step 1: Enumerate the characters
PairVector pairs = InitializeCharacters();
// Step 2: Convert to sets.
MoveAllCharsToSets(&pairs);
if (VLOG_IS_ON(1)) {
LogStringSets(pairs);
}
// Step 3: Generate states.
std::vector<State> states = GenerateStates(pairs);
// Step 4/5: Print output
PrintStates(states, output);
if (!filename.empty()) {
if (!base::CloseFile(output))
PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
<< "'";
}
return EXIT_SUCCESS;
}

View file

@ -0,0 +1,90 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/case_conversion.h"
#include <stdint.h>
#include "base/numerics/safe_conversions.h"
#include "base/strings/string16.h"
#include "base/strings/string_util.h"
#include "third_party/icu/source/common/unicode/uchar.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/common/unicode/ustring.h"
namespace base {
namespace i18n {
namespace {
// Provides a uniform interface for upper/lower/folding which take take
// slightly varying parameters.
typedef int32_t (*CaseMapperFunction)(UChar* dest, int32_t dest_capacity,
const UChar* src, int32_t src_length,
UErrorCode* error);
int32_t ToUpperMapper(UChar* dest, int32_t dest_capacity,
const UChar* src, int32_t src_length,
UErrorCode* error) {
// Use default locale.
return u_strToUpper(dest, dest_capacity, src, src_length, nullptr, error);
}
int32_t ToLowerMapper(UChar* dest, int32_t dest_capacity,
const UChar* src, int32_t src_length,
UErrorCode* error) {
// Use default locale.
return u_strToLower(dest, dest_capacity, src, src_length, nullptr, error);
}
int32_t FoldCaseMapper(UChar* dest, int32_t dest_capacity,
const UChar* src, int32_t src_length,
UErrorCode* error) {
return u_strFoldCase(dest, dest_capacity, src, src_length,
U_FOLD_CASE_DEFAULT, error);
}
// Provides similar functionality as UnicodeString::caseMap but on string16.
string16 CaseMap(StringPiece16 string, CaseMapperFunction case_mapper) {
string16 dest;
if (string.empty())
return dest;
// Provide an initial guess that the string length won't change. The typical
// strings we use will very rarely change length in this process, so don't
// optimize for that case.
dest.resize(string.size());
UErrorCode error;
do {
error = U_ZERO_ERROR;
// ICU won't terminate the string if there's not enough room for the null
// terminator, but will otherwise. So we don't need to save room for that.
// Don't use WriteInto, which assumes null terminators.
int32_t new_length = case_mapper(
&dest[0], saturated_cast<int32_t>(dest.size()),
string.data(), saturated_cast<int32_t>(string.size()),
&error);
dest.resize(new_length);
} while (error == U_BUFFER_OVERFLOW_ERROR);
return dest;
}
} // namespace
string16 ToLower(StringPiece16 string) {
return CaseMap(string, &ToLowerMapper);
}
string16 ToUpper(StringPiece16 string) {
return CaseMap(string, &ToUpperMapper);
}
string16 FoldCase(StringPiece16 string) {
return CaseMap(string, &FoldCaseMapper);
}
} // namespace i18n
} // namespace base

View file

@ -0,0 +1,48 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_CASE_CONVERSION_H_
#define BASE_I18N_CASE_CONVERSION_H_
#include "base/i18n/base_i18n_export.h"
#include "base/strings/string16.h"
#include "base/strings/string_piece.h"
namespace base {
namespace i18n {
// UNICODE CASE-HANDLING ADVICE
//
// In English it's always safe to convert to upper-case or lower-case text
// and get a good answer. But some languages have rules specific to those
// locales. One example is the Turkish I:
// http://www.i18nguy.com/unicode/turkish-i18n.html
//
// ToLower/ToUpper use the current ICU locale which will take into account
// the user language preference. Use this when dealing with user typing.
//
// FoldCase canonicalizes to a standardized form independent of the current
// locale. Use this when comparing general Unicode strings that don't
// necessarily belong in the user's current locale (like commands, protocol
// names, other strings from the web) for case-insensitive equality.
//
// Note that case conversions will change the length of the string in some
// not-uncommon cases. Never assume that the output is the same length as
// the input.
// Returns the lower case equivalent of string. Uses ICU's current locale.
BASE_I18N_EXPORT string16 ToLower(StringPiece16 string);
// Returns the upper case equivalent of string. Uses ICU's current locale.
BASE_I18N_EXPORT string16 ToUpper(StringPiece16 string);
// Convert the given string to a canonical case, independent of the current
// locale. For ASCII the canonical form is lower case.
// See http://unicode.org/faq/casemap_charprop.html#2
BASE_I18N_EXPORT string16 FoldCase(StringPiece16 string);
} // namespace i18n
} // namespace base
#endif // BASE_I18N_CASE_CONVERSION_H_

View file

@ -0,0 +1,152 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/char_iterator.h"
#include "base/logging.h"
#include "third_party/icu/source/common/unicode/utf16.h"
#include "third_party/icu/source/common/unicode/utf8.h"
namespace base {
namespace i18n {
// UTF8CharIterator ------------------------------------------------------------
UTF8CharIterator::UTF8CharIterator(const std::string* str)
: str_(reinterpret_cast<const uint8_t*>(str->data())),
len_(str->size()),
array_pos_(0),
next_pos_(0),
char_pos_(0),
char_(0) {
if (len_)
U8_NEXT(str_, next_pos_, len_, char_);
}
UTF8CharIterator::~UTF8CharIterator() = default;
bool UTF8CharIterator::Advance() {
if (array_pos_ >= len_)
return false;
array_pos_ = next_pos_;
char_pos_++;
if (next_pos_ < len_)
U8_NEXT(str_, next_pos_, len_, char_);
return true;
}
// UTF16CharIterator -----------------------------------------------------------
UTF16CharIterator::UTF16CharIterator(const string16* str)
: UTF16CharIterator(str, 0) {}
UTF16CharIterator::UTF16CharIterator(const char16* str, size_t str_len)
: UTF16CharIterator(str, str_len, 0) {}
UTF16CharIterator::UTF16CharIterator(UTF16CharIterator&& to_move) = default;
UTF16CharIterator::~UTF16CharIterator() = default;
UTF16CharIterator& UTF16CharIterator::operator=(UTF16CharIterator&& to_move) =
default;
// static
UTF16CharIterator UTF16CharIterator::LowerBound(const string16* str,
size_t array_index) {
return LowerBound(reinterpret_cast<const char16*>(str->data()), str->length(),
array_index);
}
// static
UTF16CharIterator UTF16CharIterator::LowerBound(const char16* str,
size_t length,
size_t array_index) {
DCHECK_LE(array_index, length);
U16_SET_CP_START(str, 0, array_index);
return UTF16CharIterator(str, length, array_index);
}
// static
UTF16CharIterator UTF16CharIterator::UpperBound(const string16* str,
size_t array_index) {
return UpperBound(reinterpret_cast<const char16*>(str->data()), str->length(),
array_index);
}
// static
UTF16CharIterator UTF16CharIterator::UpperBound(const char16* str,
size_t length,
size_t array_index) {
DCHECK_LE(array_index, length);
U16_SET_CP_LIMIT(str, 0, array_index, length);
return UTF16CharIterator(str, length, array_index);
}
int32_t UTF16CharIterator::NextCodePoint() const {
if (next_pos_ >= len_)
return 0;
UChar32 c;
U16_GET(str_, 0, next_pos_, len_, c);
return c;
}
int32_t UTF16CharIterator::PreviousCodePoint() const {
if (array_pos_ <= 0)
return 0;
uint32_t pos = array_pos_;
UChar32 c;
U16_PREV(str_, 0, pos, c);
return c;
}
bool UTF16CharIterator::Advance() {
if (array_pos_ >= len_)
return false;
array_pos_ = next_pos_;
char_offset_++;
if (next_pos_ < len_)
ReadChar();
return true;
}
bool UTF16CharIterator::Rewind() {
if (array_pos_ <= 0)
return false;
next_pos_ = array_pos_;
char_offset_--;
U16_PREV(str_, 0, array_pos_, char_);
return true;
}
UTF16CharIterator::UTF16CharIterator(const string16* str, int32_t initial_pos)
: UTF16CharIterator(str->data(), str->length(), initial_pos) {}
UTF16CharIterator::UTF16CharIterator(const char16* str,
size_t str_len,
int32_t initial_pos)
: str_(str),
len_(str_len),
array_pos_(initial_pos),
next_pos_(initial_pos),
char_offset_(0),
char_(0) {
// This has the side-effect of advancing |next_pos_|.
if (array_pos_ < len_)
ReadChar();
}
void UTF16CharIterator::ReadChar() {
// This is actually a huge macro, so is worth having in a separate function.
U16_NEXT(str_, next_pos_, len_, char_);
}
} // namespace i18n
} // namespace base

View file

@ -0,0 +1,175 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_CHAR_ITERATOR_H_
#define BASE_I18N_CHAR_ITERATOR_H_
#include <stddef.h>
#include <stdint.h>
#include <string>
#include "base/gtest_prod_util.h"
#include "base/i18n/base_i18n_export.h"
#include "base/macros.h"
#include "base/strings/string16.h"
#include "build/build_config.h"
// The CharIterator classes iterate through the characters in UTF8 and
// UTF16 strings. Example usage:
//
// UTF8CharIterator iter(&str);
// while (!iter.end()) {
// VLOG(1) << iter.get();
// iter.Advance();
// }
#if defined(OS_WIN)
typedef unsigned char uint8_t;
#endif
namespace base {
namespace i18n {
class BASE_I18N_EXPORT UTF8CharIterator {
public:
// Requires |str| to live as long as the UTF8CharIterator does.
explicit UTF8CharIterator(const std::string* str);
~UTF8CharIterator();
// Return the starting array index of the current character within the
// string.
int32_t array_pos() const { return array_pos_; }
// Return the logical index of the current character, independent of the
// number of bytes each character takes.
int32_t char_pos() const { return char_pos_; }
// Return the current char.
int32_t get() const { return char_; }
// Returns true if we're at the end of the string.
bool end() const { return array_pos_ == len_; }
// Advance to the next actual character. Returns false if we're at the
// end of the string.
bool Advance();
private:
// The string we're iterating over.
const uint8_t* str_;
// The length of the encoded string.
int32_t len_;
// Array index.
int32_t array_pos_;
// The next array index.
int32_t next_pos_;
// Character index.
int32_t char_pos_;
// The current character.
int32_t char_;
DISALLOW_COPY_AND_ASSIGN(UTF8CharIterator);
};
class BASE_I18N_EXPORT UTF16CharIterator {
public:
// Requires |str| to live as long as the UTF16CharIterator does.
explicit UTF16CharIterator(const string16* str);
UTF16CharIterator(const char16* str, size_t str_len);
UTF16CharIterator(UTF16CharIterator&& to_move);
~UTF16CharIterator();
UTF16CharIterator& operator=(UTF16CharIterator&& to_move);
// Returns an iterator starting on the unicode character at offset
// |array_index| into the string, or the previous array offset if
// |array_index| is the second half of a surrogate pair.
static UTF16CharIterator LowerBound(const string16* str, size_t array_index);
static UTF16CharIterator LowerBound(const char16* str,
size_t str_len,
size_t array_index);
// Returns an iterator starting on the unicode character at offset
// |array_index| into the string, or the next offset if |array_index| is the
// second half of a surrogate pair.
static UTF16CharIterator UpperBound(const string16* str, size_t array_index);
static UTF16CharIterator UpperBound(const char16* str,
size_t str_len,
size_t array_index);
// Return the starting array index of the current character within the
// string.
int32_t array_pos() const { return array_pos_; }
// Returns the offset in code points from the initial iterator position, which
// could be negative if Rewind() is called. The initial value is always zero,
// regardless of how the iterator is constructed.
int32_t char_offset() const { return char_offset_; }
// Returns the code point at the current position.
int32_t get() const { return char_; }
// Returns the code point (i.e. the full Unicode character, not half of a
// surrogate pair) following the current one. Should not be called if end() is
// true. If the current code point is the last one in the string, returns
// zero.
int32_t NextCodePoint() const;
// Returns the code point (i.e. the full Unicode character, not half of a
// surrogate pair) preceding the current one. Should not be called if start()
// is true.
int32_t PreviousCodePoint() const;
// Returns true if we're at the start of the string.
bool start() const { return array_pos_ == 0; }
// Returns true if we're at the end of the string.
bool end() const { return array_pos_ == len_; }
// Advances to the next actual character. Returns false if we're at the
// end of the string.
bool Advance();
// Moves to the previous actual character. Returns false if we're at the start
// of the string.
bool Rewind();
private:
UTF16CharIterator(const string16* str, int32_t initial_pos);
UTF16CharIterator(const char16* str, size_t str_len, int32_t initial_pos);
// Fills in the current character we found and advances to the next
// character, updating all flags as necessary.
void ReadChar();
// The string we're iterating over.
const char16* str_;
// The length of the encoded string.
int32_t len_;
// Array index.
int32_t array_pos_;
// The next array index.
int32_t next_pos_;
// Character offset from the initial position of the iterator.
int32_t char_offset_;
// The current character.
int32_t char_;
DISALLOW_COPY_AND_ASSIGN(UTF16CharIterator);
};
} // namespace i18n
} // namespace base
#endif // BASE_I18N_CHAR_ITERATOR_H_

View file

@ -0,0 +1,42 @@
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/character_encoding.h"
#include "base/macros.h"
#include "third_party/icu/source/common/unicode/ucnv.h"
namespace base {
namespace {
// An array of all supported canonical encoding names.
const char* const kCanonicalEncodingNames[] = {
"Big5", "EUC-JP", "EUC-KR", "gb18030",
"GBK", "IBM866", "ISO-2022-JP", "ISO-8859-10",
"ISO-8859-13", "ISO-8859-14", "ISO-8859-15", "ISO-8859-16",
"ISO-8859-2", "ISO-8859-3", "ISO-8859-4", "ISO-8859-5",
"ISO-8859-6", "ISO-8859-7", "ISO-8859-8", "ISO-8859-8-I",
"KOI8-R", "KOI8-U", "macintosh", "Shift_JIS",
"UTF-16LE", "UTF-8", "windows-1250", "windows-1251",
"windows-1252", "windows-1253", "windows-1254", "windows-1255",
"windows-1256", "windows-1257", "windows-1258", "windows-874"};
} // namespace
std::string GetCanonicalEncodingNameByAliasName(const std::string& alias_name) {
for (auto* encoding_name : kCanonicalEncodingNames) {
if (alias_name == encoding_name)
return alias_name;
}
static const char* kStandards[3] = {"HTML", "MIME", "IANA"};
for (auto* standard : kStandards) {
UErrorCode error_code = U_ZERO_ERROR;
const char* canonical_name =
ucnv_getStandardName(alias_name.c_str(), standard, &error_code);
if (U_SUCCESS(error_code) && canonical_name)
return canonical_name;
}
return std::string();
}
} // namespace base

View file

@ -0,0 +1,20 @@
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_CHARACTER_ENCODING_H_
#define BASE_I18N_CHARACTER_ENCODING_H_
#include <string>
#include "base/i18n/base_i18n_export.h"
namespace base {
// Return canonical encoding name according to the encoding alias name.
BASE_I18N_EXPORT std::string GetCanonicalEncodingNameByAliasName(
const std::string& alias_name);
} // namespace base
#endif // BASE_I18N_CHARACTER_ENCODING_H_

View file

@ -0,0 +1,40 @@
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/encoding_detection.h"
#include "build/build_config.h"
#include "third_party/ced/src/compact_enc_det/compact_enc_det.h"
// third_party/ced/src/util/encodings/encodings.h, which is included
// by the include above, undefs UNICODE because that is a macro used
// internally in ced. If we later in the same translation unit do
// anything related to Windows or Windows headers those will then use
// the ASCII versions which we do not want. To avoid that happening in
// jumbo builds, we redefine UNICODE again here.
#if defined(OS_WIN)
#define UNICODE 1
#endif // OS_WIN
namespace base {
bool DetectEncoding(const std::string& text, std::string* encoding) {
int consumed_bytes;
bool is_reliable;
Encoding enc = CompactEncDet::DetectEncoding(
text.c_str(), text.length(), nullptr, nullptr, nullptr,
UNKNOWN_ENCODING,
UNKNOWN_LANGUAGE,
CompactEncDet::QUERY_CORPUS, // plain text
false, // Include 7-bit encodings
&consumed_bytes,
&is_reliable);
if (enc == UNKNOWN_ENCODING)
return false;
*encoding = MimeEncodingName(enc);
return true;
}
} // namespace base

View file

@ -0,0 +1,21 @@
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_ENCODING_DETECTION_H_
#define BASE_I18N_ENCODING_DETECTION_H_
#include <string>
#include "base/compiler_specific.h"
#include "base/i18n/base_i18n_export.h"
namespace base {
// Detect encoding of |text| and put the name of encoding in |encoding|.
// Returns true on success.
BASE_I18N_EXPORT bool DetectEncoding(const std::string& text,
std::string* encoding) WARN_UNUSED_RESULT;
} // namespace base
#endif // BASE_I18N_ENCODING_DETECTION_H_

View file

@ -0,0 +1,179 @@
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// File utilities that use the ICU library go in this file.
#include "base/i18n/file_util_icu.h"
#include <stdint.h>
#include <memory>
#include "base/files/file_path.h"
#include "base/i18n/icu_string_conversions.h"
#include "base/i18n/string_compare.h"
#include "base/logging.h"
#include "base/macros.h"
#include "base/memory/singleton.h"
#include "base/strings/string_util.h"
#include "base/strings/sys_string_conversions.h"
#include "base/strings/utf_string_conversions.h"
#include "build/build_config.h"
#include "third_party/icu/source/common/unicode/uniset.h"
#include "third_party/icu/source/i18n/unicode/coll.h"
namespace base {
namespace i18n {
namespace {
class IllegalCharacters {
public:
static IllegalCharacters* GetInstance() {
return Singleton<IllegalCharacters>::get();
}
bool DisallowedEverywhere(UChar32 ucs4) {
return !!illegal_anywhere_->contains(ucs4);
}
bool DisallowedLeadingOrTrailing(UChar32 ucs4) {
return !!illegal_at_ends_->contains(ucs4);
}
bool IsAllowedName(const string16& s) {
return s.empty() || (!!illegal_anywhere_->containsNone(
icu::UnicodeString(s.c_str(), s.size())) &&
!illegal_at_ends_->contains(*s.begin()) &&
!illegal_at_ends_->contains(*s.rbegin()));
}
private:
friend class Singleton<IllegalCharacters>;
friend struct DefaultSingletonTraits<IllegalCharacters>;
IllegalCharacters();
~IllegalCharacters() = default;
// set of characters considered invalid anywhere inside a filename.
std::unique_ptr<icu::UnicodeSet> illegal_anywhere_;
// set of characters considered invalid at either end of a filename.
std::unique_ptr<icu::UnicodeSet> illegal_at_ends_;
DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
};
IllegalCharacters::IllegalCharacters() {
UErrorCode everywhere_status = U_ZERO_ERROR;
UErrorCode ends_status = U_ZERO_ERROR;
// Control characters, formatting characters, non-characters, path separators,
// and some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
// See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
// and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
// Note that code points in the "Other, Format" (Cf) category are ignored on
// HFS+ despite the ZERO_WIDTH_JOINER and ZERO_WIDTH_NON-JOINER being
// legitimate in Arabic and some S/SE Asian scripts. In addition tilde (~) is
// also excluded due to the possibility of interacting poorly with short
// filenames on VFAT. (Related to CVE-2014-9390)
illegal_anywhere_.reset(new icu::UnicodeSet(
UNICODE_STRING_SIMPLE("[[\"~*/:<>?\\\\|][:Cc:][:Cf:]]"),
everywhere_status));
illegal_at_ends_.reset(new icu::UnicodeSet(
UNICODE_STRING_SIMPLE("[[:WSpace:][.]]"), ends_status));
DCHECK(U_SUCCESS(everywhere_status));
DCHECK(U_SUCCESS(ends_status));
// Add non-characters. If this becomes a performance bottleneck by
// any chance, do not add these to |set| and change IsFilenameLegal()
// to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
// IsAllowedName().
illegal_anywhere_->add(0xFDD0, 0xFDEF);
for (int i = 0; i <= 0x10; ++i) {
int plane_base = 0x10000 * i;
illegal_anywhere_->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
}
illegal_anywhere_->freeze();
illegal_at_ends_->freeze();
}
} // namespace
bool IsFilenameLegal(const string16& file_name) {
return IllegalCharacters::GetInstance()->IsAllowedName(file_name);
}
void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
char replace_char) {
IllegalCharacters* illegal = IllegalCharacters::GetInstance();
DCHECK(!(illegal->DisallowedEverywhere(replace_char)));
DCHECK(!(illegal->DisallowedLeadingOrTrailing(replace_char)));
int cursor = 0; // The ICU macros expect an int.
while (cursor < static_cast<int>(file_name->size())) {
int char_begin = cursor;
uint32_t code_point;
#if defined(OS_WIN)
// Windows uses UTF-16 encoding for filenames.
U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
code_point);
#elif defined(OS_POSIX) || defined(OS_FUCHSIA)
// Mac and Chrome OS use UTF-8 encoding for filenames.
// Linux doesn't actually define file system encoding. Try to parse as
// UTF-8.
U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
code_point);
#else
#error Unsupported platform
#endif
if (illegal->DisallowedEverywhere(code_point) ||
((char_begin == 0 || cursor == static_cast<int>(file_name->length())) &&
illegal->DisallowedLeadingOrTrailing(code_point))) {
file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
// We just made the potentially multi-byte/word char into one that only
// takes one byte/word, so need to adjust the cursor to point to the next
// character again.
cursor = char_begin + 1;
}
}
}
bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
UErrorCode error_code = U_ZERO_ERROR;
// Use the default collator. The default locale should have been properly
// set by the time this constructor is called.
std::unique_ptr<icu::Collator> collator(
icu::Collator::createInstance(error_code));
DCHECK(U_SUCCESS(error_code));
// Make it case-sensitive.
collator->setStrength(icu::Collator::TERTIARY);
#if defined(OS_WIN)
return CompareString16WithCollator(*collator, AsStringPiece16(a.value()),
AsStringPiece16(b.value())) == UCOL_LESS;
#elif defined(OS_POSIX) || defined(OS_FUCHSIA)
// On linux, the file system encoding is not defined. We assume
// SysNativeMBToWide takes care of it.
return CompareString16WithCollator(
*collator, WideToUTF16(SysNativeMBToWide(a.value())),
WideToUTF16(SysNativeMBToWide(b.value()))) == UCOL_LESS;
#endif
}
void NormalizeFileNameEncoding(FilePath* file_name) {
#if defined(OS_CHROMEOS)
std::string normalized_str;
if (ConvertToUtf8AndNormalize(file_name->BaseName().value(), kCodepageUTF8,
&normalized_str) &&
!normalized_str.empty()) {
*file_name = file_name->DirName().Append(FilePath(normalized_str));
}
#endif
}
} // namespace i18n
} // namespace base

View file

@ -0,0 +1,58 @@
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_FILE_UTIL_ICU_H_
#define BASE_I18N_FILE_UTIL_ICU_H_
// File utilities that use the ICU library go in this file.
#include "base/files/file_path.h"
#include "base/i18n/base_i18n_export.h"
#include "base/strings/string16.h"
namespace base {
namespace i18n {
// Returns true if file_name does not have any illegal character. The input
// param has the same restriction as that for ReplaceIllegalCharacters.
BASE_I18N_EXPORT bool IsFilenameLegal(const string16& file_name);
// Replaces characters in |file_name| that are illegal for file names with
// |replace_char|. |file_name| must not be a full or relative path, but just the
// file name component (since slashes are considered illegal). Any leading or
// trailing whitespace or periods in |file_name| is also replaced with the
// |replace_char|.
//
// Example:
// "bad:file*name?.txt" will be turned into "bad_file_name_.txt" when
// |replace_char| is '_'.
//
// Warning: Do not use this function as the sole means of sanitizing a filename.
// While the resulting filename itself would be legal, it doesn't necessarily
// mean that the file will behave safely. On Windows, certain reserved names
// refer to devices rather than files (E.g. LPT1), and some filenames could be
// interpreted as shell namespace extensions (E.g. Foo.{<GUID>}).
//
// On Windows, Chrome OS and Mac, the file system encoding is already known and
// parsed as UTF-8 and UTF-16 accordingly.
// On Linux, the file name will be parsed as UTF8.
// TODO(asanka): Move full filename sanitization logic here.
BASE_I18N_EXPORT void ReplaceIllegalCharactersInPath(
FilePath::StringType* file_name,
char replace_char);
// Compares two filenames using the current locale information. This can be
// used to sort directory listings. It behaves like "operator<" for use in
// std::sort.
BASE_I18N_EXPORT bool LocaleAwareCompareFilenames(const FilePath& a,
const FilePath& b);
// Calculates the canonical file-system representation of |file_name| base name.
// Modifies |file_name| in place. No-op if not on ChromeOS.
BASE_I18N_EXPORT void NormalizeFileNameEncoding(FilePath* file_name);
} // namespace i18n
} // namespace base
#endif // BASE_I18N_FILE_UTIL_ICU_H_

View file

@ -0,0 +1,13 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/i18n_constants.h"
namespace base {
const char kCodepageLatin1[] = "ISO-8859-1";
const char kCodepageUTF8[] = "UTF-8";
} // namespace base

View file

@ -0,0 +1,21 @@
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_I18N_CONSTANTS_H_
#define BASE_I18N_I18N_CONSTANTS_H_
#include "base/i18n/base_i18n_export.h"
namespace base {
// Names of codepages (charsets) understood by icu.
BASE_I18N_EXPORT extern const char kCodepageLatin1[]; // a.k.a. ISO 8859-1
BASE_I18N_EXPORT extern const char kCodepageUTF8[];
// The other possible options are UTF-16BE and UTF-16LE, but they are unused in
// Chromium as of this writing.
} // namespace base
#endif // BASE_I18N_I18N_CONSTANTS_H_

View file

@ -0,0 +1,224 @@
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/icu_string_conversions.h"
#include <stddef.h>
#include <stdint.h>
#include <memory>
#include <vector>
#include "base/logging.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "third_party/icu/source/common/unicode/normalizer2.h"
#include "third_party/icu/source/common/unicode/ucnv.h"
#include "third_party/icu/source/common/unicode/ucnv_cb.h"
#include "third_party/icu/source/common/unicode/ucnv_err.h"
#include "third_party/icu/source/common/unicode/ustring.h"
namespace base {
namespace {
// ToUnicodeCallbackSubstitute() is based on UCNV_TO_U_CALLBACK_SUBSTITUTE
// in source/common/ucnv_err.c.
// Copyright (c) 1995-2006 International Business Machines Corporation
// and others
//
// All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, and/or
// sell copies of the Software, and to permit persons to whom the Software
// is furnished to do so, provided that the above copyright notice(s) and
// this permission notice appear in all copies of the Software and that
// both the above copyright notice(s) and this permission notice appear in
// supporting documentation.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
// INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
// OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
// OR PERFORMANCE OF THIS SOFTWARE.
//
// Except as contained in this notice, the name of a copyright holder
// shall not be used in advertising or otherwise to promote the sale, use
// or other dealings in this Software without prior written authorization
// of the copyright holder.
// ___________________________________________________________________________
//
// All trademarks and registered trademarks mentioned herein are the property
// of their respective owners.
void ToUnicodeCallbackSubstitute(const void* context,
UConverterToUnicodeArgs *to_args,
const char* code_units,
int32_t length,
UConverterCallbackReason reason,
UErrorCode * err) {
static const UChar kReplacementChar = 0xFFFD;
if (reason <= UCNV_IRREGULAR) {
if (context == nullptr ||
(*(reinterpret_cast<const char*>(context)) == 'i' &&
reason == UCNV_UNASSIGNED)) {
*err = U_ZERO_ERROR;
ucnv_cbToUWriteUChars(to_args, &kReplacementChar, 1, 0, err);
}
// else the caller must have set the error code accordingly.
}
// else ignore the reset, close and clone calls.
}
bool ConvertFromUTF16(UConverter* converter,
base::StringPiece16 src,
OnStringConversionError::Type on_error,
std::string* encoded) {
int encoded_max_length = UCNV_GET_MAX_BYTES_FOR_STRING(
src.length(), ucnv_getMaxCharSize(converter));
encoded->resize(encoded_max_length);
UErrorCode status = U_ZERO_ERROR;
// Setup our error handler.
switch (on_error) {
case OnStringConversionError::FAIL:
ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_STOP, nullptr,
nullptr, nullptr, &status);
break;
case OnStringConversionError::SKIP:
case OnStringConversionError::SUBSTITUTE:
ucnv_setFromUCallBack(converter, UCNV_FROM_U_CALLBACK_SKIP, nullptr,
nullptr, nullptr, &status);
break;
default:
NOTREACHED();
}
// ucnv_fromUChars returns size not including terminating null
int actual_size =
ucnv_fromUChars(converter, &(*encoded)[0], encoded_max_length, src.data(),
src.length(), &status);
encoded->resize(actual_size);
ucnv_close(converter);
if (U_SUCCESS(status))
return true;
encoded->clear(); // Make sure the output is empty on error.
return false;
}
// Set up our error handler for ToUTF-16 converters
void SetUpErrorHandlerForToUChars(OnStringConversionError::Type on_error,
UConverter* converter, UErrorCode* status) {
switch (on_error) {
case OnStringConversionError::FAIL:
ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_STOP, nullptr, nullptr,
nullptr, status);
break;
case OnStringConversionError::SKIP:
ucnv_setToUCallBack(converter, UCNV_TO_U_CALLBACK_SKIP, nullptr, nullptr,
nullptr, status);
break;
case OnStringConversionError::SUBSTITUTE:
ucnv_setToUCallBack(converter, ToUnicodeCallbackSubstitute, nullptr,
nullptr, nullptr, status);
break;
default:
NOTREACHED();
}
}
} // namespace
// Codepage <-> Wide/UTF-16 ---------------------------------------------------
bool UTF16ToCodepage(base::StringPiece16 utf16,
const char* codepage_name,
OnStringConversionError::Type on_error,
std::string* encoded) {
encoded->clear();
UErrorCode status = U_ZERO_ERROR;
UConverter* converter = ucnv_open(codepage_name, &status);
if (!U_SUCCESS(status))
return false;
return ConvertFromUTF16(converter, utf16, on_error, encoded);
}
bool CodepageToUTF16(base::StringPiece encoded,
const char* codepage_name,
OnStringConversionError::Type on_error,
string16* utf16) {
utf16->clear();
UErrorCode status = U_ZERO_ERROR;
UConverter* converter = ucnv_open(codepage_name, &status);
if (!U_SUCCESS(status))
return false;
// Even in the worst case, the maximum length in 2-byte units of UTF-16
// output would be at most the same as the number of bytes in input. There
// is no single-byte encoding in which a character is mapped to a
// non-BMP character requiring two 2-byte units.
//
// Moreover, non-BMP characters in legacy multibyte encodings
// (e.g. EUC-JP, GB18030) take at least 2 bytes. The only exceptions are
// BOCU and SCSU, but we don't care about them.
size_t uchar_max_length = encoded.length() + 1;
SetUpErrorHandlerForToUChars(on_error, converter, &status);
std::unique_ptr<char16[]> buffer(new char16[uchar_max_length]);
int actual_size = ucnv_toUChars(converter, buffer.get(),
static_cast<int>(uchar_max_length), encoded.data(),
static_cast<int>(encoded.length()), &status);
ucnv_close(converter);
if (!U_SUCCESS(status)) {
utf16->clear(); // Make sure the output is empty on error.
return false;
}
utf16->assign(buffer.get(), actual_size);
return true;
}
bool ConvertToUtf8AndNormalize(base::StringPiece text,
const std::string& charset,
std::string* result) {
result->clear();
string16 utf16;
if (!CodepageToUTF16(text, charset.c_str(), OnStringConversionError::FAIL,
&utf16))
return false;
UErrorCode status = U_ZERO_ERROR;
const icu::Normalizer2* normalizer = icu::Normalizer2::getNFCInstance(status);
DCHECK(U_SUCCESS(status));
if (U_FAILURE(status))
return false;
int32_t utf16_length = static_cast<int32_t>(utf16.length());
icu::UnicodeString normalized(utf16.data(), utf16_length);
int32_t normalized_prefix_length =
normalizer->spanQuickCheckYes(normalized, status);
if (normalized_prefix_length < utf16_length) {
icu::UnicodeString un_normalized(normalized, normalized_prefix_length);
normalized.truncate(normalized_prefix_length);
normalizer->normalizeSecondAndAppend(normalized, un_normalized, status);
}
if (U_FAILURE(status))
return false;
normalized.toUTF8String(*result);
return true;
}
} // namespace base

View file

@ -0,0 +1,58 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_ICU_STRING_CONVERSIONS_H_
#define BASE_I18N_ICU_STRING_CONVERSIONS_H_
#include <string>
#include "base/i18n/base_i18n_export.h"
#include "base/i18n/i18n_constants.h"
#include "base/strings/string16.h"
#include "base/strings/string_piece.h"
namespace base {
// Defines the error handling modes of UTF16ToCodepage and CodepageToUTF16.
class OnStringConversionError {
public:
enum Type {
// The function will return failure. The output buffer will be empty.
FAIL,
// The offending characters are skipped and the conversion will proceed as
// if they did not exist.
SKIP,
// When converting to Unicode, the offending byte sequences are substituted
// by Unicode replacement character (U+FFFD). When converting from Unicode,
// this is the same as SKIP.
SUBSTITUTE,
};
private:
OnStringConversionError() = delete;
};
// Converts between UTF-16 strings and the encoding specified. If the
// encoding doesn't exist or the encoding fails (when on_error is FAIL),
// returns false.
BASE_I18N_EXPORT bool UTF16ToCodepage(base::StringPiece16 utf16,
const char* codepage_name,
OnStringConversionError::Type on_error,
std::string* encoded);
BASE_I18N_EXPORT bool CodepageToUTF16(base::StringPiece encoded,
const char* codepage_name,
OnStringConversionError::Type on_error,
string16* utf16);
// Converts from any codepage to UTF-8 and ensures the resulting UTF-8 is
// normalized.
BASE_I18N_EXPORT bool ConvertToUtf8AndNormalize(base::StringPiece text,
const std::string& charset,
std::string* result);
} // namespace base
#endif // BASE_I18N_ICU_STRING_CONVERSIONS_H_

View file

@ -0,0 +1,612 @@
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/icu_util.h"
#if defined(OS_WIN)
#include <windows.h>
#endif
#include <string>
#include "base/debug/alias.h"
#include "base/environment.h"
#include "base/files/file_path.h"
#include "base/files/file_util.h"
#include "base/files/memory_mapped_file.h"
#include "base/logging.h"
#include "base/metrics/histogram_functions.h"
#include "base/metrics/metrics_hashes.h"
#include "base/path_service.h"
#include "base/strings/string_util.h"
#include "base/strings/sys_string_conversions.h"
#include "build/build_config.h"
#include "build/chromecast_buildflags.h"
#include "third_party/icu/source/common/unicode/putil.h"
#include "third_party/icu/source/common/unicode/udata.h"
#include "third_party/icu/source/common/unicode/utrace.h"
#if defined(OS_ANDROID)
#include "base/android/apk_assets.h"
#include "base/android/timezone_utils.h"
#endif
#if defined(OS_IOS)
#include "base/ios/ios_util.h"
#endif
#if defined(OS_MACOSX)
#include "base/mac/foundation_util.h"
#endif
#if defined(OS_FUCHSIA)
#include "base/fuchsia/intl_profile_watcher.h"
#endif
#if defined(OS_ANDROID) || defined(OS_FUCHSIA)
#include "third_party/icu/source/common/unicode/unistr.h"
#endif
#if defined(OS_ANDROID) || defined(OS_FUCHSIA) || \
(defined(OS_LINUX) && !BUILDFLAG(IS_CHROMECAST))
#include "third_party/icu/source/i18n/unicode/timezone.h"
#endif
namespace base {
namespace i18n {
#if !defined(OS_NACL)
namespace {
#if DCHECK_IS_ON()
// Assert that we are not called more than once. Even though calling this
// function isn't harmful (ICU can handle it), being called twice probably
// indicates a programming error.
bool g_check_called_once = true;
bool g_called_once = false;
#endif // DCHECK_IS_ON()
#if (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE)
// To debug http://crbug.com/445616.
int g_debug_icu_last_error;
int g_debug_icu_load;
int g_debug_icu_pf_error_details;
int g_debug_icu_pf_last_error;
#if defined(OS_WIN)
wchar_t g_debug_icu_pf_filename[_MAX_PATH];
#endif // OS_WIN
// Use an unversioned file name to simplify a icu version update down the road.
// No need to change the filename in multiple places (gyp files, windows
// build pkg configurations, etc). 'l' stands for Little Endian.
// This variable is exported through the header file.
const char kIcuDataFileName[] = "icudtl.dat";
const char kIcuExtraDataFileName[] = "icudtl_extra.dat";
// Time zone data loading.
// For now, only Fuchsia has a meaningful use case for this feature, so it is
// only implemented for OS_FUCHSIA.
#if defined(OS_FUCHSIA)
// The environment variable used to point the ICU data loader to the directory
// containing time zone data. This is available from ICU version 54. The env
// variable approach is antiquated by today's standards (2019), but is the
// recommended way to configure ICU.
//
// See for details: http://userguide.icu-project.org/datetime/timezone
const char kIcuTimeZoneEnvVariable[] = "ICU_TIMEZONE_FILES_DIR";
// We assume that Fuchsia will provide time zone data at this path for Chromium
// to load, and that the path will be timely updated when Fuchsia needs to
// uprev the ICU version it is using. There are unit tests that will fail at
// Fuchsia roll time in case either Chromium or Fuchsia get upgraded to
// mutually incompatible ICU versions. That should be enough to alert the
// developers of the need to keep ICU library versions in ICU and Fuchsia in
// reasonable sync.
const char kIcuTimeZoneDataDir[] = "/config/data/tzdata/icu/44/le";
#endif // defined(OS_FUCHSIA)
#if defined(OS_ANDROID)
const char kAssetsPathPrefix[] = "assets/";
#endif // defined(OS_ANDROID)
// File handle intentionally never closed. Not using File here because its
// Windows implementation guards against two instances owning the same
// PlatformFile (which we allow since we know it is never freed).
PlatformFile g_icudtl_pf = kInvalidPlatformFile;
MemoryMappedFile* g_icudtl_mapped_file = nullptr;
MemoryMappedFile::Region g_icudtl_region;
PlatformFile g_icudtl_extra_pf = kInvalidPlatformFile;
MemoryMappedFile* g_icudtl_extra_mapped_file = nullptr;
MemoryMappedFile::Region g_icudtl_extra_region;
#if defined(OS_FUCHSIA)
// The directory from which the ICU data loader will be configured to load time
// zone data. It is only changed by SetIcuTimeZoneDataDirForTesting().
const char* g_icu_time_zone_data_dir = kIcuTimeZoneDataDir;
#endif // defined(OS_FUCHSIA)
struct PfRegion {
public:
PlatformFile pf;
MemoryMappedFile::Region region;
};
std::unique_ptr<PfRegion> OpenIcuDataFile(const std::string& filename) {
auto result = std::make_unique<PfRegion>();
#if defined(OS_ANDROID)
result->pf =
android::OpenApkAsset(kAssetsPathPrefix + filename, &result->region);
if (result->pf != -1) {
return result;
}
#endif // defined(OS_ANDROID)
// For unit tests, data file is located on disk, so try there as a fallback.
#if !defined(OS_MACOSX)
FilePath data_path;
if (!PathService::Get(DIR_ASSETS, &data_path)) {
LOG(ERROR) << "Can't find " << filename;
return nullptr;
}
#if defined(OS_WIN)
// TODO(brucedawson): http://crbug.com/445616
wchar_t tmp_buffer[_MAX_PATH] = {0};
wcscpy_s(tmp_buffer, data_path.value().c_str());
debug::Alias(tmp_buffer);
#endif
data_path = data_path.AppendASCII(filename);
#if defined(OS_WIN)
// TODO(brucedawson): http://crbug.com/445616
wchar_t tmp_buffer2[_MAX_PATH] = {0};
wcscpy_s(tmp_buffer2, data_path.value().c_str());
debug::Alias(tmp_buffer2);
#endif
#else // !defined(OS_MACOSX)
// Assume it is in the framework bundle's Resources directory.
ScopedCFTypeRef<CFStringRef> data_file_name(SysUTF8ToCFStringRef(filename));
FilePath data_path = mac::PathForFrameworkBundleResource(data_file_name);
#if defined(OS_IOS)
FilePath override_data_path = ios::FilePathOfEmbeddedICU();
if (!override_data_path.empty()) {
data_path = override_data_path;
}
#endif // !defined(OS_IOS)
if (data_path.empty()) {
LOG(ERROR) << filename << " not found in bundle";
return nullptr;
}
#endif // !defined(OS_MACOSX)
File file(data_path, File::FLAG_OPEN | File::FLAG_READ);
if (file.IsValid()) {
// TODO(brucedawson): http://crbug.com/445616.
g_debug_icu_pf_last_error = 0;
g_debug_icu_pf_error_details = 0;
#if defined(OS_WIN)
g_debug_icu_pf_filename[0] = 0;
#endif // OS_WIN
result->pf = file.TakePlatformFile();
result->region = MemoryMappedFile::Region::kWholeFile;
}
#if defined(OS_WIN)
else {
// TODO(brucedawson): http://crbug.com/445616.
g_debug_icu_pf_last_error = ::GetLastError();
g_debug_icu_pf_error_details = file.error_details();
wcscpy_s(g_debug_icu_pf_filename, data_path.value().c_str());
}
#endif // OS_WIN
return result;
}
void LazyOpenIcuDataFile() {
if (g_icudtl_pf != kInvalidPlatformFile) {
return;
}
auto pf_region = OpenIcuDataFile(kIcuDataFileName);
if (!pf_region) {
return;
}
g_icudtl_pf = pf_region->pf;
g_icudtl_region = pf_region->region;
}
// Configures ICU to load external time zone data, if appropriate.
void InitializeExternalTimeZoneData() {
#if defined(OS_FUCHSIA)
if (!base::DirectoryExists(base::FilePath(g_icu_time_zone_data_dir))) {
// TODO(https://crbug.com/1061262): Make this FATAL unless expected.
PLOG(WARNING) << "Could not open: '" << g_icu_time_zone_data_dir
<< "'. Using built-in timezone database";
return;
}
// Set the environment variable to override the location used by ICU.
// Loading can still fail if the directory is empty or its data is invalid.
std::unique_ptr<base::Environment> env = base::Environment::Create();
env->SetVar(kIcuTimeZoneEnvVariable, g_icu_time_zone_data_dir);
#endif // defined(OS_FUCHSIA)
}
int LoadIcuData(PlatformFile data_fd,
const MemoryMappedFile::Region& data_region,
std::unique_ptr<MemoryMappedFile>* out_mapped_data_file,
UErrorCode* out_error_code) {
InitializeExternalTimeZoneData();
if (data_fd == kInvalidPlatformFile) {
LOG(ERROR) << "Invalid file descriptor to ICU data received.";
return 1; // To debug http://crbug.com/445616.
}
out_mapped_data_file->reset(new MemoryMappedFile());
if (!(*out_mapped_data_file)->Initialize(File(data_fd), data_region)) {
LOG(ERROR) << "Couldn't mmap icu data file";
return 2; // To debug http://crbug.com/445616.
}
(*out_error_code) = U_ZERO_ERROR;
udata_setCommonData(const_cast<uint8_t*>((*out_mapped_data_file)->data()),
out_error_code);
if (U_FAILURE(*out_error_code)) {
LOG(ERROR) << "Failed to initialize ICU with data file: "
<< u_errorName(*out_error_code);
return 3; // To debug http://crbug.com/445616.
}
return 0;
}
bool InitializeICUWithFileDescriptorInternal(
PlatformFile data_fd,
const MemoryMappedFile::Region& data_region) {
// This can be called multiple times in tests.
if (g_icudtl_mapped_file) {
g_debug_icu_load = 0; // To debug http://crbug.com/445616.
return true;
}
std::unique_ptr<MemoryMappedFile> mapped_file;
UErrorCode err;
g_debug_icu_load = LoadIcuData(data_fd, data_region, &mapped_file, &err);
if (g_debug_icu_load == 1 || g_debug_icu_load == 2) {
return false;
}
g_icudtl_mapped_file = mapped_file.release();
if (g_debug_icu_load == 3) {
g_debug_icu_last_error = err;
}
// Never try to load ICU data from files.
udata_setFileAccess(UDATA_ONLY_PACKAGES, &err);
return U_SUCCESS(err);
}
bool InitializeICUFromDataFile() {
// If the ICU data directory is set, ICU won't actually load the data until
// it is needed. This can fail if the process is sandboxed at that time.
// Instead, we map the file in and hand off the data so the sandbox won't
// cause any problems.
LazyOpenIcuDataFile();
bool result =
InitializeICUWithFileDescriptorInternal(g_icudtl_pf, g_icudtl_region);
#if defined(OS_WIN)
int debug_icu_load = g_debug_icu_load;
debug::Alias(&debug_icu_load);
int debug_icu_last_error = g_debug_icu_last_error;
debug::Alias(&debug_icu_last_error);
int debug_icu_pf_last_error = g_debug_icu_pf_last_error;
debug::Alias(&debug_icu_pf_last_error);
int debug_icu_pf_error_details = g_debug_icu_pf_error_details;
debug::Alias(&debug_icu_pf_error_details);
wchar_t debug_icu_pf_filename[_MAX_PATH] = {0};
wcscpy_s(debug_icu_pf_filename, g_debug_icu_pf_filename);
debug::Alias(&debug_icu_pf_filename);
CHECK(result); // TODO(brucedawson): http://crbug.com/445616
#endif // defined(OS_WIN)
return result;
}
#endif // (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE)
// Explicitly initialize ICU's time zone if necessary.
// On some platforms, the time zone must be explicitly initialized zone rather
// than relying on ICU's internal initialization.
void InitializeIcuTimeZone() {
#if defined(OS_ANDROID)
// On Android, we can't leave it up to ICU to set the default time zone
// because ICU's time zone detection does not work in many time zones (e.g.
// Australia/Sydney, Asia/Seoul, Europe/Paris ). Use JNI to detect the host
// time zone and set the ICU default time zone accordingly in advance of
// actual use. See crbug.com/722821 and
// https://ssl.icu-project.org/trac/ticket/13208 .
string16 zone_id = android::GetDefaultTimeZoneId();
icu::TimeZone::adoptDefault(icu::TimeZone::createTimeZone(
icu::UnicodeString(FALSE, zone_id.data(), zone_id.length())));
#elif defined(OS_FUCHSIA)
// The platform-specific mechanisms used by ICU's detectHostTimeZone() to
// determine the default time zone will not work on Fuchsia. Therefore,
// proactively set the default system.
// This is also required by TimeZoneMonitorFuchsia::ProfileMayHaveChanged(),
// which uses the current default to detect whether the time zone changed in
// the new profile.
// If the system time zone cannot be obtained or is not understood by ICU,
// the "unknown" time zone will be returned by createTimeZone() and used.
std::string zone_id =
fuchsia::IntlProfileWatcher::GetPrimaryTimeZoneIdForIcuInitialization();
icu::TimeZone::adoptDefault(
icu::TimeZone::createTimeZone(icu::UnicodeString::fromUTF8(zone_id)));
#elif defined(OS_LINUX) && !BUILDFLAG(IS_CHROMECAST)
// To respond to the time zone change properly, the default time zone
// cache in ICU has to be populated on starting up.
// See TimeZoneMonitorLinux::NotifyClientsFromImpl().
std::unique_ptr<icu::TimeZone> zone(icu::TimeZone::createDefault());
#endif // defined(OS_ANDROID)
}
const char kICUDataFile[] = "ICU.DataFile";
const char kICUCreateInstance[] = "ICU.CreateInstance";
enum class ICUCreateInstance {
kCharacterBreakIterator = 0,
kWordBreakIterator = 1,
kLineBreakIterator = 2,
kLineBreakIteratorTypeLoose = 3,
kLineBreakIteratorTypeNormal = 4,
kLineBreakIteratorTypeStrict = 5,
kSentenceBreakIterator = 6,
kTitleBreakIterator = 7,
kThaiBreakEngine = 8,
kLaoBreakEngine = 9,
kBurmeseBreakEngine = 10,
kKhmerBreakEngine = 11,
kChineseJapaneseBreakEngine = 12,
kMaxValue = kChineseJapaneseBreakEngine
};
// Callback functions to report the opening of ICU Data File, and creation of
// key objects to UMA. This help us to understand what built-in ICU data files
// are rarely used in the user's machines and the distribution of ICU usage.
static void U_CALLCONV TraceICUEntry(const void*, int32_t fn_number) {
switch (fn_number) {
case UTRACE_UBRK_CREATE_CHARACTER:
base::UmaHistogramEnumeration(kICUCreateInstance,
ICUCreateInstance::kCharacterBreakIterator);
break;
case UTRACE_UBRK_CREATE_SENTENCE:
base::UmaHistogramEnumeration(kICUCreateInstance,
ICUCreateInstance::kSentenceBreakIterator);
break;
case UTRACE_UBRK_CREATE_TITLE:
base::UmaHistogramEnumeration(kICUCreateInstance,
ICUCreateInstance::kTitleBreakIterator);
break;
case UTRACE_UBRK_CREATE_WORD:
base::UmaHistogramEnumeration(kICUCreateInstance,
ICUCreateInstance::kWordBreakIterator);
break;
default:
return;
}
}
static void U_CALLCONV TraceICUData(const void* context,
int32_t fn_number,
int32_t level,
const char* fmt,
va_list args) {
switch (fn_number) {
case UTRACE_UDATA_DATA_FILE: {
std::string icu_data_file_name(va_arg(args, const char*));
va_end(args);
// Skip icu version specified prefix if exist.
// path is prefixed with icu version prefix such as "icudt65l-".
// Histogram only the part after the -.
if (icu_data_file_name.find("icudt") == 0) {
size_t dash = icu_data_file_name.find("-");
if (dash != std::string::npos) {
icu_data_file_name = icu_data_file_name.substr(dash + 1);
}
}
// UmaHistogramSparse should track less than 100 values.
// We currently have about total 55 built-in data files inside ICU
// so it fit the UmaHistogramSparse usage.
int hash = base::HashMetricName(icu_data_file_name);
base::UmaHistogramSparse(kICUDataFile, hash);
return;
}
case UTRACE_UBRK_CREATE_LINE: {
const char* lb_type = va_arg(args, const char*);
va_end(args);
ICUCreateInstance value;
switch (lb_type[0]) {
case '\0':
value = ICUCreateInstance::kLineBreakIterator;
break;
case 'l':
DCHECK(strcmp(lb_type, "loose") == 0);
value = ICUCreateInstance::kLineBreakIteratorTypeLoose;
break;
case 'n':
DCHECK(strcmp(lb_type, "normal") == 0);
value = ICUCreateInstance::kLineBreakIteratorTypeNormal;
break;
case 's':
DCHECK(strcmp(lb_type, "strict") == 0);
value = ICUCreateInstance::kLineBreakIteratorTypeStrict;
break;
default:
return;
}
base::UmaHistogramEnumeration(kICUCreateInstance, value);
return;
}
case UTRACE_UBRK_CREATE_BREAK_ENGINE: {
const char* script = va_arg(args, const char*);
va_end(args);
ICUCreateInstance value;
switch (script[0]) {
case 'H':
DCHECK(strcmp(script, "Hani") == 0);
value = ICUCreateInstance::kChineseJapaneseBreakEngine;
break;
case 'K':
DCHECK(strcmp(script, "Khmr") == 0);
value = ICUCreateInstance::kKhmerBreakEngine;
break;
case 'L':
DCHECK(strcmp(script, "Laoo") == 0);
value = ICUCreateInstance::kLaoBreakEngine;
break;
case 'M':
DCHECK(strcmp(script, "Mymr") == 0);
value = ICUCreateInstance::kBurmeseBreakEngine;
break;
case 'T':
DCHECK(strcmp(script, "Thai") == 0);
value = ICUCreateInstance::kThaiBreakEngine;
break;
default:
return;
}
base::UmaHistogramEnumeration(kICUCreateInstance, value);
return;
}
}
}
// Common initialization to run regardless of how ICU is initialized.
// There are multiple exposed InitializeIcu* functions. This should be called
// as at the end of (the last functions in the sequence of) these functions.
bool DoCommonInitialization() {
// TODO(jungshik): Some callers do not care about tz at all. If necessary,
// add a boolean argument to this function to init the default tz only
// when requested.
InitializeIcuTimeZone();
const void* context = nullptr;
utrace_setFunctions(context, TraceICUEntry, nullptr, TraceICUData);
utrace_setLevel(UTRACE_VERBOSE);
return true;
}
} // namespace
#if (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE)
bool InitializeExtraICUWithFileDescriptor(
PlatformFile data_fd,
const MemoryMappedFile::Region& data_region) {
if (g_icudtl_pf != kInvalidPlatformFile) {
// Must call InitializeExtraICUWithFileDescriptor() before
// InitializeICUWithFileDescriptor().
return false;
}
std::unique_ptr<MemoryMappedFile> mapped_file;
UErrorCode err;
if (LoadIcuData(data_fd, data_region, &mapped_file, &err) != 0) {
return false;
}
g_icudtl_extra_mapped_file = mapped_file.release();
return true;
}
bool InitializeICUWithFileDescriptor(
PlatformFile data_fd,
const MemoryMappedFile::Region& data_region) {
#if DCHECK_IS_ON()
DCHECK(!g_check_called_once || !g_called_once);
g_called_once = true;
#endif
if (!InitializeICUWithFileDescriptorInternal(data_fd, data_region))
return false;
return DoCommonInitialization();
}
PlatformFile GetIcuDataFileHandle(MemoryMappedFile::Region* out_region) {
CHECK_NE(g_icudtl_pf, kInvalidPlatformFile);
*out_region = g_icudtl_region;
return g_icudtl_pf;
}
PlatformFile GetIcuExtraDataFileHandle(MemoryMappedFile::Region* out_region) {
if (g_icudtl_extra_pf == kInvalidPlatformFile) {
return kInvalidPlatformFile;
}
*out_region = g_icudtl_extra_region;
return g_icudtl_extra_pf;
}
bool InitializeExtraICU() {
if (g_icudtl_pf != kInvalidPlatformFile) {
// Must call InitializeExtraICU() before InitializeICU().
return false;
}
auto pf_region = OpenIcuDataFile(kIcuExtraDataFileName);
if (!pf_region) {
return false;
}
g_icudtl_extra_pf = pf_region->pf;
g_icudtl_extra_region = pf_region->region;
std::unique_ptr<MemoryMappedFile> mapped_file;
UErrorCode err;
if (LoadIcuData(g_icudtl_extra_pf, g_icudtl_extra_region, &mapped_file,
&err) != 0) {
return false;
}
g_icudtl_extra_mapped_file = mapped_file.release();
return true;
}
void ResetGlobalsForTesting() {
g_icudtl_pf = kInvalidPlatformFile;
g_icudtl_mapped_file = nullptr;
g_icudtl_extra_pf = kInvalidPlatformFile;
g_icudtl_extra_mapped_file = nullptr;
#if defined(OS_FUCHSIA)
g_icu_time_zone_data_dir = kIcuTimeZoneDataDir;
#endif // defined(OS_FUCHSIA)
}
#if defined(OS_FUCHSIA)
// |dir| must remain valid until ResetGlobalsForTesting() is called.
void SetIcuTimeZoneDataDirForTesting(const char* dir) {
g_icu_time_zone_data_dir = dir;
}
#endif // defined(OS_FUCHSIA)
#endif // (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE)
bool InitializeICU() {
#if DCHECK_IS_ON()
DCHECK(!g_check_called_once || !g_called_once);
g_called_once = true;
#endif
#if (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_STATIC)
// The ICU data is statically linked.
#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE)
if (!InitializeICUFromDataFile())
return false;
#else
#error Unsupported ICU_UTIL_DATA_IMPL value
#endif // (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_STATIC)
return DoCommonInitialization();
}
void AllowMultipleInitializeCallsForTesting() {
#if DCHECK_IS_ON()
g_check_called_once = false;
#endif
}
#endif // !defined(OS_NACL)
} // namespace i18n
} // namespace base

View file

@ -0,0 +1,66 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_ICU_UTIL_H_
#define BASE_I18N_ICU_UTIL_H_
#include <stdint.h>
#include "base/files/memory_mapped_file.h"
#include "base/i18n/base_i18n_export.h"
#include "build/build_config.h"
#define ICU_UTIL_DATA_FILE 0
#define ICU_UTIL_DATA_STATIC 1
namespace base {
namespace i18n {
#if !defined(OS_NACL)
// Call this function to load ICU's data tables for the current process. This
// function should be called before ICU is used.
BASE_I18N_EXPORT bool InitializeICU();
#if ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE
// Loads ICU's extra data tables from disk for the current process. If used must
// be called before InitializeICU().
BASE_I18N_EXPORT bool InitializeExtraICU();
// Returns the PlatformFile and Region that was initialized by InitializeICU()
// or InitializeExtraICU(). Use with InitializeICUWithFileDescriptor() or
// InitializeExtraICUWithFileDescriptor().
BASE_I18N_EXPORT PlatformFile GetIcuDataFileHandle(
MemoryMappedFile::Region* out_region);
BASE_I18N_EXPORT PlatformFile
GetIcuExtraDataFileHandle(MemoryMappedFile::Region* out_region);
// Loads ICU data file from file descriptor passed by browser process to
// initialize ICU in render processes.
BASE_I18N_EXPORT bool InitializeICUWithFileDescriptor(
PlatformFile data_fd,
const MemoryMappedFile::Region& data_region);
// Loads ICU extra data file from file descriptor passed by browser process to
// initialize ICU in render processes. If used must be called before
// InitializeICUWithFileDescriptor().
BASE_I18N_EXPORT bool InitializeExtraICUWithFileDescriptor(
PlatformFile data_fd,
const MemoryMappedFile::Region& data_region);
BASE_I18N_EXPORT void ResetGlobalsForTesting();
#if defined(OS_FUCHSIA)
// Overrides the directory used by ICU for external time zone data.
BASE_I18N_EXPORT void SetIcuTimeZoneDataDirForTesting(const char* dir);
#endif // defined(OS_FUCHSIA)
#endif // ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE
// In a test binary, initialize functions might be called twice.
BASE_I18N_EXPORT void AllowMultipleInitializeCallsForTesting();
#endif // !defined(OS_NACL)
} // namespace i18n
} // namespace base
#endif // BASE_I18N_ICU_UTIL_H_

View file

@ -0,0 +1,142 @@
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/message_formatter.h"
#include "base/i18n/unicodestring.h"
#include "base/logging.h"
#include "base/numerics/safe_conversions.h"
#include "base/time/time.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/common/unicode/utypes.h"
#include "third_party/icu/source/i18n/unicode/fmtable.h"
#include "third_party/icu/source/i18n/unicode/msgfmt.h"
using icu::UnicodeString;
namespace base {
namespace i18n {
namespace {
UnicodeString UnicodeStringFromStringPiece(StringPiece str) {
return UnicodeString::fromUTF8(
icu::StringPiece(str.data(), base::checked_cast<int32_t>(str.size())));
}
} // anonymous namespace
namespace internal {
MessageArg::MessageArg() : formattable(nullptr) {}
MessageArg::MessageArg(const char* s)
: formattable(new icu::Formattable(UnicodeStringFromStringPiece(s))) {}
MessageArg::MessageArg(StringPiece s)
: formattable(new icu::Formattable(UnicodeStringFromStringPiece(s))) {}
MessageArg::MessageArg(const std::string& s)
: formattable(new icu::Formattable(UnicodeString::fromUTF8(s))) {}
MessageArg::MessageArg(const string16& s)
: formattable(new icu::Formattable(UnicodeString(s.data(), s.size()))) {}
MessageArg::MessageArg(int i) : formattable(new icu::Formattable(i)) {}
MessageArg::MessageArg(int64_t i) : formattable(new icu::Formattable(i)) {}
MessageArg::MessageArg(double d) : formattable(new icu::Formattable(d)) {}
MessageArg::MessageArg(const Time& t)
: formattable(new icu::Formattable(static_cast<UDate>(t.ToJsTime()))) {}
MessageArg::~MessageArg() = default;
// Tests if this argument has a value, and if so increments *count.
bool MessageArg::has_value(int *count) const {
if (formattable == nullptr)
return false;
++*count;
return true;
}
} // namespace internal
string16 MessageFormatter::FormatWithNumberedArgs(
StringPiece16 msg,
const internal::MessageArg& arg0,
const internal::MessageArg& arg1,
const internal::MessageArg& arg2,
const internal::MessageArg& arg3,
const internal::MessageArg& arg4,
const internal::MessageArg& arg5,
const internal::MessageArg& arg6) {
int32_t args_count = 0;
icu::Formattable args[] = {
arg0.has_value(&args_count) ? *arg0.formattable : icu::Formattable(),
arg1.has_value(&args_count) ? *arg1.formattable : icu::Formattable(),
arg2.has_value(&args_count) ? *arg2.formattable : icu::Formattable(),
arg3.has_value(&args_count) ? *arg3.formattable : icu::Formattable(),
arg4.has_value(&args_count) ? *arg4.formattable : icu::Formattable(),
arg5.has_value(&args_count) ? *arg5.formattable : icu::Formattable(),
arg6.has_value(&args_count) ? *arg6.formattable : icu::Formattable(),
};
UnicodeString msg_string(msg.data(), msg.size());
UErrorCode error = U_ZERO_ERROR;
icu::MessageFormat format(msg_string, error);
icu::UnicodeString formatted;
icu::FieldPosition ignore(icu::FieldPosition::DONT_CARE);
format.format(args, args_count, formatted, ignore, error);
if (U_FAILURE(error)) {
LOG(ERROR) << "MessageFormat(" << msg.as_string() << ") failed with "
<< u_errorName(error);
return string16();
}
return i18n::UnicodeStringToString16(formatted);
}
string16 MessageFormatter::FormatWithNamedArgs(
StringPiece16 msg,
StringPiece name0, const internal::MessageArg& arg0,
StringPiece name1, const internal::MessageArg& arg1,
StringPiece name2, const internal::MessageArg& arg2,
StringPiece name3, const internal::MessageArg& arg3,
StringPiece name4, const internal::MessageArg& arg4,
StringPiece name5, const internal::MessageArg& arg5,
StringPiece name6, const internal::MessageArg& arg6) {
icu::UnicodeString names[] = {
UnicodeStringFromStringPiece(name0),
UnicodeStringFromStringPiece(name1),
UnicodeStringFromStringPiece(name2),
UnicodeStringFromStringPiece(name3),
UnicodeStringFromStringPiece(name4),
UnicodeStringFromStringPiece(name5),
UnicodeStringFromStringPiece(name6),
};
int32_t args_count = 0;
icu::Formattable args[] = {
arg0.has_value(&args_count) ? *arg0.formattable : icu::Formattable(),
arg1.has_value(&args_count) ? *arg1.formattable : icu::Formattable(),
arg2.has_value(&args_count) ? *arg2.formattable : icu::Formattable(),
arg3.has_value(&args_count) ? *arg3.formattable : icu::Formattable(),
arg4.has_value(&args_count) ? *arg4.formattable : icu::Formattable(),
arg5.has_value(&args_count) ? *arg5.formattable : icu::Formattable(),
arg6.has_value(&args_count) ? *arg6.formattable : icu::Formattable(),
};
UnicodeString msg_string(msg.data(), msg.size());
UErrorCode error = U_ZERO_ERROR;
icu::MessageFormat format(msg_string, error);
icu::UnicodeString formatted;
format.format(names, args, args_count, formatted, error);
if (U_FAILURE(error)) {
LOG(ERROR) << "MessageFormat(" << msg.as_string() << ") failed with "
<< u_errorName(error);
return string16();
}
return i18n::UnicodeStringToString16(formatted);
}
} // namespace i18n
} // namespace base

View file

@ -0,0 +1,128 @@
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_MESSAGE_FORMATTER_H_
#define BASE_I18N_MESSAGE_FORMATTER_H_
#include <stdint.h>
#include <memory>
#include <string>
#include "base/i18n/base_i18n_export.h"
#include "base/macros.h"
#include "base/strings/string16.h"
#include "base/strings/string_piece.h"
#include "third_party/icu/source/common/unicode/uversion.h"
U_NAMESPACE_BEGIN
class Formattable;
U_NAMESPACE_END
namespace base {
class Time;
namespace i18n {
class MessageFormatter;
namespace internal {
class BASE_I18N_EXPORT MessageArg {
public:
MessageArg(const char* s);
MessageArg(StringPiece s);
MessageArg(const std::string& s);
MessageArg(const string16& s);
MessageArg(int i);
MessageArg(int64_t i);
MessageArg(double d);
MessageArg(const Time& t);
~MessageArg();
private:
friend class base::i18n::MessageFormatter;
MessageArg();
// Tests if this argument has a value, and if so increments *count.
bool has_value(int* count) const;
std::unique_ptr<icu::Formattable> formattable;
DISALLOW_COPY_AND_ASSIGN(MessageArg);
};
} // namespace internal
// Message Formatter with the ICU message format syntax support.
// It can format strings (UTF-8 and UTF-16), numbers and base::Time with
// plural, gender and other 'selectors' support. This is handy if you
// have multiple parameters of differnt types and some of them require
// plural or gender/selector support.
//
// To use this API for locale-sensitive formatting, retrieve a 'message
// template' in the ICU message format from a message bundle (e.g. with
// l10n_util::GetStringUTF16()) and pass it to FormatWith{Named,Numbered}Args.
//
// MessageFormat specs:
// http://icu-project.org/apiref/icu4j/com/ibm/icu/text/MessageFormat.html
// http://icu-project.org/apiref/icu4c/classicu_1_1DecimalFormat.html#details
// Examples:
// http://userguide.icu-project.org/formatparse/messages
// message_formatter_unittest.cc
// go/plurals inside Google.
// TODO(jshin): Document this API in md format docs.
// Caveat:
// When plural/select/gender is used along with other format specifiers such
// as date or number, plural/select/gender should be at the top level. It's
// not an ICU restriction but a constraint imposed by Google's translation
// infrastructure. Message A does not work. It must be revised to Message B.
//
// A.
// Rated <ph name="RATING">{0, number,0.0}<ex>3.2</ex></ph>
// by {1, plural, =1{a user} other{# users}}
//
// B.
// {1, plural,
// =1{Rated <ph name="RATING">{0, number,0.0}<ex>3.2</ex></ph>
// by a user.}
// other{Rated <ph name="RATING">{0, number,0.0}<ex>3.2</ex></ph>
// by # users.}}
class BASE_I18N_EXPORT MessageFormatter {
public:
static string16 FormatWithNamedArgs(
StringPiece16 msg,
StringPiece name0 = StringPiece(),
const internal::MessageArg& arg0 = internal::MessageArg(),
StringPiece name1 = StringPiece(),
const internal::MessageArg& arg1 = internal::MessageArg(),
StringPiece name2 = StringPiece(),
const internal::MessageArg& arg2 = internal::MessageArg(),
StringPiece name3 = StringPiece(),
const internal::MessageArg& arg3 = internal::MessageArg(),
StringPiece name4 = StringPiece(),
const internal::MessageArg& arg4 = internal::MessageArg(),
StringPiece name5 = StringPiece(),
const internal::MessageArg& arg5 = internal::MessageArg(),
StringPiece name6 = StringPiece(),
const internal::MessageArg& arg6 = internal::MessageArg());
static string16 FormatWithNumberedArgs(
StringPiece16 msg,
const internal::MessageArg& arg0 = internal::MessageArg(),
const internal::MessageArg& arg1 = internal::MessageArg(),
const internal::MessageArg& arg2 = internal::MessageArg(),
const internal::MessageArg& arg3 = internal::MessageArg(),
const internal::MessageArg& arg4 = internal::MessageArg(),
const internal::MessageArg& arg5 = internal::MessageArg(),
const internal::MessageArg& arg6 = internal::MessageArg());
private:
MessageFormatter() = delete;
DISALLOW_COPY_AND_ASSIGN(MessageFormatter);
};
} // namespace i18n
} // namespace base
#endif // BASE_I18N_MESSAGE_FORMATTER_H_

View file

@ -0,0 +1,93 @@
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/number_formatting.h"
#include <stddef.h>
#include <memory>
#include "base/format_macros.h"
#include "base/i18n/message_formatter.h"
#include "base/i18n/unicodestring.h"
#include "base/lazy_instance.h"
#include "base/logging.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "third_party/icu/source/common/unicode/ustring.h"
#include "third_party/icu/source/i18n/unicode/numfmt.h"
namespace base {
namespace {
// A simple wrapper around icu::NumberFormat that allows for resetting it
// (as LazyInstance does not).
struct NumberFormatWrapper {
NumberFormatWrapper() {
Reset();
}
void Reset() {
// There's no ICU call to destroy a NumberFormat object other than
// operator delete, so use the default Delete, which calls operator delete.
// This can cause problems if a different allocator is used by this file
// than by ICU.
UErrorCode status = U_ZERO_ERROR;
number_format.reset(icu::NumberFormat::createInstance(status));
DCHECK(U_SUCCESS(status));
}
std::unique_ptr<icu::NumberFormat> number_format;
};
LazyInstance<NumberFormatWrapper>::DestructorAtExit g_number_format_int =
LAZY_INSTANCE_INITIALIZER;
LazyInstance<NumberFormatWrapper>::DestructorAtExit g_number_format_float =
LAZY_INSTANCE_INITIALIZER;
} // namespace
string16 FormatNumber(int64_t number) {
icu::NumberFormat* number_format =
g_number_format_int.Get().number_format.get();
if (!number_format) {
// As a fallback, just return the raw number in a string.
return ASCIIToUTF16(StringPrintf("%" PRId64, number));
}
icu::UnicodeString ustr;
number_format->format(number, ustr);
return i18n::UnicodeStringToString16(ustr);
}
string16 FormatDouble(double number, int fractional_digits) {
icu::NumberFormat* number_format =
g_number_format_float.Get().number_format.get();
if (!number_format) {
// As a fallback, just return the raw number in a string.
return ASCIIToUTF16(StringPrintf("%f", number));
}
number_format->setMaximumFractionDigits(fractional_digits);
number_format->setMinimumFractionDigits(fractional_digits);
icu::UnicodeString ustr;
number_format->format(number, ustr);
return i18n::UnicodeStringToString16(ustr);
}
string16 FormatPercent(int number) {
return i18n::MessageFormatter::FormatWithNumberedArgs(
ASCIIToUTF16("{0,number,percent}"), static_cast<double>(number) / 100.0);
}
void ResetFormattersForTesting() {
g_number_format_int.Get().Reset();
g_number_format_float.Get().Reset();
}
} // namespace base

View file

@ -0,0 +1,34 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_NUMBER_FORMATTING_H_
#define BASE_I18N_NUMBER_FORMATTING_H_
#include <stdint.h>
#include "base/i18n/base_i18n_export.h"
#include "base/strings/string16.h"
namespace base {
// Return a number formatted with separators in the user's locale.
// Ex: FormatNumber(1234567) => "1,234,567" in English, "1.234.567" in German
BASE_I18N_EXPORT string16 FormatNumber(int64_t number);
// Return a number formatted with separators in the user's locale.
// Ex: FormatDouble(1234567.8, 1)
// => "1,234,567.8" in English, "1.234.567,8" in German
BASE_I18N_EXPORT string16 FormatDouble(double number, int fractional_digits);
// Return a percentage formatted with space and symbol in the user's locale.
// Ex: FormatPercent(12) => "12%" in English, "12 %" in Romanian
BASE_I18N_EXPORT string16 FormatPercent(int number);
// Causes cached formatters to be discarded and recreated. Only useful for
// testing.
BASE_I18N_EXPORT void ResetFormattersForTesting();
} // namespace base
#endif // BASE_I18N_NUMBER_FORMATTING_H_

View file

@ -0,0 +1,497 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/rtl.h"
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include "base/command_line.h"
#include "base/files/file_path.h"
#include "base/i18n/base_i18n_switches.h"
#include "base/logging.h"
#include "base/stl_util.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/sys_string_conversions.h"
#include "base/strings/utf_string_conversions.h"
#include "build/build_config.h"
#include "third_party/icu/source/common/unicode/locid.h"
#include "third_party/icu/source/common/unicode/uchar.h"
#include "third_party/icu/source/common/unicode/uscript.h"
#include "third_party/icu/source/i18n/unicode/coll.h"
#if defined(OS_IOS)
#include "base/debug/crash_logging.h"
#include "base/ios/ios_util.h"
#endif
namespace {
// Extract language, country and variant, but ignore keywords. For example,
// en-US, ca@valencia, ca-ES@valencia.
std::string GetLocaleString(const icu::Locale& locale) {
const char* language = locale.getLanguage();
const char* country = locale.getCountry();
const char* variant = locale.getVariant();
std::string result =
(language != nullptr && *language != '\0') ? language : "und";
if (country != nullptr && *country != '\0') {
result += '-';
result += country;
}
if (variant != nullptr && *variant != '\0')
result += '@' + base::ToLowerASCII(variant);
return result;
}
// Returns LEFT_TO_RIGHT or RIGHT_TO_LEFT if |character| has strong
// directionality, returns UNKNOWN_DIRECTION if it doesn't. Please refer to
// http://unicode.org/reports/tr9/ for more information.
base::i18n::TextDirection GetCharacterDirection(UChar32 character) {
static bool has_switch = base::CommandLine::ForCurrentProcess()->HasSwitch(
switches::kForceTextDirection);
if (has_switch) {
base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
std::string force_flag =
command_line->GetSwitchValueASCII(switches::kForceTextDirection);
if (force_flag == switches::kForceDirectionRTL)
return base::i18n::RIGHT_TO_LEFT;
if (force_flag == switches::kForceDirectionLTR)
return base::i18n::LEFT_TO_RIGHT;
}
// Now that we have the character, we use ICU in order to query for the
// appropriate Unicode BiDi character type.
int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
switch (property) {
case U_RIGHT_TO_LEFT:
case U_RIGHT_TO_LEFT_ARABIC:
case U_RIGHT_TO_LEFT_EMBEDDING:
case U_RIGHT_TO_LEFT_OVERRIDE:
return base::i18n::RIGHT_TO_LEFT;
case U_LEFT_TO_RIGHT:
case U_LEFT_TO_RIGHT_EMBEDDING:
case U_LEFT_TO_RIGHT_OVERRIDE:
return base::i18n::LEFT_TO_RIGHT;
}
return base::i18n::UNKNOWN_DIRECTION;
}
} // namespace
namespace base {
namespace i18n {
// Represents the locale-specific ICU text direction.
static TextDirection g_icu_text_direction = UNKNOWN_DIRECTION;
// Convert the ICU default locale to a string.
std::string GetConfiguredLocale() {
return GetLocaleString(icu::Locale::getDefault());
}
// Convert the ICU canonicalized locale to a string.
std::string GetCanonicalLocale(const std::string& locale) {
return GetLocaleString(icu::Locale::createCanonical(locale.c_str()));
}
// Convert Chrome locale name to ICU locale name
std::string ICULocaleName(const std::string& locale_string) {
// If not Spanish, just return it.
if (locale_string.substr(0, 2) != "es")
return locale_string;
// Expand es to es-ES.
if (LowerCaseEqualsASCII(locale_string, "es"))
return "es-ES";
// Map es-419 (Latin American Spanish) to es-FOO depending on the system
// locale. If it's es-RR other than es-ES, map to es-RR. Otherwise, map
// to es-MX (the most populous in Spanish-speaking Latin America).
if (LowerCaseEqualsASCII(locale_string, "es-419")) {
const icu::Locale& locale = icu::Locale::getDefault();
std::string language = locale.getLanguage();
const char* country = locale.getCountry();
if (LowerCaseEqualsASCII(language, "es") &&
!LowerCaseEqualsASCII(country, "es")) {
language += '-';
language += country;
return language;
}
return "es-MX";
}
// Currently, Chrome has only "es" and "es-419", but later we may have
// more specific "es-RR".
return locale_string;
}
void SetICUDefaultLocale(const std::string& locale_string) {
#if defined(OS_IOS)
static base::debug::CrashKeyString* crash_key_locale =
base::debug::AllocateCrashKeyString("icu_locale_input",
base::debug::CrashKeySize::Size256);
base::debug::SetCrashKeyString(crash_key_locale, locale_string);
#endif
icu::Locale locale(ICULocaleName(locale_string).c_str());
UErrorCode error_code = U_ZERO_ERROR;
const char* lang = locale.getLanguage();
if (lang != nullptr && *lang != '\0') {
icu::Locale::setDefault(locale, error_code);
} else {
LOG(ERROR) << "Failed to set the ICU default locale to " << locale_string
<< ". Falling back to en-US.";
icu::Locale::setDefault(icu::Locale::getUS(), error_code);
}
g_icu_text_direction = UNKNOWN_DIRECTION;
}
bool IsRTL() {
return ICUIsRTL();
}
void SetRTLForTesting(bool rtl) {
SetICUDefaultLocale(rtl ? "he" : "en");
DCHECK_EQ(rtl, IsRTL());
}
bool ICUIsRTL() {
if (g_icu_text_direction == UNKNOWN_DIRECTION) {
const icu::Locale& locale = icu::Locale::getDefault();
g_icu_text_direction = GetTextDirectionForLocaleInStartUp(locale.getName());
}
return g_icu_text_direction == RIGHT_TO_LEFT;
}
TextDirection GetForcedTextDirection() {
// On iOS, check for RTL forcing.
#if defined(OS_IOS)
if (base::ios::IsInForcedRTL())
return base::i18n::RIGHT_TO_LEFT;
#endif
base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
if (command_line->HasSwitch(switches::kForceUIDirection)) {
std::string force_flag =
command_line->GetSwitchValueASCII(switches::kForceUIDirection);
if (force_flag == switches::kForceDirectionLTR)
return base::i18n::LEFT_TO_RIGHT;
if (force_flag == switches::kForceDirectionRTL)
return base::i18n::RIGHT_TO_LEFT;
}
return base::i18n::UNKNOWN_DIRECTION;
}
TextDirection GetTextDirectionForLocaleInStartUp(const char* locale_name) {
// Check for direction forcing.
TextDirection forced_direction = GetForcedTextDirection();
if (forced_direction != UNKNOWN_DIRECTION)
return forced_direction;
// This list needs to be updated in alphabetical order if we add more RTL
// locales.
static const char kRTLLanguageCodes[][3] = {"ar", "fa", "he", "iw", "ur"};
std::vector<StringPiece> locale_split =
SplitStringPiece(locale_name, "-_", KEEP_WHITESPACE, SPLIT_WANT_ALL);
const StringPiece& language_code = locale_split[0];
if (std::binary_search(kRTLLanguageCodes,
kRTLLanguageCodes + base::size(kRTLLanguageCodes),
language_code))
return RIGHT_TO_LEFT;
return LEFT_TO_RIGHT;
}
TextDirection GetTextDirectionForLocale(const char* locale_name) {
// Check for direction forcing.
TextDirection forced_direction = GetForcedTextDirection();
if (forced_direction != UNKNOWN_DIRECTION)
return forced_direction;
UErrorCode status = U_ZERO_ERROR;
ULayoutType layout_dir = uloc_getCharacterOrientation(locale_name, &status);
DCHECK(U_SUCCESS(status));
// Treat anything other than RTL as LTR.
return (layout_dir != ULOC_LAYOUT_RTL) ? LEFT_TO_RIGHT : RIGHT_TO_LEFT;
}
TextDirection GetFirstStrongCharacterDirection(const string16& text) {
const UChar* string = text.c_str();
size_t length = text.length();
size_t position = 0;
while (position < length) {
UChar32 character;
size_t next_position = position;
U16_NEXT(string, next_position, length, character);
TextDirection direction = GetCharacterDirection(character);
if (direction != UNKNOWN_DIRECTION)
return direction;
position = next_position;
}
return LEFT_TO_RIGHT;
}
TextDirection GetLastStrongCharacterDirection(const string16& text) {
const UChar* string = text.c_str();
size_t position = text.length();
while (position > 0) {
UChar32 character;
size_t prev_position = position;
U16_PREV(string, 0, prev_position, character);
TextDirection direction = GetCharacterDirection(character);
if (direction != UNKNOWN_DIRECTION)
return direction;
position = prev_position;
}
return LEFT_TO_RIGHT;
}
TextDirection GetStringDirection(const string16& text) {
const UChar* string = text.c_str();
size_t length = text.length();
size_t position = 0;
TextDirection result(UNKNOWN_DIRECTION);
while (position < length) {
UChar32 character;
size_t next_position = position;
U16_NEXT(string, next_position, length, character);
TextDirection direction = GetCharacterDirection(character);
if (direction != UNKNOWN_DIRECTION) {
if (result != UNKNOWN_DIRECTION && result != direction)
return UNKNOWN_DIRECTION;
result = direction;
}
position = next_position;
}
// Handle the case of a string not containing any strong directionality
// characters defaulting to LEFT_TO_RIGHT.
if (result == UNKNOWN_DIRECTION)
return LEFT_TO_RIGHT;
return result;
}
#if defined(OS_WIN)
bool AdjustStringForLocaleDirection(string16* text) {
if (!IsRTL() || text->empty())
return false;
// Marking the string as LTR if the locale is RTL and the string does not
// contain strong RTL characters. Otherwise, mark the string as RTL.
bool has_rtl_chars = StringContainsStrongRTLChars(*text);
if (!has_rtl_chars)
WrapStringWithLTRFormatting(text);
else
WrapStringWithRTLFormatting(text);
return true;
}
bool UnadjustStringForLocaleDirection(string16* text) {
if (!IsRTL() || text->empty())
return false;
*text = StripWrappingBidiControlCharacters(*text);
return true;
}
#else
bool AdjustStringForLocaleDirection(string16* text) {
// On OS X & GTK the directionality of a label is determined by the first
// strongly directional character.
// However, we want to make sure that in an LTR-language-UI all strings are
// left aligned and vice versa.
// A problem can arise if we display a string which starts with user input.
// User input may be of the opposite directionality to the UI. So the whole
// string will be displayed in the opposite directionality, e.g. if we want to
// display in an LTR UI [such as US English]:
//
// EMAN_NOISNETXE is now installed.
//
// Since EXTENSION_NAME begins with a strong RTL char, the label's
// directionality will be set to RTL and the string will be displayed visually
// as:
//
// .is now installed EMAN_NOISNETXE
//
// In order to solve this issue, we prepend an LRM to the string. An LRM is a
// strongly directional LTR char.
// We also append an LRM at the end, which ensures that we're in an LTR
// context.
// Unlike Windows, Linux and OS X can correctly display RTL glyphs out of the
// box so there is no issue with displaying zero-width bidi control characters
// on any system. Thus no need for the !IsRTL() check here.
if (text->empty())
return false;
bool ui_direction_is_rtl = IsRTL();
bool has_rtl_chars = StringContainsStrongRTLChars(*text);
if (!ui_direction_is_rtl && has_rtl_chars) {
WrapStringWithRTLFormatting(text);
text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
kLeftToRightMark);
text->push_back(kLeftToRightMark);
} else if (ui_direction_is_rtl && has_rtl_chars) {
WrapStringWithRTLFormatting(text);
text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
kRightToLeftMark);
text->push_back(kRightToLeftMark);
} else if (ui_direction_is_rtl) {
WrapStringWithLTRFormatting(text);
text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
kRightToLeftMark);
text->push_back(kRightToLeftMark);
} else {
return false;
}
return true;
}
bool UnadjustStringForLocaleDirection(string16* text) {
if (text->empty())
return false;
size_t begin_index = 0;
char16 begin = text->at(begin_index);
if (begin == kLeftToRightMark ||
begin == kRightToLeftMark) {
++begin_index;
}
size_t end_index = text->length() - 1;
char16 end = text->at(end_index);
if (end == kLeftToRightMark ||
end == kRightToLeftMark) {
--end_index;
}
string16 unmarked_text =
text->substr(begin_index, end_index - begin_index + 1);
*text = StripWrappingBidiControlCharacters(unmarked_text);
return true;
}
#endif // !OS_WIN
void EnsureTerminatedDirectionalFormatting(string16* text) {
int count = 0;
for (auto c : *text) {
if (c == kLeftToRightEmbeddingMark || c == kRightToLeftEmbeddingMark ||
c == kLeftToRightOverride || c == kRightToLeftOverride) {
++count;
} else if (c == kPopDirectionalFormatting && count > 0) {
--count;
}
}
for (int j = 0; j < count; j++)
text->push_back(kPopDirectionalFormatting);
}
void SanitizeUserSuppliedString(string16* text) {
EnsureTerminatedDirectionalFormatting(text);
AdjustStringForLocaleDirection(text);
}
bool StringContainsStrongRTLChars(const string16& text) {
const UChar* string = text.c_str();
size_t length = text.length();
size_t position = 0;
while (position < length) {
UChar32 character;
size_t next_position = position;
U16_NEXT(string, next_position, length, character);
// Now that we have the character, we use ICU in order to query for the
// appropriate Unicode BiDi character type.
int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
if ((property == U_RIGHT_TO_LEFT) || (property == U_RIGHT_TO_LEFT_ARABIC))
return true;
position = next_position;
}
return false;
}
void WrapStringWithLTRFormatting(string16* text) {
if (text->empty())
return;
// Inserting an LRE (Left-To-Right Embedding) mark as the first character.
text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
kLeftToRightEmbeddingMark);
// Inserting a PDF (Pop Directional Formatting) mark as the last character.
text->push_back(kPopDirectionalFormatting);
}
void WrapStringWithRTLFormatting(string16* text) {
if (text->empty())
return;
// Inserting an RLE (Right-To-Left Embedding) mark as the first character.
text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
kRightToLeftEmbeddingMark);
// Inserting a PDF (Pop Directional Formatting) mark as the last character.
text->push_back(kPopDirectionalFormatting);
}
void WrapPathWithLTRFormatting(const FilePath& path,
string16* rtl_safe_path) {
// Wrap the overall path with LRE-PDF pair which essentialy marks the
// string as a Left-To-Right string.
// Inserting an LRE (Left-To-Right Embedding) mark as the first character.
rtl_safe_path->push_back(kLeftToRightEmbeddingMark);
#if defined(OS_MACOSX)
rtl_safe_path->append(UTF8ToUTF16(path.value()));
#elif defined(OS_WIN)
rtl_safe_path->append(AsString16(path.value()));
#else // defined(OS_POSIX) && !defined(OS_MACOSX)
std::wstring wide_path = base::SysNativeMBToWide(path.value());
rtl_safe_path->append(WideToUTF16(wide_path));
#endif
// Inserting a PDF (Pop Directional Formatting) mark as the last character.
rtl_safe_path->push_back(kPopDirectionalFormatting);
}
string16 GetDisplayStringInLTRDirectionality(const string16& text) {
// Always wrap the string in RTL UI (it may be appended to RTL string).
// Also wrap strings with an RTL first strong character direction in LTR UI.
if (IsRTL() || GetFirstStrongCharacterDirection(text) == RIGHT_TO_LEFT) {
string16 text_mutable(text);
WrapStringWithLTRFormatting(&text_mutable);
return text_mutable;
}
return text;
}
string16 StripWrappingBidiControlCharacters(const string16& text) {
if (text.empty())
return text;
size_t begin_index = 0;
char16 begin = text[begin_index];
if (begin == kLeftToRightEmbeddingMark ||
begin == kRightToLeftEmbeddingMark ||
begin == kLeftToRightOverride ||
begin == kRightToLeftOverride)
++begin_index;
size_t end_index = text.length() - 1;
if (text[end_index] == kPopDirectionalFormatting)
--end_index;
return text.substr(begin_index, end_index - begin_index + 1);
}
} // namespace i18n
} // namespace base

View file

@ -0,0 +1,171 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_RTL_H_
#define BASE_I18N_RTL_H_
#include <string>
#include "base/compiler_specific.h"
#include "base/i18n/base_i18n_export.h"
#include "base/strings/string16.h"
#include "build/build_config.h"
namespace base {
class FilePath;
namespace i18n {
const char16 kRightToLeftMark = 0x200F;
const char16 kLeftToRightMark = 0x200E;
const char16 kLeftToRightEmbeddingMark = 0x202A;
const char16 kRightToLeftEmbeddingMark = 0x202B;
const char16 kPopDirectionalFormatting = 0x202C;
const char16 kLeftToRightOverride = 0x202D;
const char16 kRightToLeftOverride = 0x202E;
// Locale.java mirrored this enum TextDirection. Please keep in sync.
enum TextDirection {
UNKNOWN_DIRECTION = 0,
RIGHT_TO_LEFT = 1,
LEFT_TO_RIGHT = 2,
TEXT_DIRECTION_MAX = LEFT_TO_RIGHT,
};
// Get the locale that the currently running process has been configured to use.
// The return value is of the form language[-country] (e.g., en-US) where the
// language is the 2 or 3 letter code from ISO-639.
BASE_I18N_EXPORT std::string GetConfiguredLocale();
// Canonicalize a string (eg. a POSIX locale string) to a Chrome locale name.
BASE_I18N_EXPORT std::string GetCanonicalLocale(const std::string& locale);
// Sets the default locale of ICU.
// Once the application locale of Chrome in GetApplicationLocale is determined,
// the default locale of ICU need to be changed to match the application locale
// so that ICU functions work correctly in a locale-dependent manner.
// This is handy in that we don't have to call GetApplicationLocale()
// everytime we call locale-dependent ICU APIs as long as we make sure
// that this is called before any locale-dependent API is called.
BASE_I18N_EXPORT void SetICUDefaultLocale(const std::string& locale_string);
// Returns true if the application text direction is right-to-left.
BASE_I18N_EXPORT bool IsRTL();
// A test utility function to set the application default text direction.
BASE_I18N_EXPORT void SetRTLForTesting(bool rtl);
// Returns whether the text direction for the default ICU locale is RTL. This
// assumes that SetICUDefaultLocale has been called to set the default locale to
// the UI locale of Chrome.
// NOTE: Generally, you should call IsRTL() instead of this.
BASE_I18N_EXPORT bool ICUIsRTL();
// Gets the explicitly forced text direction for debugging. If no forcing is
// applied, returns UNKNOWN_DIRECTION.
BASE_I18N_EXPORT TextDirection GetForcedTextDirection();
// Returns the text direction for |locale_name|.
// As a startup optimization, this method checks the locale against a list of
// Chrome-supported RTL locales.
BASE_I18N_EXPORT TextDirection
GetTextDirectionForLocaleInStartUp(const char* locale_name);
// Returns the text direction for |locale_name|.
BASE_I18N_EXPORT TextDirection GetTextDirectionForLocale(
const char* locale_name);
// Given the string in |text|, returns the directionality of the first or last
// character with strong directionality in the string. If no character in the
// text has strong directionality, LEFT_TO_RIGHT is returned. The Bidi
// character types L, LRE, LRO, R, AL, RLE, and RLO are considered as strong
// directionality characters. Please refer to http://unicode.org/reports/tr9/
// for more information.
BASE_I18N_EXPORT TextDirection GetFirstStrongCharacterDirection(
const string16& text);
BASE_I18N_EXPORT TextDirection GetLastStrongCharacterDirection(
const string16& text);
// Given the string in |text|, returns LEFT_TO_RIGHT or RIGHT_TO_LEFT if all the
// strong directionality characters in the string are of the same
// directionality. It returns UNKNOWN_DIRECTION if the string contains a mix of
// LTR and RTL strong directionality characters. Defaults to LEFT_TO_RIGHT if
// the string does not contain directionality characters. Please refer to
// http://unicode.org/reports/tr9/ for more information.
BASE_I18N_EXPORT TextDirection GetStringDirection(const string16& text);
// Given the string in |text|, this function modifies the string in place with
// the appropriate Unicode formatting marks that mark the string direction
// (either left-to-right or right-to-left). The function checks both the current
// locale and the contents of the string in order to determine the direction of
// the returned string. The function returns true if the string in |text| was
// properly adjusted.
//
// Certain LTR strings are not rendered correctly when the context is RTL. For
// example, the string "Foo!" will appear as "!Foo" if it is rendered as is in
// an RTL context. Calling this function will make sure the returned localized
// string is always treated as a right-to-left string. This is done by
// inserting certain Unicode formatting marks into the returned string.
//
// ** Notes about the Windows version of this function:
// TODO(idana) bug 6806: this function adjusts the string in question only
// if the current locale is right-to-left. The function does not take care of
// the opposite case (an RTL string displayed in an LTR context) since
// adjusting the string involves inserting Unicode formatting characters that
// Windows does not handle well unless right-to-left language support is
// installed. Since the English version of Windows doesn't have right-to-left
// language support installed by default, inserting the direction Unicode mark
// results in Windows displaying squares.
BASE_I18N_EXPORT bool AdjustStringForLocaleDirection(string16* text);
// Undoes the actions of the above function (AdjustStringForLocaleDirection).
BASE_I18N_EXPORT bool UnadjustStringForLocaleDirection(string16* text);
// Ensures |text| contains no unterminated directional formatting characters, by
// appending the appropriate pop-directional-formatting characters to the end of
// |text|.
BASE_I18N_EXPORT void EnsureTerminatedDirectionalFormatting(string16* text);
// Sanitizes the |text| by terminating any directional override/embedding
// characters and then adjusting the string for locale direction.
BASE_I18N_EXPORT void SanitizeUserSuppliedString(string16* text);
// Returns true if the string contains at least one character with strong right
// to left directionality; that is, a character with either R or AL Unicode
// BiDi character type.
BASE_I18N_EXPORT bool StringContainsStrongRTLChars(const string16& text);
// Wraps a string with an LRE-PDF pair which essentialy marks the string as a
// Left-To-Right string. Doing this is useful in order to make sure LTR
// strings are rendered properly in an RTL context.
BASE_I18N_EXPORT void WrapStringWithLTRFormatting(string16* text);
// Wraps a string with an RLE-PDF pair which essentialy marks the string as a
// Right-To-Left string. Doing this is useful in order to make sure RTL
// strings are rendered properly in an LTR context.
BASE_I18N_EXPORT void WrapStringWithRTLFormatting(string16* text);
// Wraps file path to get it to display correctly in RTL UI. All filepaths
// should be passed through this function before display in UI for RTL locales.
BASE_I18N_EXPORT void WrapPathWithLTRFormatting(const FilePath& path,
string16* rtl_safe_path);
// Return the string in |text| wrapped with LRE (Left-To-Right Embedding) and
// PDF (Pop Directional Formatting) marks, if needed for UI display purposes.
BASE_I18N_EXPORT string16 GetDisplayStringInLTRDirectionality(
const string16& text) WARN_UNUSED_RESULT;
// Strip the beginning (U+202A..U+202B, U+202D..U+202E) and/or ending (U+202C)
// explicit bidi control characters from |text|, if there are any. Otherwise,
// return the text itself. Explicit bidi control characters display and have
// semantic effect. They can be deleted so they might not always appear in a
// pair.
BASE_I18N_EXPORT string16 StripWrappingBidiControlCharacters(
const string16& text) WARN_UNUSED_RESULT;
} // namespace i18n
} // namespace base
#endif // BASE_I18N_RTL_H_

View file

@ -0,0 +1,59 @@
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This implementation doesn't use ICU. The ICU macros are oriented towards
// character-at-a-time processing, whereas byte-at-a-time processing is easier
// with streaming input.
#include "base/i18n/streaming_utf8_validator.h"
#include "base/i18n/utf8_validator_tables.h"
#include "base/logging.h"
namespace base {
namespace {
uint8_t StateTableLookup(uint8_t offset) {
DCHECK_LT(offset, internal::kUtf8ValidatorTablesSize);
return internal::kUtf8ValidatorTables[offset];
}
} // namespace
StreamingUtf8Validator::State StreamingUtf8Validator::AddBytes(const char* data,
size_t size) {
// Copy |state_| into a local variable so that the compiler doesn't have to be
// careful of aliasing.
uint8_t state = state_;
for (const char* p = data; p != data + size; ++p) {
if ((*p & 0x80) == 0) {
if (state == 0)
continue;
state = internal::I18N_UTF8_VALIDATOR_INVALID_INDEX;
break;
}
const uint8_t shift_amount = StateTableLookup(state);
const uint8_t shifted_char = (*p & 0x7F) >> shift_amount;
state = StateTableLookup(state + shifted_char + 1);
// State may be INVALID here, but this code is optimised for the case of
// valid UTF-8 and it is more efficient (by about 2%) to not attempt an
// early loop exit unless we hit an ASCII character.
}
state_ = state;
return state == 0 ? VALID_ENDPOINT
: state == internal::I18N_UTF8_VALIDATOR_INVALID_INDEX
? INVALID
: VALID_MIDPOINT;
}
void StreamingUtf8Validator::Reset() {
state_ = 0u;
}
bool StreamingUtf8Validator::Validate(const std::string& string) {
return StreamingUtf8Validator().AddBytes(string.data(), string.size()) ==
VALID_ENDPOINT;
}
} // namespace base

View file

@ -0,0 +1,66 @@
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// A streaming validator for UTF-8. Validation is based on the definition in
// RFC-3629. In particular, it does not reject the invalid characters rejected
// by base::IsStringUTF8().
//
// The implementation detects errors on the first possible byte.
#ifndef BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
#define BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
#include <stddef.h>
#include <stdint.h>
#include <string>
#include "base/i18n/base_i18n_export.h"
#include "base/macros.h"
namespace base {
class BASE_I18N_EXPORT StreamingUtf8Validator {
public:
// The validator exposes 3 states. It starts in state VALID_ENDPOINT. As it
// processes characters it alternates between VALID_ENDPOINT and
// VALID_MIDPOINT. If it encounters an invalid byte or UTF-8 sequence the
// state changes permanently to INVALID.
enum State {
VALID_ENDPOINT,
VALID_MIDPOINT,
INVALID
};
StreamingUtf8Validator() : state_(0u) {}
// Trivial destructor intentionally omitted.
// Validate |size| bytes starting at |data|. If the concatenation of all calls
// to AddBytes() since this object was constructed or reset is a valid UTF-8
// string, returns VALID_ENDPOINT. If it could be the prefix of a valid UTF-8
// string, returns VALID_MIDPOINT. If an invalid byte or UTF-8 sequence was
// present, returns INVALID.
State AddBytes(const char* data, size_t size);
// Return the object to a freshly-constructed state so that it can be re-used.
void Reset();
// Validate a complete string using the same criteria. Returns true if the
// string only contains complete, valid UTF-8 codepoints.
static bool Validate(const std::string& string);
private:
// The current state of the validator. Value 0 is the initial/valid state.
// The state is stored as an offset into |kUtf8ValidatorTables|. The special
// state |kUtf8InvalidState| is invalid.
uint8_t state_;
// This type could be made copyable but there is currently no use-case for
// it.
DISALLOW_COPY_AND_ASSIGN(StreamingUtf8Validator);
};
} // namespace base
#endif // BASE_I18N_STREAMING_UTF8_VALIDATOR_H_

View file

@ -0,0 +1,29 @@
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/string_compare.h"
#include "base/logging.h"
#include "base/strings/utf_string_conversions.h"
#include "third_party/icu/source/common/unicode/unistr.h"
namespace base {
namespace i18n {
// Compares the character data stored in two different string16 strings by
// specified Collator instance.
UCollationResult CompareString16WithCollator(const icu::Collator& collator,
StringPiece16 lhs,
StringPiece16 rhs) {
UErrorCode error = U_ZERO_ERROR;
UCollationResult result = collator.compare(
icu::UnicodeString(FALSE, lhs.data(), static_cast<int>(lhs.length())),
icu::UnicodeString(FALSE, rhs.data(), static_cast<int>(rhs.length())),
error);
DCHECK(U_SUCCESS(error));
return result;
}
} // namespace i18n
} // namespace base

View file

@ -0,0 +1,28 @@
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_STRING_COMPARE_H_
#define BASE_I18N_STRING_COMPARE_H_
#include <algorithm>
#include <string>
#include <vector>
#include "base/i18n/base_i18n_export.h"
#include "base/strings/string_piece.h"
#include "third_party/icu/source/i18n/unicode/coll.h"
namespace base {
namespace i18n {
// Compares the two strings using the specified collator.
BASE_I18N_EXPORT UCollationResult
CompareString16WithCollator(const icu::Collator& collator,
const StringPiece16 lhs,
const StringPiece16 rhs);
} // namespace i18n
} // namespace base
#endif // BASE_I18N_STRING_COMPARE_H_

View file

@ -0,0 +1,111 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <stdint.h>
#include "base/i18n/string_search.h"
#include "base/logging.h"
#include "third_party/icu/source/i18n/unicode/usearch.h"
namespace base {
namespace i18n {
FixedPatternStringSearch::FixedPatternStringSearch(const string16& find_this,
bool case_sensitive)
: find_this_(find_this) {
// usearch_open requires a valid string argument to be searched, even if we
// want to set it by usearch_setText afterwards. So, supplying a dummy text.
const string16& dummy = find_this_;
UErrorCode status = U_ZERO_ERROR;
search_ = usearch_open(find_this_.data(), find_this_.size(), dummy.data(),
dummy.size(), uloc_getDefault(),
nullptr, // breakiter
&status);
if (U_SUCCESS(status)) {
// http://icu-project.org/apiref/icu4c40/ucol_8h.html#6a967f36248b0a1bc7654f538ee8ba96
// Set comparison level to UCOL_PRIMARY to ignore secondary and tertiary
// differences. Set comparison level to UCOL_TERTIARY to include all
// comparison differences.
// Diacritical differences on the same base letter represent a
// secondary difference.
// Uppercase and lowercase versions of the same character represents a
// tertiary difference.
UCollator* collator = usearch_getCollator(search_);
ucol_setStrength(collator, case_sensitive ? UCOL_TERTIARY : UCOL_PRIMARY);
usearch_reset(search_);
}
}
FixedPatternStringSearch::~FixedPatternStringSearch() {
if (search_)
usearch_close(search_);
}
bool FixedPatternStringSearch::Search(const string16& in_this,
size_t* match_index,
size_t* match_length,
bool forward_search) {
UErrorCode status = U_ZERO_ERROR;
usearch_setText(search_, in_this.data(), in_this.size(), &status);
// Default to basic substring search if usearch fails. According to
// http://icu-project.org/apiref/icu4c/usearch_8h.html, usearch_open will fail
// if either |find_this| or |in_this| are empty. In either case basic
// substring search will give the correct return value.
if (!U_SUCCESS(status)) {
size_t index = in_this.find(find_this_);
if (index == string16::npos)
return false;
if (match_index)
*match_index = index;
if (match_length)
*match_length = find_this_.size();
return true;
}
int32_t index = forward_search ? usearch_first(search_, &status)
: usearch_last(search_, &status);
if (!U_SUCCESS(status) || index == USEARCH_DONE)
return false;
if (match_index)
*match_index = static_cast<size_t>(index);
if (match_length)
*match_length = static_cast<size_t>(usearch_getMatchedLength(search_));
return true;
}
FixedPatternStringSearchIgnoringCaseAndAccents::
FixedPatternStringSearchIgnoringCaseAndAccents(const string16& find_this)
: base_search_(find_this, /*case_sensitive=*/false) {}
bool FixedPatternStringSearchIgnoringCaseAndAccents::Search(
const string16& in_this,
size_t* match_index,
size_t* match_length) {
return base_search_.Search(in_this, match_index, match_length,
/*forward_search=*/true);
}
bool StringSearchIgnoringCaseAndAccents(const string16& find_this,
const string16& in_this,
size_t* match_index,
size_t* match_length) {
return FixedPatternStringSearchIgnoringCaseAndAccents(find_this).Search(
in_this, match_index, match_length);
}
bool StringSearch(const string16& find_this,
const string16& in_this,
size_t* match_index,
size_t* match_length,
bool case_sensitive,
bool forward_search) {
return FixedPatternStringSearch(find_this, case_sensitive)
.Search(in_this, match_index, match_length, forward_search);
}
} // namespace i18n
} // namespace base

View file

@ -0,0 +1,93 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_STRING_SEARCH_H_
#define BASE_I18N_STRING_SEARCH_H_
#include <stddef.h>
#include "base/i18n/base_i18n_export.h"
#include "base/strings/string16.h"
struct UStringSearch;
namespace base {
namespace i18n {
// Returns true if |in_this| contains |find_this|. If |match_index| or
// |match_length| are non-NULL, they are assigned the start position and total
// length of the match.
//
// Only differences between base letters are taken into consideration. Case and
// accent differences are ignored. Please refer to 'primary level' in
// http://userguide.icu-project.org/collation/concepts for additional details.
BASE_I18N_EXPORT
bool StringSearchIgnoringCaseAndAccents(const string16& find_this,
const string16& in_this,
size_t* match_index,
size_t* match_length);
// Returns true if |in_this| contains |find_this|. If |match_index| or
// |match_length| are non-NULL, they are assigned the start position and total
// length of the match.
//
// When |case_sensitive| is false, only differences between base letters are
// taken into consideration. Case and accent differences are ignored.
// Please refer to 'primary level' in
// http://userguide.icu-project.org/collation/concepts for additional details.
// When |forward_search| is true, finds the first instance of |find_this|,
// otherwise finds the last instance
BASE_I18N_EXPORT
bool StringSearch(const string16& find_this,
const string16& in_this,
size_t* match_index,
size_t* match_length,
bool case_sensitive,
bool forward_search);
// This class is for speeding up multiple StringSearch()
// with the same |find_this| argument. |find_this| is passed as the constructor
// argument, and precomputation for searching is done only at that time.
class BASE_I18N_EXPORT FixedPatternStringSearch {
public:
explicit FixedPatternStringSearch(const string16& find_this,
bool case_sensitive);
~FixedPatternStringSearch();
// Returns true if |in_this| contains |find_this|. If |match_index| or
// |match_length| are non-NULL, they are assigned the start position and total
// length of the match.
bool Search(const string16& in_this,
size_t* match_index,
size_t* match_length,
bool forward_search);
private:
string16 find_this_;
UStringSearch* search_;
};
// This class is for speeding up multiple StringSearchIgnoringCaseAndAccents()
// with the same |find_this| argument. |find_this| is passed as the constructor
// argument, and precomputation for searching is done only at that time.
class BASE_I18N_EXPORT FixedPatternStringSearchIgnoringCaseAndAccents {
public:
explicit FixedPatternStringSearchIgnoringCaseAndAccents(
const string16& find_this);
// Returns true if |in_this| contains |find_this|. If |match_index| or
// |match_length| are non-NULL, they are assigned the start position and total
// length of the match.
bool Search(const string16& in_this,
size_t* match_index,
size_t* match_length);
private:
FixedPatternStringSearch base_search_;
};
} // namespace i18n
} // namespace base
#endif // BASE_I18N_STRING_SEARCH_H_

View file

@ -0,0 +1,296 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/time_formatting.h"
#include <stddef.h>
#include <memory>
#include "base/i18n/unicodestring.h"
#include "base/logging.h"
#include "base/strings/utf_string_conversions.h"
#include "base/time/time.h"
#include "third_party/icu/source/common/unicode/utypes.h"
#include "third_party/icu/source/i18n/unicode/datefmt.h"
#include "third_party/icu/source/i18n/unicode/dtitvfmt.h"
#include "third_party/icu/source/i18n/unicode/dtptngen.h"
#include "third_party/icu/source/i18n/unicode/fmtable.h"
#include "third_party/icu/source/i18n/unicode/measfmt.h"
#include "third_party/icu/source/i18n/unicode/smpdtfmt.h"
namespace base {
namespace {
string16 TimeFormat(const icu::DateFormat* formatter,
const Time& time) {
DCHECK(formatter);
icu::UnicodeString date_string;
formatter->format(static_cast<UDate>(time.ToDoubleT() * 1000), date_string);
return i18n::UnicodeStringToString16(date_string);
}
string16 TimeFormatWithoutAmPm(const icu::DateFormat* formatter,
const Time& time) {
DCHECK(formatter);
icu::UnicodeString time_string;
icu::FieldPosition ampm_field(icu::DateFormat::kAmPmField);
formatter->format(
static_cast<UDate>(time.ToDoubleT() * 1000), time_string, ampm_field);
int ampm_length = ampm_field.getEndIndex() - ampm_field.getBeginIndex();
if (ampm_length) {
int begin = ampm_field.getBeginIndex();
// Doesn't include any spacing before the field.
if (begin)
begin--;
time_string.removeBetween(begin, ampm_field.getEndIndex());
}
return i18n::UnicodeStringToString16(time_string);
}
icu::SimpleDateFormat CreateSimpleDateFormatter(const char* pattern) {
// Generate a locale-dependent format pattern. The generator will take
// care of locale-dependent formatting issues like which separator to
// use (some locales use '.' instead of ':'), and where to put the am/pm
// marker.
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<icu::DateTimePatternGenerator> generator(
icu::DateTimePatternGenerator::createInstance(status));
DCHECK(U_SUCCESS(status));
icu::UnicodeString generated_pattern =
generator->getBestPattern(icu::UnicodeString(pattern), status);
DCHECK(U_SUCCESS(status));
// Then, format the time using the generated pattern.
icu::SimpleDateFormat formatter(generated_pattern, status);
DCHECK(U_SUCCESS(status));
return formatter;
}
UMeasureFormatWidth DurationWidthToMeasureWidth(DurationFormatWidth width) {
switch (width) {
case DURATION_WIDTH_WIDE: return UMEASFMT_WIDTH_WIDE;
case DURATION_WIDTH_SHORT: return UMEASFMT_WIDTH_SHORT;
case DURATION_WIDTH_NARROW: return UMEASFMT_WIDTH_NARROW;
case DURATION_WIDTH_NUMERIC: return UMEASFMT_WIDTH_NUMERIC;
}
NOTREACHED();
return UMEASFMT_WIDTH_COUNT;
}
const char* DateFormatToString(DateFormat format) {
switch (format) {
case DATE_FORMAT_YEAR_MONTH:
return UDAT_YEAR_MONTH;
case DATE_FORMAT_MONTH_WEEKDAY_DAY:
return UDAT_MONTH_WEEKDAY_DAY;
}
NOTREACHED();
return UDAT_YEAR_MONTH_DAY;
}
} // namespace
string16 TimeFormatTimeOfDay(const Time& time) {
// We can omit the locale parameter because the default should match
// Chrome's application locale.
std::unique_ptr<icu::DateFormat> formatter(
icu::DateFormat::createTimeInstance(icu::DateFormat::kShort));
return TimeFormat(formatter.get(), time);
}
string16 TimeFormatTimeOfDayWithMilliseconds(const Time& time) {
icu::SimpleDateFormat formatter = CreateSimpleDateFormatter("HmsSSS");
return TimeFormatWithoutAmPm(&formatter, time);
}
string16 TimeFormatTimeOfDayWithHourClockType(const Time& time,
HourClockType type,
AmPmClockType ampm) {
// Just redirect to the normal function if the default type matches the
// given type.
HourClockType default_type = GetHourClockType();
if (default_type == type && (type == k24HourClock || ampm == kKeepAmPm)) {
return TimeFormatTimeOfDay(time);
}
const char* base_pattern = (type == k12HourClock ? "ahm" : "Hm");
icu::SimpleDateFormat formatter = CreateSimpleDateFormatter(base_pattern);
if (ampm == kKeepAmPm) {
return TimeFormat(&formatter, time);
}
return TimeFormatWithoutAmPm(&formatter, time);
}
string16 TimeFormatShortDate(const Time& time) {
std::unique_ptr<icu::DateFormat> formatter(
icu::DateFormat::createDateInstance(icu::DateFormat::kMedium));
return TimeFormat(formatter.get(), time);
}
string16 TimeFormatShortDateNumeric(const Time& time) {
std::unique_ptr<icu::DateFormat> formatter(
icu::DateFormat::createDateInstance(icu::DateFormat::kShort));
return TimeFormat(formatter.get(), time);
}
string16 TimeFormatShortDateAndTime(const Time& time) {
std::unique_ptr<icu::DateFormat> formatter(
icu::DateFormat::createDateTimeInstance(icu::DateFormat::kShort));
return TimeFormat(formatter.get(), time);
}
string16 TimeFormatShortDateAndTimeWithTimeZone(const Time& time) {
std::unique_ptr<icu::DateFormat> formatter(
icu::DateFormat::createDateTimeInstance(icu::DateFormat::kShort,
icu::DateFormat::kLong));
return TimeFormat(formatter.get(), time);
}
string16 TimeFormatMonthAndYear(const Time& time) {
icu::SimpleDateFormat formatter =
CreateSimpleDateFormatter(DateFormatToString(DATE_FORMAT_YEAR_MONTH));
return TimeFormat(&formatter, time);
}
string16 TimeFormatFriendlyDateAndTime(const Time& time) {
std::unique_ptr<icu::DateFormat> formatter(
icu::DateFormat::createDateTimeInstance(icu::DateFormat::kFull));
return TimeFormat(formatter.get(), time);
}
string16 TimeFormatFriendlyDate(const Time& time) {
std::unique_ptr<icu::DateFormat> formatter(
icu::DateFormat::createDateInstance(icu::DateFormat::kFull));
return TimeFormat(formatter.get(), time);
}
string16 TimeFormatWithPattern(const Time& time, const char* pattern) {
icu::SimpleDateFormat formatter = CreateSimpleDateFormatter(pattern);
return TimeFormat(&formatter, time);
}
bool TimeDurationFormat(const TimeDelta time,
const DurationFormatWidth width,
string16* out) {
DCHECK(out);
UErrorCode status = U_ZERO_ERROR;
const int total_minutes = static_cast<int>(time.InSecondsF() / 60 + 0.5);
const int hours = total_minutes / 60;
const int minutes = total_minutes % 60;
UMeasureFormatWidth u_width = DurationWidthToMeasureWidth(width);
// TODO(derat): Delete the |status| checks and LOG(ERROR) calls throughout
// this function once the cause of http://crbug.com/677043 is tracked down.
const icu::Measure measures[] = {
icu::Measure(hours, icu::MeasureUnit::createHour(status), status),
icu::Measure(minutes, icu::MeasureUnit::createMinute(status), status)};
if (U_FAILURE(status)) {
LOG(ERROR) << "Creating MeasureUnit or Measure for " << hours << "h"
<< minutes << "m failed: " << u_errorName(status);
return false;
}
icu::MeasureFormat measure_format(icu::Locale::getDefault(), u_width, status);
if (U_FAILURE(status)) {
LOG(ERROR) << "Creating MeasureFormat for "
<< icu::Locale::getDefault().getName()
<< " failed: " << u_errorName(status);
return false;
}
icu::UnicodeString formatted;
icu::FieldPosition ignore(icu::FieldPosition::DONT_CARE);
measure_format.formatMeasures(measures, 2, formatted, ignore, status);
if (U_FAILURE(status)) {
LOG(ERROR) << "formatMeasures failed: " << u_errorName(status);
return false;
}
*out = i18n::UnicodeStringToString16(formatted);
return true;
}
bool TimeDurationFormatWithSeconds(const TimeDelta time,
const DurationFormatWidth width,
string16* out) {
DCHECK(out);
UErrorCode status = U_ZERO_ERROR;
const int64_t total_seconds = static_cast<int64_t>(time.InSecondsF() + 0.5);
const int64_t hours = total_seconds / 3600;
const int64_t minutes = (total_seconds - hours * 3600) / 60;
const int64_t seconds = total_seconds % 60;
UMeasureFormatWidth u_width = DurationWidthToMeasureWidth(width);
const icu::Measure measures[] = {
icu::Measure(hours, icu::MeasureUnit::createHour(status), status),
icu::Measure(minutes, icu::MeasureUnit::createMinute(status), status),
icu::Measure(seconds, icu::MeasureUnit::createSecond(status), status)};
icu::MeasureFormat measure_format(icu::Locale::getDefault(), u_width, status);
icu::UnicodeString formatted;
icu::FieldPosition ignore(icu::FieldPosition::DONT_CARE);
measure_format.formatMeasures(measures, 3, formatted, ignore, status);
*out = i18n::UnicodeStringToString16(formatted);
return U_SUCCESS(status) == TRUE;
}
string16 DateIntervalFormat(const Time& begin_time,
const Time& end_time,
DateFormat format) {
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<icu::DateIntervalFormat> formatter(
icu::DateIntervalFormat::createInstance(DateFormatToString(format),
status));
icu::FieldPosition pos = 0;
UDate start_date = static_cast<UDate>(begin_time.ToDoubleT() * 1000);
UDate end_date = static_cast<UDate>(end_time.ToDoubleT() * 1000);
icu::DateInterval interval(start_date, end_date);
icu::UnicodeString formatted;
formatter->format(&interval, formatted, pos, status);
return i18n::UnicodeStringToString16(formatted);
}
HourClockType GetHourClockType() {
// TODO(satorux,jshin): Rework this with ures_getByKeyWithFallback()
// once it becomes public. The short time format can be found at
// "calendar/gregorian/DateTimePatterns/3" in the resources.
std::unique_ptr<icu::SimpleDateFormat> formatter(
static_cast<icu::SimpleDateFormat*>(
icu::DateFormat::createTimeInstance(icu::DateFormat::kShort)));
// Retrieve the short time format.
icu::UnicodeString pattern_unicode;
formatter->toPattern(pattern_unicode);
// Determine what hour clock type the current locale uses, by checking
// "a" (am/pm marker) in the short time format. This is reliable as "a"
// is used by all of 12-hour clock formats, but not any of 24-hour clock
// formats, as shown below.
//
// % grep -A4 DateTimePatterns third_party/icu/source/data/locales/*.txt |
// grep -B1 -- -- |grep -v -- '--' |
// perl -nle 'print $1 if /^\S+\s+"(.*)"/' |sort -u
//
// H.mm
// H:mm
// HH.mm
// HH:mm
// a h:mm
// ah:mm
// ahh:mm
// h-mm a
// h:mm a
// hh:mm a
//
// See http://userguide.icu-project.org/formatparse/datetime for details
// about the date/time format syntax.
return pattern_unicode.indexOf('a') == -1 ? k24HourClock : k12HourClock;
}
} // namespace base

View file

@ -0,0 +1,138 @@
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Basic time formatting methods. These methods use the current locale
// formatting for displaying the time.
#ifndef BASE_I18N_TIME_FORMATTING_H_
#define BASE_I18N_TIME_FORMATTING_H_
#include "base/compiler_specific.h"
#include "base/i18n/base_i18n_export.h"
#include "base/strings/string16.h"
namespace base {
class Time;
class TimeDelta;
// Argument type used to specify the hour clock type.
enum HourClockType {
k12HourClock, // Uses 1-12. e.g., "3:07 PM"
k24HourClock, // Uses 0-23. e.g., "15:07"
};
// Argument type used to specify whether or not to include AM/PM sign.
enum AmPmClockType {
kDropAmPm, // Drops AM/PM sign. e.g., "3:07"
kKeepAmPm, // Keeps AM/PM sign. e.g., "3:07 PM"
};
// Should match UMeasureFormatWidth in measfmt.h; replicated here to avoid
// requiring third_party/icu dependencies with this file.
enum DurationFormatWidth {
DURATION_WIDTH_WIDE, // "3 hours, 7 minutes"
DURATION_WIDTH_SHORT, // "3 hr, 7 min"
DURATION_WIDTH_NARROW, // "3h 7m"
DURATION_WIDTH_NUMERIC // "3:07"
};
// Date formats from third_party/icu/source/i18n/unicode/udat.h. Add more as
// necessary.
enum DateFormat {
// November 2007
DATE_FORMAT_YEAR_MONTH,
// Tuesday, 7 November
DATE_FORMAT_MONTH_WEEKDAY_DAY,
};
// Returns the time of day, e.g., "3:07 PM".
BASE_I18N_EXPORT string16 TimeFormatTimeOfDay(const Time& time);
// Returns the time of day in 24-hour clock format with millisecond accuracy,
// e.g., "15:07:30.568"
BASE_I18N_EXPORT string16 TimeFormatTimeOfDayWithMilliseconds(const Time& time);
// Returns the time of day in the specified hour clock type. e.g.
// "3:07 PM" (type == k12HourClock, ampm == kKeepAmPm).
// "3:07" (type == k12HourClock, ampm == kDropAmPm).
// "15:07" (type == k24HourClock).
BASE_I18N_EXPORT string16 TimeFormatTimeOfDayWithHourClockType(
const Time& time,
HourClockType type,
AmPmClockType ampm);
// Returns a shortened date, e.g. "Nov 7, 2007"
BASE_I18N_EXPORT string16 TimeFormatShortDate(const Time& time);
// Returns a numeric date such as 12/13/52.
BASE_I18N_EXPORT string16 TimeFormatShortDateNumeric(const Time& time);
// Returns a numeric date and time such as "12/13/52 2:44:30 PM".
BASE_I18N_EXPORT string16 TimeFormatShortDateAndTime(const Time& time);
// Returns a month and year, e.g. "November 2007"
BASE_I18N_EXPORT string16 TimeFormatMonthAndYear(const Time& time);
// Returns a numeric date and time with time zone such as
// "12/13/52 2:44:30 PM PST".
BASE_I18N_EXPORT string16
TimeFormatShortDateAndTimeWithTimeZone(const Time& time);
// Formats a time in a friendly sentence format, e.g.
// "Monday, March 6, 2008 2:44:30 PM".
BASE_I18N_EXPORT string16 TimeFormatFriendlyDateAndTime(const Time& time);
// Formats a time in a friendly sentence format, e.g.
// "Monday, March 6, 2008".
BASE_I18N_EXPORT string16 TimeFormatFriendlyDate(const Time& time);
// Formats a time using a skeleton to produce a format for different locales
// when an unusual time format is needed, e.g. "Feb. 2, 18:00".
//
// See http://userguide.icu-project.org/formatparse/datetime for details.
BASE_I18N_EXPORT string16 TimeFormatWithPattern(const Time& time,
const char* pattern);
// Formats a time duration of hours and minutes into various formats, e.g.,
// "3:07" or "3 hours, 7 minutes", and returns true on success. See
// DurationFormatWidth for details.
//
// Please don't use width = DURATION_WIDTH_NUMERIC when the time duration
// can possibly be larger than 24h, as the hour value will be cut below 24
// after formatting.
// TODO(crbug.com/675791): fix function output when width =
// DURATION_WIDTH_NUMERIC.
BASE_I18N_EXPORT bool TimeDurationFormat(const TimeDelta time,
const DurationFormatWidth width,
string16* out) WARN_UNUSED_RESULT;
// Formats a time duration of hours, minutes and seconds into various formats,
// e.g., "3:07:30" or "3 hours, 7 minutes, 30 seconds", and returns true on
// success. See DurationFormatWidth for details.
//
// Please don't use width = DURATION_WIDTH_NUMERIC when the time duration
// can possibly be larger than 24h, as the hour value will be cut below 24
// after formatting.
// TODO(crbug.com/675791): fix function output when width =
// DURATION_WIDTH_NUMERIC.
BASE_I18N_EXPORT bool TimeDurationFormatWithSeconds(
const TimeDelta time,
const DurationFormatWidth width,
string16* out) WARN_UNUSED_RESULT;
// Formats a date interval into various formats, e.g. "2 December - 4 December"
// or "March 2016 - December 2016". See DateFormat for details.
BASE_I18N_EXPORT string16 DateIntervalFormat(const Time& begin_time,
const Time& end_time,
DateFormat format);
// Gets the hour clock type of the current locale. e.g.
// k12HourClock (en-US).
// k24HourClock (en-GB).
BASE_I18N_EXPORT HourClockType GetHourClockType();
} // namespace base
#endif // BASE_I18N_TIME_FORMATTING_H_

View file

@ -0,0 +1,34 @@
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/timezone.h"
#include <memory>
#include <string>
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/i18n/unicode/timezone.h"
namespace base {
std::string CountryCodeForCurrentTimezone() {
std::unique_ptr<icu::TimeZone> zone(icu::TimeZone::createDefault());
icu::UnicodeString id;
// ICU returns '001' (world) for Etc/GMT. Preserve the old behavior
// only for Etc/GMT while returning an empty string for Etc/UTC and
// Etc/UCT because they're less likely to be chosen by mistake in UK in
// place of Europe/London (Briitish Time).
if (zone->getID(id) == UNICODE_STRING_SIMPLE("Etc/GMT"))
return "GB";
char region_code[4];
UErrorCode status = U_ZERO_ERROR;
int length = zone->getRegion(id, region_code, 4, status);
// Return an empty string if region_code is a 3-digit numeric code such
// as 001 (World) for Etc/UTC, Etc/UCT.
return (U_SUCCESS(status) && length == 2)
? std::string(region_code, static_cast<size_t>(length))
: std::string();
}
} // namespace base

View file

@ -0,0 +1,24 @@
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_TIMEZONE_H_
#define BASE_I18N_TIMEZONE_H_
#include <string>
#include "base/i18n/base_i18n_export.h"
namespace base {
// Checks the system timezone and turns it into a two-character ISO 3166 country
// code. This may fail (for example, it used to always fail on Android), in
// which case it will return an empty string. It'll also return an empty string
// when the timezone is Etc/UTC or Etc/UCT, but will return 'GB" for Etc/GMT
// because people in the UK tends to select Etc/GMT by mistake instead of
// Europe/London (British Time).
BASE_I18N_EXPORT std::string CountryCodeForCurrentTimezone();
} // namespace base
#endif // BASE_I18N_TIMEZONE_H_

View file

@ -0,0 +1,32 @@
// Copyright (c) 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_UNICODESTRING_H_
#define BASE_I18N_UNICODESTRING_H_
#include "base/strings/string16.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/common/unicode/uvernum.h"
#if U_ICU_VERSION_MAJOR_NUM >= 59
#include "third_party/icu/source/common/unicode/char16ptr.h"
#endif
namespace base {
namespace i18n {
inline string16 UnicodeStringToString16(const icu::UnicodeString& unistr) {
#if U_ICU_VERSION_MAJOR_NUM >= 59
return base::string16(icu::toUCharPtr(unistr.getBuffer()),
static_cast<size_t>(unistr.length()));
#else
return base::string16(unistr.getBuffer(),
static_cast<size_t>(unistr.length()));
#endif
}
} // namespace i18n
} // namespace base
#endif // BASE_UNICODESTRING_H_

View file

@ -0,0 +1,56 @@
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file is auto-generated by build_utf8_validator_tables.
// DO NOT EDIT.
#include "base/i18n/utf8_validator_tables.h"
#include "base/stl_util.h"
namespace base {
namespace internal {
const uint8_t kUtf8ValidatorTables[] = {
// State 0, offset 0x00
0x00, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0x08
0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0x10
0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0x18
0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0x20
0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0x28
0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0x30
0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0x38
0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0x40
0x81, 0x81, 0x81, 0x83, 0x83, 0x83, 0x83, 0x83, // 0x48
0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, // 0x50
0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, // 0x58
0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, 0x83, // 0x60
0x83, 0x86, 0x8b, 0x8b, 0x8b, 0x8b, 0x8b, 0x8b, // 0x68
0x8b, 0x8b, 0x8b, 0x8b, 0x8b, 0x8b, 0x8e, 0x8b, // 0x70
0x8b, 0x93, 0x9c, 0x9c, 0x9c, 0x9f, 0x81, 0x81, // 0x78
0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0x80
0x81, // 0x81
// State 1, offset 0x81
0x07, 0x81, // 0x83
// State 2, offset 0x83
0x06, 0x00, 0x81, // 0x86
// State 3, offset 0x86
0x05, 0x81, 0x83, 0x81, 0x81, // 0x8b
// State 4, offset 0x8b
0x06, 0x83, 0x81, // 0x8e
// State 5, offset 0x8e
0x05, 0x83, 0x81, 0x81, 0x81, // 0x93
// State 6, offset 0x93
0x04, 0x81, 0x8b, 0x8b, 0x8b, 0x81, 0x81, 0x81, // 0x9b
0x81, // 0x9c
// State 7, offset 0x9c
0x06, 0x8b, 0x81, // 0x9f
// State 8, offset 0x9f
0x04, 0x8b, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0xa7
0x81, // 0xa8
};
const size_t kUtf8ValidatorTablesSize = base::size(kUtf8ValidatorTables);
} // namespace internal
} // namespace base

View file

@ -0,0 +1,32 @@
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_I18N_UTF8_VALIDATOR_TABLES_H_
#define BASE_I18N_UTF8_VALIDATOR_TABLES_H_
#include <stddef.h>
#include <stdint.h>
#include "base/macros.h"
namespace base {
namespace internal {
// The tables for all states; a list of entries of the form (right_shift,
// next_state, next_state, ....). The right_shifts are used to reduce the
// overall size of the table. The table only covers bytes in the range
// [0x80, 0xFF] to save space.
extern const uint8_t kUtf8ValidatorTables[];
extern const size_t kUtf8ValidatorTablesSize;
// The offset of the INVALID state in kUtf8ValidatorTables.
enum {
I18N_UTF8_VALIDATOR_INVALID_INDEX = 129
};
} // namespace internal
} // namespace base
#endif // BASE_I18N_UTF8_VALIDATOR_TABLES_H_