OLD | NEW |
---|---|
1 // Copyright 2017 The Chromium Authors. All rights reserved. | 1 // Copyright 2017 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/url_formatter/idn_spoof_checker.h" | 5 #include "components/url_formatter/idn_spoof_checker.h" |
6 | 6 |
7 #include "base/numerics/safe_conversions.h" | 7 #include "base/numerics/safe_conversions.h" |
8 #include "base/strings/string_split.h" | 8 #include "base/strings/string_split.h" |
9 #include "base/strings/string_util.h" | 9 #include "base/strings/string_util.h" |
10 #include "base/threading/thread_local_storage.h" | 10 #include "base/threading/thread_local_storage.h" |
(...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
225 // detection. ICU 58 does not detect MSC any more for a single input string. | 225 // detection. ICU 58 does not detect MSC any more for a single input string. |
226 // See http://bugs.icu-project.org/trac/ticket/12823 . | 226 // See http://bugs.icu-project.org/trac/ticket/12823 . |
227 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. | 227 // TODO(jshin): adjust the pattern once the above ICU bug is fixed. |
228 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana | 228 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana |
229 // Prolonged Sound) used out-of-context. | 229 // Prolonged Sound) used out-of-context. |
230 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark) | 230 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark) |
231 // unless they're preceded by a Katakana. | 231 // unless they're preceded by a Katakana. |
232 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters | 232 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters |
233 // (U+30D[8-A]) that look exactly like each other when they're used in a | 233 // (U+30D[8-A]) that look exactly like each other when they're used in a |
234 // label otherwise entirely in Katakna or Hiragana. | 234 // label otherwise entirely in Katakna or Hiragana. |
235 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small | 235 // - Disalow mixing of Latin and Armenian |
Peter Kasting
2017/08/29 04:08:30
Nit: Period at end (2 places)
| |
236 // Letter Co) to be next to Latin. | |
237 // - Disallow Latin 'o' and 'g' next to Armenian. | |
238 // - Disalow mixing of Latin and Canadian Syllabary. | 236 // - Disalow mixing of Latin and Canadian Syllabary. |
239 // - Disalow mixing of Latin and Tifinagh. | 237 // - Disalow mixing of Latin and Tifinagh. |
238 // - Disalow mixing of Latin and Miao | |
240 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC | 239 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC |
241 // character. Other combining diacritical marks are not in the allowed | 240 // character. Other combining diacritical marks are not in the allowed |
242 // character set. | 241 // character set. |
243 dangerous_pattern = new icu::RegexMatcher( | 242 dangerous_pattern = new icu::RegexMatcher( |
244 icu::UnicodeString( | 243 icu::UnicodeString( |
245 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])" | 244 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])" |
246 R"([\u30ce\u30f3\u30bd\u30be])" | 245 R"([\u30ce\u30f3\u30bd\u30be])" |
247 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)" | 246 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)" |
248 R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)" | 247 R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)" |
249 R"([^\p{scx=kana}][\u30fd\u30fe]|^[\u30fd\u30fe]|)" | 248 R"([^\p{scx=kana}][\u30fd\u30fe]|^[\u30fd\u30fe]|)" |
250 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$|)" | 249 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$|)" |
251 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$|)" | 250 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$|)" |
252 R"([a-z]\u30fb|\u30fb[a-z]|)" | 251 R"([a-z]\u30fb|\u30fb[a-z]|)" |
253 R"(^[\u0585\u0581]+[a-z]|[a-z][\u0585\u0581]+$|)" | 252 R"([\p{sc=armn}].*[a-z]|[a-z].*[\p{sc=armn}]|)" |
254 R"([a-z][\u0585\u0581]+[a-z]|)" | |
255 R"(^[og]+[\p{scx=armn}]|[\p{scx=armn}][og]+$|)" | |
256 R"([\p{scx=armn}][og]+[\p{scx=armn}]|)" | |
257 R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}]|)" | 253 R"([\p{sc=cans}].*[a-z]|[a-z].*[\p{sc=cans}]|)" |
258 R"([\p{sc=tfng}].*[a-z]|[a-z].*[\p{sc=tfng}]|)" | 254 R"([\p{sc=tfng}].*[a-z]|[a-z].*[\p{sc=tfng}]|)" |
259 R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])", | 255 R"([\p{sc=miao}].*[a-z]|[a-z].*[\p{sc=miao}]|)" |
260 -1, US_INV), | 256 R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])"), |
Peter Kasting
2017/08/29 04:08:29
Were these just default values?
| |
261 0, status); | 257 0, status); |
262 tls_index.Set(dangerous_pattern); | 258 tls_index.Set(dangerous_pattern); |
263 } | 259 } |
264 dangerous_pattern->reset(label_string); | 260 dangerous_pattern->reset(label_string); |
265 return !dangerous_pattern->find(); | 261 return !dangerous_pattern->find(); |
266 } | 262 } |
267 | 263 |
268 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) { | 264 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) { |
269 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0); | 265 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0); |
270 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length); | 266 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length); |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
323 allowed_set.addAll(*recommended_set); | 319 allowed_set.addAll(*recommended_set); |
324 const icu::UnicodeSet* inclusion_set = uspoof_getInclusionUnicodeSet(status); | 320 const icu::UnicodeSet* inclusion_set = uspoof_getInclusionUnicodeSet(status); |
325 allowed_set.addAll(*inclusion_set); | 321 allowed_set.addAll(*inclusion_set); |
326 | 322 |
327 // Five aspirational scripts are taken from UTR 31 Table 6 at | 323 // Five aspirational scripts are taken from UTR 31 Table 6 at |
328 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts . | 324 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts . |
329 // Not all the characters of aspirational scripts are suitable for | 325 // Not all the characters of aspirational scripts are suitable for |
330 // identifiers. Therefore, only characters belonging to | 326 // identifiers. Therefore, only characters belonging to |
331 // [:Identifier_Type=Aspirational:] (listed in 'Status/Type=Aspirational' | 327 // [:Identifier_Type=Aspirational:] (listed in 'Status/Type=Aspirational' |
332 // section at | 328 // section at |
333 // http://www.unicode.org/Public/security/latest/xidmodifications.txt) are | 329 // http://www.unicode.org/Public/security/9.0.0/IdentifierType.txt) are |
334 // are added to the allowed set. The list has to be updated when a new | 330 // added to the allowed set. The list has to be updated when a new |
335 // version of Unicode is released. The current version is 9.0.0 and ICU 60 | 331 // version of Unicode is released. The current version is 9.0.0 and ICU 60 |
336 // will have Unicode 10.0 data. | 332 // will have Unicode 10.0 data. |
333 // Note that Mongolian is dropped because it's written vertically. | |
337 #if U_ICU_VERSION_MAJOR_NUM < 60 | 334 #if U_ICU_VERSION_MAJOR_NUM < 60 |
338 const icu::UnicodeSet aspirational_scripts( | 335 const icu::UnicodeSet aspirational_scripts( |
339 icu::UnicodeString( | 336 icu::UnicodeString( |
340 // Unified Canadian Syllabics | 337 // Unified Canadian Syllabics |
341 "[\\u1401-\\u166C\\u166F-\\u167F" | 338 "[\\u1401-\\u166C\\u166F-\\u167F" |
342 // Mongolian | |
343 "\\u1810-\\u1819\\u1820-\\u1877\\u1880-\\u18AA" | |
344 // Unified Canadian Syllabics | 339 // Unified Canadian Syllabics |
345 "\\u18B0-\\u18F5" | 340 "\\u18B0-\\u18F5" |
346 // Tifinagh | 341 // Tifinagh |
347 "\\u2D30-\\u2D67\\u2D7F" | 342 "\\u2D30-\\u2D67\\u2D7F" |
348 // Yi | 343 // Yi |
349 "\\uA000-\\uA48C" | 344 "\\uA000-\\uA48C" |
350 // Miao | 345 // Miao |
351 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7E" | 346 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7E" |
352 "\\U00016F8F-\\U00016F9F]", | 347 "\\U00016F8F-\\U00016F9F]", |
353 -1, US_INV), | 348 -1, US_INV), |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
385 allowed_set.remove(0x0F8Cu); | 380 allowed_set.remove(0x0F8Cu); |
386 allowed_set.remove(0x0F8Du); | 381 allowed_set.remove(0x0F8Du); |
387 allowed_set.remove(0x0F8Eu); | 382 allowed_set.remove(0x0F8Eu); |
388 allowed_set.remove(0x0F8Fu); | 383 allowed_set.remove(0x0F8Fu); |
389 #endif | 384 #endif |
390 | 385 |
391 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status); | 386 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status); |
392 } | 387 } |
393 | 388 |
394 } // namespace url_formatter | 389 } // namespace url_formatter |
OLD | NEW |