components/url_formatter/idn_spoof_checker.cc - Issue 2895103003: Drop Mongolian from the IDN script list and tighten up the policy on Armenian-Latin mixing

Side by Side Diff: components/url_formatter/idn_spoof_checker.cc

Issue 2895103003: Drop Mongolian from the IDN script list and tighten up the policy on Armenian-Latin mixing (Closed)

Patch Set: block Armenian + Latin mix Created 3 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2017 The Chromium Authors. All rights reserved.	1 // Copyright 2017 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/idn_spoof_checker.h"	5 #include "components/url_formatter/idn_spoof_checker.h"

6	6

7 #include "base/numerics/safe_conversions.h"	7 #include "base/numerics/safe_conversions.h"

8 #include "base/strings/string_split.h"	8 #include "base/strings/string_split.h"

9 #include "base/strings/string_util.h"	9 #include "base/strings/string_util.h"

10 #include "base/threading/thread_local_storage.h"	10 #include "base/threading/thread_local_storage.h"

(...skipping 214 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
225 // detection. ICU 58 does not detect MSC any more for a single input string.	225 // detection. ICU 58 does not detect MSC any more for a single input string.

226 // See http://bugs.icu-project.org/trac/ticket/12823 .	226 // See http://bugs.icu-project.org/trac/ticket/12823 .

227 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.	227 // TODO(jshin): adjust the pattern once the above ICU bug is fixed.

228 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana	228 // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana

229 // Prolonged Sound) used out-of-context.	229 // Prolonged Sound) used out-of-context.

230 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)	230 // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)

231 // unless they're preceded by a Katakana.	231 // unless they're preceded by a Katakana.

232 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters	232 // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters

233 // (U+30D[8-A]) that look exactly like each other when they're used in a	233 // (U+30D[8-A]) that look exactly like each other when they're used in a

234 // label otherwise entirely in Katakna or Hiragana.	234 // label otherwise entirely in Katakna or Hiragana.

235 // - Disallow U+0585 (Armenian Small Letter Oh) and U+0581 (Armenian Small	235 // - Disalow mixing of Latin and Armenian
	Peter Kasting 2017/08/29 04:08:30 Nit: Period at end (2 places) Nit: Period at end (2 places)
236 // Letter Co) to be next to Latin.

237 // - Disallow Latin 'o' and 'g' next to Armenian.

238 // - Disalow mixing of Latin and Canadian Syllabary.	236 // - Disalow mixing of Latin and Canadian Syllabary.

239 // - Disalow mixing of Latin and Tifinagh.	237 // - Disalow mixing of Latin and Tifinagh.

	238 // - Disalow mixing of Latin and Miao

240 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC	239 // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC

241 // character. Other combining diacritical marks are not in the allowed	240 // character. Other combining diacritical marks are not in the allowed

242 // character set.	241 // character set.

243 dangerous_pattern = new icu::RegexMatcher(	242 dangerous_pattern = new icu::RegexMatcher(

244 icu::UnicodeString(	243 icu::UnicodeString(

245 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"	244 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"

246 R"([\u30ce\u30f3\u30bd\u30be])"	245 R"([\u30ce\u30f3\u30bd\u30be])"

247 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]\|)"	246 R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]\|)"

248 R"([^\p{scx=kana}\p{scx=hira}]\u30fc\|^\u30fc\|)"	247 R"([^\p{scx=kana}\p{scx=hira}]\u30fc\|^\u30fc\|)"

249 R"([^\p{scx=kana}][\u30fd\u30fe]\|^[\u30fd\u30fe]\|)"	248 R"([^\p{scx=kana}][\u30fd\u30fe]\|^[\u30fd\u30fe]\|)"

250 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$\|)"	249 R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$\|)"

251 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$\|)"	250 R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$\|)"

252 R"([a-z]\u30fb\|\u30fb[a-z]\|)"	251 R"([a-z]\u30fb\|\u30fb[a-z]\|)"

253 R"(^[\u0585\u0581]+[a-z]\|[a-z][\u0585\u0581]+$\|)"	252 R"([\p{sc=armn}].[a-z]\|[a-z].[\p{sc=armn}]\|)"

254 R"([a-z][\u0585\u0581]+[a-z]\|)"

255 R"(^[og]+[\p{scx=armn}]\|[\p{scx=armn}][og]+$\|)"

256 R"([\p{scx=armn}][og]+[\p{scx=armn}]\|)"

257 R"([\p{sc=cans}].[a-z]\|[a-z].[\p{sc=cans}]\|)"	253 R"([\p{sc=cans}].[a-z]\|[a-z].[\p{sc=cans}]\|)"

258 R"([\p{sc=tfng}].[a-z]\|[a-z].[\p{sc=tfng}]\|)"	254 R"([\p{sc=tfng}].[a-z]\|[a-z].[\p{sc=tfng}]\|)"

259 R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])",	255 R"([\p{sc=miao}].[a-z]\|[a-z].[\p{sc=miao}]\|)"

260 -1, US_INV),	256 R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339])"),
Peter Kasting 2017/08/29 04:08:29 Were these just default values? Were these just default values?
261 0, status);	257 0, status);

262 tls_index.Set(dangerous_pattern);	258 tls_index.Set(dangerous_pattern);

263 }	259 }

264 dangerous_pattern->reset(label_string);	260 dangerous_pattern->reset(label_string);

265 return !dangerous_pattern->find();	261 return !dangerous_pattern->find();

266 }	262 }

267	263

268 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {	264 bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {

269 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);	265 size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);

270 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);	266 icu::UnicodeString ustr_host(FALSE, hostname.data(), hostname_length);

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
323 allowed_set.addAll(*recommended_set);	319 allowed_set.addAll(*recommended_set);

324 const icu::UnicodeSet* inclusion_set = uspoof_getInclusionUnicodeSet(status);	320 const icu::UnicodeSet* inclusion_set = uspoof_getInclusionUnicodeSet(status);

325 allowed_set.addAll(*inclusion_set);	321 allowed_set.addAll(*inclusion_set);

326	322

327 // Five aspirational scripts are taken from UTR 31 Table 6 at	323 // Five aspirational scripts are taken from UTR 31 Table 6 at

328 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts .	324 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts .

329 // Not all the characters of aspirational scripts are suitable for	325 // Not all the characters of aspirational scripts are suitable for

330 // identifiers. Therefore, only characters belonging to	326 // identifiers. Therefore, only characters belonging to

331 // [:Identifier_Type=Aspirational:] (listed in 'Status/Type=Aspirational'	327 // [:Identifier_Type=Aspirational:] (listed in 'Status/Type=Aspirational'

332 // section at	328 // section at

333 // http://www.unicode.org/Public/security/latest/xidmodifications.txt) are	329 // http://www.unicode.org/Public/security/9.0.0/IdentifierType.txt) are

334 // are added to the allowed set. The list has to be updated when a new	330 // added to the allowed set. The list has to be updated when a new

335 // version of Unicode is released. The current version is 9.0.0 and ICU 60	331 // version of Unicode is released. The current version is 9.0.0 and ICU 60

336 // will have Unicode 10.0 data.	332 // will have Unicode 10.0 data.

	333 // Note that Mongolian is dropped because it's written vertically.

337 #if U_ICU_VERSION_MAJOR_NUM < 60	334 #if U_ICU_VERSION_MAJOR_NUM < 60

338 const icu::UnicodeSet aspirational_scripts(	335 const icu::UnicodeSet aspirational_scripts(

339 icu::UnicodeString(	336 icu::UnicodeString(

340 // Unified Canadian Syllabics	337 // Unified Canadian Syllabics

341 "[\\u1401-\\u166C\\u166F-\\u167F"	338 "[\\u1401-\\u166C\\u166F-\\u167F"

342 // Mongolian

343 "\\u1810-\\u1819\\u1820-\\u1877\\u1880-\\u18AA"

344 // Unified Canadian Syllabics	339 // Unified Canadian Syllabics

345 "\\u18B0-\\u18F5"	340 "\\u18B0-\\u18F5"

346 // Tifinagh	341 // Tifinagh

347 "\\u2D30-\\u2D67\\u2D7F"	342 "\\u2D30-\\u2D67\\u2D7F"

348 // Yi	343 // Yi

349 "\\uA000-\\uA48C"	344 "\\uA000-\\uA48C"

350 // Miao	345 // Miao

351 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7E"	346 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7E"

352 "\\U00016F8F-\\U00016F9F]",	347 "\\U00016F8F-\\U00016F9F]",

353 -1, US_INV),	348 -1, US_INV),

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
385 allowed_set.remove(0x0F8Cu);	380 allowed_set.remove(0x0F8Cu);

386 allowed_set.remove(0x0F8Du);	381 allowed_set.remove(0x0F8Du);

387 allowed_set.remove(0x0F8Eu);	382 allowed_set.remove(0x0F8Eu);

388 allowed_set.remove(0x0F8Fu);	383 allowed_set.remove(0x0F8Fu);

389 #endif	384 #endif

390	385

391 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);	386 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);

392 }	387 }

393	388

394 } // namespace url_formatter	389 } // namespace url_formatter

OLD	NEW

« no previous file with comments | « no previous file | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »