OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
100 return 1; | 100 return 1; |
101 } else if (ch <= kMaxTwoByteChar) { | 101 } else if (ch <= kMaxTwoByteChar) { |
102 return 2; | 102 return 2; |
103 } else if (ch <= kMaxThreeByteChar) { | 103 } else if (ch <= kMaxThreeByteChar) { |
104 return 3; | 104 return 3; |
105 } | 105 } |
106 ASSERT(ch <= kMaxFourByteChar); | 106 ASSERT(ch <= kMaxFourByteChar); |
107 return 4; | 107 return 4; |
108 } | 108 } |
109 | 109 |
| 110 // A constant mask that can be 'and'ed with a word of data to determine if it |
| 111 // is all ASCII (with no Latin1 characters). |
| 112 #if defined(ARCH_IS_64_BIT) |
| 113 static const uintptr_t kAsciiWordMask = DART_UINT64_C(0x8080808080808080); |
| 114 #else |
| 115 static const uintptr_t kAsciiWordMask = 0x80808080u; |
| 116 #endif |
| 117 |
110 intptr_t Utf8::Length(const String& str) { | 118 intptr_t Utf8::Length(const String& str) { |
| 119 if (str.IsOneByteString() || str.IsExternalOneByteString()) { |
| 120 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8 |
| 121 // encodings and all >= 0x80 have two-byte encodings. To get the length, |
| 122 // start with the number of code points and add the number of high bits in |
| 123 // the bytes. |
| 124 uintptr_t char_length = str.Length(); |
| 125 uintptr_t length = char_length; |
| 126 const uintptr_t* data; |
| 127 NoSafepointScope no_safepoint; |
| 128 if (str.IsOneByteString()) { |
| 129 data = reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(str)); |
| 130 } else { |
| 131 data = reinterpret_cast<const uintptr_t*>( |
| 132 ExternalOneByteString::DataStart(str)); |
| 133 } |
| 134 uintptr_t i; |
| 135 for (i = 0; i + sizeof(uintptr_t) < char_length; i += sizeof(uintptr_t)) { |
| 136 uintptr_t chunk = *data++; |
| 137 chunk &= kAsciiWordMask; |
| 138 if (chunk != 0) { |
| 139 // Shuffle the bits until we have a count of bits in the low nibble. |
| 140 #if defined(ARCH_IS_64_BIT) |
| 141 chunk += chunk >> 32; |
| 142 #endif |
| 143 chunk += chunk >> 16; |
| 144 chunk += chunk >> 8; |
| 145 length += (chunk >> 7) & 0xf; |
| 146 } |
| 147 } |
| 148 // Take care of the tail of the string, the last length % wordsize chars. |
| 149 for (; i < char_length; i++) { |
| 150 if (str.CharAt(i) > kMaxOneByteChar) length++; |
| 151 } |
| 152 return length; |
| 153 } |
| 154 |
| 155 // Slow case for 2-byte strings that handles surrogate pairs and longer UTF-8 |
| 156 // encodings. |
111 intptr_t length = 0; | 157 intptr_t length = 0; |
112 String::CodePointIterator it(str); | 158 String::CodePointIterator it(str); |
113 while (it.Next()) { | 159 while (it.Next()) { |
114 int32_t ch = it.Current(); | 160 int32_t ch = it.Current(); |
115 length += Utf8::Length(ch); | 161 length += Utf8::Length(ch); |
116 } | 162 } |
117 return length; | 163 return length; |
118 } | 164 } |
119 | 165 |
120 intptr_t Utf8::Encode(int32_t ch, char* dst) { | 166 intptr_t Utf8::Encode(int32_t ch, char* dst) { |
(...skipping 15 matching lines...) Expand all Loading... |
136 } | 182 } |
137 ASSERT(ch <= kMaxFourByteChar); | 183 ASSERT(ch <= kMaxFourByteChar); |
138 dst[0] = 0xF0 | (ch >> 18); | 184 dst[0] = 0xF0 | (ch >> 18); |
139 dst[1] = 0x80 | ((ch >> 12) & kMask); | 185 dst[1] = 0x80 | ((ch >> 12) & kMask); |
140 dst[2] = 0x80 | ((ch >> 6) & kMask); | 186 dst[2] = 0x80 | ((ch >> 6) & kMask); |
141 dst[3] = 0x80 | (ch & kMask); | 187 dst[3] = 0x80 | (ch & kMask); |
142 return 4; | 188 return 4; |
143 } | 189 } |
144 | 190 |
145 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { | 191 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { |
| 192 uintptr_t array_len = len; |
146 intptr_t pos = 0; | 193 intptr_t pos = 0; |
147 String::CodePointIterator it(src); | 194 ASSERT(static_cast<intptr_t>(array_len) >= Length(src)); |
148 while (it.Next()) { | 195 if (src.IsOneByteString() || src.IsExternalOneByteString()) { |
149 int32_t ch = it.Current(); | 196 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8 |
150 intptr_t num_bytes = Utf8::Length(ch); | 197 // encodings and all >= 0x80 have two-byte encodings. |
151 if (pos + num_bytes > len) { | 198 const uintptr_t* data; |
152 break; | 199 NoSafepointScope scope; |
| 200 if (src.IsOneByteString()) { |
| 201 data = reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(src)); |
| 202 } else { |
| 203 data = reinterpret_cast<const uintptr_t*>( |
| 204 ExternalOneByteString::DataStart(src)); |
153 } | 205 } |
154 Utf8::Encode(ch, &dst[pos]); | 206 uintptr_t char_length = src.Length(); |
155 pos += num_bytes; | 207 uintptr_t pos = 0; |
| 208 ASSERT(kMaxOneByteChar + 1 == 0x80); |
| 209 for (uintptr_t i = 0; i < char_length; i += sizeof(uintptr_t)) { |
| 210 // Read the input one word at a time and just write it verbatim if it is |
| 211 // plain ASCII, as determined by the mask. |
| 212 if (i + sizeof(uintptr_t) <= char_length && |
| 213 (*data & kAsciiWordMask) == 0 && |
| 214 pos + sizeof(uintptr_t) <= array_len) { |
| 215 StoreUnaligned(reinterpret_cast<uintptr_t*>(dst + pos), *data); |
| 216 pos += sizeof(uintptr_t); |
| 217 } else { |
| 218 // Process up to one word of input that contains non-ASCII Latin1 |
| 219 // characters. |
| 220 const uint8_t* p = reinterpret_cast<const uint8_t*>(data); |
| 221 const uint8_t* limit = |
| 222 Utils::Minimum(p + sizeof(uintptr_t), p + (char_length - i)); |
| 223 for (; p < limit; p++) { |
| 224 uint8_t c = *p; |
| 225 // These calls to Length and Encode get inlined and the cases for 3 |
| 226 // and 4 byte sequences are removed. |
| 227 intptr_t bytes = Length(c); |
| 228 if (pos + bytes > array_len) { |
| 229 return pos; |
| 230 } |
| 231 Encode(c, reinterpret_cast<char*>(dst) + pos); |
| 232 pos += bytes; |
| 233 } |
| 234 } |
| 235 data++; |
| 236 } |
| 237 } else { |
| 238 // For two-byte strings, which can contain 3 and 4-byte UTF-8 encodings, |
| 239 // which can result in surrogate pairs, use the more general code. |
| 240 String::CodePointIterator it(src); |
| 241 while (it.Next()) { |
| 242 int32_t ch = it.Current(); |
| 243 intptr_t num_bytes = Utf8::Length(ch); |
| 244 if (pos + num_bytes > len) { |
| 245 break; |
| 246 } |
| 247 Utf8::Encode(ch, &dst[pos]); |
| 248 pos += num_bytes; |
| 249 } |
156 } | 250 } |
157 return pos; | 251 return pos; |
158 } | 252 } |
159 | 253 |
160 intptr_t Utf8::Decode(const uint8_t* utf8_array, | 254 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
161 intptr_t array_len, | 255 intptr_t array_len, |
162 int32_t* dst) { | 256 int32_t* dst) { |
163 uint32_t ch = utf8_array[0] & 0xFF; | 257 uint32_t ch = utf8_array[0] & 0xFF; |
164 intptr_t i = 1; | 258 intptr_t i = 1; |
165 if (ch >= 0x80) { | 259 if (ch >= 0x80) { |
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
265 } | 359 } |
266 | 360 |
267 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | 361 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { |
268 ASSERT(codepoint > Utf16::kMaxCodeUnit); | 362 ASSERT(codepoint > Utf16::kMaxCodeUnit); |
269 ASSERT(dst != NULL); | 363 ASSERT(dst != NULL); |
270 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | 364 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); |
271 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 365 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
272 } | 366 } |
273 | 367 |
274 } // namespace dart | 368 } // namespace dart |
OLD | NEW |