Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(138)

Side by Side Diff: runtime/vm/unicode.cc

Issue 2952193002: VM: Speed up output of UTF8 for 1-byte strings.
Patch Set: Add test that would have caught bug pointed out by Slava Created 3 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « runtime/vm/symbols.cc ('k') | runtime/vm/unicode_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/unicode.h" 5 #include "vm/unicode.h"
6 6
7 #include "vm/allocation.h" 7 #include "vm/allocation.h"
8 #include "vm/globals.h" 8 #include "vm/globals.h"
9 #include "vm/object.h" 9 #include "vm/object.h"
10 10
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after
100 return 1; 100 return 1;
101 } else if (ch <= kMaxTwoByteChar) { 101 } else if (ch <= kMaxTwoByteChar) {
102 return 2; 102 return 2;
103 } else if (ch <= kMaxThreeByteChar) { 103 } else if (ch <= kMaxThreeByteChar) {
104 return 3; 104 return 3;
105 } 105 }
106 ASSERT(ch <= kMaxFourByteChar); 106 ASSERT(ch <= kMaxFourByteChar);
107 return 4; 107 return 4;
108 } 108 }
109 109
110 // A constant mask that can be 'and'ed with a word of data to determine if it
111 // is all ASCII (with no Latin1 characters).
112 #if defined(ARCH_IS_64_BIT)
113 static const uintptr_t kAsciiWordMask = DART_UINT64_C(0x8080808080808080);
114 #else
115 static const uintptr_t kAsciiWordMask = 0x80808080u;
116 #endif
117
110 intptr_t Utf8::Length(const String& str) { 118 intptr_t Utf8::Length(const String& str) {
119 if (str.IsOneByteString() || str.IsExternalOneByteString()) {
120 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8
121 // encodings and all >= 0x80 have two-byte encodings. To get the length,
122 // start with the number of code points and add the number of high bits in
123 // the bytes.
124 uintptr_t char_length = str.Length();
125 uintptr_t length = char_length;
126 const uintptr_t* data;
127 NoSafepointScope no_safepoint;
128 if (str.IsOneByteString()) {
129 data = reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(str));
130 } else {
131 data = reinterpret_cast<const uintptr_t*>(
132 ExternalOneByteString::DataStart(str));
133 }
134 uintptr_t i;
135 for (i = 0; i + sizeof(uintptr_t) < char_length; i += sizeof(uintptr_t)) {
136 uintptr_t chunk = *data++;
137 chunk &= kAsciiWordMask;
138 if (chunk != 0) {
139 // Shuffle the bits until we have a count of bits in the low nibble.
140 #if defined(ARCH_IS_64_BIT)
141 chunk += chunk >> 32;
142 #endif
143 chunk += chunk >> 16;
144 chunk += chunk >> 8;
145 length += (chunk >> 7) & 0xf;
146 }
147 }
148 // Take care of the tail of the string, the last length % wordsize chars.
149 for (; i < char_length; i++) {
150 if (str.CharAt(i) > kMaxOneByteChar) length++;
151 }
152 return length;
153 }
154
155 // Slow case for 2-byte strings that handles surrogate pairs and longer UTF-8
156 // encodings.
111 intptr_t length = 0; 157 intptr_t length = 0;
112 String::CodePointIterator it(str); 158 String::CodePointIterator it(str);
113 while (it.Next()) { 159 while (it.Next()) {
114 int32_t ch = it.Current(); 160 int32_t ch = it.Current();
115 length += Utf8::Length(ch); 161 length += Utf8::Length(ch);
116 } 162 }
117 return length; 163 return length;
118 } 164 }
119 165
120 intptr_t Utf8::Encode(int32_t ch, char* dst) { 166 intptr_t Utf8::Encode(int32_t ch, char* dst) {
(...skipping 15 matching lines...) Expand all
136 } 182 }
137 ASSERT(ch <= kMaxFourByteChar); 183 ASSERT(ch <= kMaxFourByteChar);
138 dst[0] = 0xF0 | (ch >> 18); 184 dst[0] = 0xF0 | (ch >> 18);
139 dst[1] = 0x80 | ((ch >> 12) & kMask); 185 dst[1] = 0x80 | ((ch >> 12) & kMask);
140 dst[2] = 0x80 | ((ch >> 6) & kMask); 186 dst[2] = 0x80 | ((ch >> 6) & kMask);
141 dst[3] = 0x80 | (ch & kMask); 187 dst[3] = 0x80 | (ch & kMask);
142 return 4; 188 return 4;
143 } 189 }
144 190
145 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { 191 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {
192 uintptr_t array_len = len;
146 intptr_t pos = 0; 193 intptr_t pos = 0;
147 String::CodePointIterator it(src); 194 ASSERT(static_cast<intptr_t>(array_len) >= Length(src));
148 while (it.Next()) { 195 if (src.IsOneByteString() || src.IsExternalOneByteString()) {
149 int32_t ch = it.Current(); 196 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8
150 intptr_t num_bytes = Utf8::Length(ch); 197 // encodings and all >= 0x80 have two-byte encodings.
151 if (pos + num_bytes > len) { 198 const uintptr_t* data;
152 break; 199 NoSafepointScope scope;
200 if (src.IsOneByteString()) {
201 data = reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(src));
202 } else {
203 data = reinterpret_cast<const uintptr_t*>(
204 ExternalOneByteString::DataStart(src));
153 } 205 }
154 Utf8::Encode(ch, &dst[pos]); 206 uintptr_t char_length = src.Length();
155 pos += num_bytes; 207 uintptr_t pos = 0;
208 ASSERT(kMaxOneByteChar + 1 == 0x80);
209 for (uintptr_t i = 0; i < char_length; i += sizeof(uintptr_t)) {
210 // Read the input one word at a time and just write it verbatim if it is
211 // plain ASCII, as determined by the mask.
212 if (i + sizeof(uintptr_t) <= char_length &&
213 (*data & kAsciiWordMask) == 0 &&
214 pos + sizeof(uintptr_t) <= array_len) {
215 StoreUnaligned(reinterpret_cast<uintptr_t*>(dst + pos), *data);
216 pos += sizeof(uintptr_t);
217 } else {
218 // Process up to one word of input that contains non-ASCII Latin1
219 // characters.
220 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
221 const uint8_t* limit =
222 Utils::Minimum(p + sizeof(uintptr_t), p + (char_length - i));
223 for (; p < limit; p++) {
224 uint8_t c = *p;
225 // These calls to Length and Encode get inlined and the cases for 3
226 // and 4 byte sequences are removed.
227 intptr_t bytes = Length(c);
228 if (pos + bytes > array_len) {
229 return pos;
230 }
231 Encode(c, reinterpret_cast<char*>(dst) + pos);
232 pos += bytes;
233 }
234 }
235 data++;
236 }
237 } else {
238 // For two-byte strings, which can contain 3 and 4-byte UTF-8 encodings,
239 // which can result in surrogate pairs, use the more general code.
240 String::CodePointIterator it(src);
241 while (it.Next()) {
242 int32_t ch = it.Current();
243 intptr_t num_bytes = Utf8::Length(ch);
244 if (pos + num_bytes > len) {
245 break;
246 }
247 Utf8::Encode(ch, &dst[pos]);
248 pos += num_bytes;
249 }
156 } 250 }
157 return pos; 251 return pos;
158 } 252 }
159 253
160 intptr_t Utf8::Decode(const uint8_t* utf8_array, 254 intptr_t Utf8::Decode(const uint8_t* utf8_array,
161 intptr_t array_len, 255 intptr_t array_len,
162 int32_t* dst) { 256 int32_t* dst) {
163 uint32_t ch = utf8_array[0] & 0xFF; 257 uint32_t ch = utf8_array[0] & 0xFF;
164 intptr_t i = 1; 258 intptr_t i = 1;
165 if (ch >= 0x80) { 259 if (ch >= 0x80) {
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after
265 } 359 }
266 360
267 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { 361 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {
268 ASSERT(codepoint > Utf16::kMaxCodeUnit); 362 ASSERT(codepoint > Utf16::kMaxCodeUnit);
269 ASSERT(dst != NULL); 363 ASSERT(dst != NULL);
270 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); 364 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));
271 dst[1] = (0xDC00 + (codepoint & 0x3FF)); 365 dst[1] = (0xDC00 + (codepoint & 0x3FF));
272 } 366 }
273 367
274 } // namespace dart 368 } // namespace dart
OLDNEW
« no previous file with comments | « runtime/vm/symbols.cc ('k') | runtime/vm/unicode_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698