runtime/vm/unicode.cc - Issue 2952193002: VM: Speed up output of UTF8 for 1-byte strings.

Side by Side Diff: runtime/vm/unicode.cc

Issue 2952193002: VM: Speed up output of UTF8 for 1-byte strings.

Patch Set: Add test that would have caught bug pointed out by Slava Created 3 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #include "vm/unicode.h"	5 #include "vm/unicode.h"

6	6

7 #include "vm/allocation.h"	7 #include "vm/allocation.h"

8 #include "vm/globals.h"	8 #include "vm/globals.h"

9 #include "vm/object.h"	9 #include "vm/object.h"

10	10

(...skipping 89 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
100 return 1;	100 return 1;

101 } else if (ch <= kMaxTwoByteChar) {	101 } else if (ch <= kMaxTwoByteChar) {

102 return 2;	102 return 2;

103 } else if (ch <= kMaxThreeByteChar) {	103 } else if (ch <= kMaxThreeByteChar) {

104 return 3;	104 return 3;

105 }	105 }

106 ASSERT(ch <= kMaxFourByteChar);	106 ASSERT(ch <= kMaxFourByteChar);

107 return 4;	107 return 4;

108 }	108 }

109	109

	110 // A constant mask that can be 'and'ed with a word of data to determine if it

	111 // is all ASCII (with no Latin1 characters).

	112 #if defined(ARCH_IS_64_BIT)

	113 static const uintptr_t kAsciiWordMask = DART_UINT64_C(0x8080808080808080);

	114 #else

	115 static const uintptr_t kAsciiWordMask = 0x80808080u;

	116 #endif

	117

110 intptr_t Utf8::Length(const String& str) {	118 intptr_t Utf8::Length(const String& str) {

	119 if (str.IsOneByteString() \|\| str.IsExternalOneByteString()) {

	120 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8

	121 // encodings and all >= 0x80 have two-byte encodings. To get the length,

	122 // start with the number of code points and add the number of high bits in

	123 // the bytes.

	124 uintptr_t char_length = str.Length();

	125 uintptr_t length = char_length;

	126 const uintptr_t* data;

	127 NoSafepointScope no_safepoint;

	128 if (str.IsOneByteString()) {

	129 data = reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(str));

	130 } else {

	131 data = reinterpret_cast<const uintptr_t*>(

	132 ExternalOneByteString::DataStart(str));

	133 }

	134 uintptr_t i;

	135 for (i = 0; i + sizeof(uintptr_t) < char_length; i += sizeof(uintptr_t)) {

	136 uintptr_t chunk = *data++;

	137 chunk &= kAsciiWordMask;

	138 if (chunk != 0) {

	139 // Shuffle the bits until we have a count of bits in the low nibble.

	140 #if defined(ARCH_IS_64_BIT)

	141 chunk += chunk >> 32;

	142 #endif

	143 chunk += chunk >> 16;

	144 chunk += chunk >> 8;

	145 length += (chunk >> 7) & 0xf;

	146 }

	147 }

	148 // Take care of the tail of the string, the last length % wordsize chars.

	149 for (; i < char_length; i++) {

	150 if (str.CharAt(i) > kMaxOneByteChar) length++;

	151 }

	152 return length;

	153 }

	154

	155 // Slow case for 2-byte strings that handles surrogate pairs and longer UTF-8

	156 // encodings.

111 intptr_t length = 0;	157 intptr_t length = 0;

112 String::CodePointIterator it(str);	158 String::CodePointIterator it(str);

113 while (it.Next()) {	159 while (it.Next()) {

114 int32_t ch = it.Current();	160 int32_t ch = it.Current();

115 length += Utf8::Length(ch);	161 length += Utf8::Length(ch);

116 }	162 }

117 return length;	163 return length;

118 }	164 }

119	165

120 intptr_t Utf8::Encode(int32_t ch, char* dst) {	166 intptr_t Utf8::Encode(int32_t ch, char* dst) {

(...skipping 15 matching lines...) Expand all Loading...
136 }	182 }

137 ASSERT(ch <= kMaxFourByteChar);	183 ASSERT(ch <= kMaxFourByteChar);

138 dst[0] = 0xF0 \| (ch >> 18);	184 dst[0] = 0xF0 \| (ch >> 18);

139 dst[1] = 0x80 \| ((ch >> 12) & kMask);	185 dst[1] = 0x80 \| ((ch >> 12) & kMask);

140 dst[2] = 0x80 \| ((ch >> 6) & kMask);	186 dst[2] = 0x80 \| ((ch >> 6) & kMask);

141 dst[3] = 0x80 \| (ch & kMask);	187 dst[3] = 0x80 \| (ch & kMask);

142 return 4;	188 return 4;

143 }	189 }

144	190

145 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {	191 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {

	192 uintptr_t array_len = len;

146 intptr_t pos = 0;	193 intptr_t pos = 0;

147 String::CodePointIterator it(src);	194 ASSERT(static_cast<intptr_t>(array_len) >= Length(src));

148 while (it.Next()) {	195 if (src.IsOneByteString() \|\| src.IsExternalOneByteString()) {

149 int32_t ch = it.Current();	196 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8

150 intptr_t num_bytes = Utf8::Length(ch);	197 // encodings and all >= 0x80 have two-byte encodings.

151 if (pos + num_bytes > len) {	198 const uintptr_t* data;

152 break;	199 NoSafepointScope scope;

	200 if (src.IsOneByteString()) {

	201 data = reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(src));

	202 } else {

	203 data = reinterpret_cast<const uintptr_t*>(

	204 ExternalOneByteString::DataStart(src));

153 }	205 }

154 Utf8::Encode(ch, &dst[pos]);	206 uintptr_t char_length = src.Length();

155 pos += num_bytes;	207 uintptr_t pos = 0;

	208 ASSERT(kMaxOneByteChar + 1 == 0x80);

	209 for (uintptr_t i = 0; i < char_length; i += sizeof(uintptr_t)) {

	210 // Read the input one word at a time and just write it verbatim if it is

	211 // plain ASCII, as determined by the mask.

	212 if (i + sizeof(uintptr_t) <= char_length &&

	213 (*data & kAsciiWordMask) == 0 &&

	214 pos + sizeof(uintptr_t) <= array_len) {

	215 StoreUnaligned(reinterpret_cast<uintptr_t>(dst + pos), data);

	216 pos += sizeof(uintptr_t);

	217 } else {

	218 // Process up to one word of input that contains non-ASCII Latin1

	219 // characters.

	220 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);

	221 const uint8_t* limit =

	222 Utils::Minimum(p + sizeof(uintptr_t), p + (char_length - i));

	223 for (; p < limit; p++) {

	224 uint8_t c = *p;

	225 // These calls to Length and Encode get inlined and the cases for 3

	226 // and 4 byte sequences are removed.

	227 intptr_t bytes = Length(c);

	228 if (pos + bytes > array_len) {

	229 return pos;

	230 }

	231 Encode(c, reinterpret_cast<char*>(dst) + pos);

	232 pos += bytes;

	233 }

	234 }

	235 data++;

	236 }

	237 } else {

	238 // For two-byte strings, which can contain 3 and 4-byte UTF-8 encodings,

	239 // which can result in surrogate pairs, use the more general code.

	240 String::CodePointIterator it(src);

	241 while (it.Next()) {

	242 int32_t ch = it.Current();

	243 intptr_t num_bytes = Utf8::Length(ch);

	244 if (pos + num_bytes > len) {

	245 break;

	246 }

	247 Utf8::Encode(ch, &dst[pos]);

	248 pos += num_bytes;

	249 }

156 }	250 }

157 return pos;	251 return pos;

158 }	252 }

159	253

160 intptr_t Utf8::Decode(const uint8_t* utf8_array,	254 intptr_t Utf8::Decode(const uint8_t* utf8_array,

161 intptr_t array_len,	255 intptr_t array_len,

162 int32_t* dst) {	256 int32_t* dst) {

163 uint32_t ch = utf8_array[0] & 0xFF;	257 uint32_t ch = utf8_array[0] & 0xFF;

164 intptr_t i = 1;	258 intptr_t i = 1;

165 if (ch >= 0x80) {	259 if (ch >= 0x80) {

(...skipping 99 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
265 }	359 }

266	360

267 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {	361 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {

268 ASSERT(codepoint > Utf16::kMaxCodeUnit);	362 ASSERT(codepoint > Utf16::kMaxCodeUnit);

269 ASSERT(dst != NULL);	363 ASSERT(dst != NULL);

270 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));	364 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));

271 dst[1] = (0xDC00 + (codepoint & 0x3FF));	365 dst[1] = (0xDC00 + (codepoint & 0x3FF));

272 }	366 }

273	367

274 } // namespace dart	368 } // namespace dart

OLD	NEW

« no previous file with comments | « runtime/vm/symbols.cc ('k') | runtime/vm/unicode_test.cc » ('j') | no next file with comments »