Skip to content

Commit d8f44bc

Browse files
committed
experiment: avoid string creation from char[] for tags and attribute names
1 parent 47b97d7 commit d8f44bc

4 files changed

Lines changed: 605 additions & 8 deletions

File tree

src/main/java/org/htmlunit/cyberneko/HTMLScanner.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.htmlunit.cyberneko.HTMLElements.Element;
2929
import org.htmlunit.cyberneko.io.PlaybackInputStream;
3030
import org.htmlunit.cyberneko.util.MiniStack;
31+
import org.htmlunit.cyberneko.util.StringCache;
3132
import org.htmlunit.cyberneko.xerces.util.EncodingTranslator;
3233
import org.htmlunit.cyberneko.xerces.util.NamespaceSupport;
3334
import org.htmlunit.cyberneko.xerces.util.StandardEncodingTranslator;
@@ -523,6 +524,8 @@ public class HTMLScanner implements XMLDocumentSource, XMLLocator, HTMLComponent
523524
/** Reusable parser for numeric character references (&#x...; and &#...;) */
524525
private final HTMLUnicodeEntitiesParser fUnicodeEntitiesParser = new HTMLUnicodeEntitiesParser();
525526

527+
final StringCache fStringCache = new StringCache();
528+
526529
final HTMLConfiguration htmlConfiguration_;
527530

528531
/**
@@ -1349,7 +1352,7 @@ else if (NAMES_LOWERCASE == mode && !Character.isLowerCase(c)) {
13491352
}
13501353

13511354
final int length = fCurrentEntity.offset_ - offset;
1352-
final String name = length > 0 ? new String(fCurrentEntity.buffer_, offset, length) : null;
1355+
final String name = length > 0 ? fStringCache.get(fCurrentEntity.buffer_, offset, length) : null;
13531356
if (DEBUG_BUFFER) {
13541357
fCurrentEntity.debugBufferIfNeeded(")scanName: ", " -> \"" + name + '"');
13551358
}
@@ -1419,7 +1422,7 @@ else if (NAMES_LOWERCASE == fNamesElems && !Character.isLowerCase(c)) {
14191422
}
14201423

14211424
final int length = fCurrentEntity.offset_ - offset;
1422-
final String name = length > 0 ? new String(fCurrentEntity.buffer_, offset, length) : null;
1425+
final String name = length > 0 ? fStringCache.get(fCurrentEntity.buffer_, offset, length) : null;
14231426
if (DEBUG_BUFFER) {
14241427
fCurrentEntity.debugBufferIfNeeded(")scanName: ", " -> \"" + name + '"');
14251428
}
@@ -1837,7 +1840,7 @@ int read() throws IOException {
18371840
* @return the read string (length may be smaller if EOF is encountered)
18381841
* @throws IOException in case of io problems
18391842
*/
1840-
String nextContent(final int len) throws IOException {
1843+
String nextContent(final StringCache strCache, final int len) throws IOException {
18411844
final int originalOffset = offset_;
18421845
final int originalColumnNumber = getColumnNumber();
18431846
final int originalCharacterOffset = getCharacterOffset();
@@ -1864,7 +1867,7 @@ String nextContent(final int len) throws IOException {
18641867
columnNumber_ = originalColumnNumber;
18651868
characterOffset_ = originalCharacterOffset;
18661869

1867-
return new String(buff, 0, nbRead);
1870+
return strCache.get(buff, 0, nbRead);
18681871
}
18691872

18701873
// Reads a single character, preserving the old buffer content
@@ -2419,7 +2422,7 @@ private void scanUntilEndTag(final String tagNameWithLeadingSlash) throws IOExce
24192422
break;
24202423
}
24212424
if (c == '<') {
2422-
final String next = fCurrentEntity.nextContent(lengthToScan) + " ";
2425+
final String next = fCurrentEntity.nextContent(fStringCache, lengthToScan) + " ";
24232426
if (next.length() >= lengthToScan
24242427
&& tagNameWithLeadingSlash.equalsIgnoreCase(
24252428
next.substring(0, tagNameWithLeadingSlash.length()))
@@ -3636,7 +3639,7 @@ public int scan(final boolean complete) throws IOException {
36363639
state = ScanScriptState.ESCAPED;
36373640
}
36383641
else if (c == '<') {
3639-
final String next = fCurrentEntity.nextContent(8) + " ";
3642+
final String next = fCurrentEntity.nextContent(fStringCache, 8) + " ";
36403643
if (next.length() >= 8 && "/script".equalsIgnoreCase(next.substring(0, 7))
36413644
&& ('>' == next.charAt(7) || Character.isWhitespace(next.charAt(7)))) {
36423645
fCurrentEntity.rewind();
@@ -3655,7 +3658,7 @@ else if (fScanScriptContent.endsWith("--!")) {
36553658
}
36563659
}
36573660
else if (c == '<') {
3658-
final String next = fCurrentEntity.nextContent(8) + " ";
3661+
final String next = fCurrentEntity.nextContent(fStringCache, 8) + " ";
36593662
if (next.length() >= 8 && "/script".equalsIgnoreCase(next.substring(0, 7))
36603663
&& ('>' == next.charAt(7) || Character.isWhitespace(next.charAt(7)))) {
36613664
fCurrentEntity.rewind();
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
/*
2+
* Copyright (c) 2017-2026 Ronald Brill
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
* https://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
package org.htmlunit.cyberneko.util;
16+
17+
import java.util.Arrays;
18+
import java.util.HashMap;
19+
20+
/**
21+
* A cache that interns strings from char[] buffer regions.
22+
* <p>
23+
* On cache hits, the same {@code String} instance is returned,
24+
* avoiding repeated allocation for frequently occurring names
25+
* (e.g., HTML tag names and attribute names).
26+
*
27+
* <p>The lookup key points directly into the caller's buffer
28+
* (zero-copy), and only when a new entry is added does the key
29+
* data get copied into an independent array.
30+
*
31+
* @author Ronald Brill
32+
* @since 5.0.0
33+
*/
34+
public class StringCache {
35+
// HTML has ~100 distinct tag names + ~50 common attribute names
36+
// At 0.75 load factor, capacity 256 avoids some rehash
37+
private final HashMap<CharBufferKey, String> cache_ = new HashMap<>(256);
38+
39+
private final CharBufferKey lookupKey_ = new CharBufferKey();
40+
41+
/**
42+
* Returns a cached {@code String} for the given char buffer region.
43+
* If no cached entry exists, a new {@code String} is created, cached, and returned.
44+
*
45+
* @param ch the character array (may be a shared/reused buffer)
46+
* @param offset the start offset of the name in {@code ch}
47+
* @param length the number of characters
48+
* @return the cached string
49+
*/
50+
public String get(final char[] ch, final int offset, final int length) {
51+
lookupKey_.update(ch, offset, length);
52+
String val = cache_.get(lookupKey_);
53+
54+
if (val == null) {
55+
val = new String(ch, offset, length);
56+
cache_.put(lookupKey_.detach(), val);
57+
}
58+
59+
return val;
60+
}
61+
62+
/**
63+
* A lightweight key that wraps a region of a {@code char[]} for use
64+
* as a {@link HashMap} lookup key. The {@link #update} method points
65+
* the key at a caller-owned buffer (zero-copy); {@link #detach}
66+
* creates an independent copy suitable for long-term storage in the map.
67+
*/
68+
static final class CharBufferKey {
69+
private char[] data_;
70+
private int offset_;
71+
private int length_;
72+
private int hash_;
73+
74+
/**
75+
* Points this key at a region of an external char array.
76+
* No copy is made; the caller must not mutate the region
77+
* while this key is used for a lookup.
78+
*
79+
* @param ch the character array
80+
* @param offset the start offset
81+
* @param length the number of characters
82+
*/
83+
void update(final char[] ch, final int offset, final int length) {
84+
data_ = ch;
85+
offset_ = offset;
86+
length_ = length;
87+
88+
int h = 0;
89+
for (int i = offset; i < offset + length; i++) {
90+
h = ((h << 5) - h) + ch[i];
91+
}
92+
hash_ = h;
93+
}
94+
95+
/**
96+
* Creates an independent copy of this key whose data is
97+
* not shared with any external buffer. The copy is suitable
98+
* for storing as a long-lived map key.
99+
*
100+
* @return a detached copy of this key
101+
*/
102+
CharBufferKey detach() {
103+
final CharBufferKey detached = new CharBufferKey();
104+
detached.data_ = new char[length_];
105+
System.arraycopy(data_, offset_, detached.data_, 0, length_);
106+
detached.offset_ = 0;
107+
detached.length_ = length_;
108+
detached.hash_ = hash_;
109+
return detached;
110+
}
111+
112+
@Override
113+
public int hashCode() {
114+
return hash_;
115+
}
116+
117+
@Override
118+
public boolean equals(final Object o) {
119+
if (o instanceof CharBufferKey ob) {
120+
if (ob.length_ != length_) {
121+
return false;
122+
}
123+
return Arrays.mismatch(
124+
data_, offset_, offset_ + length_,
125+
ob.data_, ob.offset_, ob.offset_ + ob.length_) < 0;
126+
}
127+
return false;
128+
}
129+
130+
@Override
131+
public String toString() {
132+
return new String(data_, offset_, length_);
133+
}
134+
}
135+
}

src/test/java/org/htmlunit/cyberneko/HTMLScannerTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ class MyContentScanner extends HTMLScanner.ContentScanner {
209209
@Override
210210
protected int scanComment() throws IOException {
211211
// bug was here: calling nextContent() at the end of the buffer/input
212-
fCurrentEntity.nextContent(30);
212+
fCurrentEntity.nextContent(fStringCache, 30);
213213
return super.scanComment();
214214
}
215215
}

0 commit comments

Comments
 (0)