@@ -77,7 +77,7 @@ pub fn read_i32_code(code: &Vec<u8>, pc: u16, offset: u16) -> i32 {
77
77
( ( byte1 << 24 ) | ( byte2 << 16 ) | ( byte3 << 8 ) | byte4) as i32
78
78
}
79
79
80
- pub fn get_java_bytes_as_string_value ( bytes : & [ VmPrimitive ] ) -> String {
80
+ fn get_java_bytes_utf16_string_value ( bytes : & [ VmPrimitive ] ) -> String {
81
81
let element_values: Vec < u16 > = bytes. iter ( )
82
82
. tuples ( )
83
83
. map ( |( h, l) | match ( h, l) {
@@ -88,64 +88,76 @@ pub fn get_java_bytes_as_string_value(bytes: &[VmPrimitive]) -> String {
88
88
89
89
String :: from_utf16_lossy ( element_values. as_slice ( ) )
90
90
}
91
+ fn get_java_bytes_latin1_string_value ( bytes : & [ VmPrimitive ] ) -> String {
92
+ let element_values: Vec < u8 > = bytes. iter ( )
93
+ . map ( |h| match h {
94
+ VmPrimitive :: Byte ( ref b) => * b,
95
+ p => panic ! ( "Unexpected primitives: {:?}" , p) ,
96
+ } )
97
+ . collect ( ) ;
98
+
99
+ String :: from_utf8 ( element_values)
100
+ . expect ( "Failed to convert Latin-1 bytes to String" )
101
+ }
91
102
92
103
pub fn get_java_string_value ( string_instance : & VmInstance ) -> String {
104
+ let is_latin1 = if let & VmPrimitive :: Byte ( ref coder_value) = string_instance. fields . get ( "coder" ) . unwrap ( ) {
105
+ * coder_value == 0
106
+ } else {
107
+ panic ! ( "Unexpected coder field type in string instance: {:?}" , string_instance. fields. get( "coder" ) ) ;
108
+ } ;
109
+
93
110
match string_instance. fields . get ( "value" ) . unwrap ( ) {
94
111
& VmPrimitive :: Arrayref ( ref rc_value_array) => {
95
- get_java_bytes_as_string_value ( & * rc_value_array. borrow ( ) . elements )
112
+ if is_latin1 {
113
+ get_java_bytes_latin1_string_value ( & * rc_value_array. borrow ( ) . elements )
114
+ } else {
115
+ // Handle UTF-16 case
116
+ get_java_bytes_utf16_string_value ( & * rc_value_array. borrow ( ) . elements )
117
+ }
96
118
}
97
119
p => panic ! ( "Unexpected primitive: {:?}" , p) ,
98
120
}
99
121
}
100
122
101
123
pub fn create_java_string ( vm_thread : & mut VmThread , string : String ) -> Rc < RefCell < VmInstance > > {
102
- // Java 9+ uses compact strings: a byte[] `value` field and a `coder` field.
103
- // `coder == 0` → UTF‑8, `coder == 1` → UTF‑16 (big‑endian).
104
- // This implementation stores the string as UTF‑16 bytes (big‑endian) and sets coder = 1.
105
- // This matches the expectations of `get_java_string_value`, which decodes UTF‑16.
106
-
107
- trace ! ( "Creating Java String: {}" , string) ;
108
-
109
- // Encode as UTF‑16BE bytes (big‑endian)
110
- let utf16_iter = string. encode_utf16 ( ) ;
111
- let mut bytes: Vec < u8 > = Vec :: with_capacity ( utf16_iter. clone ( ) . count ( ) * 2 ) ;
112
- for code_unit in utf16_iter {
113
- bytes. push ( ( code_unit >> 8 ) as u8 ) ; // high byte
114
- bytes. push ( ( code_unit & 0xFF ) as u8 ) ; // low byte
115
- }
116
- let count = bytes. len ( ) ;
124
+ let ( bytes, is_latin1) = if string. is_ascii ( ) {
125
+ // Encoding as individual bytes is sufficient for ASCII strings
126
+ ( string. into_bytes ( ) , true )
127
+ } else {
128
+ // Non-ASCII strings need to be encoded as UTF-16
129
+ let utf16_iter = string. encode_utf16 ( ) ;
130
+ let mut bytes: Vec < u8 > = Vec :: with_capacity ( utf16_iter. clone ( ) . count ( ) * 2 ) ;
131
+ for code_unit in utf16_iter {
132
+ bytes. push ( ( code_unit & 0xFF ) as u8 ) ; // low byte
133
+ bytes. push ( ( code_unit >> 8 ) as u8 ) ; // high byte
134
+ }
135
+
136
+ ( bytes, false )
137
+ } ;
117
138
118
139
// Allocate a byte[] array (atype 8 = byte) with the exact length
119
- let mut array = VmArray :: new_primitive ( count , 8 ) ;
140
+ let mut array = VmArray :: new_primitive ( bytes . len ( ) , 8 ) ;
120
141
for ( i, b) in bytes. iter ( ) . enumerate ( ) {
121
142
array. elements [ i] = VmPrimitive :: Byte ( * b) ;
122
143
}
123
144
let rc_array = Rc :: new ( RefCell :: new ( array) ) ;
124
-
125
- // Load java/lang/String class (triggers <clinit> if not already done)
126
- let jvm_class = vm_thread. load_and_clinit_class ( & "java/lang/String" . to_string ( ) ) ;
127
-
145
+
128
146
// Create a new instance of java/lang/String
147
+ let jvm_class = vm_thread. load_and_clinit_class ( & "java/lang/String" . to_string ( ) ) ;
129
148
let mut instance = VmInstance :: new ( vm_thread, & jvm_class) ;
130
149
131
150
// Set the `value` field to the byte[] we just created
132
151
instance
133
152
. fields
134
- . insert ( "value" . to_string ( ) , VmPrimitive :: Arrayref ( rc_array) ) ;
153
+ . insert ( "value" . to_string ( ) , VmPrimitive :: Arrayref ( rc_array. clone ( ) ) ) ;
135
154
136
- // Set the `coder` field to 1 (UTF‑16). Use 0 for UTF‑8.
155
+ // Set the `coder` field to 1 for UTF‑16 or 0 for ASCII/Latin-1
156
+ let coder_value = if is_latin1 { 0 } else { 1 } ;
137
157
instance
138
158
. fields
139
- . insert ( "coder" . to_string ( ) , VmPrimitive :: Byte ( 1 ) ) ;
140
-
141
- // Some JVM implementations also have a cached `hash` field; initialise it to 0.
142
- // This is optional but mirrors the reference implementation.
143
- if instance. fields . contains_key ( "hash" ) {
144
- instance
145
- . fields
146
- . insert ( "hash" . to_string ( ) , VmPrimitive :: Int ( 0 ) ) ;
147
- }
148
-
159
+ . insert ( "coder" . to_string ( ) , VmPrimitive :: Byte ( coder_value) ) ;
160
+
149
161
Rc :: new ( RefCell :: new ( instance) )
150
162
}
151
163
0 commit comments