Implementing a UTF‑8 to UTF‑16LE Conversion Function in Delphi
To convert a UTF‑8 byte array into a Delphi String (UTF‑16LE),
you can use TEncoding.UTF8.GetString(Bytes) from the
System.SysUtils unit.
However, if even a single invalid byte exists in the array, the function
refuses to decode anything and raises an error.
Since I often need a result that converts as many valid parts as possible,
I decided to implement my own conversion routine.
About Unicode
Unicode code points are expressed in the form U+10FFFF
(a 21‑bit value).
The first 5 bits represent the "plane" (0–16),
the next 8 bits represent the "row",
and the final 8 bits represent the "cell".
UMamUnicode.pas File
unit UMamUnicode; interface uses System.SysUtils; function MamU8ToU16(b: TBytes): TBytes; // UTF‑8 → UTF‑16LE conversion function MamU32ToU16(b: TBytes): TBytes; // Unicode (UTF‑32) → UTF‑16LE conversion function MamU8ToU32(b: TBytes): TBytes; // UTF‑8 → Unicode (UTF‑32) conversion implementation function MamU8ToU16(b:TBytes):TBytes; var b1:TBytes; begin b1:=MamU8ToU32(b); result:=MamU32ToU16(b1); end; function MamU32ToU16(b:TBytes):TBytes; var i,bl,rl:integer; begin // Example: the character "𠮷" is U+20BB7 in UTF‑32 (Unicode) // b[0] = $02, b[1] = $0B, b[2] = $B7 // If b[0] is 1 or greater, the character requires a surrogate pair. // // For surrogate pairs: // ① Subtract 1 from b[0] // ② Convert the 21‑bit value into two UTF‑16 code units: // // b[0] b[1] b[2] // 0001 0000 1011 1011 0111 // ~~~~ ~~~~ ~~ (Little Endian) // // w1 = 1101 10 0001 0000 10 ↓ ↓ = D842 → 42 D8 // w2 = 1101 11 11 1011 0111 = DFB7 → B7 DF rl:=0; SetLength(result,rl); bl:=Length(b); i:=0; while (i+2)<bl do begin if b[i]<=0 then begin //Not a surrogate pair inc(rl,2); setLength(result,rl); result[rl-1]:=b[i+1]; result[rl-2]:=b[i+2]; end else begin //Surrogate pair b[i]:=b[i]-1; inc(rl,2); setLength(result,rl); result[rl-1]:=$D8 or ((b[i] and $0C) shr 2); result[rl-2]:=((b[i] and $03) shl 6) or ((b[i+1] and $FC) shr 2); inc(rl,2); setLength(result,rl); result[rl-1]:=$DC or (b[i+1] and $03); result[rl-2]:=b[i+2]; end; inc(i,3); end; end; function MamU8ToU32(b:TBytes):TBytes; var i:integer; bl,rl:Integer; begin // Example: the character “𠮷” arrives as UTF‑8 bytes F0 A0 AE B7 // b[0] = $F0, b[1] = $A0, b[2] = $AE, b[3] = $B7 // // This function returns the Unicode code point U+20BB7 // encoded into 3 bytes: // result[0] = $02, result[1] = $0B, result[2] = $B7 // // Unicode 21‑bit layout: //wwwww xxxxxyyy yzzzzzzz // // Unicode Range UTF‑8 Encoding // 1st byte 2nd byte 3rd byte 4th byte // 0~ 7F 0zzzzzzz 1-byte UTF8 // 80~ 7FF 110yyyyz 10zzzzzz 2-byte UTF8 // 800~ FFFF 1110xxxx 10xyyyyz 10zzzzzz 3-byte UTF8 //10000~10FFFF 11110www 10wwxxxx 10xyyyyz 10zzzzzz 4-byte UTF8 // If all red‑marked bits are zero, the sequence is invalid. bl:=Length(b); rl:=0; SetLength(result,rl); i:=0; while i<bl do begin if b[i]<$80 then begin //1‑byte UTF‑8 inc(rl,3); SetLength(result,rl); result[rl-1]:=b[i]; result[rl-2]:=0; result[rl-3]:=0; inc(i); end else if (b[i]>=$C2) and (b[i]<$E0) then begin //2‑byte UTF‑8 if ((i+1)<bl) and (b[i+1]>=$80) and (b[i+1]<$C0) and ((b[0] and $1E)<>0) then begin inc(rl,3); SetLength(result,rl); result[rl-1]:=(b[i+1] and $3F) OR ((b[i] and $03) shl 6); result[rl-2]:=(b[i] and $1C) shr 2; result[rl-3]:=0; inc(i,2); end else begin //Skip invalid sequence inc(i); end; end else if (b[i]>=$E0) and (b[i]<$F0) then begin //3‑byte UTF‑8 if ((i+2)<bl) and (b[i+1]>=$80) and (b[i+1]<$C0) and (b[i+2]>=$80) and (b[i+2]<$C0) and (((b[i] and $0F)<>0) or ((b[i+1] and $20)<>0)) then begin inc(rl,3); SetLength(result,rl); result[rl-1]:=(b[i+2] and $3F) OR ((b[i+1] and $03) shl 6); result[rl-2]:=((b[i+1] and $3C) shr 2) + ((b[i] and $0F) shl 4); result[rl-3]:=0; inc(i,3); end else begin //Skip invalid sequence inc(i); end; end else if (b[i]>=$F0) and (b[i]<$F8) then begin //4‑byte UTF‑8 if ((i+3)<bl) and (b[i+1]>=$80) and (b[i+1]<$C0) and (b[i+2]>=$80) and (b[i+2]<$C0) and (b[i+3]>=$80) and (b[i+3]<$C0) and (((b[i] and $07)<>0) or ((b[i+1] and $30)<>0)) then begin inc(rl,3); SetLength(result,rl); result[rl-1]:=(b[i+3] and $3F) OR ((b[i+2] and $03) shl 6); result[rl-2]:=((b[i+2] and $3C) shr 2) + ((b[i+1] and $0F) shl 4); result[rl-3]:=((b[i+1] and $30) shr 4) + ((b[i] and $07) shl 4); inc(i,4); end else begin //Skip invalid sequence inc(i); end; end else begin //Skip invalid sequence inc(i); end; end; end; end.
Usage Example
unit Unit1; interface uses Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics, Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls, System.Net.URLClient, System.Net.HttpClient, System.Net.HttpClientComponent, UMamUnicode; type TForm1 = class(TForm) Button1: TButton; Memo1: TMemo; NetHTTPClient1: TNetHTTPClient; procedure Button1Click(Sender: TObject); private { Private declarations } public { Public declarations } end; var Form1: TForm1; implementation {$R *.dfm} procedure TForm1.Button1Click(Sender: TObject); var b:TBytes; http:TNetHTTPClient; res:IHTTPResponse; begin http:=TNetHTTPClient.Create(nil); try res:=http.Get('https://mam-mam.net/'); SetLength(b,res.ContentStream.Size); res.ContentStream.Position:=0; res.ContentStream.Read(b,Length(b)); b:=MamU8ToU16(b); //Convert UTF‑8 byte array -> UTF‑16LE byte array Memo1.Lines.Add( TEncoding.Unicode.GetString(b) //Convert UTF‑16LE byte array -> Delphi String ); finally http.Free; end; end; end.
