Implementing a UTF‑8 to UTF‑16LE Conversion Function in Delphi

Japanese

Implementing a UTF‑8 to UTF‑16LE Conversion Function in Delphi

To convert a UTF‑8 byte array into a Delphi String (UTF‑16LE), you can use TEncoding.UTF8.GetString(Bytes) from the System.SysUtils unit. However, if even a single invalid byte exists in the array, the function refuses to decode anything and raises an error.
Since I often need a result that converts as many valid parts as possible, I decided to implement my own conversion routine.

About Unicode

Unicode code points are expressed in the form U+10FFFF (a 21‑bit value).
The first 5 bits represent the "plane" (0–16), the next 8 bits represent the "row", and the final 8 bits represent the "cell".

UMamUnicode.pas File

unit UMamUnicode;

interface

uses System.SysUtils;

function MamU8ToU16(b: TBytes): TBytes;  // UTF‑8 → UTF‑16LE conversion
function MamU32ToU16(b: TBytes): TBytes; // Unicode (UTF‑32) → UTF‑16LE conversion
function MamU8ToU32(b: TBytes): TBytes;  // UTF‑8 → Unicode (UTF‑32) conversion

implementation

function MamU8ToU16(b:TBytes):TBytes;
var b1:TBytes;
begin
  b1:=MamU8ToU32(b);
  result:=MamU32ToU16(b1);
end;

function MamU32ToU16(b:TBytes):TBytes;
var i,bl,rl:integer;
begin
  // Example: the character "𠮷" is U+20BB7 in UTF‑32 (Unicode)
  // b[0] = $02, b[1] = $0B, b[2] = $B7
  // If b[0] is 1 or greater, the character requires a surrogate pair.
  //
  // For surrogate pairs:
  //   ① Subtract 1 from b[0]
  //   ② Convert the 21‑bit value into two UTF‑16 code units:
  //
  //             b[0]   b[1]      b[2]
  //             0001 0000 1011 1011 0111
  //             ~~~~ ~~~~ ~~                      (Little Endian)
  //
  // w1 = 1101 10 0001 0000 10    ↓   ↓  = D842 → 42 D8
  // w2 = 1101 11             11 1011 0111 = DFB7 → B7 DF



  rl:=0;
  SetLength(result,rl);
  bl:=Length(b);
  i:=0;
  while (i+2)<bl do
  begin
    if b[i]<=0 then
    begin
      //Not a surrogate pair
      inc(rl,2);
      setLength(result,rl);
      result[rl-1]:=b[i+1];
      result[rl-2]:=b[i+2];
    end
    else
    begin
      //Surrogate pair
      b[i]:=b[i]-1;
      inc(rl,2);
      setLength(result,rl);
      result[rl-1]:=$D8 or ((b[i] and $0C) shr 2);
      result[rl-2]:=((b[i] and $03) shl 6) or
                    ((b[i+1] and $FC) shr 2);
      inc(rl,2);
      setLength(result,rl);
      result[rl-1]:=$DC or (b[i+1] and $03);
      result[rl-2]:=b[i+2];
    end;
    inc(i,3);
  end;

end;

function MamU8ToU32(b:TBytes):TBytes;
var i:integer;
    bl,rl:Integer;
begin
  // Example: the character “𠮷” arrives as UTF‑8 bytes F0 A0 AE B7
  // b[0] = $F0, b[1] = $A0, b[2] = $AE, b[3] = $B7
  //
  // This function returns the Unicode code point U+20BB7
  // encoded into 3 bytes:
  // result[0] = $02, result[1] = $0B, result[2] = $B7
  //
  // Unicode 21‑bit layout:
  //wwwww xxxxxyyy yzzzzzzz
  //
  // Unicode Range     UTF‑8 Encoding
  //                   1st byte   2nd byte   3rd byte   4th byte
  //    0～    7F      0zzzzzzz                            1-byte UTF8
  //   80～   7FF      110yyyyz 10zzzzzz                   2-byte UTF8
  //  800～  FFFF      1110xxxx 10xyyyyz 10zzzzzz          3-byte UTF8
  //10000～10FFFF      11110www 10wwxxxx 10xyyyyz 10zzzzzz 4-byte UTF8
  //               If all red‑marked bits are zero, the sequence is invalid.

  bl:=Length(b);
  rl:=0;
  SetLength(result,rl);

  i:=0;
  while i<bl do
  begin
    if b[i]<$80 then
    begin
      //1‑byte UTF‑8
      inc(rl,3);
      SetLength(result,rl);
      result[rl-1]:=b[i];
      result[rl-2]:=0;
      result[rl-3]:=0;
      inc(i);
    end
    else if (b[i]>=$C2) and (b[i]<$E0) then
    begin
      //2‑byte UTF‑8
      if ((i+1)<bl) and (b[i+1]>=$80) and (b[i+1]<$C0) and ((b[0] and $1E)<>0) then
      begin
        inc(rl,3);
        SetLength(result,rl);
        result[rl-1]:=(b[i+1] and $3F) OR ((b[i] and $03) shl 6);
        result[rl-2]:=(b[i] and $1C) shr 2;
        result[rl-3]:=0;
        inc(i,2);
      end
      else
      begin
        //Skip invalid sequence
        inc(i);
      end;
    end
    else if (b[i]>=$E0) and (b[i]<$F0) then
    begin
      //3‑byte UTF‑8
      if ((i+2)<bl) and
         (b[i+1]>=$80) and (b[i+1]<$C0) and
         (b[i+2]>=$80) and (b[i+2]<$C0) and
         (((b[i] and $0F)<>0) or ((b[i+1] and $20)<>0)) then
      begin
        inc(rl,3);
        SetLength(result,rl);
        result[rl-1]:=(b[i+2] and $3F) OR ((b[i+1] and $03) shl 6);
        result[rl-2]:=((b[i+1] and $3C) shr 2) + ((b[i] and $0F) shl 4);
        result[rl-3]:=0;
        inc(i,3);
      end
      else
      begin
        //Skip invalid sequence
        inc(i);
      end;
    end
    else if (b[i]>=$F0) and (b[i]<$F8) then
    begin
      //4‑byte UTF‑8
      if ((i+3)<bl) and
         (b[i+1]>=$80) and (b[i+1]<$C0) and
         (b[i+2]>=$80) and (b[i+2]<$C0) and
         (b[i+3]>=$80) and (b[i+3]<$C0) and
         (((b[i] and $07)<>0) or ((b[i+1] and $30)<>0)) then
      begin
        inc(rl,3);
        SetLength(result,rl);
        result[rl-1]:=(b[i+3] and $3F) OR ((b[i+2] and $03) shl 6);
        result[rl-2]:=((b[i+2] and $3C) shr 2) + ((b[i+1] and $0F) shl 4);
        result[rl-3]:=((b[i+1] and $30) shr 4) + ((b[i] and $07) shl 4);
        inc(i,4);
      end
      else
      begin
        //Skip invalid sequence
        inc(i);
      end;
    end
    else
    begin
      //Skip invalid sequence
      inc(i);
    end;
  end;
end;

end.

Usage Example

unit Unit1;

interface

uses
  Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants,
  System.Classes, Vcl.Graphics, Vcl.Controls, Vcl.Forms, Vcl.Dialogs,
  Vcl.StdCtrls, System.Net.URLClient, System.Net.HttpClient,
  System.Net.HttpClientComponent,
  UMamUnicode;

type
  TForm1 = class(TForm)
    Button1: TButton;
    Memo1: TMemo;
    NetHTTPClient1: TNetHTTPClient;
    procedure Button1Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form1: TForm1;

implementation

{$R *.dfm}

procedure TForm1.Button1Click(Sender: TObject);
var b:TBytes;
    http:TNetHTTPClient;
    res:IHTTPResponse;
begin
  http:=TNetHTTPClient.Create(nil);
  try
    res:=http.Get('https://mam-mam.net/');
    SetLength(b,res.ContentStream.Size);
    res.ContentStream.Position:=0;
    res.ContentStream.Read(b,Length(b));
    b:=MamU8ToU16(b); //Convert UTF‑8 byte array -> UTF‑16LE byte array
    Memo1.Lines.Add(
      TEncoding.Unicode.GetString(b) //Convert UTF‑16LE byte array -> Delphi String
    );
  finally
    http.Free;
  end;
end;

end.