summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/libraries.adb9
-rw-r--r--src/vhdl/scanner.adb49
-rw-r--r--src/vhdl/scanner.ads5
3 files changed, 56 insertions, 7 deletions
diff --git a/src/libraries.adb b/src/libraries.adb
index 9852c52..0cca4d0 100644
--- a/src/libraries.adb
+++ b/src/libraries.adb
@@ -1353,8 +1353,15 @@ package body Libraries is
Res : Iir_Design_File;
begin
Scanner.Set_File (File);
- Res := Parse.Parse_Design_File;
+ if Scanner.Detect_Encoding_Errors then
+ -- Don't even try to parse such a file. The BOM will be interpreted
+ -- as an identifier, which is not valid at the beginning of a file.
+ Res := Null_Iir;
+ else
+ Res := Parse.Parse_Design_File;
+ end if;
Scanner.Close_File;
+
if Res /= Null_Iir then
Set_Parent (Res, Work_Library);
Set_Design_File_Filename (Res, Files_Map.Get_File_Name (File));
diff --git a/src/vhdl/scanner.adb b/src/vhdl/scanner.adb
index f18723d..26dff5e 100644
--- a/src/vhdl/scanner.adb
+++ b/src/vhdl/scanner.adb
@@ -268,12 +268,8 @@ package body Scanner is
is
N_Source: File_Buffer_Acc;
begin
- if Current_Context.Source /= null then
- raise Internal_Error;
- end if;
- if Source_File = No_Source_File_Entry then
- raise Internal_Error;
- end if;
+ pragma Assert (Current_Context.Source = null);
+ pragma Assert (Source_File /= No_Source_File_Entry);
N_Source := Get_File_Source (Source_File);
Current_Context := (Source => N_Source,
Source_File => Source_File,
@@ -293,6 +289,47 @@ package body Scanner is
Current_Token := Tok_Invalid;
end Set_File;
+ function Detect_Encoding_Errors return Boolean
+ is
+ C : constant Character := Source (Pos);
+ begin
+ -- No need to check further if first character is plain ASCII-7
+ if C >= ' ' and C < Character'Val (127) then
+ return False;
+ end if;
+
+ -- UTF-8 BOM is EF BB BF
+ if Source (Pos + 0) = Character'Val (16#ef#)
+ and then Source (Pos + 1) = Character'Val (16#bb#)
+ and then Source (Pos + 2) = Character'Val (16#bf#)
+ then
+ Error_Msg_Scan
+ ("source encoding must be latin-1 (UTF-8 BOM detected)");
+ return True;
+ end if;
+
+ -- UTF-16 BE BOM is FE FF
+ if Source (Pos + 0) = Character'Val (16#fe#)
+ and then Source (Pos + 1) = Character'Val (16#ff#)
+ then
+ Error_Msg_Scan
+ ("source encoding must be latin-1 (UTF-16 BE BOM detected)");
+ return True;
+ end if;
+
+ -- UTF-16 LE BOM is FF FE
+ if Source (Pos + 0) = Character'Val (16#ff#)
+ and then Source (Pos + 1) = Character'Val (16#fe#)
+ then
+ Error_Msg_Scan
+ ("source encoding must be latin-1 (UTF-16 LE BOM detected)");
+ return True;
+ end if;
+
+ -- Certainly weird, but scanner/parser will catch it.
+ return False;
+ end Detect_Encoding_Errors;
+
procedure Set_Current_Position (Position: Source_Ptr)
is
Loc : Location_Type;
diff --git a/src/vhdl/scanner.ads b/src/vhdl/scanner.ads
index 3edc9c0..6a5e1cf 100644
--- a/src/vhdl/scanner.ads
+++ b/src/vhdl/scanner.ads
@@ -62,6 +62,11 @@ package Scanner is
-- Initialize the scanner with file SOURCE_FILE.
procedure Set_File (Source_File : Source_File_Entry);
+ -- This function can be called just after Set_File to detect UTF BOM
+ -- patterns. It reports an error if a BOM is present and return True.
+ -- Silently return False if no error detected.
+ function Detect_Encoding_Errors return Boolean;
+
procedure Set_Current_Position (Position: Source_Ptr);
-- Finalize the scanner.