diff options
-rw-r--r-- | src/libraries.adb | 9 | ||||
-rw-r--r-- | src/vhdl/scanner.adb | 49 | ||||
-rw-r--r-- | src/vhdl/scanner.ads | 5 |
3 files changed, 56 insertions, 7 deletions
diff --git a/src/libraries.adb b/src/libraries.adb index 9852c52..0cca4d0 100644 --- a/src/libraries.adb +++ b/src/libraries.adb @@ -1353,8 +1353,15 @@ package body Libraries is Res : Iir_Design_File; begin Scanner.Set_File (File); - Res := Parse.Parse_Design_File; + if Scanner.Detect_Encoding_Errors then + -- Don't even try to parse such a file. The BOM will be interpreted + -- as an identifier, which is not valid at the beginning of a file. + Res := Null_Iir; + else + Res := Parse.Parse_Design_File; + end if; Scanner.Close_File; + if Res /= Null_Iir then Set_Parent (Res, Work_Library); Set_Design_File_Filename (Res, Files_Map.Get_File_Name (File)); diff --git a/src/vhdl/scanner.adb b/src/vhdl/scanner.adb index f18723d..26dff5e 100644 --- a/src/vhdl/scanner.adb +++ b/src/vhdl/scanner.adb @@ -268,12 +268,8 @@ package body Scanner is is N_Source: File_Buffer_Acc; begin - if Current_Context.Source /= null then - raise Internal_Error; - end if; - if Source_File = No_Source_File_Entry then - raise Internal_Error; - end if; + pragma Assert (Current_Context.Source = null); + pragma Assert (Source_File /= No_Source_File_Entry); N_Source := Get_File_Source (Source_File); Current_Context := (Source => N_Source, Source_File => Source_File, @@ -293,6 +289,47 @@ package body Scanner is Current_Token := Tok_Invalid; end Set_File; + function Detect_Encoding_Errors return Boolean + is + C : constant Character := Source (Pos); + begin + -- No need to check further if first character is plain ASCII-7 + if C >= ' ' and C < Character'Val (127) then + return False; + end if; + + -- UTF-8 BOM is EF BB BF + if Source (Pos + 0) = Character'Val (16#ef#) + and then Source (Pos + 1) = Character'Val (16#bb#) + and then Source (Pos + 2) = Character'Val (16#bf#) + then + Error_Msg_Scan + ("source encoding must be latin-1 (UTF-8 BOM detected)"); + return True; + end if; + + -- UTF-16 BE BOM is FE FF + if Source (Pos + 0) = Character'Val (16#fe#) + and then Source (Pos + 1) = Character'Val (16#ff#) + then + Error_Msg_Scan + ("source encoding must be latin-1 (UTF-16 BE BOM detected)"); + return True; + end if; + + -- UTF-16 LE BOM is FF FE + if Source (Pos + 0) = Character'Val (16#ff#) + and then Source (Pos + 1) = Character'Val (16#fe#) + then + Error_Msg_Scan + ("source encoding must be latin-1 (UTF-16 LE BOM detected)"); + return True; + end if; + + -- Certainly weird, but scanner/parser will catch it. + return False; + end Detect_Encoding_Errors; + procedure Set_Current_Position (Position: Source_Ptr) is Loc : Location_Type; diff --git a/src/vhdl/scanner.ads b/src/vhdl/scanner.ads index 3edc9c0..6a5e1cf 100644 --- a/src/vhdl/scanner.ads +++ b/src/vhdl/scanner.ads @@ -62,6 +62,11 @@ package Scanner is -- Initialize the scanner with file SOURCE_FILE. procedure Set_File (Source_File : Source_File_Entry); + -- This function can be called just after Set_File to detect UTF BOM + -- patterns. It reports an error if a BOM is present and return True. + -- Silently return False if no error detected. + function Detect_Encoding_Errors return Boolean; + procedure Set_Current_Position (Position: Source_Ptr); -- Finalize the scanner. |