{-# OPTIONS  #-}
-----------------------------------------------------------------------------
-- |
-- Module      : Language.Python.Common.LexerUtils 
-- Copyright   : (c) 2009 Bernie Pope 
-- License     : BSD-style
-- Maintainer  : bjpop@csse.unimelb.edu.au
-- Stability   : experimental
-- Portability : ghc
--
-- Various utilities to support the Python lexer. 
-----------------------------------------------------------------------------

module Language.Python.Common.LexerUtils where

import Control.Monad (liftM)
import Data.List (foldl')
import Data.Word (Word8)
import Language.Python.Common.Token as Token
import Language.Python.Common.ParserMonad hiding (location)
import Language.Python.Common.SrcLocation 
import Codec.Binary.UTF8.String as UTF8 (encode)

type Byte = Word8

-- Beginning of. BOF = beginning of file, BOL = beginning of line
data BO = BOF | BOL

-- Functions for building tokens 

type StartCode = Int
type Action = SrcSpan -> Int -> String -> P Token 

lineJoin :: Action
lineJoin :: Action
lineJoin span :: SrcSpan
span _len :: Int
_len _str :: String
_str = 
   Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> StateT ParseState (Either ParseError) Token)
-> Token -> StateT ParseState (Either ParseError) Token
forall a b. (a -> b) -> a -> b
$ SrcSpan -> Token
LineJoinToken (SrcSpan -> Token) -> SrcSpan -> Token
forall a b. (a -> b) -> a -> b
$ SrcSpan -> SrcSpan
spanStartPoint SrcSpan
span

endOfLine :: P Token -> Action
endOfLine :: StateT ParseState (Either ParseError) Token -> Action
endOfLine lexToken :: StateT ParseState (Either ParseError) Token
lexToken span :: SrcSpan
span _len :: Int
_len _str :: String
_str = do
   SrcSpan -> P ()
setLastEOL (SrcSpan -> P ()) -> SrcSpan -> P ()
forall a b. (a -> b) -> a -> b
$ SrcSpan -> SrcSpan
spanStartPoint SrcSpan
span
   StateT ParseState (Either ParseError) Token
lexToken

bolEndOfLine :: P Token -> Int -> Action 
bolEndOfLine :: StateT ParseState (Either ParseError) Token -> Int -> Action
bolEndOfLine lexToken :: StateT ParseState (Either ParseError) Token
lexToken bol :: Int
bol span :: SrcSpan
span len :: Int
len inp :: String
inp = do
   Int -> P ()
pushStartCode Int
bol 
   StateT ParseState (Either ParseError) Token -> Action
endOfLine StateT ParseState (Either ParseError) Token
lexToken SrcSpan
span Int
len String
inp

dedentation :: P Token -> Action
dedentation :: StateT ParseState (Either ParseError) Token -> Action
dedentation lexToken :: StateT ParseState (Either ParseError) Token
lexToken span :: SrcSpan
span _len :: Int
_len _str :: String
_str = do
   Int
topIndent <- P Int
getIndent
   -- case compare (endCol span) topIndent of
   case Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (SrcSpan -> Int
startCol SrcSpan
span) Int
topIndent of
      EQ -> do P ()
popStartCode
               StateT ParseState (Either ParseError) Token
lexToken 
      LT -> do P ()
popIndent
               Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return Token
dedentToken 
      GT -> SrcSpan -> String -> StateT ParseState (Either ParseError) Token
forall a b. Span a => a -> String -> P b
spanError SrcSpan
span "indentation error"

indentation :: P Token -> Int -> BO -> Action 
-- Check if we are at the EOF. If yes, we may need to generate a newline,
-- in case we came here from BOL (but not BOF).
indentation :: StateT ParseState (Either ParseError) Token -> Int -> BO -> Action
indentation lexToken :: StateT ParseState (Either ParseError) Token
lexToken _dedentCode :: Int
_dedentCode bo :: BO
bo _loc :: SrcSpan
_loc _len :: Int
_len [] = do
   P ()
popStartCode
   case BO
bo of
      BOF -> StateT ParseState (Either ParseError) Token
lexToken
      BOL -> StateT ParseState (Either ParseError) Token
newlineToken
indentation lexToken :: StateT ParseState (Either ParseError) Token
lexToken dedentCode :: Int
dedentCode bo :: BO
bo span :: SrcSpan
span _len :: Int
_len _str :: String
_str = do
   P ()
popStartCode
   Int
parenDepth <- P Int
getParenStackDepth
   if Int
parenDepth Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 0
      then StateT ParseState (Either ParseError) Token
lexToken
      else do 
         Int
topIndent <- P Int
getIndent
         -- case compare (endCol span) topIndent of
         case Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (SrcSpan -> Int
startCol SrcSpan
span) Int
topIndent of
            EQ -> case BO
bo of
                     BOF -> StateT ParseState (Either ParseError) Token
lexToken
                     BOL -> StateT ParseState (Either ParseError) Token
newlineToken   
            LT -> do Int -> P ()
pushStartCode Int
dedentCode 
                     StateT ParseState (Either ParseError) Token
newlineToken 
            -- GT -> do pushIndent (endCol span)
            GT -> do Int -> P ()
pushIndent (SrcSpan -> Int
startCol SrcSpan
span)
                     Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return Token
indentToken
   where
   indentToken :: Token
indentToken = SrcSpan -> Token
IndentToken SrcSpan
span

symbolToken :: (SrcSpan -> Token) -> Action 
symbolToken :: (SrcSpan -> Token) -> Action
symbolToken mkToken :: SrcSpan -> Token
mkToken location :: SrcSpan
location _ _ = Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return (SrcSpan -> Token
mkToken SrcSpan
location)

token :: (SrcSpan -> String -> a -> Token) -> (String -> a) -> Action 
token :: (SrcSpan -> String -> a -> Token) -> (String -> a) -> Action
token mkToken :: SrcSpan -> String -> a -> Token
mkToken read :: String -> a
read location :: SrcSpan
location len :: Int
len str :: String
str 
   = Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> StateT ParseState (Either ParseError) Token)
-> Token -> StateT ParseState (Either ParseError) Token
forall a b. (a -> b) -> a -> b
$ SrcSpan -> String -> a -> Token
mkToken SrcSpan
location String
literal (String -> a
read String
literal) 
   where
   literal :: String
literal = Int -> String -> String
forall a. Int -> [a] -> [a]
take Int
len String
str

-- special tokens for the end of file and end of line
endOfFileToken :: Token
endOfFileToken :: Token
endOfFileToken = SrcSpan -> Token
EOFToken SrcSpan
SpanEmpty
dedentToken :: Token
dedentToken = SrcSpan -> Token
DedentToken SrcSpan
SpanEmpty 

newlineToken :: P Token
newlineToken :: StateT ParseState (Either ParseError) Token
newlineToken = do
   SrcSpan
loc <- P SrcSpan
getLastEOL
   Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> StateT ParseState (Either ParseError) Token)
-> Token -> StateT ParseState (Either ParseError) Token
forall a b. (a -> b) -> a -> b
$ SrcSpan -> Token
NewlineToken SrcSpan
loc

-- Test if we are at the end of the line or file
atEOLorEOF :: a -> AlexInput -> Int -> AlexInput -> Bool
atEOLorEOF :: a -> AlexInput -> Int -> AlexInput -> Bool
atEOLorEOF _user :: a
_user _inputBeforeToken :: AlexInput
_inputBeforeToken _tokenLength :: Int
_tokenLength (_loc :: SrcLocation
_loc, _bs :: [Byte]
_bs, inputAfterToken :: String
inputAfterToken) 
   = String -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null String
inputAfterToken Bool -> Bool -> Bool
|| Char
nextChar Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '\n' Bool -> Bool -> Bool
|| Char
nextChar Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '\r'
   where
   nextChar :: Char
nextChar = String -> Char
forall a. [a] -> a
head String
inputAfterToken 

notEOF :: a -> AlexInput -> Int -> AlexInput -> Bool
notEOF :: a -> AlexInput -> Int -> AlexInput -> Bool
notEOF _user :: a
_user _inputBeforeToken :: AlexInput
_inputBeforeToken _tokenLength :: Int
_tokenLength (_loc :: SrcLocation
_loc, _bs :: [Byte]
_bs, inputAfterToken :: String
inputAfterToken) 
   = Bool -> Bool
not (String -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null String
inputAfterToken)

delUnderscores :: String -> String
delUnderscores :: String -> String
delUnderscores []       = []
delUnderscores ('_':xs :: String
xs) = String -> String
delUnderscores String
xs
delUnderscores (x :: Char
x  :xs :: String
xs) = Char
x Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
delUnderscores String
xs

readBinary :: String -> Integer
readBinary :: String -> Integer
readBinary 
   = String -> Integer
toBinary (String -> Integer) -> (String -> String) -> String -> Integer
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> String -> String
forall a. Int -> [a] -> [a]
drop 2 
   where
   toBinary :: String -> Integer
toBinary = (Integer -> Char -> Integer) -> Integer -> String -> Integer
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' Integer -> Char -> Integer
forall a. Num a => a -> Char -> a
acc 0
   acc :: a -> Char -> a
acc b :: a
b '0' = 2 a -> a -> a
forall a. Num a => a -> a -> a
* a
b
   acc b :: a
b '1' = 2 a -> a -> a
forall a. Num a => a -> a -> a
* a
b a -> a -> a
forall a. Num a => a -> a -> a
+ 1
   acc _ _ = String -> a
forall a. HasCallStack => String -> a
error "Lexer ensures all digits passed to readBinary are 0 or 1."

readFloat :: String -> Double
readFloat :: String -> Double
readFloat str :: String
str@('.':cs :: String
cs) = String -> Double
forall a. Read a => String -> a
read ('0'Char -> String -> String
forall a. a -> [a] -> [a]
:String -> String
readFloatRest String
str)
readFloat str :: String
str = String -> Double
forall a. Read a => String -> a
read (String -> String
readFloatRest String
str)
readFloatRest :: String -> String
readFloatRest :: String -> String
readFloatRest [] = []
readFloatRest ['.'] = ".0"
readFloatRest (c :: Char
c:cs :: String
cs) = Char
c Char -> String -> String
forall a. a -> [a] -> [a]
: String -> String
readFloatRest String
cs

mkString :: (SrcSpan -> String -> Token) -> Action
mkString :: (SrcSpan -> String -> Token) -> Action
mkString toToken :: SrcSpan -> String -> Token
toToken loc :: SrcSpan
loc len :: Int
len str :: String
str = do
   Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> StateT ParseState (Either ParseError) Token)
-> Token -> StateT ParseState (Either ParseError) Token
forall a b. (a -> b) -> a -> b
$ SrcSpan -> String -> Token
toToken SrcSpan
loc (Int -> String -> String
forall a. Int -> [a] -> [a]
take Int
len String
str)

stringToken :: SrcSpan -> String -> Token
stringToken :: SrcSpan -> String -> Token
stringToken = SrcSpan -> String -> Token
StringToken

rawStringToken :: SrcSpan -> String -> Token
rawStringToken :: SrcSpan -> String -> Token
rawStringToken = SrcSpan -> String -> Token
StringToken

byteStringToken :: SrcSpan -> String -> Token
byteStringToken :: SrcSpan -> String -> Token
byteStringToken = SrcSpan -> String -> Token
ByteStringToken

formatStringToken :: SrcSpan -> String -> Token
formatStringToken :: SrcSpan -> String -> Token
formatStringToken = SrcSpan -> String -> Token
StringToken

formatRawStringToken :: SrcSpan -> String -> Token
formatRawStringToken :: SrcSpan -> String -> Token
formatRawStringToken = SrcSpan -> String -> Token
StringToken

unicodeStringToken :: SrcSpan -> String -> Token
unicodeStringToken :: SrcSpan -> String -> Token
unicodeStringToken = SrcSpan -> String -> Token
UnicodeStringToken

rawByteStringToken :: SrcSpan -> String -> Token
rawByteStringToken :: SrcSpan -> String -> Token
rawByteStringToken = SrcSpan -> String -> Token
ByteStringToken

openParen :: (SrcSpan -> Token) -> Action
openParen :: (SrcSpan -> Token) -> Action
openParen mkToken :: SrcSpan -> Token
mkToken loc :: SrcSpan
loc _len :: Int
_len _str :: String
_str = do
   let token :: Token
token = SrcSpan -> Token
mkToken SrcSpan
loc
   Token -> P ()
pushParen Token
token 
   Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return Token
token 

closeParen :: (SrcSpan -> Token) -> Action
closeParen :: (SrcSpan -> Token) -> Action
closeParen mkToken :: SrcSpan -> Token
mkToken loc :: SrcSpan
loc _len :: Int
_len _str :: String
_str = do
  let token :: Token
token = SrcSpan -> Token
mkToken SrcSpan
loc
  Maybe Token
topParen <- P (Maybe Token)
getParen
  case Maybe Token
topParen of
     Nothing -> SrcSpan -> String -> StateT ParseState (Either ParseError) Token
forall a b. Span a => a -> String -> P b
spanError SrcSpan
loc String
err1 
     Just open :: Token
open -> if Token -> Token -> Bool
matchParen Token
open Token
token 
                    then P ()
popParen P ()
-> StateT ParseState (Either ParseError) Token
-> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Token -> StateT ParseState (Either ParseError) Token
forall (m :: * -> *) a. Monad m => a -> m a
return Token
token
                    else SrcSpan -> String -> StateT ParseState (Either ParseError) Token
forall a b. Span a => a -> String -> P b
spanError SrcSpan
loc String
err2
   where
   -- XXX fix these error messages
   err1 :: String
err1 = "Lexical error ! unmatched closing paren"
   err2 :: String
err2 = "Lexical error ! unmatched closing paren"

matchParen :: Token -> Token -> Bool
matchParen :: Token -> Token -> Bool
matchParen (LeftRoundBracketToken {}) (RightRoundBracketToken {}) = Bool
True
matchParen (LeftBraceToken {}) (RightBraceToken {}) = Bool
True
matchParen (LeftSquareBracketToken {}) (RightSquareBracketToken {}) = Bool
True
matchParen _ _ = Bool
False

-- -----------------------------------------------------------------------------
-- Functionality required by Alex 

type AlexInput = (SrcLocation,  -- current src location
                 [Byte],        -- byte buffer for next character
                 String)        -- input string

alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar _ = String -> Char
forall a. HasCallStack => String -> a
error "alexInputPrevChar not used"

-- byte buffer should be empty here
alexGetChar :: AlexInput -> Maybe (Char, AlexInput)
alexGetChar :: AlexInput -> Maybe (Char, AlexInput)
alexGetChar (loc :: SrcLocation
loc, [], input :: String
input) 
   | String -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null String
input  = Maybe (Char, AlexInput)
forall a. Maybe a
Nothing
   | Bool
otherwise = SrcLocation -> Maybe (Char, AlexInput) -> Maybe (Char, AlexInput)
forall a b. a -> b -> b
seq SrcLocation
nextLoc ((Char, AlexInput) -> Maybe (Char, AlexInput)
forall a. a -> Maybe a
Just (Char
nextChar, (SrcLocation
nextLoc, [], String
rest)))
   where
   nextChar :: Char
nextChar = String -> Char
forall a. [a] -> a
head String
input
   rest :: String
rest = String -> String
forall a. [a] -> [a]
tail String
input 
   nextLoc :: SrcLocation
nextLoc = Char -> SrcLocation -> SrcLocation
moveChar Char
nextChar SrcLocation
loc
alexGetChar (loc :: SrcLocation
loc, _:_, _) = String -> Maybe (Char, AlexInput)
forall a. HasCallStack => String -> a
error "alexGetChar called with non-empty byte buffer"

-- mapFst :: (a -> b) -> (a, c) -> (b, c)
-- mapFst f (a, c) = (f a, c)

alexGetByte :: AlexInput -> Maybe (Byte, AlexInput)
-- alexGetByte = fmap (mapFst (fromIntegral . ord)) . alexGetChar
alexGetByte :: AlexInput -> Maybe (Byte, AlexInput)
alexGetByte (loc :: SrcLocation
loc, b :: Byte
b:bs :: [Byte]
bs, input :: String
input) = (Byte, AlexInput) -> Maybe (Byte, AlexInput)
forall a. a -> Maybe a
Just (Byte
b, (SrcLocation
loc, [Byte]
bs, String
input))
alexGetByte (loc :: SrcLocation
loc, [], []) = Maybe (Byte, AlexInput)
forall a. Maybe a
Nothing
alexGetByte (loc :: SrcLocation
loc, [], nextChar :: Char
nextChar:rest :: String
rest) =
   SrcLocation -> Maybe (Byte, AlexInput) -> Maybe (Byte, AlexInput)
forall a b. a -> b -> b
seq SrcLocation
nextLoc ((Byte, AlexInput) -> Maybe (Byte, AlexInput)
forall a. a -> Maybe a
Just (Byte
byte, (SrcLocation
nextLoc, [Byte]
restBytes, String
rest)))
   where
   nextLoc :: SrcLocation
nextLoc = Char -> SrcLocation -> SrcLocation
moveChar Char
nextChar SrcLocation
loc
   byte :: Byte
byte:restBytes :: [Byte]
restBytes = String -> [Byte]
UTF8.encode [Char
nextChar]

moveChar :: Char -> SrcLocation -> SrcLocation 
moveChar :: Char -> SrcLocation -> SrcLocation
moveChar '\n' = Int -> SrcLocation -> SrcLocation
incLine 1 
moveChar '\t' = SrcLocation -> SrcLocation
incTab 
moveChar '\r' = SrcLocation -> SrcLocation
forall a. a -> a
id 
moveChar _    = Int -> SrcLocation -> SrcLocation
incColumn 1 

lexicalError :: P a
lexicalError :: P a
lexicalError = do
  SrcLocation
location <- P SrcLocation
getLocation
  Char
c <- (String -> Char)
-> StateT ParseState (Either ParseError) String
-> StateT ParseState (Either ParseError) Char
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM String -> Char
forall a. [a] -> a
head StateT ParseState (Either ParseError) String
getInput
  ParseError -> P a
forall a. ParseError -> P a
throwError (ParseError -> P a) -> ParseError -> P a
forall a b. (a -> b) -> a -> b
$ Char -> SrcLocation -> ParseError
UnexpectedChar Char
c SrcLocation
location

readOctNoO :: String -> Integer
readOctNoO :: String -> Integer
readOctNoO (zero :: Char
zero:rest :: String
rest) = String -> Integer
forall a. Read a => String -> a
read (Char
zeroChar -> String -> String
forall a. a -> [a] -> [a]
:'O'Char -> String -> String
forall a. a -> [a] -> [a]
:String
rest)
readOctNoO [] = String -> Integer
forall a. HasCallStack => String -> a
error "Lexer ensures readOctNoO is never called on an empty string"