In my previous post we’ve parsed an exception verifying whether it’s correct or not syntactically. But we still have to evaluate it. To be able to do that we’ll have to build an abstract syntax tree. This can be done by modifying the previous code and inserting semantic action. That means we do something more when we match productions.

An abstract syntax tree is a binary tree. The inner nodes will represent operators and leafs will be numerical values.

Here is how a node in the AST will look:

AST node

It is defined like this:

enum ASTNodeType
{
   Undefined,
   OperatorPlus,
   OperatorMinus,
   OperatorMul,
   OperatorDiv,
   UnaryMinus,
   NumberValue
};

class ASTNode
{
public:
   ASTNodeType Type;
   double      Value;
   ASTNode*    Left;
   ASTNode*    Right;

   ASTNode()
   {
      Type = Undefined;
      Value = 0;
      Left = NULL;
      Right = NULL;
   }

   ~ASTNode()
   {
      delete Left;
      delete Right;
   }
};

For the expression 1+2*3, the AST will be:

AST example

We’ll build this tree by inserting semantic actions and adding nodes according to the following rules:

AST semantic rules

You’ll probably notice that based on these rules the AST shown above will be modified a little bit, with some additional nodes for operators + and *, having on the left a leaf node with the neutral element for the operation (zero for + and 1 for *), and on the right a node corresponding to a TERM or FACTOR. This won’t affect the evaluation.

The Parser class will change so that the functions corresponding to the non-terminal symbols EXP, EXP1, TERM, TERM1 and FACTOR will return an ASTNode* instead of void. That is the node created as a semantic action.

class Parser
{
   Token m_crtToken;
   const char* m_Text;
   size_t m_Index;

private:

   ASTNode* Expression()
   {
      ASTNode* tnode = Term();
      ASTNode* e1node = Expression1();

      return CreateNode(OperatorPlus, tnode, e1node);
   }

   ASTNode* Expression1()
   {
      ASTNode* tnode;
      ASTNode* e1node;

      switch(m_crtToken.Type)
      {
      case Plus:
         GetNextToken();
         tnode = Term();
         e1node = Expression1();

         return CreateNode(OperatorPlus, e1node, tnode);

      case Minus:
         GetNextToken();
         tnode = Term();
         e1node = Expression1();

         return CreateNode(OperatorMinus, e1node, tnode);
      }

      return CreateNodeNumber(0);
   }

   ASTNode* Term()
   {
      ASTNode* fnode = Factor();
      ASTNode* t1node = Term1();

      return CreateNode(OperatorMul, fnode, t1node);
   }

   ASTNode* Term1()
   {
      ASTNode* fnode;
      ASTNode* t1node;

      switch(m_crtToken.Type)
      {
      case Mul:
         GetNextToken();
         fnode = Factor();
         t1node = Term1();
         return CreateNode(OperatorMul, t1node, fnode);

      case Div:
         GetNextToken();
         fnode = Factor();
         t1node = Term1();
         return CreateNode(OperatorDiv, t1node, fnode);
      }

      return CreateNodeNumber(1);
   }

   ASTNode* Factor()
   {
      ASTNode* node;
      switch(m_crtToken.Type)
      {
      case OpenParenthesis:
         GetNextToken();
         node = Expression();
         Match(')');
         return node;

      case Minus:
         GetNextToken();
		 node = Factor();
         return CreateUnaryNode(node);

      case Number:
         {
            double value = m_crtToken.Value;
            GetNextToken();
            return CreateNodeNumber(value);
         }

      default:
         {
            std::stringstream sstr;
            sstr << "Unexpected token '" << m_crtToken.Symbol << "' at position " << m_Index;
            throw ParserException(sstr.str(), m_Index);
         }
      }
   }

   ASTNode* CreateNode(ASTNodeType type, ASTNode* left, ASTNode* right)
   {
      ASTNode* node = new ASTNode;
      node->Type = type;
      node->Left = left;
      node->Right = right;

      return node;
   }

   ASTNode* CreateUnaryNode(ASTNode* left)
   {
      ASTNode* node = new ASTNode;
      node->Type = UnaryMinus;
      node->Left = left;
      node->Right = NULL;

      return node;
   }

   ASTNode* CreateNodeNumber(double value)
   {
      ASTNode* node = new ASTNode;
      node->Type = NumberValue;
      node->Value = value;

      return node;
   }

   void Match(char expected)
   {
      if(m_Text[m_Index-1] == expected)
         GetNextToken();
      else
      {
         std::stringstream sstr;
         sstr << "Expected token '" << expected << "' at position " << m_Index;
         throw ParserException(sstr.str(), m_Index);
      }
   }

   void SkipWhitespaces()
   {
      while(isspace(m_Text[m_Index])) m_Index++;
   }

   void GetNextToken()
   {
      SkipWhitespaces();

	  m_crtToken.Value = 0;
	  m_crtToken.Symbol = 0;

      if(m_Text[m_Index] == 0)
      {
         m_crtToken.Type = EndOfText;
         return;
      }

      if(isdigit(m_Text[m_Index]))
      {
         m_crtToken.Type = Number;
		 m_crtToken.Value = GetNumber();
         return;
      }

      m_crtToken.Type = Error;

      switch(m_Text[m_Index])
      {
      case '+': m_crtToken.Type = Plus; break;
      case '-': m_crtToken.Type = Minus; break;
      case '*': m_crtToken.Type = Mul; break;
      case '/': m_crtToken.Type = Div; break;
      case '(': m_crtToken.Type = OpenParenthesis; break;
      case ')': m_crtToken.Type = ClosedParenthesis; break;
      }

      if(m_crtToken.Type != Error)
	  {
         m_crtToken.Symbol = m_Text[m_Index];
         m_Index++;
	  }
      else
      {
         std::stringstream sstr;
         sstr << "Unexpected token '" << m_Text[m_Index] << "' at position " << m_Index;
         throw ParserException(sstr.str(), m_Index);
      }
   }

   double GetNumber()
   {
      SkipWhitespaces();

      int index = m_Index;
      while(isdigit(m_Text[m_Index])) m_Index++;
      if(m_Text[m_Index] == '.') m_Index++;
      while(isdigit(m_Text[m_Index])) m_Index++;

      if(m_Index - index == 0)
         throw ParserException("Number expected but not found!", m_Index);

      char buffer[32] = {0};
      memcpy(buffer, &m_Text[index], m_Index - index);

      return atof(buffer);
   }

public:
   ASTNode* Parse(const char* text)
   {
      m_Text = text;
      m_Index = 0;
      GetNextToken();

      return Expression();
   }
};

Now the Parse() method will return the created abstract syntax tree. We will see how to evaluate the expression by traversing this tree in the next post.

, , , , Hits for this post: 18964 .

In my previous post I have provided some background theory for evaluating expressions with abstract syntax trees. As I was mentioning, the first step towards this goal is to parse the expression, make sure it is correct syntactically. This is what I’ll show you in this post.

Having the grammar defined, we’ll make one function for each non-terminal symbol (EXP, EXP1, TERM, TERM1, FACTOR).

Simply put the code will look like this:

   void Expression()
   {
      Term();
      Expression1();
   }

   void Expression1()
   {
      switch(current_token)
      {
      case '+':
         GetNextToken();
         Term();
         Expression1();
         break;

      case '-':
         GetNextToken();
         Term();
         Expression1();
         break;
      }
   }

However, I want to make it a little bit more organized, so the first thing to do will be defining a Token structure that will indicate the type of last extracted token and if the case its value (for numbers). A token is basically a symbol extracted (one at a time) from the input text. The possible tokens will be the arithmetical operators (‘+’, ‘-’, ‘/’, ‘*’), the parentheses (‘(‘ and ‘)’), numbers and the end of the text.

Here is how I defined the token type and the token:

enum TokenType
{
   Error,
   Plus,
   Minus,
   Mul,
   Div,
   EndOfText,
   OpenParenthesis,
   ClosedParenthesis,
   Number
};

struct Token
{
   TokenType	Type;
   double		Value;
   char		Symbol;

   Token():Type(Error), Value(0), Symbol(0)
   {}
};

To be able to do the parsing, we’ll need some helper functions:

  • SkipWhitespaces(), skips all whitespaces between two tokens:
       void SkipWhitespaces()
       {
          while(isspace(m_Text[m_Index])) m_Index++;
       }
    
  • GetNextToken(), extracts the next token from the text; if an illegal token appears it throws an exception
       void GetNextToken()
       {
          // ignore white spaces
          SkipWhitespaces();
    
          m_crtToken.Value = 0;
          m_crtToken.Symbol = 0;
    
          // test for the end of text
          if(m_Text[m_Index] == 0)
          {
             m_crtToken.Type = EndOfText;
             return;
          }
    
          // if the current character is a digit read a number
          if(isdigit(m_Text[m_Index]))
          {
             m_crtToken.Type = Number;
             m_crtToken.Value = GetNumber();
             return;
          }
    
          m_crtToken.Type = Error;
    
          // check if the current character is an operator or parentheses
          switch(m_Text[m_Index])
          {
          case '+': m_crtToken.Type = Plus; break;
          case '-': m_crtToken.Type = Minus; break;
          case '*': m_crtToken.Type = Mul; break;
          case '/': m_crtToken.Type = Div; break;
          case '(': m_crtToken.Type = OpenParenthesis; break;
          case ')': m_crtToken.Type = ClosedParenthesis; break;
          }
    
          if(m_crtToken.Type != Error)
          {
             m_crtToken.Symbol = m_Text[m_Index];
             m_Index++;
          }
          else
          {
             std::stringstream sstr;
             sstr << "Unexpected token '" << m_Text[m_Index] << "' at position " << m_Index;
             throw ParserException(sstr.str(), m_Index);
          }
       }
    
  • GetNumber() extracts a number from the input text from the current position; the purpose of this tutorial is didactical, so this function is quite simple: it reads integers and doubles with '.' As the decimal point; it doesn't read numbers in a format like 123.3E+2.
       double GetNumber()
       {
          SkipWhitespaces();
    
          int index = m_Index;
          while(isdigit(m_Text[m_Index])) m_Index++;
          if(m_Text[m_Index] == '.') m_Index++;
          while(isdigit(m_Text[m_Index])) m_Index++;
    
          if(m_Index - index == 0)
             throw ParserException("Number expected but not found!", m_Index);
    
          char buffer[32] = {0};
          memcpy(buffer, &m_Text[index], m_Index - index);
    
          return atof(buffer);
       }
    

With these defined, we can build the parser for the specified grammar.

class Parser
{
   Token m_crtToken;
   const char* m_Text;
   size_t m_Index;

private:

   void Expression()
   {
      Term();
      Expression1();
   }

   void Expression1()
   {
      switch(m_crtToken.Type)
      {
      case Plus:
         GetNextToken();
         Term();
         Expression1();
         break;

      case Minus:
         GetNextToken();
         Term();
         Expression1();
         break;
      }
   }

   void Term()
   {
      Factor();
      Term1();
   }

   void Term1()
   {
      switch(m_crtToken.Type)
      {
      case Mul:
         GetNextToken();
         Factor();
         Term1();
         break;

      case Div:
         GetNextToken();
         Factor();
         Term1();
         break;
      }
   }

   void Factor()
   {
      switch(m_crtToken.Type)
      {
      case OpenParenthesis:
         GetNextToken();
         Expression();
         Match(')');
         break;

      case Minus:
         GetNextToken();
         Factor();
         break;

      case Number:
         GetNextToken();
         break;

      default:
         {
            std::stringstream sstr;
            sstr << "Unexpected token '" << m_crtToken.Symbol << "' at position " << m_Index;
            throw ParserException(sstr.str(), m_Index);
         }
      }
   }

   void Match(char expected)
   {
      if(m_Text[m_Index-1] == expected)
         GetNextToken();
      else
      {
         std::stringstream sstr;
         sstr << "Expected token '" << expected << "' at position " << m_Index;
         throw ParserException(sstr.str(), m_Index);
      }
   }

   void SkipWhitespaces()
   {
      while(isspace(m_Text[m_Index])) m_Index++;
   }

   void GetNextToken()
   {
      // ignore white spaces
      SkipWhitespaces();

      m_crtToken.Value = 0;
      m_crtToken.Symbol = 0;

      // test for the end of text
      if(m_Text[m_Index] == 0)
      {
         m_crtToken.Type = EndOfText;
         return;
      }

      // if the current character is a digit read a number
      if(isdigit(m_Text[m_Index]))
      {
         m_crtToken.Type = Number;
         m_crtToken.Value = GetNumber();
         return;
      }

      m_crtToken.Type = Error;

      // check if the current character is an operator or parentheses
      switch(m_Text[m_Index])
      {
      case '+': m_crtToken.Type = Plus; break;
      case '-': m_crtToken.Type = Minus; break;
      case '*': m_crtToken.Type = Mul; break;
      case '/': m_crtToken.Type = Div; break;
      case '(': m_crtToken.Type = OpenParenthesis; break;
      case ')': m_crtToken.Type = ClosedParenthesis; break;
      }

      if(m_crtToken.Type != Error)
      {
         m_crtToken.Symbol = m_Text[m_Index];
         m_Index++;
      }
      else
      {
         std::stringstream sstr;
         sstr << "Unexpected token '" << m_Text[m_Index] << "' at position " << m_Index;
         throw ParserException(sstr.str(), m_Index);
      }
   }

   double GetNumber()
   {
      SkipWhitespaces();

      int index = m_Index;
      while(isdigit(m_Text[m_Index])) m_Index++;
      if(m_Text[m_Index] == '.') m_Index++;
      while(isdigit(m_Text[m_Index])) m_Index++;

      if(m_Index - index == 0)
         throw ParserException("Number expected but not found!", m_Index);

      char buffer[32] = {0};
      memcpy(buffer, &m_Text[index], m_Index - index);

      return atof(buffer);
   }

public:
   void Parse(const char* text)
   {
      m_Text = text;
      m_Index = 0;
      GetNextToken();

      Expression();
   }
};

The exception class is defined like this:

class ParserException : public std::exception
{
   int m_Pos;

public:
   ParserException(const std::string& message, int pos):
      std::exception(message.c_str()),
      m_Pos(pos)
   {
   }
};

As you can see, the code for the grammar production is quite simple and straight forward. Now, let's put it to the test.

void Test(const char* text)
{
   Parser parser;
   try
   {
      parser.Parse(text);
      std::cout << """ << text << ""t OK" << std::endl;
   }
   catch(ParserException& ex)
   {
      std::cout << """ << text << ""t " << ex.what() << std::endl;
   }
}

int main()
{
   Test("1+2+3+4");
   Test("1*2*3*4");
   Test("1-2-3-4");
   Test("1/2/3/4");
   Test("1*2+3*4");
   Test("1+2*3+4");
   Test("(1+2)*(3+4)");
   Test("1+(2*3)*(4+5)");
   Test("1+(2*3)/4+5");
   Test("5/(4+3)/2");
   Test("1 + 2.5");
   Test("125");
   Test("-1");
   Test("-1+(-2)");
   Test("-1+(-2.0)");

   Test("   1*2,5");
   Test("   1*2.5e2");
   Test("M1 + 2.5");
   Test("1 + 2&5");
   Test("1 * 2.5.6");
   Test("1 ** 2.5");
   Test("*1 / 2.5");

   return 0;
}

The output for this testing program is:

"1+2+3+4"        OK
"1*2*3*4"        OK
"1-2-3-4"        OK
"1/2/3/4"        OK
"1*2+3*4"        OK
"1+2*3+4"        OK
"(1+2)*(3+4)"    OK
"1+(2*3)*(4+5)"  OK
"1+(2*3)/4+5"    OK
"5/(4+3)/2"      OK
"1 + 2.5"        OK
"125"    OK
"-1"     OK
"-1+(-2)"        OK
"-1+(-2.0)"      OK
"   1*2,5"       Unexpected token ',' at position 6
"   1*2.5e2"     Unexpected token 'e' at position 8
"M1 + 2.5"       Unexpected token 'M' at position 0
"1 + 2&5"        Unexpected token '&' at position 5
"1 * 2.5.6"      Unexpected token '.' at position 7
"1 ** 2.5"       Unexpected token '*' at position 4
"*1 / 2.5"       Unexpected token '*' at position 1

Which is exactly what we expected: it validates correct expressions and throws an exception when the exception is incorrect.

In the next post I'll show how to modify this code to build an abstract syntax tree.

, , , , , , , Hits for this post: 21126 .