c++ - lexical analyzer using c++

Question

I have coded a lexical analyzer which parses through a file and just displays the token , lexeme and if they are value/valuer/literal (int,float or literal)

I am pretty sure , the code for the analyzer works but I can't write down the displaytoken where it will show the output in the following way :

Token Lexeme Value/valuer/literal

numt 1234 value

i have this code sample : for the lexical.cpp file

#include <cstdlib>
#include <iostream>
#include <fstream>
#include <string>
#include <cstring>

#include "lexical.h" 


int value ; // integer
float valuer ; // float
char ch ; // character
string lexeme ; // string
SYMBOL Token ; // token of symbols 
string Literal ; // string literals 

string reswords[15] ; // reserved words 

using namespace std;


//constructor 

Lexical::Lexical()
{

}

//destructor 

Lexical::~Lexical()
{





}

void Lexical::GetNextToken()
{
           // getting the lexeme *******************
           // **************************************

            value = 0 ;
            valuer = 0.0 ;

            int i = 1 ; // line tracker 

            while(!(fin.eof()))
            {
                lexeme[i] = ch ;
                fin.get(ch);
            }

            if(fin.peek() == '\n')
            {
                i++ ; // increment the line !
            }
            if(!fin.eof())
            {
                ProcessToken() ;
            }
            else
            {
                fin.close() ;
                Token = eofilet ; // set to end of file 
            }





    }







//rules not implemented yet
void Lexical::displayToken() 
{
    cout << "Token" << "           " << "Lexeme" << "           " << " value/valuer/Literal " << endl ;

    cout << Token <<   "           " << lexeme <<   "           " <<  value ;

}



//initialize reserved words 
void Lexical::InitResWords()
{
    reswords[begint].copy("BEGIN",5,0);
    reswords[programt].copy("PROGRAM",7,0);
    reswords[constt].copy("CONST",5,0);
    reswords[vart].copy("VAR",3,0);
    reswords[proceduret].copy("PROCEDURE",9,0);
    reswords[ift].copy("IF",2,0);
    reswords[whilet].copy("WHILE",5,0);
    reswords[thent].copy("THEN",4,0);
    reswords[elset].copy("ELSE",4,0);
    reswords[realt].copy("REAL",4,0);
    reswords[integert].copy("INTEGER",7,0);
    reswords[booleant].copy("BOOLEAN",7,0);
    reswords[chart].copy("CHAR",4,0);
    reswords[arrayt].copy("ARRAY",5,0);
    reswords[endt].copy("END",3,0);
} 

void Lexical::ProcessToken()
{
    lexeme.at(0) = ch ; // 1 character at a time 
    fin.get(ch) ;

    if((lexeme.at(0) >= 'A' && lexeme.at(0) <= 'Z') || (lexeme.at(0) >= 'a' && lexeme.at(0) <= 'z')) // if alphabets
    {
        int counter = 0 ;
        //match word token
            if(!isdigit(lexeme.at(0)) && !isalpha(lexeme.at(0)) && lexeme.at(0) != '_')
            {
            //*********** Working with reserved words !!! **************************************************
                        int j = 0 ;
                        bool flag = false ;
                        while(j < endt)
                            {
                                if(lexeme.compare(reswords[j]) == 0)
                                {
                                    Token = (SYMBOL)j;
                                    displayToken();
                                    flag = true ;
                                }
                            }       
            //**********************************************************************************************

            // if not a token , then we are alrea
                        Token = idt ;// then an identifier token
                        displayToken();
                        return ;
            }// if ends

            lexeme.at(counter) = ch ;// keep proceeding 
            fin.get(ch) ;
    }
    else if (lexeme.at(0) >= '0' && lexeme.at(0) <= '9') // if numbers
    {
        NumToken() ; 
    }
    else if (lexeme.at(0) == '\"') // for string literal
    {
        ProcessLiteralToken();
    }
    else if (lexeme.at(0) == '/') // entering comment section
      {
                if (ch == '/' || ch == '*')
                {
                       // MatchComment();
                        if (ch == '/') // start of a comment maybe ?
                            {
                            //Line comment
                              while(ch != '\n')
                                fin.get(ch);
                            }
                        else if(ch == '*') // end of a comment ?
                            {
                                  while(true) 
                                {
                                  fin.get(ch);
                                  if (ch == '*') 
                                  {
                                       char peek_value = fin.peek();
                                       if (peek_value == '/') 
                                           {
                                               fin.get(ch);
                                             //  fin.get(ch);
                                               return;
                                           }
                                      else 
                                          continue;
                                  }
                               } // while ends
                            }
                       else
                          {
                            cout << "ERROR !!!" ;
                          }

                      GetNextToken();
                      } // comment analyzer then moves to next token !
                else 
                {
                  OpToken();
                }
      }
  else if ((lexeme.at(0) == '<') || (lexeme.at(0) == '>') || (lexeme.at(0) == '='))
      {
        if (ch == '=')
          {
             lexeme.at(1) = ch;
             Token = relop ;
             fin.get(ch);
          }
        else
          OpToken() ; // process the final token
      }
  else if ((lexeme.at(0)) == ':')
      {
          if (ch == '=')
          {
             lexeme.at(1) = ch;
             Token = relop ;
             fin.get(ch);
          }
     }
  else
     OpToken();
}
void Lexical::OpToken()
{
    //Need to detect +, -, ||, *, /, &&, =, (), {}, comma, semicolon, period, quotation("), and []
if(lexeme.at(0) == '+' || lexeme.at(0) == '-' || lexeme.at(0) == 'OR')
      {
          Token = addop ;
          return;
      }
else if(lexeme.at(0) == '*' || lexeme.at(0) == '/' || lexeme.at(0) == 'DIV' || lexeme.at(0) == 'MOD' || lexeme.at(0) == 'AND')
    {
      Token = mulop ;
      return;
    }
else if(lexeme.at(0) == '<' || lexeme.at(0) == '>' || lexeme.at(0) == '=')
    {
      Token = relop ;
      return;
    }
else if(lexeme.at(0) == '(')
    {
      Token = lparen;
      return;
    }
else if(lexeme.at(0) == ')')
    {
      Token = rparen;
      return;
    }
else if(lexeme.at(0) == '{')
    {
      Token = clbrat;
      return;
    }
else if(lexeme.at(0) == '}')
    {
      Token = crbrat;
      return;
    }
else if(lexeme.at(0) == ',')
    {
      Token = comma;
      return;
    }
else if(lexeme.at(0) == ';')
    {
      Token = semicolon ;
      return;
    }
else if(lexeme.at(0) == '.')
    {
      Token = period ;
      return;
    }
else if(lexeme.at(0) == '~')
    {
      Token = tildat;
      return;
    }
else if(lexeme.at(0) == '[')
    {
      Token = lbrat;
      return;
    }
else if(lexeme.at(0) == ']')
    {
      Token = rbrat;
      return;
    }
else if(lexeme.at(0) == '#')
    {
      Token = nott ;
      return;
    }

else 
  {
    Token = unknownt ;
    cout << "Error !!" ;
    exit(0);
  }
}

void Lexical::NumToken()
{
 int i = 0;
  while (isdigit(ch)) 
  {
    //Update Lexeme and keep going
    lexeme.at(i++) = ch;
    fin.get(ch);
  }

  if (ch == '.') // checking for float 
  {
    //if ch is a period, then we might be dealing with a float. We need to ensure that the next character is a digit
    lexeme.at(i++) = ch;
    fin.get(ch);
    if (isdigit(ch)) 
    {
      //ch is a digit, so we are good to go.
      while (isdigit(ch)) 
      {
        lexeme.at(i++) = ch;
        fin.get(ch);
      }
      //If we are here, then we have a float and we have just encountered a new token
      Token = numt;
      valuer = atof(lexeme.c_str());
      return;
    }
    //If we are here, then we have a period but no digit after it--an error
    Token = unknownt ;
    cout << "Error." << endl;
    exit(0);
  }
  else {
    //If we are here, then it means that the next char is not a period.... so we have a NUMT int token
    Token = numt;
    value = atoi(lexeme.c_str());
    return;
  }

}

void Lexical::ProcessLiteralToken()
{

  int i = 0;
  while (ch != '\"') 
  {
    if (ch == '\n') 
    {
      Token = unknownt ;
      cout << "Error!" << endl ;
      exit(0);
    }
    Literal.at(i++) = ch ; // advance !
    fin.get(ch);
  }
  //Literal[i++] = ch;
  Token = literalt;
  fin.get(ch);
  return;




}

My lexical.h ( HEADER FILE ) is this :

#ifndef _LEXICAL_H
#define _LEXICAL_H

#include <iostream>
#include <fstream>


using namespace std ; 


            // enumerated data type 
            enum SYMBOL
            {
                begint,programt, constt, vart, proceduret, ift, whilet, thent, elset, realt, integert, booleant, chart, arrayt, endt, divt, modt, andt, nott, ort, addop, mulop, assignop, lparen, rparen, comma, semicolon, period, numt, idt, literalt, unknownt, eofilet, relop, clbrat, crbrat, tildat, lbrat, rbrat ,colon
            };

            //extern int size = 15 ;
            extern int value ; // integer
            extern float valuer ; // float
            extern char ch ; // character
            extern string lexeme ; // string
            extern SYMBOL Token ; // token of symbols 
            extern string Literal ; // string literals 
            extern string reswords[15] ; // string array

class Lexical {

        public :

            Lexical(); // constructor 

            ~Lexical() ; // destructor

            //GetNextToken function
            void GetNextToken() ;

            //displayToken function
            void displayToken() ;




        private :

            //initialize reserved words 
            void InitResWords() ;

            //ProcessToken
            void ProcessToken() ;

            // operator tokens
            void OpToken() ;

            //NumToken
            void NumToken() ;

            //Process string literals
            void ProcessLiteralToken() ;

            ifstream fin ; // file 

} ;


#endif // !_lexical_H

The code is give if anyone wants to go through it , for any function.

I just cannot figure out how to correctly display my tokens ! (displayToken) Can some one help me with the function so for every token , it parses , it displays it by the format : Token ----- Lexeme ------- Value/valuer/literal numt ----- 1234 ------ value

i just want to write down the display token function. How will you display the lexeme, token and whether they are value/valuer or literal after you process each token ?

this is my driver file

#include "lexical.h"

#include <iostream>
#include <fstream>

using namespace std;

int main()
{  
   cout << "creating the constructor" << endl << endl ;
   Lexical myLex ;
   ifstream fin;

   fin.open("test.txt") ;

   while (Token != eofilet) 
   {
      myLex.GetNextToken();
      myLex.displayToken();
   }

   cout << endl << "success" << endl ;
   fin.close();
   //system("pause");
   return 0;

}

score 2 · Accepted Answer

你偷看'\n'，我看不出你在哪里消费那个角色。

实现简单词法分析器/解析器的最简单方法是 Wirth 在 Pascal 示例中的做法（不记得在哪里，抱歉）：

你总是有一个字符前瞻，你在一开始就读一个字符作为前瞻（在你的构造函数中）
每次您开始处理一个标记时，您都会查看前瞻字符，并根据它决定什么会出现（例如，数字--> 数字、字母-->（暂定）标识符，...）。要跳过空格，您只需读取下一个字符，直到它不是 ' '（或 '\t'），以计算行数，您将它们计数在 '\n' 上并读取下一个字符。
每当您需要一个角色时，您都会消耗前瞻，并获得下一个。

解析器（递归下降）反过来有一个用于前瞻标记的变量，并按照类似的方式工作。

这样，您就不需要在什么时候阅读字符时搞乱 peek 和混淆。

c++ - lexical analyzer using c++

1 回答 1

Related

Reference