3

我正在将我的一个 C++ 项目(一个简单的 DSL)转换为 rust 作为学习 rust 的练习,但我在嵌套结构和所有权方面遇到了麻烦。我很难转换一些东西,比如:

struct FileData {
    bool is_utf8;
    std::string file_name;
};

class Token {
public:
    enum TokenType {
        REGULAR,
        INCLUDE_FILE,

    }

    Token() {
        _type = REGULAR;
    }

    Type get_type() const { return _type; }

    void beginIncludeFile() {
        _type = INCLUDE_FILE;
        _include_data = std::unique_ptr<FileData>(new FileData);
    }

    bool is_utf8() const {
        assert(get_type() == INCLUDE_FILE);
        return _include_data->is_utf8; 
    }

    void set_utf8(bool value) { 
        assert(get_type() == INCLUDE_FILE);
        _include_data->is_utf8 = value; 
    }

    const std::string& get_file_name() const { 
        assert(get_type() == INCLUDE_FILE);
        return _include_data->file_name; 
    }

    void setFileNameToEmpty() {
        assert(get_type() == INCLUDE_FILE);
        _include_data->file_name = "";
    }

    void appendToFileName(char c) { 
        assert(get_type() == INCLUDE_FILE);
        _include_data->file_name += c;
    }

    FileData* releaseFileData() { return _include_data.release(); }
private:
    std::unique_ptr<FileData> _include_data;
    TokenType _type;
};

我要为此写的锈是:

use std::str;

pub struct FileData {
    is_utf8 : bool,
    file_name : ~str
}

pub fn FileData() -> FileData {
    FileData { is_utf8 : true, file_name : ~"" }
}

enum TokenType {
    REGULAR,
    INCLUDE_FILE
}

pub struct Token {
    priv _include_data : Option<~FileData>,
    priv _type : TokenType
}

pub fn Token() -> Token {
    Token {
        _include_data: None,
        _type : REGULAR
    }
}

impl Token {
    pub fn get_type(&self) -> TokenType {
        self._type
    } 

    pub fn beginIncludeFile(&mut self) {
        self._type = INCLUDE_FILE;
        self._include_data = Some(~FileData());
    }

    pub fn is_utf8(&self) -> bool {
        match self._include_data {
            Some(ref data) => data.is_utf8,
            _ => fail!("No FileData")
        }
    }

    pub fn set_utf8(&mut self, value : bool) {
        self._include_data.mutate(|mut data| {
            data.is_utf8 = value;
            data
        });
    }

    // Return immutable/read-only copy
    pub fn get_file_name(&self) -> &~str {
        match self._include_data {
            Some(ref data) => &data.file_name,
            _ => fail!("No FileData")
        }
    }

    pub fn setFileNameToEmpty(&mut self) {
        match self._include_data {
            Some(ref data) => data.file_name = ~"",
            _ => fail!("No FileData")
        }
        return;
    }

    pub fn appendToFileName(&mut self, c : char) {
        match self._include_data {
            Some(ref data) => data.file_name.push_char(c),
            _ => fail!("No FileData")
        }
        return;
    }

    pub fn getIncludeData(&mut self) -> ~FileData {
        match self._include_data {
            Some(ref data) => *data,
            _ => fail!("No FileData")
        }
    }
}

enum LexState {
    INITIAL,
    EXPECT_COLON,
    EXPECT_ENCODING,
    EXPECT_QUOTE,
    IN_FILENAME_STRING,
    EXPECT_SEMI
}

impl Eq for LexState {
    fn eq(&self, other: &LexState) -> bool {
        return (*self as int) == (*other as int);
    }
    fn ne(&self, other: &LexState) -> bool {
        !self.eq(other)
    }
}

fn main() {
    let mut t = ~Token();
    let input = ~"include:utf8 \"file_path/file.foo\";";
    let iter = input.iter();
    let mut buf : ~str = ~"";

    let mut state : LexState = INITIAL;

    let buf_action = |action : &fn()| {
        buf = ~"";
        action();
    };

    while true {
        let c = iter.next();
        match c {
            None => break,
            Some(_c) => buf.push_char(_c)
        }

        match buf {
            // Initial state
            ~"include" if state == INITIAL => buf_action(|| { 
                t.beginIncludeFile();
                state = EXPECT_COLON;
            }),

            // Expecting either an encoding, or the start of the file name
            ~":" if state == EXPECT_COLON => buf_action(|| { state = EXPECT_ENCODING; }),
            _   if state == EXPECT_COLON => state = EXPECT_QUOTE, // match WS

            // utf8 is the only encoding accepted at the moment
            ~"utf8" if state == EXPECT_ENCODING => buf_action(|| {
                t.set_utf8(true);
                state = EXPECT_QUOTE;
            }),
            _ if state == EXPECT_ENCODING => t.set_utf8(false),

            // Looking for string start
            ~"\"" if state == EXPECT_QUOTE => buf_action(||{ state = IN_FILENAME_STRING; }),
            _ if state == EXPECT_QUOTE => (), // ignore other chars

            // Reading filename
            ~"\"" if state == IN_FILENAME_STRING => buf_action(|| {
                state = EXPECT_SEMI;
            }),
            _ if state == IN_FILENAME_STRING => t.appendToFileName(c.unwrap()),

            // End of lex
            ~":" if state == EXPECT_SEMI => break,
            _   if state == EXPECT_SEMI => fail!("Expected semi"),

            _ => fail!("Unexpected character: " + str::from_char(c.unwrap()))

        }
    }
    return;
}

这种代码的惯用生锈方式是什么?

4

1 回答 1

5

Rust is sufficiently different to C++ that a straight line by line translation will give non-idiomatic code. This isn't really a full answer, just a collection of bits and pieces:


When returning information from inside a structure, writing the function as fn foo<'a>(&'a self) -> &'a SomeInformation is the normal way (with str's and []'s treated specially): so

pub fn get_file_name<'a>(&'a self) -> &'a str {
    match self._include_data {
        Some(ref data) => &data.file_name,
        _ => fail!("No FileData")
    }
}

pub fn getIncludeData<'a>(&'a self) -> &'a FileData {
    match self._include_data {
        Some(ref data) => &*data,
        _ => fail!("No FileData")
    }
}

The 'a marker is a named lifetime, which connects the how long the return values are valid for with the period that the self object is valid for; which means that dangling pointers are impossible (ignoring compiler bugs).

A collection of things with the match:

  • matchs are checked for completeness, so flipping it around (matching on state rather than buf) is type-safer.

  • match have a return value, so you can set the state "magically".

  • the buf_action function is peculiar (I assume that it normally does more?), it could either be changed so that buf_action(foo) is written as clear_buf(); foo, or, at the very least, should return the value of the inner closure, so

    let buf_action = |f| { buf = ~""; f() } // note the lack of semicolon after f
    
  • There is a special sugar for calling functions where the last argument is a function: do buf_action { some; actions(); here; }. (When the closure has arguments, do f |a,b,c| { x; y; z }.)

    state = match state {
        // Initial state
        INITIAL if "include" == buf => do buf_action { 
            t.beginIncludeFile();
            EXPECT_COLON
        },

        // Expecting either an encoding, or the start of the file name
        EXPECT_COLON => if ":" == buf {
            buf_action(|| EXPECT_ENCODING ),
        } else { 
            EXPECT_QUOTE
        },

        // utf8 is the only encoding accepted at the moment
        EXPECT_ENCODING => match buf {
            ~"utf8" => do buf_action { t.set_utf(true); EXPECT_QUOTE },
            _ => { t.set_utf(false); EXPECT_ENCODING } // this is probably incorrect?
        },

        // Looking for string start
        EXPECT_QUOTE => if "\"" == buf {
            buf_action(|| IN_FILENAME_STRING)
        } else {
            EXPECT_QUOTE // ignore other chars
        },

        IN_FILENAME_STRING => if "\"" == buf {
            buf_action(|| EXPECT_SEMI)
        } else {
            t.appendToFileName(c.unwrap());
            IN_FILENAME_STRING
        }

        // End of lex
        EXPECT_SEMI => if ":" == buf {break} else {fail!("Expected semi")},

        _ => fail!("Unexpected character: %c", c)
    };

Also, while true should be loop; but in fact, the loop should be written:

for input.iter().advance |c| {
    buf.push_char(c);
    state = match state { ... }
}

Minor points:

  • Option<~FileData>, let mut t = ~Token();Option<FileData>, let mut t = Token();. These allocations are unnecessary.

  • lowercase_with_underscores seems to be Rust naming convention.

  • The Eq impl you have can be automatically created by the compiler via #[deriving(Eq)] enum LexState { ... }. (Described in more detail in the tutorial and manual.)

  • It is idiomatic to avoid allocations where possible, and this would include using slices (s.slice(byte_start, byte_end)) in to input rather than pushing the characters onto buf; i.e. recording a start index for the current token and "clearing" the buffer by setting this index to the current index; however, this might be a little tricky to implement.

于 2013-07-15T07:19:12.537 回答