1

我是 Go 新手,正在尝试利用 Go 中的并发性来构建一个基本的抓取工具,以从 URL 中提取标题、元描述和元关键字。

我能够以并发方式将结果打印到终端,但无法弄清楚如何将输出写入 CSV。我已经尝试了许多我可以在 Go 知识有限的情况下想到的变体,并且许多变体最终破坏了并发性 - 所以有点失去理智。

我的代码和 URL 输入文件如下 - 提前感谢您的任何提示!

// file name: metascraper.go
package main

import (
    // import standard libraries
    "encoding/csv"
    "fmt"
    "io"
    "log"
    "os"
    "time"
    // import third party libraries
    "github.com/PuerkitoBio/goquery"
)

func csvParsing() {
    file, err := os.Open("data/sample.csv")
    checkError("Cannot open file ", err)

    if err != nil {
        // err is printable
        // elements passed are separated by space automatically
        fmt.Println("Error:", err)
        return
    }

    // automatically call Close() at the end of current method
    defer file.Close()
    //
    reader := csv.NewReader(file)
    // options are available at:
    // http://golang.org/src/pkg/encoding/csv/reader.go?s=3213:3671#L94
    reader.Comma = ';'
    lineCount := 0

    fileWrite, err := os.Create("data/result.csv")
    checkError("Cannot create file", err)
    defer fileWrite.Close()

    writer := csv.NewWriter(fileWrite)
    defer writer.Flush()

    for {
        // read just one record
        record, err := reader.Read()
        // end-of-file is fitted into err
        if err == io.EOF {
            break
        } else if err != nil {
            fmt.Println("Error:", err)
            return
        }

        go func(url string) {
            // fmt.Println(msg)
            doc, err := goquery.NewDocument(url)
            if err != nil {
                checkError("No URL", err)
            }

            metaDescription := make(chan string, 1)
            pageTitle := make(chan string, 1)

            go func() {
                // time.Sleep(time.Second * 2)
                // use CSS selector found with the browser inspector
                // for each, use index and item
                pageTitle <- doc.Find("title").Contents().Text()

                doc.Find("meta").Each(func(index int, item *goquery.Selection) {
                    if item.AttrOr("name", "") == "description" {
                        metaDescription <- item.AttrOr("content", "")
                    }
                })
            }()
            select {
            case res := <-metaDescription:
                resTitle := <-pageTitle
                fmt.Println(res)
                fmt.Println(resTitle)

                // Have been trying to output to CSV here but it's not working

                // writer.Write([]string{url, resTitle, res})
                // err := writer.WriteString(`res`)
                // checkError("Cannot write to file", err)

            case <-time.After(time.Second * 2):
                fmt.Println("timeout 2")
            }

        }(record[0])

        fmt.Println()

        lineCount++
    }
}

func main() {

    csvParsing()

    //Code is to make sure there is a pause before program finishes so we can see output
    var input string
    fmt.Scanln(&input)
}

func checkError(message string, err error) {
    if err != nil {
        log.Fatal(message, err)
    }
}

带有 URL 的 data/sample.csv 输入文件:

    http://jonathanmh.com
    http://keshavmalani.com
    http://google.com
    http://bing.com
    http://facebook.com
4

1 回答 1

0

在您提供的代码中,您注释了以下代码:

// Have been trying to output to CSV here but it's not working
err = writer.Write([]string{url, resTitle, res})
checkError("Cannot write to file", err)

这段代码是正确的,除了你有一个问题。在函数的前面,您有以下代码:

fileWrite, err := os.Create("data/result.csv")
checkError("Cannot create file", err)
defer fileWrite.Close()

csvParsing()一旦您的func 退出,此代码将导致 fileWriter 关闭。因为您已经使用 defer 关闭了 fileWriter,所以您无法在并发函数中写入它。

解决方案: 您需要defer fileWrite.Close() 在并发 func或类似的内部使用,以便在写入之前不要关闭 fileWriter。

于 2017-09-02T07:17:02.893 回答