xml - 比较游标与管道 xml 解析

Question

这是我正在解析的一些 xml——这是我第一次使用 xml、soap 或管道。

<?xml version  = "1.0" 
      encoding = "utf-8"?>
<soap:Envelope xmlns:soap = "http://schemas.xmlsoap.org/soap/envelope/" 
               xmlns:xsi  = "http://www.w3.org/2001/XMLSchema-instance" 
               xmlns:xsd  = "http://www.w3.org/2001/XMLSchema">
<soap:Body>
<GetListItemsResponse xmlns = "http://schemas.microsoft.com/sharepoint/soap/">
<GetListItemsResult>
<listitems xmlns:s  = 'uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882'
           xmlns:dt = 'uuid:C2F41010-65B3-11d1-A29F-00AA00C14882'
           xmlns:rs = 'urn:schemas-microsoft-com:rowset'
           xmlns:z  = '#RowsetSchema'>
<rs:data ItemCount = "290">
<z:row ows_Date              = '2020-10-20 00:00:00' 
       ows_Document          = 'https://www.oregon.gov/oha/PH/DISEASESCONDITIONS/DISEASESAZ/Emerging%20Respitory%20Infections/Oregon-COVID-19-Update-10-20-2020-FINAL.pdf, Oregon COVID-19 Daily Update 10.20.2020' 
       ows_Category          = 'Daily Update' 
       ows_MetaInfo          = '294;#' 
       ows__ModerationStatus = '0' 
       ows__Level            = '1' 
       ows_ID                = '294' 
       ows_UniqueId          = '294;#{C51D9DDB-9A9C-4C56-B030-236D6A0980D2}' 
       ows_owshiddenversion  = '1' 
       ows_FSObjType         = '294;#0' 
       ows_Created           = '2020-10-20 12:16:49' 
       ows_PermMask          = '0x1000030041' 
       ows_Modified          = '2020-10-20 12:16:49' 
       ows_FileRef           = '294;#oha/ERD/Lists/COVID19 Updates/294_.000' />
</rs:data>
</listitems>
</GetListItemsResult>
</GetListItemsResponse>
</soap:Body>
</soap:Envelope>

我只想保留记录在哪里ows_Category和Weekly Report不ows_Document包含Spanish。我的cursor版本很容易工作。这个版本要复杂得多，但我最终通过这个问题conduit的答案弄清楚了。

尽管这两种方法现在都有效，但我有几个问题。

该方法是否具有忽略名称空间conduit的等效方法？lax
concat游标功能需要什么？查看类型，我们从根节点开始，生成并维护一个相关节点列表以考虑、filter对它们进行maping、对它们进行 ing 等等。什么是另一层嵌套，为什么？
该conduit版本需要助手（到处f调用）和（给所有东西命名）——它们似乎非常必要，以至于我认为库会将它们作为实用程序提供，因为每个人都会一直需要它们。还是我在做傻事？forcens
我最糟糕的问题是我原来需要gliRspNS命名空间GetListItemsResult，listitems即使在 xml 中它看起来应该只适用于GetListItemsResponse. 这只是一个幸运的猜测让我通过了这个。命名空间是否应该继承下来，直到像这样被覆盖？
关于requireAttrRaw：
- 如果我们负责验证，我们不应该知道命名空间Name吗？
- 为什么要requireAttrRaw送我们[Content]而不是两个Maybe Content，一个用于ContentText和ContentEntity？
- 我们应该如何处理ContentEntity“For pass-through parsing”？

{-# LANGUAGE OverloadedStrings #-}

import           Conduit
import           Control.Applicative
import           Control.Arrow
import           Control.Exception
import           Control.Monad
import qualified Data.ByteString.Lazy.Char8 as L8
import           Data.Foldable
import qualified Data.Map.Strict            as M
import           Data.String
import qualified Data.Text                  as T
import           Data.Time.Calendar
import           Data.Time.Format
import           Data.XML.Types
import qualified Text.XML                   as X
import           Text.XML.Cursor            hiding (force)
import           Text.XML.Stream.Parse

data Doc = Doc
  { url  :: String
  , name :: String
  , date :: Day
  } deriving (Show)

main :: IO ()
main = do
  r <- L8.readFile "oha.xml"

  let go :: Cursor -> [Doc]
      go c = concat $ -- what is making the layer of nesting that makes this necessary?  why?
        c $// laxElement  "row"
          >=> attributeIs "ows_Category" "Weekly Report"
          >=> checkElement (maybe False (not . T.isInfixOf "Spanish") . M.lookup "ows_Document" . X.elementAttributes)
          &|  \x -> doc <$> attribute "ows_Document" x <*> attribute "ows_Date" x

      doc x = Doc u v . parseTimeOrError True defaultTimeLocale "%Y-%-m-%-d" . takeWhile (/= ' ') . T.unpack
        where (u,v) = second (drop 2) . break (== ',') $ T.unpack x

      parseAttributes, parseAttributes' :: AttrParser (T.Text, T.Text)
      parseAttributes' = do
        doc  <- requireAttr "ows_Document"
        cat  <- requireAttr "ows_Category"
        date <- requireAttr "ows_Date"
        ignoreAttrs
        guard $ not (T.isInfixOf "Spanish" doc) && cat == "Weekly Report"
        return (doc, date)

      -- since the attribute values don't interact, we can parse in Applicative rather than Monad
      parseAttributes = (,) <$> requireAttrRaw' "ows_Document" (not . T.isInfixOf "Spanish")
                            <*> requireAttr     "ows_Date"
                            <*  requireAttrRaw' "ows_Category" ("Weekly Report" ==)
                            <*  ignoreAttrs
        where requireAttrRaw' n f = requireAttrRaw ("required attr value failed condition: " <> n) $ \(n',as) ->
                  asum $ (\(ContentText a) -> guard (n' == fromString n && f a) *> pure a) <$> as
                -- shouldn't we have had to pass in namespace?
                -- why [Content] instead of two Maybe Content, one for ContentText and other for ContentEntity?
                -- what to do with ContentEntity Text "For pass-through parsing"?}

      ns n = fromString . (("{" <> n <> "}") <>)
      f g n s = force (s <> " required") . g (ns n s)

      parseDocs :: (MonadThrow m, MonadIO m) => ConduitT Event o m [Doc]
      parseDocs = f tagNoAttr      soapNS   "Envelope"
                . f tagNoAttr      soapNS   "Body"
                . f tagNoAttr      gliRspNS "GetListItemsResponse"
                . f tagNoAttr      gliRspNS "GetListItemsResult" -- didn't expect to need ns gliRspNS here
                . f tagNoAttr      gliRspNS "listitems"          -- didn't expect to need ns gliRspNS here
                . f tagIgnoreAttrs rsNS     "data"
                . many' . tag' (ns zNS      "row")
                               parseAttributes $ return . uncurry doc

      soapNS   = "http://schemas.xmlsoap.org/soap/envelope/"
      gliRspNS = "http://schemas.microsoft.com/sharepoint/soap/"
      rsNS     = "urn:schemas-microsoft-com:rowset"
      zNS      = "#RowsetSchema"

      disp = (print . length) <=< traverse print

  (throwIO ||| disp . go . fromDocument) $ X.parseLBS X.def r
  (            disp =<<) . runConduit    $   parseLBS   def r .| parseDocs

最后，我通常从中获取 xml，Network.HTTP.Simple.httpLBS而不是从文件中读取。有一种方法可以连接conduit解析器Network.HTTP.Client.Conduit.httpLBS以便它直接在流上运行，我对吗？

xml - 比较游标与管道 xml 解析

0 回答 0

Related

Reference