这是我正在解析的一些 xml——这是我第一次使用 xml、soap 或管道。
<?xml version = "1.0"
encoding = "utf-8"?>
<soap:Envelope xmlns:soap = "http://schemas.xmlsoap.org/soap/envelope/"
xmlns:xsi = "http://www.w3.org/2001/XMLSchema-instance"
xmlns:xsd = "http://www.w3.org/2001/XMLSchema">
<soap:Body>
<GetListItemsResponse xmlns = "http://schemas.microsoft.com/sharepoint/soap/">
<GetListItemsResult>
<listitems xmlns:s = 'uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882'
xmlns:dt = 'uuid:C2F41010-65B3-11d1-A29F-00AA00C14882'
xmlns:rs = 'urn:schemas-microsoft-com:rowset'
xmlns:z = '#RowsetSchema'>
<rs:data ItemCount = "290">
<z:row ows_Date = '2020-10-20 00:00:00'
ows_Document = 'https://www.oregon.gov/oha/PH/DISEASESCONDITIONS/DISEASESAZ/Emerging%20Respitory%20Infections/Oregon-COVID-19-Update-10-20-2020-FINAL.pdf, Oregon COVID-19 Daily Update 10.20.2020'
ows_Category = 'Daily Update'
ows_MetaInfo = '294;#'
ows__ModerationStatus = '0'
ows__Level = '1'
ows_ID = '294'
ows_UniqueId = '294;#{C51D9DDB-9A9C-4C56-B030-236D6A0980D2}'
ows_owshiddenversion = '1'
ows_FSObjType = '294;#0'
ows_Created = '2020-10-20 12:16:49'
ows_PermMask = '0x1000030041'
ows_Modified = '2020-10-20 12:16:49'
ows_FileRef = '294;#oha/ERD/Lists/COVID19 Updates/294_.000' />
</rs:data>
</listitems>
</GetListItemsResult>
</GetListItemsResponse>
</soap:Body>
</soap:Envelope>
我只想保留记录在哪里ows_Category
和Weekly Report
不ows_Document
包含Spanish
。我的cursor
版本很容易工作。这个版本要复杂得多,但我最终通过这个问题conduit
的答案弄清楚了。
尽管这两种方法现在都有效,但我有几个问题。
- 该方法是否具有忽略名称空间
conduit
的等效方法?lax
concat
游标功能需要什么?查看类型,我们从根节点开始,生成并维护一个相关节点列表以考虑、filter
对它们进行map
ing、对它们进行 ing 等等。什么是另一层嵌套,为什么?- 该
conduit
版本需要助手(到处f
调用)和(给所有东西命名)——它们似乎非常必要,以至于我认为库会将它们作为实用程序提供,因为每个人都会一直需要它们。还是我在做傻事?force
ns
- 我最糟糕的问题是我原来需要
gliRspNS
命名空间GetListItemsResult
,listitems
即使在 xml 中它看起来应该只适用于GetListItemsResponse
. 这只是一个幸运的猜测让我通过了这个。命名空间是否应该继承下来,直到像这样被覆盖? - 关于
requireAttrRaw
:- 如果我们负责验证,我们不应该知道命名空间
Name
吗? - 为什么要
requireAttrRaw
送我们[Content]
而不是两个Maybe Content
,一个用于ContentText
和ContentEntity
? - 我们应该如何处理
ContentEntity
“For pass-through parsing”?
- 如果我们负责验证,我们不应该知道命名空间
{-# LANGUAGE OverloadedStrings #-}
import Conduit
import Control.Applicative
import Control.Arrow
import Control.Exception
import Control.Monad
import qualified Data.ByteString.Lazy.Char8 as L8
import Data.Foldable
import qualified Data.Map.Strict as M
import Data.String
import qualified Data.Text as T
import Data.Time.Calendar
import Data.Time.Format
import Data.XML.Types
import qualified Text.XML as X
import Text.XML.Cursor hiding (force)
import Text.XML.Stream.Parse
data Doc = Doc
{ url :: String
, name :: String
, date :: Day
} deriving (Show)
main :: IO ()
main = do
r <- L8.readFile "oha.xml"
let go :: Cursor -> [Doc]
go c = concat $ -- what is making the layer of nesting that makes this necessary? why?
c $// laxElement "row"
>=> attributeIs "ows_Category" "Weekly Report"
>=> checkElement (maybe False (not . T.isInfixOf "Spanish") . M.lookup "ows_Document" . X.elementAttributes)
&| \x -> doc <$> attribute "ows_Document" x <*> attribute "ows_Date" x
doc x = Doc u v . parseTimeOrError True defaultTimeLocale "%Y-%-m-%-d" . takeWhile (/= ' ') . T.unpack
where (u,v) = second (drop 2) . break (== ',') $ T.unpack x
parseAttributes, parseAttributes' :: AttrParser (T.Text, T.Text)
parseAttributes' = do
doc <- requireAttr "ows_Document"
cat <- requireAttr "ows_Category"
date <- requireAttr "ows_Date"
ignoreAttrs
guard $ not (T.isInfixOf "Spanish" doc) && cat == "Weekly Report"
return (doc, date)
-- since the attribute values don't interact, we can parse in Applicative rather than Monad
parseAttributes = (,) <$> requireAttrRaw' "ows_Document" (not . T.isInfixOf "Spanish")
<*> requireAttr "ows_Date"
<* requireAttrRaw' "ows_Category" ("Weekly Report" ==)
<* ignoreAttrs
where requireAttrRaw' n f = requireAttrRaw ("required attr value failed condition: " <> n) $ \(n',as) ->
asum $ (\(ContentText a) -> guard (n' == fromString n && f a) *> pure a) <$> as
-- shouldn't we have had to pass in namespace?
-- why [Content] instead of two Maybe Content, one for ContentText and other for ContentEntity?
-- what to do with ContentEntity Text "For pass-through parsing"?}
ns n = fromString . (("{" <> n <> "}") <>)
f g n s = force (s <> " required") . g (ns n s)
parseDocs :: (MonadThrow m, MonadIO m) => ConduitT Event o m [Doc]
parseDocs = f tagNoAttr soapNS "Envelope"
. f tagNoAttr soapNS "Body"
. f tagNoAttr gliRspNS "GetListItemsResponse"
. f tagNoAttr gliRspNS "GetListItemsResult" -- didn't expect to need ns gliRspNS here
. f tagNoAttr gliRspNS "listitems" -- didn't expect to need ns gliRspNS here
. f tagIgnoreAttrs rsNS "data"
. many' . tag' (ns zNS "row")
parseAttributes $ return . uncurry doc
soapNS = "http://schemas.xmlsoap.org/soap/envelope/"
gliRspNS = "http://schemas.microsoft.com/sharepoint/soap/"
rsNS = "urn:schemas-microsoft-com:rowset"
zNS = "#RowsetSchema"
disp = (print . length) <=< traverse print
(throwIO ||| disp . go . fromDocument) $ X.parseLBS X.def r
( disp =<<) . runConduit $ parseLBS def r .| parseDocs
最后,我通常从中获取 xml,Network.HTTP.Simple.httpLBS
而不是从文件中读取。有一种方法可以连接conduit
解析器Network.HTTP.Client.Conduit.httpLBS
以便它直接在流上运行,我对吗?