我希望以与 pandas.DataFrame.Merge 类似的方式基于每个帧中的特定列合并两个 Deedle (F#) 帧。完美的例子是包含数据列和 (city, state) 列以及包含以下列的信息框架:(city, state);纬度;长。如果我想将 lat long 列添加到我的主框架中,我将合并 (city, state) 列上的两个框架。
这是一个例子:
let primaryFrame =
[(0, "Job Name", box "Job 1")
(0, "City, State", box "Reno, NV")
(1, "Job Name", box "Job 2")
(1, "City, State", box "Portland, OR")
(2, "Job Name", box "Job 3")
(2, "City, State", box "Portland, OR")
(3, "Job Name", box "Job 4")
(3, "City, State", box "Sacramento, CA")] |> Frame.ofValues
let infoFrame =
[(0, "City, State", box "Reno, NV")
(0, "Lat", box "Reno_NV_Lat")
(0, "Long", box "Reno_NV_Long")
(1, "City, State", box "Portland, OR")
(1, "Lat", box "Portland_OR_Lat")
(1, "Long", box "Portland_OR_Long")] |> Frame.ofValues
// see code for merge_on below.
let mergedFrame = primaryFrame
|> merge_On infoFrame "City, State" null
这将导致“mergedFrame”看起来像这样:
> mergedFrame.Format();;
val it : string =
" Job Name City, State Lat Long
0 -> Job 1 Reno, NV Reno_NV_Lat Reno_NV_Long
1 -> Job 2 Portland, OR Portland_OR_Lat Portland_OR_Long
2 -> Job 3 Portland, OR Portland_OR_Lat Portland_OR_Long
3 -> Job 4 Sacramento, CA <missing> <missing>
我想出了一种方法(上面示例中使用的 'merge_on' 函数),但作为一个不熟悉 F# 的销售工程师,我想有一种更惯用/更有效的方法来做到这一点。以下是我执行此操作的函数以及“removeDuplicateRows”,它可以满足您的期望并且是“merge_on”函数所需要的;如果您也想评论更好的方法,请这样做。
let removeDuplicateRows column (frame : Frame<'a, 'b>) =
let nonDupKeys = frame.GroupRowsBy(column).RowKeys
|> Seq.distinctBy (fun (a, b) -> a)
|> Seq.map (fun (a, b) -> b)
frame.Rows.[nonDupKeys]
let merge_On (infoFrame : Frame<'c, 'b>) mergeOnCol missingReplacement
(primaryFrame : Frame<'a,'b>) =
let frame = primaryFrame.Clone()
let infoFrame = infoFrame
|> removeDuplicateRows mergeOnCol
|> Frame.indexRows mergeOnCol
let initialSeries = frame.GetColumn(mergeOnCol)
let infoFrameRows = infoFrame.RowKeys
for colKey in infoFrame.ColumnKeys do
let newSeries =
[for v in initialSeries.ValuesAll do
if Seq.contains v infoFrameRows then
let key = infoFrame.GetRow(v)
yield key.[colKey]
else
yield box missingReplacement ]
frame.AddColumn(colKey, newSeries)
frame
谢谢你的帮助!
更新:
将 Frame.indexRowsString 切换为 Frame.indexRows 以处理“mergOnCol”中的类型不是字符串的情况。
按照 Tomas 的建议摆脱 infoFrame.Clone()