我正在使用一个CMSSW_11_1_4
环境(Python 3.8.4,uproot 3.11.3,笨拙的 0.12.20),我正在尝试打开一些非常大的 .root 文件(3 个文件,每个 9 GB)并使用一个事件循环。迭代第一个文件(前 100k 个事件)时,代码似乎按预期工作。在第一个事件期间需要很多时间,但随后会很快处理其余的事件。直到它以 100k 处理事件到达第二个文件。然后,代码崩溃并AssertionError
打印出一条消息。下面是一些代码来重现给定 xrootd 访问权限的问题。
import uproot
import uproot_methods
import numpy as np
import matplotlib.pyplot as plt
# Get the file and import using uproot
base = 'root://cmseos.fnal.gov//store/user/kdipetri/SUEP/Production_v0.2/2018/NTUP/'
datasets = [
base + 'Autumn18.QCD_HT1000to1500_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root',
base + 'Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root',
base + 'Autumn18.QCD_HT2000toInf_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root',
]
# Load on lazy arrays
mycache = uproot.ArrayCache("1 GB")
events = uproot.lazyarrays(datasets, 'TreeMaker2/PreSelection', ['HT','CrossSection',
'Tracks.fCoordinates.fX',
'Tracks.fCoordinates.fY',
'Tracks.fCoordinates.fZ',
'Tracks_fromPV0',
'Tracks_matchedToPFCandidate'],
cache=mycache)
trk_multiplicity = np.zeros(events['Tracks.fCoordinates.fX'].size)
for ievt in range(events['Tracks.fCoordinates.fX'].size):
if ievt%1000 == 0:
print("Processing event %d. Progress: %.2f%%"%(ievt,100*ievt/events['Tracks.fCoordinates.fX'].size))
if events['HT'][ievt] < 1200:
continue
tracks_x = events['Tracks.fCoordinates.fX'][ievt]
tracks_y = events['Tracks.fCoordinates.fY'][ievt]
tracks_z = events['Tracks.fCoordinates.fZ'][ievt]
tracks_E = np.sqrt(tracks_x**2+tracks_y**2+tracks_z**2+0.13957**2)
tracks = uproot_methods.TLorentzVectorArray.from_cartesian(tracks_x,
tracks_y,
tracks_z,
tracks_E)
tracks_fromPV0 = events['Tracks_fromPV0'][ievt]
tracks_matchedToPFCandidate = events['Tracks_matchedToPFCandidate'][ievt]
tracks = tracks[(tracks.pt > 1.) & (tracks.eta < 2.5) & (tracks_fromPV0 >= 2) &
(tracks_matchedToPFCandidate > 0)]
trk_multiplicity[ievt] = tracks.size
# Plot results
fig = plt.figure(figsize=(8,8))
ax = plt.gca()
CrossSection = events['CrossSection'][events['HT'] > 1200]
trk_multiplicity = trk_multiplicity[events['HT'] > 1200]
ax.hist(trk_multiplicity, bins=100, density=True, weights=CrossSection, histtype='step')
plt.show()
我确信这是一些内存问题,因为我已经设法以这种方式处理多个文件(尽管要小得多并且在本地存储)。我该如何克服呢?我需要以不同的方式处理缓存吗?
编辑(2020 年 10 月 28 日):我尝试切换到 uproot4(0.0.27)/awkward1(0.3.1),但没有取得多大成功。我将相同的代码转换为以下代码:
import uproot4 as uproot
import uproot_methods
import awkward1 as ak
import numpy as np
import matplotlib.pyplot as plt
# Get the file and import using uproot
base = 'root://cmseos.fnal.gov//store/user/kdipetri/SUEP/Production_v0.2/2018/NTUP/'
datasets = {
base + 'Autumn18.QCD_HT1000to1500_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
base + 'Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
base + 'Autumn18.QCD_HT2000toInf_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
}
# Load on lazy arrays
events = uproot.lazy(datasets)
trk_multiplicity = np.zeros(len(events['Tracks.fCoordinates.fX']))
for ievt in range(len(events['Tracks.fCoordinates.fX'])):
if ievt%1000 == 0:
print("Processing event %d. Progress: %.2f%%"%(ievt,100*ievt/len(events['Tracks.fCoordinates.fX'])))
if events['HT'][ievt] < 1200:
continue
tracks_x = events['Tracks.fCoordinates.fX'][ievt]
tracks_y = events['Tracks.fCoordinates.fY'][ievt]
tracks_z = events['Tracks.fCoordinates.fZ'][ievt]
tracks_E = np.sqrt(tracks_x**2+tracks_y**2+tracks_z**2+0.13957**2)
tracks = uproot_methods.TLorentzVectorArray.from_cartesian(ak.to_awkward0(tracks_x),ak.to_awkward0(tracks_y),ak.to_awkward0(tracks_z),ak.to_awkward0(tracks_E))
tracks_fromPV0 = events['Tracks_fromPV0'][ievt]
tracks_matchedToPFCandidate = events['Tracks_matchedToPFCandidate'][ievt]
tracks = tracks[(tracks.pt > 1.) & (tracks.eta < 2.5) & (ak.to_awkward0(tracks_fromPV0) >= 2) &
(ak.to_awkward0(tracks_matchedToPFCandidate) > 0)]
trk_multiplicity[ievt] = tracks.size
# Plot results
fig = plt.figure(figsize=(8,8))
ax = plt.gca()
CrossSection = events['CrossSection'][events['HT'] > 1200]
trk_multiplicity = trk_multiplicity[events['HT'] > 1200]
ax.hist(trk_multiplicity, bins=100, density=True, weights=CrossSection, histtype='step')
plt.show()
这一次代码开始立即处理事件,尽管速度慢了很多(可能是更小的缓存?)。同样,当它到达第二个文件时,这次代码崩溃并显示以下消息:
Traceback (most recent call last):
File "scripts/plotEventShapes_lazy.py", line 43, in <module>
tracks_x = events['Tracks.fCoordinates.fX'][ievt]
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/awkward1/highlevel.py", line 974, in __getitem__
self._layout[where], self._behavior, cache=self._cache
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/awkward1/partition.py", line 366, in __getitem__
return PartitionedArray.from_ext(self._ext.getitem_at(where))
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/behaviors/TBranch.py", line 2017, in array
_ranges_or_baskets_to_arrays(
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/behaviors/TBranch.py", line 3264, in _ranges_or_baskets_to_arrays
uproot4.source.futures.delayed_raise(*obj)
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/source/futures.py", line 46, in delayed_raise
raise exception_value.with_traceback(traceback)
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/behaviors/TBranch.py", line 3189, in chunk_to_basket
basket = uproot4.models.TBasket.Model_TBasket.read(
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/model.py", line 730, in read
self.read_members(chunk, cursor, context, file)
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/models/TBasket.py", line 230, in read_members
) = cursor.fields(chunk, _tbasket_format2, context)
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/source/cursor.py", line 195, in fields
return format.unpack(chunk.get(start, stop, self, context))
File "/uscms/home/chpapage/.local/lib/python3.8/site-packages/uproot4/source/chunk.py", line 370, in get
raise uproot4.deserialization.DeserializationError(
uproot4.deserialization.DeserializationError: while reading
TBasket version None as uproot4.models.TBasket.Model_TBasket (? bytes)
fNbytes: -1607368158
fObjlen: -1243566277
fDatime: 2634931141
fKeylen: -27664
fCycle: 21409
attempting to get bytes 483015:483033
outside expected range 510698:538758 for this Chunk
in file root://cmseos.fnal.gov//store/user/kdipetri/SUEP/Production_v0.2/2018/NTUP/Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root
最后一条评论,我用小得多的文件(每个大约 1 GB 的信号文件)运行了相同的代码,但这次使用 xrootd 访问并且循环没有问题地完成。由于文件很大,这可能是一个缓存问题吗?