我为这个算法编写了一个新的容器类,根据需要量身定制。这也让我有机会围绕我的程序调整其他代码,同时提高了一点速度。
这比使用 STL 向量的旧实现要快得多,但在其他方面基本相同。但是虽然它更快,但它仍然不够快......不幸的是。
分析不再揭示真正的瓶颈是什么。MSVC 分析器似乎有时会将“责任”归咎于错误的调用(假设相同的运行分配了大不相同的运行时间),并且大多数调用被合并成一个大裂缝。
查看生成的代码的反汇编表明生成的代码中有大量的跳转,我认为这可能是现在缓慢的主要原因。
class SpanBuffer {
private:
int *data;
size_t allocated_size;
size_t count;
inline void EnsureSpace()
{
if (count == allocated_size)
Reserve(count*2);
}
public:
struct Span {
int start, end;
};
public:
SpanBuffer()
: data(0)
, allocated_size(24)
, count(0)
{
data = new int[allocated_size];
}
SpanBuffer(const SpanBuffer &src)
: data(0)
, allocated_size(src.allocated_size)
, count(src.count)
{
data = new int[allocated_size];
memcpy(data, src.data, sizeof(int)*count);
}
~SpanBuffer()
{
delete [] data;
}
inline void AddIntersection(int x)
{
EnsureSpace();
data[count++] = x;
}
inline void AddSpan(int s, int e)
{
assert((count & 1) == 0);
assert(s >= 0);
assert(e >= 0);
EnsureSpace();
data[count] = s;
data[count+1] = e;
count += 2;
}
inline void Clear()
{
count = 0;
}
inline size_t GetCount() const
{
return count;
}
inline int GetIntersection(size_t i) const
{
return data[i];
}
inline const Span * GetSpanIteratorBegin() const
{
assert((count & 1) == 0);
return reinterpret_cast<const Span *>(data);
}
inline Span * GetSpanIteratorBegin()
{
assert((count & 1) == 0);
return reinterpret_cast<Span *>(data);
}
inline const Span * GetSpanIteratorEnd() const
{
assert((count & 1) == 0);
return reinterpret_cast<const Span *>(data+count);
}
inline Span * GetSpanIteratorEnd()
{
assert((count & 1) == 0);
return reinterpret_cast<Span *>(data+count);
}
inline void MergeOrAddSpan(int s, int e)
{
assert((count & 1) == 0);
assert(s >= 0);
assert(e >= 0);
if (count == 0)
{
AddSpan(s, e);
return;
}
int *lastspan = data + count-2;
if (s > lastspan[1])
{
AddSpan(s, e);
}
else
{
if (s < lastspan[0])
lastspan[0] = s;
if (e > lastspan[1])
lastspan[1] = e;
}
}
inline void Reserve(size_t minsize)
{
if (minsize <= allocated_size)
return;
int *newdata = new int[minsize];
memcpy(newdata, data, sizeof(int)*count);
delete [] data;
data = newdata;
allocated_size = minsize;
}
inline void SortIntersections()
{
assert((count & 1) == 0);
std::sort(data, data+count, std::less<int>());
assert((count & 1) == 0);
}
inline void Swap(SpanBuffer &other)
{
std::swap(data, other.data);
std::swap(allocated_size, other.allocated_size);
std::swap(count, other.count);
}
};
struct ShapeWidener {
// How much to widen in the X direction
int widen_by;
// Half of width difference of src and dst (width of the border being produced)
int xofs;
// Temporary storage for OverlayScanline, so it doesn't need to reallocate for each call
SpanBuffer buffer;
inline void OverlayScanline(const SpanBuffer &src, SpanBuffer &dst);
ShapeWidener(int _xofs) : xofs(_xofs) { }
};
inline void ShapeWidener::OverlayScanline(const SpanBuffer &src, SpanBuffer &dst)
{
if (src.GetCount() == 0) return;
if (src.GetCount() + dst.GetCount() == 0) return;
assert((src.GetCount() & 1) == 0);
assert((dst.GetCount() & 1) == 0);
assert(buffer.GetCount() == 0);
dst.Swap(buffer);
const int widen_s = xofs - widen_by;
const int widen_e = xofs + widen_by;
size_t resta = src.GetCount()/2;
size_t restb = buffer.GetCount()/2;
const SpanBuffer::Span *spa = src.GetSpanIteratorBegin();
const SpanBuffer::Span *spb = buffer.GetSpanIteratorBegin();
while (resta > 0 || restb > 0)
{
if (restb == 0)
{
dst.MergeOrAddSpan(spa->start+widen_s, spa->end+widen_e);
--resta, ++spa;
}
else if (resta == 0)
{
dst.MergeOrAddSpan(spb->start, spb->end);
--restb, ++spb;
}
else if (spa->start < spb->start)
{
dst.MergeOrAddSpan(spa->start+widen_s, spa->end+widen_e);
--resta, ++spa;
}
else
{
dst.MergeOrAddSpan(spb->start, spb->end);
--restb, ++spb;
}
}
buffer.Clear();
}