python - 循环回到 csv 中的下一列

Question

我正在尝试让脚本在 csv 文件的每一列上运行。我已经想出了如何告诉 python 我想在哪一列上运行脚本，但我希望它分析第一列，输出结果，移动到第二列并继续遍历文件。我想要的是一个“if etc goto etc”命令。我已经找到了如何使用简单的 oneliners 来做到这一点，但我有一个更大的脚本。任何帮助都会很棒，因为我确定我只是错过了一些东西。就像我可以循环回到我定义数据的位置（h=data）但告诉它选择下一列。这是我的脚本。

import numpy as np
import matplotlib.pyplot as plt
from pylab import * 
import pylab
from scipy import linalg
import sys
import scipy.interpolate as interpolate
import scipy.optimize as optimize

a=raw_input("Data file name? ") #Name of the data file including the directory, must be .csv

datafile = open(a, 'r')
data = []
for row in datafile:
    data.append(row.strip().split(',')) #opening and organizing the csv file
print('Data points= ', len(data))
print data
c=raw_input("Is there a header row? y/n?") #Remove header line if present
if c is ('y'):
    del data[0]
    data2=data
    print('Raw data= ', data2)
else:
    print('Raw data= ', data)
'''
#if I wanted to select a column
b=input("What column to analyze?") #Asks what column depth data is in
if b is 1: 
    h=[[rowa[i] for rowa in data] for i in range(1)] #first row
'''
h=data # all columns
g=reduce(lambda x,y: x+y,h) #prepares data for calculations
a=map(float, g)
a.sort()
print ('Organized data= ',a)

def GRLC(values):
    '''
    Calculate Gini index, Gini coefficient, Robin Hood index, and points of 
    Lorenz curve based on the instructions given in 
    www.peterrosenmai.com/lorenz-curve-graphing-tool-and-gini-coefficient-calculator
    Lorenz curve values as given as lists of x & y points [[x1, x2], [y1, y2]]
    @param values: List of values
    @return: [Gini index, Gini coefficient, Robin Hood index, [Lorenz curve]] 
    '''

    n = len(values)
    assert(n > 0), 'Empty list of values'
    sortedValues = sorted(values) #Sort smallest to largest

    #Find cumulative totals
    cumm = [0]
    for i in range(n):
        cumm.append(sum(sortedValues[0:(i + 1)]))

    #Calculate Lorenz points
    LorenzPoints = [[], []]
    sumYs = 0           #Some of all y values
    robinHoodIdx = -1   #Robin Hood index max(x_i, y_i)
    for i in range(1, n + 2):
        x = 100.0 * (i - 1)/n
        y = 100.0 * (cumm[i - 1]/float(cumm[n]))
        LorenzPoints[0].append(x)
        LorenzPoints[1].append(y)
        sumYs += y
        maxX_Y = x - y
        if maxX_Y > robinHoodIdx: robinHoodIdx = maxX_Y   

    giniIdx = 100 + (100 - 2 * sumYs)/n #Gini index 

    return [giniIdx, giniIdx/100, robinHoodIdx, LorenzPoints]

result = GRLC(a)
print 'Gini Index', result[0]  
print 'Gini Coefficient', result[1]
print 'Robin Hood Index', result[2]

score 0 · Accepted Answer

我忽略了所有的 GRLC 函数，只是解决了循环问题。试试这个。它用于while True:永远循环（您可以通过结束程序来中断；在 Windows 中为 Ctrl+C，取决于操作系统）。只需从 csv 加载数据一次，然后每次循环时，您都可以重新构建一些变量。如果您有任何问题，请提出。另外，我没有测试它，因为我没有安装所有 NumPy 包:)

import numpy as np
import matplotlib.pyplot as plt
from pylab import * 
import pylab
from scipy import linalg
import sys
import scipy.interpolate as interpolate
import scipy.optimize as optimize

def GRLC(values):
    '''
    Calculate Gini index, Gini coefficient, Robin Hood index, and points of 
    Lorenz curve based on the instructions given in 
    www.peterrosenmai.com/lorenz-curve-graphing-tool-and-gini-coefficient-calculator
    Lorenz curve values as given as lists of x & y points [[x1, x2], [y1, y2]]
    @param values: List of values
    @return: [Gini index, Gini coefficient, Robin Hood index, [Lorenz curve]] 
    '''

    n = len(values)
    assert(n > 0), 'Empty list of values'
    sortedValues = sorted(values) #Sort smallest to largest

    #Find cumulative totals
    cumm = [0]
    for i in range(n):
        cumm.append(sum(sortedValues[0:(i + 1)]))

    #Calculate Lorenz points
    LorenzPoints = [[], []]
    sumYs = 0           #Some of all y values
    robinHoodIdx = -1   #Robin Hood index max(x_i, y_i)
    for i in range(1, n + 2):
        x = 100.0 * (i - 1)/n
        y = 100.0 * (cumm[i - 1]/float(cumm[n]))
        LorenzPoints[0].append(x)
        LorenzPoints[1].append(y)
        sumYs += y
        maxX_Y = x - y
        if maxX_Y > robinHoodIdx: robinHoodIdx = maxX_Y   

    giniIdx = 100 + (100 - 2 * sumYs)/n #Gini index 

    return [giniIdx, giniIdx/100, robinHoodIdx, LorenzPoints]

#Name of the data file including the directory, must be .csv
a=raw_input("Data file name? ") 

datafile = open(a.strip(), 'r')
data = []

#opening and organizing the csv file
for row in datafile:
    data.append(row.strip().split(',')) 

#Remove header line if present
c=raw_input("Is there a header row? y/n?") 
if c.strip().lower() == ('y'):
    del data[0]

while True :
    #if I want the first column, that's index 0.
    b=raw_input("What column to analyze?")

    # Validate that the column input data is correct here.  Otherwise it might be out of range, etc.
    # Maybe try this.  You might want more smarts in there, depending on your intent:
    b = int(b.strip())

    # If you expect the user to inpt "2" to mean the second column, you're going to use index 1 (list indexes are 0 based)
    h=[[rowa[b-1] for rowa in data] for i in range(1)]

    # prepares data for calculations
    g=reduce(lambda x,y: x+y,h) 
    a=map(float, g)
    a.sort()
    print ('Organized data= ',a)

    result = GRLC(a)
    print 'Gini Index', result[0]  
    print 'Gini Coefficient', result[1]
    print 'Robin Hood Index', result[2]

python - 循环回到 csv 中的下一列

1 回答 1

Related

Reference