0

Data which I am scraping using the beautiful soup contains one category of device name, device names contains Colors mentioned in them eg. Lumia 800 Black. I want to create a new column which contains this color.

I want to search the device name for any color against a list of colors & if color is present in that device name I want to remove that color from device name and put it in new column named Color.

I am using below referred code to accomplish this, I am creating a function named color and trying to search the device name string for presence of color and if present I am trying to feed that color to new variable named color_column. But my output csv is not returning any values at all. It is empty.

Please check the referred code below:

# -*- coding: cp1252 -*-
import csv
import urllib2
import sys
import urllib
import time
import mechanize
import cookielib
from bs4 import BeautifulSoup
from itertools import islice
colors = ["Black","Gray"]


def color(arg):
    for colors_1 in colors:
        if arg.find(colors_1) == -1:
            return color_column == ""
        return color_column == colors_1


url = 'http://www.t-mobile.com/shop/phones/default.aspx?shape=smartphones'
user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1;Trident/5.0)'
values = {
'Phones':'MBBDevice',
'__ASYNCPOST':'true',
'__EVENTARGUMENT':'',
'__EVENTTARGET':'pgrTop$lnkPageShowAll',
'__LASTFOCUS':'',
'__VIEWSTATE':'/wEPDwULLTE1NTE5NDk1ODIPFgIeEEN1cnJlbnRQYWdlSW5kZXgCARYCAgEPZBYCAgEPZBYCAgEPZBYCZg9kFgICAQ9kFhgCCg9kFgJmD2QWAmYPZBYCZg8UKwACZDKJBAABAAAA/////wEAAAAAAAAADAIAAABfVE1vYmlsZS5XZWIuVE1vYmlsZURvdENvbS5VSS5XZWJDb250cm9scywgVmVyc2lvbj0xLjAuMC4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAAEFUTW9iaWxlLldlYi5UTW9iaWxlRG90Q29tLlVJLldlYkNvbnRyb2xzLkJyZWFkQ3J1bWJJdGVtQ29sbGVjdGlvbgEAAAATQ29sbGVjdGlvbkJhc2UrbGlzdAMcU3lzdGVtLkNvbGxlY3Rpb25zLkFycmF5TGlzdAIAAAAJAwAAAAQDAAAAHFN5c3RlbS5Db2xsZWN0aW9ucy5BcnJheUxpc3QDAAAABl9pdGVtcwVfc2l6ZQhfdmVyc2lvbgUAAAgICQQAAAACAAAABQAAABAEAAAABAAAAAkFAAAACQYAAAANAgUFAAAAN1RNb2JpbGUuV2ViLlRNb2JpbGVEb3RDb20uVUkuV2ViQ29udHJvbHMuQnJlYWRDcnVtYkl0ZW0DAAAABV90ZXh0BF91cmwJX3Nob3dMaW5rAQEAAQIAAAAGBwAAAARIb21lBggAAAAAAQEGAAAABQAAAAYJAAAAGVNtYXJ0cGhvbmVzICYgQ2VsbCBQaG9uZXMGCgAAAAtzaG9wL3Bob25lcwELZAIMD2QWAgIDDxYCHgxIdG1sT3ZlcnJpZGUFkwI8aW1nIHN0eWxlPSJGTE9BVDogcmlnaHQ7IENVUlNPUjogcG9pbnRlciEgaW1wb3J0YW50IiBvbmNsaWNrPSJqYXZhc2NyaXB0OnBvcFVwKCAnL3RlbXBsYXRlcy9wb3B1cC5hc3B4P1BBc3NldD1TaHBfUGhuX3NoaXBwaW5nRGV0YWlscycsICczNDAnLCAnNTY4JywgJzQ1JywgJzMwJywgJzAnLCAnMCcsICcxJyApIiBhbHQ9IkZyZWUgU2hpcHBpbmcgb24gYWxsIGNlbGwgcGhvbmVzIGFuZCBkZXZpY2VzLiIgc3JjPSIuLi9pbWFnZXMvZnJlZV9zaGlwcGluZy1iYW5uZXIuZ2lmIiAvPmQCDg8PFgIeB1Zpc2libGVoZGQCGA9kFgJmD2QWAmYPZBYCZg9kFggCAQ9kFgICAQ8QDxYEHgdDaGVja2VkaB4HRW5hYmxlZGgWAh4LbWFrZWVuYWJsZWQFBWZhbHNlZGRkAgUPZBYCAgEPEA9kFgIfBQUEdHJ1ZWRkZAIHD2QWAgIBDxAPZBYCHwUFBHRydWVkZGQCCQ9kFgICAQ8QD2QWAh8FBQR0cnVlZGRkAhoPZBYCZg9kFgJmD2QWAmYPZBYEAgMPZBYCAgEPEA9kFgIfBQUEdHJ1ZWRkZAIFD2QWAgIBDxAPFgIeBFRleHQF2AU8dGFibGUgaGVpZ2h0PSIxNSIgY2VsbHNwYWNpbmc9IjAiIGNlbGxwYWRkaW5nPSIwIiB3aWR0aD0iNzciIGJvcmRlcj0iMCI+CiAgICAgIDx0Ym9keT4KICAgICAgICA8dHI+CiAgICAgICAgICA8dGQgY2xhc3M9InJlZnVyYmlzaGVkIj5SZWZ1cmJpc2hlZDwvdGQ+CgogICAgICAgICAgPHRkIGNsYXNzPSJyZWZ1cmJpc2hlZCI+CiAgICAgICAgICAgIDxkaXYgb25tb3VzZW92ZXI9ImphdmFzY3JpcHQ6ZGlzcENPQkRlc2MoKTsiIHN0eWxlPSJGTE9BVDogbGVmdCIgb25tb3VzZW91dD0iamF2YXNjcmlwdDpoaWRlQ09CRGVzYygpOyIgcnVuYXQ9InNlcnZlciI+CiAgICAgICAgICAgICAgPGltZyBzcmM9Ii9pbWFnZXMvaWNvbl9oZWxwLmdpZiIgLz4gPGRpdiBjbGFzcz0idG9vbHRpcCIgaWQ9ImRpdkNPQkRlc2NyaXB0aW9uIiBzdHlsZT0iRElTUExBWTogbm9uZSI+CiAgICAgIDxkaXYgY2xhc3M9InRvb2x0aXAtYnRtLWJrZyI+CiAgICAgICAgPGRpdiBjbGFzcz0idG9vbHRpcC1jb250YWluZXIiPgogICAgICAgICAgR2V0IGEgZ3JlYXQgdmFsdWUgb24gYSBsaWtlLW5ldyBwaG9uZQogICAgICAgICAgPGJyIC8+CiAgICAgICAgICAgd2l0aCBhIDkwLWRheSB3YXJyYW50eS4KICAgICAgICA8L2Rpdj4KICAgICAgPC9kaXY+CiAgICA8L2Rpdj4KICAgICAgICAgICAgPC9kaXY+CiAgICAgICAgICA8L3RkPgogICAgICAgIDwvdHI+CiAgICAgIDwvdGJvZHk+CiAgICA8L3RhYmxlPhYCHwUFBHRydWVkZGQCIA8WAh4Fc3R5bGUFDmRpc3BsYXk6YmxvY2s7FgJmD2QWAmYPZBYCZg9kFgYCAw9kFgICAQ8QD2QWAh8FBQR0cnVlZGRkAgUPZBYCAgEPEA9kFgIfBQUEdHJ1ZWRkZAIHD2QWAgIBDxAPZBYCHwUFBHRydWVkZGQCKg9kFgJmD2QWAmYPZBYEZg8PFgIfAmcWAh4HT25DbGljawUKQ2xlYXJJRFMoKWQCAQ8PZBYCHwgFCkNsZWFySURTKClkAi4PZBYCZg9kFgJmD2QWAgIKD2QWCAIBDw8WAh8CaGRkAgMPFgIeCl9QYWdlQ291bnQCBBYGAgIPFgIfAmhkAgcPD2QWAh8HBQxkaXNwbGF5Om5vbmVkAggPDxYCHwJnZGQCBw8WAh8JAgQWBgICDxYCHwJoZAIIDw9kFgIfBwUMZGlzcGxheTpub25lZAIJDw8WAh8CZ2RkAgsPFgIfAmhkAjAPFgIeE0Ntc0NvbGxlY3Rpb25TdHJpbmdlZAI0D2QWAmYPZBYCZg9kFgQCAQ8WAh4MQ21zQXNzZXROYW1lBRVUb3V0X0ZBUV9EZXZBbGxQaG9uZXNkAgQPFgIfCgUPdG91dF9odG1sX2xvZ2luZAI2D2QWBGYPZBYCZg9kFgJmDxYCHwJoZAIBD2QWAmYPZBYCZg8WAh8LBRJzaHBfcGhuX2xlZ2FsTm90ZXNkAjgPDxYCHhxUaXRsZXBvcHVwUGxhbkNoYW5nZVJlcXVpcmVkZWQWBAIPDxYCHwJoZAITDxYCHwJoZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WNAUJTUJCRGV2aWNlBQ1QcmVQYWlkUGhvbmVzBQ1QcmVQYWlkUGhvbmVzBSFyZXBQcmljZVJhbmdlJGN0bDAwJGNoa1ByaWNlUmFuZ2UFDmNoa05ld0Fycml2YWxzBQ9jaGtXZWJPbmx5RGVhbHMFEmNoa1dlYk9ubHlQcm9kdWN0cwUPY2hrTmV3Q29uZGl0aW9uBQZjaGtDT0IFFnJlcFR5cGVzJGN0bDAwJGNoa1R5cGUFFnJlcFR5cGVzJGN0bDAyJGNoa1R5cGUFFnJlcFR5cGVzJGN0bDA0JGNoa1R5cGUFFnJlcFR5cGVzJGN0bDA1JGNoa1R5cGUFFnJlcFR5cGVzJGN0bDA2JGNoa1R5cGUFDGNoa0FuZHJvaWRPUwUPY2hrQmxhY2tCZXJyeU9TBQhjaGtXaW5PUwUgcmVwRmVhdHVyZUZpbHRlciRjdGwwMCRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMDEkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDAyJGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwwMyRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMDQkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDA1JGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwwNiRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMDckY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDA4JGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwwOSRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMTAkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDExJGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwxMiRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMTMkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDE0JGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwxNSRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMTYkY2hrRmlsdGVyBSdyZXBNYW51ZmFjdHVyZXJzJGN0bDAwJGNoa01hbnVmYWN0dXJlcnMFJ3JlcE1hbnVmYWN0dXJlcnMkY3RsMDEkY2hrTWFudWZhY3R1cmVycwUncmVwTWFudWZhY3R1cmVycyRjdGwwMiRjaGtNYW51ZmFjdHVyZXJzBSdyZXBNYW51ZmFjdHVyZXJzJGN0bDA0JGNoa01hbnVmYWN0dXJlcnMFJ3JlcE1hbnVmYWN0dXJlcnMkY3RsMDUkY2hrTWFudWZhY3R1cmVycwUncmVwTWFudWZhY3R1cmVycyRjdGwwNiRjaGtNYW51ZmFjdHVyZXJzBSdyZXBNYW51ZmFjdHVyZXJzJGN0bDA3JGNoa01hbnVmYWN0dXJlcnMFJ3JlcE1hbnVmYWN0dXJlcnMkY3RsMDgkY2hrTWFudWZhY3R1cmVycwUabXJwUGhvbmVzJGN0bDAwJGNoa0NvbXBhcmUFGm1ycFBob25lcyRjdGwwMiRjaGtDb21wYXJlBRptcnBQaG9uZXMkY3RsMDQkY2hrQ29tcGFyZQUabXJwUGhvbmVzJGN0bDA2JGNoa0NvbXBhcmUFGm1ycFBob25lcyRjdGwwOCRjaGtDb21wYXJlBRptcnBQaG9uZXMkY3RsMTAkY2hrQ29tcGFyZQUabXJwUGhvbmVzJGN0bDEyJGNoa0NvbXBhcmUFGm1ycFBob25lcyRjdGwxNCRjaGtDb21wYXJlBRptcnBQaG9uZXMkY3RsMTYkY2hrQ29tcGFyZQUabXJwUGhvbmVzJGN0bDE4JGNoa0NvbXBhcmVnDy0KUN8keEvS5/wEmJXssTUSNw==',
'ctl09':'ctl13|pgrTop$lnkPageShowAll',
'ddlSort':'0',
'hdnBlackBerryID':'3c2c3562-aa1c-4fe4-a0ca-da5dd8e4bd84',
'hdnCapCode':'',
'hdnDeviceId':'',
'hdnFeature':'',
'hdnFeatureNames':'',
'hdnFilter':'',
'hdnIsPricingOptionLockedB':'false',
'hdnLocationParameter':'',
'hdnManufacturer':'',
'hdnManufacturerID':'',
'hdnManufacturerNames':'',
'hdnOtherFilters':'',
'hdnPageIndex':'',
'hdnPriceRange':'',
'hdnPriceRangeText':'',
'hdnProductType':'GSM',
'hdnSelectedDeviceId':'',
'hdnSelections':'',
'hdnSortFilter':'0',
'hdnTitle':'',
'hdnType':'smp,',
'hdnTypeNames':'Smartphone|',
'popupPlanChangeRequired$hdnDeviceID':'',
'popupPlanChangeRequired$hdnFamilyID':'',
'popupPlanChangeRequired$hiddenImagePath':'',
'repTypes$ctl05$chkType':'on',
'txtSelectedDevices':'0',
'txtSelectedFeatures':'0'}
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)
response = urllib2.urlopen(req)
page = response.read()
soup = BeautifulSoup(page)

with open('tmob_colortest.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',')
    items = soup.findAll('div', {"class": "phonename"}, text = colors)
    prices = soup.findAll('p', {"class": "totalitemprice"})
    for item, price in zip(items, prices):
        textcontent = u' '.join(islice(item.stripped_strings, 0, 2, 1))
        textcontent2 = u' '.join(price.stripped_strings)
        name_1 = unicode(textcontent).encode('utf8').replace('Nexus 4','LG Nexus 4').replace(' T-Mobile Refurbished Device','').replace('™','').replace('®','').replace(' ›','').replace("NEW! ","").replace(" Web-only offer -- now thru Thu 1/3/13","").replace(" Web-only offer!","").strip()
        oem = list(name_1)
        pos = oem.index(' ')
        if name_1.find('Refurbished')== -1:
            name= name_1
            refur = "N"
        else:
            name = name_1.replace("Refurbished","").replace(" -","")
            refur = "Y"        
        spamwriter.writerow(["US", "T-Mobile",
                             name[0:pos],name,refur,color_column,
                             "24 Months","$",unicode(textcontent2).encode('utf8').replace("FREE","0").replace('$','')])

Please help me to solve this issue and pardon my ignorance as I am new to coding.

4

1 回答 1

0

你从来没有真正使用过你的功能,所以color_column永远不会被填满。

您要做的是让您的函数将更改后的产品名称和检测到的颜色作为两个单独的值返回:

def handle_color(arg):
    for col in colors:
        if col.lower() not in arg.lower():
            continue
        # color found, remove it from arg (case insensitively)
        start = arg.lower().index(col.lower())
        arg = arg[:start] + arg[start + len(col):]
        return arg, col
    # No matching color found, return arg unchanged and an empty value for the color
    return arg, ''

现在您所要做的就是调用此函数并将它的返回值解压缩为您的 CSV 的两个变量:

    name, color_column = handle_color(name)

并且color_column将是一个空值或匹配的颜色(现在从 中删除name)。

于 2013-01-09T12:00:45.780 回答