该页面是如此可怕的无效 XML,以至于所有使用parse()
fromlxml.tree
后跟xpath
etc 的正常方法都惨遭失败。所以看起来你最好的选择是:
>>> import re
>>> import urllib
>>> import pprint
>>> s = urllib.urlopen("http://www.rob389.com/dp/tr/11/9789754681383").read()
>>> magic = re.compile(r'tOBJ.DATA\[0\].([A-Z0-9_]+)="([^"]+)"')
>>> my_dict = dict(magic.findall(s))
>>> pprint.pprint(my_dict)
{'DISC_PERC': '15.0000000000',
'EXCHANGE_RT': '2.2815',
'LNK_PREFIX': 'uykusuz-bir-gece-jill-murphy',
'LST_PRICE': '7.500000000000000',
'LST_YAX02_CODE': 'YTL',
'MMG00_CODE': '11',
'MMG00_TITLE': 'Kitap',
'MMM00_DESC': '...<br />Cad\xfdlar Okulu M\xfcd\xfcr\xfc, \\',
'MMM00_DESC250': '...<br />Cad\xfdlar Okulu M\xfcd\xfcr\xfc, \\',
'MMM00_DISC_PERC_SAL': '25',
'MMM00_HEIGHT': '19.6',
'MMM00_ITEM_CODE': '9789751028440',
'MMM00_ORG_TITLE': '026512',
'MMM00_SRC_CODE': '9789754681383',
'MMM00_TITLE': 'Uykusuz Bir Gece',
'MMM00_TYPE': 'M',
'MMM00_WEIGHT': '0',
'MMM00_WIDTH': '13.6',
'MMM00_ZHEIGHT': '1',
'MMS03_PRICE_1': '7.500000000000000',
'MMS03_PRICE_2': '0.000000000000000',
'MMS03_PRICE_3': '7.500000000000000',
'MMS03_YAX02_CODE_1': 'YTL',
'MMS03_YAX02_CODE_2': 'YTL',
'MMS03_YAX02_CODE_3': 'YTL',
'NWS01_DESC': "<BR>New Orleans'da do\xf0an Lillian Hellman'\xfdn ilkgen\xe7li\xf0i daha sonra oyunlar\xfdnda \xfcst\xfc kapal\xfd olarak yer bulacak olan tuhaf ve h\xfdrsl\xfd akrabalar aras\xfdnda ge\xe7ti. New Orleans ve New York aras\xfdnda mekik dokuyarak ge\xe7en y\xfdllarda etraf\xfdndaki farkl\xfd k\xfclt\xfcrleri g\xf6zlemleme \xfeans\xfd buldu. Liseyi bitirdikten sonra Columbia ve New York \xdcniversitesi'ne devam ettiyse de, e\xf0itimini yar\xfdda b\xfdrakarak bir yay\xfdnevinde \xe7al\xfd\xfemaya ba\xfelad\xfd. 1920'lerin bohem hayat\xfdna g\xf6z k\xfdrpt\xfd\xf0\xfd bu d\xf6nemde tan\xfd\xfet\xfd\xf0\xfd gen\xe7 yazar Arthur Kober ile evlenerek Hollywood'a ta\xfe\xfdnd\xfd. <BR><BR>1930'lar\xfdn ba\xfe\xfdnda MGM'de d\xfczeltmenlik yapt\xfd. Hevesli bir solcu oldu\xf0u bu y\xfdllarda, i\xfe arkada\xfelar\xfdn\xfd sendikala\xfemalar\xfd i\xe7in<A class=A2 href=\\",
'NWS01_DESC400': '<A class=A3 href=\\',
'NWS01_ID': '588',
'NWS01_IMAGE': '/UD_OBJS/IMAGES/NWS/HSTTR/Hellman_L_231204_365_1.jpg',
'ON_ESHOP': 'T',
'PEP01_ID': '229016',
'PEP01_NAME': 'Jill Murphy',
'PRD_FNM01_ID': '23462',
'PRD_FNM01_TITLE': 'Mandolin',
'PRD_FNM01_TRD_TITLE': 'Say Yay\xfdnlar\xfd',
'PUR_VAT_VALUE': '8',
'SAL_PRICE': '6.3750000000',
'SAL_VAT_VALUE': '8',
'SAL_YAX02_CODE': 'YTL',
'UD_10': '~410~|',
'UD_10_VAL': 'T\xfcrk\xe7e',
'UD_11': '~1000~|~803.2~|',
'UD_11_VAL': '\xc7ocuk,\xd6yk\xfc',
'UD_12': '~1000.4080~|',
'UD_12_VAL': '\xc7ocuk | 07-12 Ya\xfe | Edebiyat',
'UD_15': '978-975-468-138-3',
'UD_15_VAL': '978-975-468-138-3',
'UD_16': '~PB~|',
'UD_16_VAL': 'Karton Kapak',
'UD_19': '01/01/2010',
'UD_19_VAL': '01/01/2004',
'UD_2': 'The Worst Witch Strikes Again',
'UD_20': '92',
'UD_20_VAL': '92',
'UD_21': '52',
'UD_21_VAL': '52',
'UD_22': '3',
'UD_22_VAL': '3',
'UD_23': '1',
'UD_23_VAL': '1',
'UD_24': '~HM1~|',
'UD_24_VAL': '1. Hamur',
'UD_26': '7-12',
'UD_26_VAL': '07-12',
'UD_2_VAL': 'The Worst Witch Strikes Again',
'UD_3': '~229016~|',
'UD_30': '1',
'UD_30_VAL': '1',
'UD_31': '1',
'UD_31_VAL': '1',
'UD_34': '~1~|',
'UD_34_VAL': '1-3 G\xfcn',
'UD_36': '1',
'UD_36_VAL': '1',
'UD_39': 'VAR',
'UD_39_VAL': 'Var',
'UD_3_VAL': 'Jill Murphy',
'UD_42': '~410~|',
'UD_42_VAL': 'T\xfcrk\xe7e',
'UD_6': '~239986~|',
'UD_6_VAL': 'Seza Sunar',
'YAX02_CODE': 'EUR'}
>>>