我尝试下面的代码从 kaggle.com 获得 http 响应。Kaggle 响应为 html 格式,我想将其转换为 json 格式以便于进一步处理。
import requests
import json
username = 'vgtgayan'
base_url = 'https://www.kaggle.com/'
url = base_url+str(username)
r = requests.get(url)
print(r.status_code)
print(r.r.headers["content-type"])
输出:
200
text/html; charset=utf-8
上面的代码对下面的所有方法都是通用的。
尝试1:
r.json()
错误:
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
尝试2:
import xmltodict
import xmltojson
json_ = xmltojson.parse(r.text)
错误:
ExpatError: not well-formed (invalid token): line 22, column 70
尝试 3:
import xml.etree.ElementTree
dict = xmltodict.parse(ElementTree.tostring(ElementTree.parse(path).getroot()))
错误:
ParseError: not well-formed (invalid token):
参考: xml.parsers.expat.ExpatError: not well-formed (invalid token)
上述每一次尝试都以错误告终。请帮助我完成这项任务。
下面是我得到的 html 响应,
<!DOCTYPE html>
<html lang="en">
<head>
<title>V.G.T. Gayan | Novice | Kaggle</title>
<meta charset="utf-8" />
<meta name="robots" content="index, follow" />
<meta name="description" content="Kaggle profile for V.G.T. Gayan" />
<meta name="turbolinks-cache-control" content="no-cache" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=5.0, minimum-scale=1.0">
<meta name="theme-color" content="#008ABC" />
<script nonce="VmwzDvxUO596o90cb7qq6Q==" type="text/javascript">
window["pageRequestStartTime"] = 1641792835393;
window["pageRequestEndTime"] = 1641792835435;
window["initialPageLoadStartTime"] = new Date().getTime();
</script>
<link rel="preconnect" href="https://www.google-analytics.com" crossorigin="anonymous" /><link rel="preconnect" href="https://stats.g.doubleclick.net" /><link rel="preconnect" href="https://storage.googleapis.com" /><link rel="preconnect" href="https://apis.google.com" />
<link href="/static/images/favicon.ico" rel="shortcut icon" type="image/x-icon" />
<link rel="manifest" href="/static/json/manifest.json" crossorigin="use-credentials">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
<link href="https://fonts.googleapis.com/icon?family=Google+Material+Icons"
rel="preload" as="style" />
<link href="https://fonts.googleapis.com/css?family=Inter:400,400i,500,500i,600,600i,700,700i"
rel="preload" as="style" />
<link href="https://fonts.googleapis.com/icon?family=Google+Material+Icons"
rel="stylesheet" media="print" id="async-google-font-1" />
<link href="https://fonts.googleapis.com/css?family=Inter:400,400i,500,500i,600,600i,700,700i"
rel="stylesheet" media="print" id="async-google-font-2" />
<script nonce="VmwzDvxUO596o90cb7qq6Q==" type="text/javascript">
const styleSheetIds = ["async-google-font-1", "async-google-font-2"];
styleSheetIds.forEach(function (id) {
document.getElementById(id).addEventListener("load", function() {
this.media = "all";
});
});
</script>
<link rel="stylesheet" type="text/css" href="/static/assets/vendor.css?v=a39c9d14b7e6072d0f7a" />
<link rel="stylesheet" type="text/css" href="/static/assets/app.css?v=453de5392c911bfbccf0" />
<script nonce="VmwzDvxUO596o90cb7qq6Q==">
try{(function(a,s,y,n,c,h,i,d,e){d=s.createElement("style");
d.appendChild(s.createTextNode(""));s.head.appendChild(d);d=d.sheet;
y=y.map(x => d.insertRule(x + "{ opacity: 0 !important }"));
h.start=1*new Date;h.end=i=function(){y.forEach(x => x<d.cssRules.length ? d.deleteRule(x) : {})};
(a[n]=a[n]||[]).hide=h;setTimeout(function(){i();h.end=null},c);h.timeout=c;
})(window,document,['.site-header-react__nav'],'dataLayer',2000,{'GTM-52LNT9S':true});}catch(ex){}
</script>
<script nonce="VmwzDvxUO596o90cb7qq6Q==">
window.dataLayer = window.dataLayer || [];
function gtag() { dataLayer.push(arguments); }
gtag('js', new Date());
gtag('config', 'UA-12629138-1', {
'optimize_id': 'GTM-52LNT9S',
'displayFeaturesTask': null,
'send_page_view': false,
'content_group1': 'Users'
});
</script>
<script nonce="VmwzDvxUO596o90cb7qq6Q==" async src="https://www.googletagmanager.com/gtag/js?id=UA-12629138-1"></script>
<meta property="og:url" content="https://www.kaggle.com/vgtgayan/home" />
<meta property="og:title" content="V.G.T. Gayan | Novice" />
<meta property="og:description" content="Kaggle profile for V.G.T. Gayan" />
<meta property="og:type" content="profile" />
<meta property="og:username" content="vgtgayan" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:image" content="https://www.kaggle.com/static/images/tiers/Novice@192.png" />
<meta name="twitter:image:alt" content="Novice" />
<meta name="twitter:site" content="@Kaggle" />
<script nonce="VmwzDvxUO596o90cb7qq6Q==" type="text/javascript">
var Kaggle = window.Kaggle || {};
Kaggle.Current = {
antiForgeryToken: 'CfDJ8LdUzqlsSWBPr4Ce3rb9VL9za2FFB-rFm9iuPAm8PKLq9TqJiAHT4UmlnyzfLtHBrGjL5o2brrzKiVOqcKYybNKzrGn4e1wSl09TIhcFoh_3ivZ3ndFbLCUM9zaxUWdhs8JzSLYm5l9RM1a-bbh55aw',
isAnonymous: true,
analyticsToken: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2NDE3OTM3MzUsIlVzZXJJZCI6MH0.vEUaJcUG88jwDqImiQoQtQU8-jui4Gdp6xihpLlls0U',
analyticsTokenExpiry: 15,
enableRapidash: true,
}
Kaggle.Current.log = function(){};
Kaggle.Current.warn = function(){};
var decodeUserDisplayName = function () {
var escapedUserDisplayName = Kaggle.Current.userDisplayNameEscaped || "";
try {
var textVersion = new DOMParser().parseFromString(escapedUserDisplayName, "text/html").documentElement.textContent;
if (textVersion) {
return textVersion;
}
} catch(ex) {}
return escapedUserDisplayName;
}
Kaggle.Current.userDisplayName = decodeUserDisplayName();
</script>
<script nonce="VmwzDvxUO596o90cb7qq6Q==" type="text/javascript">
var Kaggle = window.Kaggle || {};
Kaggle.PageMessages = [];
</script>
<script nonce="VmwzDvxUO596o90cb7qq6Q==">window['useKaggleAnalytics'] = true;</script>
<script id="gapi-target" nonce="VmwzDvxUO596o90cb7qq6Q==" src="https://apis.google.com/js/api.js" defer
async></script>
<script nonce="VmwzDvxUO596o90cb7qq6Q==" src="/static/assets/runtime.js?v=f84cf2a36564689cb2b3" data-turbolinks-track="reload"></script>
<script nonce="VmwzDvxUO596o90cb7qq6Q==" src="/static/assets/vendor.js?v=26a3487f6f4d622267d2" data-turbolinks-track="reload"></script>
<script nonce="VmwzDvxUO596o90cb7qq6Q==" src="/static/assets/app.js?v=36d0e4e3f41dd27e5515" data-turbolinks-track="reload"></script>
<script nonce="VmwzDvxUO596o90cb7qq6Q==" type="text/javascript">
window.kaggleStackdriverConfig = {
key: 'AIzaSyA4eNqUdRRskJsCZWVz-qL655Xa5JEMreE',
projectId: 'kaggle-161607',
service: 'web-fe',
version: 'ci',
userId: '0'
}
</script>
</head>
<body data-turbolinks="false">
<main>
<div id="site-container"></div>
<div data-component-name="NavigationContainer" style="display: flex; flex-direction: column; flex: 1 0 auto;"></div><script class="kaggle-component" nonce="VmwzDvxUO596o90cb7qq6Q==">var Kaggle=window.Kaggle||{};Kaggle.State=Kaggle.State||[];Kaggle.State.push({"navigationType":"BOTH_NAV"});performance && performance.mark && performance.mark("NavigationContainer.componentCouldBootstrap");</script>
<div id="site-body" class="hide">
<div data-component-name="ProfileContainerReact" style="display: flex; flex-direction: column; flex: 1 0 auto;"></div><script class="kaggle-component" nonce="VmwzDvxUO596o90cb7qq6Q==">var Kaggle=window.Kaggle||{};Kaggle.State=Kaggle.State||[];Kaggle.State.push({"userId":8257495,"displayName":"V.G.T. Gayan","country":"Sri Lanka","region":"Western Province","city":"Colombo","gitHubUserName":null,"twitterUserName":null,"linkedInUrl":null,"websiteUrl":null,"occupation":"Research \u0026 Development Engineer","organization":"Synopsys Inc","bio":null,"userLastActive":"2022-01-10T03:58:03.07Z","userJoinDate":"2021-09-01T11:53:55.643Z","performanceTier":"novice","performanceTierCategory":"competitions","activePaneTier":"novice","activePaneCategory":"unspecified","userUrl":"/vgtgayan","userAvatarUrl":"https://storage.googleapis.com/kaggle-avatars/images/default-thumb.png","email":null,"canEdit":false,"canCreateDatasets":true,"userName":"vgtgayan","activePane":"home","totalDatasets":0,"totalOrganizations":0,"competitionsSummary":{"tier":"novice","totalResults":0,"rankPercentage":0.9691485,"rankOutOf":173152,"rankCurrent":null,"rankHighest":null,"totalGoldMedals":0,"totalSilverMedals":0,"totalBronzeMedals":0,"highlights":[],"summaryType":"competitions"},"scriptsSummary":{"tier":"novice","totalResults":1,"rankPercentage":0.1549871,"rankOutOf":203004,"rankCurrent":null,"rankHighest":null,"totalGoldMedals":0,"totalSilverMedals":0,"totalBronzeMedals":0,"highlights":[],"summaryType":"notebooks"},"discussionsSummary":{"tier":"novice","totalResults":0,"rankPercentage":0.022529345,"rankOutOf":267429,"rankCurrent":null,"rankHighest":null,"totalGoldMedals":0,"totalSilverMedals":0,"totalBronzeMedals":0,"highlights":[],"summaryType":"discussion"},"datasetsSummary":{"tier":"novice","totalResults":0,"rankPercentage":0.13677031,"rankOutOf":54098,"rankCurrent":null,"rankHighest":null,"totalGoldMedals":0,"totalSilverMedals":0,"totalBronzeMedals":0,"highlights":[],"summaryType":"datasets"},"pageMessages":null,"followers":{"type":"following","count":0,"list":[],"containsSelf":false,"maxCountReached":false},"following":{"type":"following","count":0,"list":[],"containsSelf":false,"maxCountReached":false},"canSeeFollowers":false,"canSeeCallToAction":true,"canSeeNotifications":true,"canSeeAtMentions":true,"totalScripts":1,"isAdmin":false,"isEditing":false,"userAllowsUserMessages":true,"@wf": "Users.Models.ProfileDtoWireFormat"});performance && performance.mark && performance.mark("ProfileContainerReact.componentCouldBootstrap");</script>
<script nonce="VmwzDvxUO596o90cb7qq6Q==" type="text/x-mathjax-config">
MathJax.Hub.Config({
"HTML-CSS": {
preferredFont: "TeX",
availableFonts: ["STIX", "TeX"],
linebreaks: {
automatic: true
},
EqnChunk: (MathJax.Hub.Browser.isMobile ? 10 : 50)
},
tex2jax: {
inlineMath: [["\\(", "\\)"], ["\\\\(", "\\\\)"]],
displayMath: [["$$", "$$"], ["\\[", "\\]"]],
processEscapes: true,
ignoreClass: "tex2jax_ignore|dno"
},
TeX: {
noUndefined: {
attributes: {
mathcolor: "red",
mathbackground: "#FFEEEE",
mathsize: "90%"
}
}
},
Macros: {
href: "{}"
},
skipStartupTypeset: true,
messageStyle: "none",
extensions: [],
});
</script>
<script type="text/javascript" nonce="VmwzDvxUO596o90cb7qq6Q==">
window.addEventListener("DOMContentLoaded", () => {
const head = document.getElementsByTagName("head")[0];
const useProdHosts = ["www.kaggle.com", "admin.kaggle.com"];
const subdomain = useProdHosts.includes(window.location.hostname) ? "www" : "staging";
const lib = document.createElement("script");
lib.type = "text/javascript";
lib.src = `https://${subdomain}.kaggleusercontent.com/static/mathjax/2.7.9/MathJax.js?config=TeX-AMS-MML_HTMLorMML`;
head.appendChild(lib);
});
</script>
</div>
</main>
</body>
</html>