2

我有一个字符串形式的页面的 html 源代码:

<html>
    <head>
          <link rel="stylesheet" type="text/css" href="/css/all.css" /> 
    </head>
    <body>
        <a href="/test.aspx">Test</a>
        <a href="http://mysite.com">Test</a>
        <img src="/images/test.jpg"/>
        <img src="http://mysite.com/images/test.jpg"/>
    </body>
</html>

我想将所有相对路径转换为绝对路径。我希望输出是:

<html>
    <head>
          <link rel="stylesheet" type="text/css" href="http://mysite.com/css/all.css" /> 
    </head>
    <body>
        <a href="http://mysite.com/test.aspx">Test</a>
        <a href="http://mysite.com">Test</a>
        <img src="http://mysite.com/images/test.jpg"/>
        <img src="http://mysite.com/images/test.jpg"/>
    </body>
</html>

注意:我只希望将该字符串中的相对路径转换为绝对路径。不应该触及已经在该字符串中的绝对值,它们对我来说很好,因为它们已经是绝对的。这可以通过正则表达式或其他方式完成吗?

4

6 回答 6

20

不要尝试使用正则表达式解析 html,如下所示https://stackoverflow.com/a/1732454/932418https://stackoverflow.com/a/1758162/932418

改用HtmlAgilityPack之类的 html 解析器

string html = 
@"<html>
    <head>
            <link rel=""stylesheet"" type=""text/css"" href=""/css/all.css"" /> 
    </head>
    <body>
        <a href=""/test.aspx"">Test</a>
        <a href=""http://example.com"">Test</a>
        <img src=""/images/test.jpg""/>
        <img src=""http://example.com/images/test.jpg""/>
    </body>
</html>";

StringWriter writer = new StringWriter();
string baseUrl= "http://example.com";
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);

foreach(var img in doc.DocumentNode.Descendants("img"))
{
    img.Attributes["src"].Value = new Uri(new Uri(baseUrl), img.Attributes["src"].Value).AbsoluteUri;
}

foreach (var a in doc.DocumentNode.Descendants("a"))
{
    a.Attributes["href"].Value = new Uri(new Uri(baseUrl), a.Attributes["href"].Value).AbsoluteUri;
}

doc.Save(writer);

string newHtml = writer.ToString();
于 2012-09-03T21:40:45.230 回答
4

添加

<base href="http://mysite.com/images/" />

到页首

于 2012-09-03T20:29:33.117 回答
0

为此使用正则表达式。这是一个简短的例子

static void Main(string[] args)
    {
        string input = "<html>\n<head>\n<link rel=\"stylesheet\" type=\"text/css\" href=\"/css/all.css\" /> \n</head>\n<body>\n<a href=\"/test.aspx\">Test</a>\n<a href=\"http://mysite.com\">Test</a>\n<img src=\"/images/test.jpg\"/>\n<img src=\"http://mysite.com/images/test.jpg\"/>\n</body>\n</html>";
        string pattern = "((?:src|href)[\\s]*?)(?:\\=[\\s]*?[\\\"\\\'])[\\/*\\\\*]?(?!..+[s]?\\:[\\/]*)(.*?)(?:[\\s\\\"\\\'])";
        var reg = new Regex(pattern, RegexOptions.IgnoreCase);
        string prefix = @"http://mysite.com";
        var result = reg.Replace(input, "$1=\""+prefix+"$2\"");
    }

结果是

<html>
<head>
<link rel="stylesheet" type="text/css" href="http://mysite.com/css/all.css" /> 
</head>
<body>
<a href="http://mysite.com/test.aspx">Test</a>
<a href="http://mysite.com">Test</a>
<img src="http://mysite.com/images/test.jpg"/>
<img src="http://mysite.com/images/test.jpg"/>
</body>
</html>
于 2012-09-03T21:35:12.353 回答
0

看看这个,它可以帮助你。

它采用以下格式:http(s)://domain(:port)/AppPath )

HttpContext.Current.Request.Url.Scheme + "://" + HttpContext.Current.Request.Url.Authority + HttpContext.Current.Request.ApplicationPath;

或者你可以使用:

Page.ResolveUrl("img/youFile");
于 2012-09-03T20:33:10.120 回答
0

看看这个函数:

Private Function ConvertALLrelativeLinksToAbsoluteUri(ByVal html As String, ByVal PageURL As String)
    Dim result As String = Nothing
    ' Getting all Href
    Dim opt As New RegexOptions
    Dim XpHref As New Regex("(href="".*?"")", RegexOptions.IgnoreCase)
    Dim i As Integer
    Dim NewSTR As String = html
    For i = 0 To XpHref.Matches(html).Count - 1
        Application.DoEvents()
        Dim Oldurl As String = Nothing
        Dim OldHREF As String = Nothing
        Dim MainURL As New Uri(PageURL)
        OldHREF = XpHref.Matches(html).Item(i).Value
        Oldurl = OldHREF.Replace("href=", "").Replace("HREF=", "").Replace("""", "")
        Dim NEWURL As New Uri(MainURL, Oldurl)
        Dim NewHREF As String = "href=""" & NEWURL.AbsoluteUri & """"
        NewSTR = NewSTR.Replace(OldHREF, NewHREF)
    Next
    html = NewSTR
    Dim XpSRC As New Regex("(src="".*?"")", RegexOptions.IgnoreCase)
    For i = 0 To XpSRC.Matches(html).Count - 1
        Application.DoEvents()
        Dim Oldurl As String = Nothing
        Dim OldHREF As String = Nothing
        Dim MainURL As New Uri(PageURL)
        OldHREF = XpSRC.Matches(html).Item(i).Value
        Oldurl = OldHREF.Replace("src=", "").Replace("src=", "").Replace("""", "")
        Dim NEWURL As New Uri(MainURL, Oldurl)
        Dim NewHREF As String = "src=""" & NEWURL.AbsoluteUri & """"
        NewSTR = NewSTR.Replace(OldHREF, NewHREF)
    Next
    Return NewSTR
End Function
于 2014-02-20T19:55:05.617 回答
0

这对我很有用。我在电子邮件模板上使用它。我在每个链接的开头使用 MVC/Razor“~/”。

' Parse HTML and make relative links absolute with p_basepath
Public Function ParseHTMLLinks(ByVal MailBodyHTML As String) As String
    ' Declare & intialize variables
    Dim strHTMLBody As String = MailBodyHTML

    ' Set regex variables 
    Dim strSrcSubMatch As String = ""
    Dim strSrcFullUrl As String = ""
    Dim srcPattern As String = "[=""]\/?([^""\s]*(\.gif|\.jpg|\.jpeg|\.png|\.css|\.js))[""\s]"
    Dim srcOptions As RegexOptions = RegexOptions.IgnoreCase
    Dim regex As Regex = New Regex(srcPattern, srcOptions)
    Dim regexSub As Regex = New Regex(srcPattern, srcOptions)
    Dim Matches As MatchCollection = regex.Matches(strHTMLBody)

    Try
        For Each Match As Match In Matches
            ' filter out absolute links
            If InStr(Match.ToString, "://") = 0 And InStr(LCase(Match.ToString), "mailto:") = 0 And InStr(LCase(Match.ToString), "javascript:") = 0 Then
                ' Remove the " at each end of relative path
                strSrcSubMatch = regexSub.Replace(Match.ToString, "$1")
                ' Concatenate the FullPath
                strSrcFullUrl = p_basePath & strSrcSubMatch
                ' Execute the replace
                strHTMLBody = Replace(strHTMLBody, "/" & strSrcSubMatch, strSrcFullUrl)
            End If
        Next

    Catch e As WebException
        'Add errors to List(Of WebException), if any.
        ErrorCodes.Add(e)
    End Try

    Return strHTMLBody 'MailBodyHTML
End Function
于 2017-06-09T07:23:03.693 回答