Регулярные выражения - некорректно парсится HTML-страница - C#
Формулировка задачи:
Здравствуйте,битый день пытаюсь понять почему не работает выражение.Есть html файл, нужно вытащить текст из body.
Сам файл:
pattern = "(<body>[^<>]+?).</body>*>";
<html xmlns:user="http://mycompany.com/mynamespace" xmlns:MSHelp="http://msdn.microsoft.com/mshelp" xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<head>
<META http-equiv="Content-Type" content="text/html; charset=utf-8">
<link href="http://yastatic.net/highlightjs/8.2/styles/solarized_light.min.css" rel="stylesheet"><script src="http://yastatic.net/highlightjs/8.2/highlight.min.js"></script><script type="text/javascript">hljs.initHighlightingOnLoad();</script><link rel="stylesheet" type="text/css" href="how_to.css">
<body>
<p>текст</p>
<div class="......">
Operating system: Windows XP (both 32-bit and 64-bit editions), Microsoft Windows Server 2003 (both 32-bit and 64-bit editions), Microsoft Windows 2000
</div>
<div class="aqWhatitdoes">
The script below invokes the Open File dialog.
</div>
<div class="aqCode">
<div class="aqCodeSegment">
<p class="aqCodeHeader" style="background-image: url(.....gif);"><span class="aqLangID" name="aqLangID" id="aqLangID">VBScript</span><div id="ID0E2_divcodeseg">
<pre><code class="vbs">Sub Test
Dim i, openDialog, files
'Microsoft Common Dialog Control
Set openDialog = CreateObject("MSComDlg.CommonDialog")
With openDialog
.DialogTitle = "Open my files..."
.InitDir = "C:\WINDOWS\"
.Filter = "All files (*.*)|*.*|Programs (*.com,*.exe)|*.com;"&_
"*.exe|Text (*.txt,*.log)|*.txt;*.log"
.FilterIndex = 2
.Flags = 2621952
.MaxFileSize =32000
.ShowOpen
Filename = .Filename
End With
If (Len(openDialog.FileName)= 0) Then
Log.Error("Files were not selected")
Exit Sub
End If
files = Split(openDialog.Filename, vbNullChar)
If 0 = UBound(files) Then
Log.Message("The " & files(0) & " file was selected")
Else
For i = 1 To UBound(files)
Log.Message("The " & files(0) & "\" & files(i) & " file was selected")
Next
End If
End Sub</code></pre>
</div>
</p>
</div>
<div class="aqCodeSegment">
<p class="aqCodeHeader" style="background-image: url(/images/support/kb/data/2009/11/30/codeHeader.gif);"><span class="aqLangID" name="aqLangID" id="aqLangID">JScript</span><div id="ID0E5_divcodeseg">
<pre><code class="jsp">function Test()
{
// Microsoft Common Dialog
var openDialog = new ActiveXObject("MSComDlg.CommonDialog");
openDialog.DialogTitle = "Open my files...";
openDialog.InitDir = "C:\\WINDOWS\\";
openDialog.Filter = "All files (*.*)|*.*|Programs (*.com,*.exe)|" +
"*.com;*.exe|Text (*.txt,*.log)|*.txt;*.log";
openDialog.FilterIndex = 2;
openDialog.Flags = 2621952;
openDialog.MaxFileSize = 32000;
openDialog.ShowOpen();
if (0 == openDialog.FileName.length) {
Log.Error("Files were not selected");
return;
}
var files = openDialog.Filename.split("\0");
if(1 == files.length) {
Log.Message("The " + files[0] + " file was selected");
}
else {
for(var i = 1; i < files.length; i++) {
Log.Message("The " + files[0] + "\\" + files[i] + " file was selected");
}
}
}</code></pre>
</div>
</p>
</div>
</div>
</body>
</head>
</html>
То есть соответственно такой же патер по работает
Решение задачи: «Регулярные выражения - некорректно парсится HTML-страница»
textual
Листинг программы
var webClient = new System.Net.WebClient(); string URL = "http://site.ru"; string HTML = webClient.DownloadString(URL); Match match = Regex.Match(HTML, "<body>(.*)</body>"); string result=match.Success ? match.Groups[1].Value : "Не найдено";