# filter-weblog # # parses IISStyle logs and removes "valid URLs" # # 85.25.110.111 - - [01/Feb/2013:00:01:19 +0100] "GET /admin/test.htm HTTP/1.1" 200 41052 "-" "Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; Win 9x4.90)" # # Based on Apache "Combined Log" # %h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-agent}i" # %h = IP address of the client (remote host) which made the request # %l = RFC 1413 identity of the client # %u = Userid of the person requesting the document # %t = Time that the server finished processing the request # %r = Request line from the client in double quotes # %>s = Status code that the server sends back to the client # %b = Size of the object returned to the client # Referer # UserAgent # # get-content access*.log | .\filter-weblog.ps1 | out-file -FilePath .\badrequest.log -Encoding ascii param ( [string]$sitemapfile = ".\sitemap.xml", # filename of the SitemapXML with valid files [string]$webdir = "C:\\Meine Websites\www.msxfaq.de\", [int]$URLcolumn = 5, # Which column of the W3SRV-Log contains the reuqest-URI [string]$logregex = '^(.+)\s(.+?)\s(.+?)\s(\[.+?\])\s"(.+?)"\s(\d+)\s(.+)\s"(.+?)"\s"(.+?)"' ) begin { Write-Host 'filter-weblog: ============== started ============== ' write-host "Loading XMLfile " $sitemapfile [xml]$sitemap = Get-Content $sitemapfile [hashtable]$sitemapURLs=@{} #foreach ($URL in $sitemap.URLset.URL) { #write-host "Sitemap LongURL" $URL.loc # write-host "." -nonewline # [string]$shortURL = ("GET /"+(($URL.loc.split("/"))[3..99] -join "/")).tolower() #write-host "Sitemap W3SVC URL" $shortURL # $sitemapURLs.add($shortURL,$true) #} write-host ("Adding URLs from WebDir:" + $webdir) get-childitem $webdir -Recurse -File | %{ $relURL = "/" + $_.fullname.replace($webdir,"").replace("\","/").tolower() #write-host $relURL $sitemapURLs.add($relURL,$true) # adding URLs from local Webpage } #$sitemapURLs.add("GET /robots.txt".tolower(),$true) # adding default URLs #$sitemapURLs.add("GET /sitemap.xml".tolower(),$true) # adding default URLs #$sitemapURLs.add("GET /msxfaq2012.css".tolower(),$true) # adding default URLs #$sitemapURLs.add("GET /newsletter/msxfaq.xml".tolower(),$true) # adding default URLs $sitemapURLs.add("/".tolower(),$true) # adding default URLs #$sitemapURLs.keys | export-clixml .\sitemapURLs.xml write-host "Total Valid URLs learned" $sitemapURLs.count [long]$ValidURLsOK = 0 [long]$ValidURLsBOT = 0 [long]$ValidURLsERROR = 0 [long]$InValidURLs = 0 [long]$NoMatchLine = 0 [long]$totallines = 0 [long]$totalbytes = 0 } process { #Write-Host "filter-weblog: ------ Pipelinedata --------" $pipedata = $_ $totallines +=1 $totalbytes += $pipedata.length if ($totallines % 1000 -eq 0) { write-host "Lines: $totallines Bytes $totalbytes" } if ($pipedata -match $logregex) { [string]$requesturi = ($matches[5]).split(" ")[1].tolower() [int]$requeststatus = [int]($matches[6]) [string]$Useragent = $matches[9] write-debug ("W3SRVLog RequestURI" + $requesturi + " Status:" + $requeststatus) #write-host ("W3SRVLog RequestURI:" + $requesturi + "---Status:" + $requeststatus) if ( ($Useragent -like "*Googlebot*") ` -or ($Useragent -like "*MJ12bot*") ` -or ($Useragent -like "*Baiduspider*") ` -or ($Useragent -like "*Ezooms*") ` -or ($Useragent -like "*msnbot*") ` -or ($Useragent -like "*bingbot*") ` -or ($Useragent -like "*linkdex.com*") ` -or ($Useragent -like "*YandexBot*") ` -or ($Useragent -like "*TurnitinBot*") ` -or ($Useragent -like "*free monitoring service; http://mon.itor.us*")) { #write-host "Skip Bots" $Useragent $ValidURLsBOT +=1 } elseif (($sitemapURLs.containskey($requesturi)) -and ($requeststatus -lt 400)){ #write-host " Valid and successful URL request" $ValidURLsOK +=1 } elseif (($sitemapURLs.containskey($requesturi)) -and ($requeststatus -ge 400)){ #write-host " Valid URL but Status 400 or higher" #write-warning (" Valid URL but Status 400 or higher" +$requesturi) $ValidURLsERROR +=1 $pipedata + " ValidURIerror" } else { #write-host " Invalid URI !" $requesturi #write-warning (" InValid URL but Status 400 or higher" +$requesturi) $InValidURLs +=1 $pipedata + " InvalidURI" } } else { #write-host "NoMatch" $pipedata $NoMatchLine +=1 $pipedata + " nomatch" } # $fields = ($_ | gm -MemberType NoteProperty) # foreach($item in $fields) { # Write-Host "Name:" $item.name " Daten: " $pipedata.($item.name) # } } end { Write-Host "filter-weblog: ============== ended ============== " Write-host "totallines :" $totallines Write-host "totalMB :" ($totalbyes/1mb) Write-host "ValidURLsOK :" $ValidURLsOK Write-host "ValidURLsBOT :" $ValidURLsBOT Write-host "ValidURLsERROR :" $ValidURLsERROR Write-host "InValidURLs :" $InValidURLs Write-host "NoMatchLine :" $NoMatchLine }