-
-
Save hreeder/f1ffe1408d296ce0591d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
import gzip | |
import os | |
import sys | |
import re | |
INPUT_DIR = "nginx-logs" | |
lineformat = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/1\.1")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (["](?P<refferer>(\-)|(.+))["]) (["](?P<useragent>.+)["])""", re.IGNORECASE) | |
for f in os.listdir(INPUT_DIR): | |
if f.endswith(".gz"): | |
logfile = gzip.open(os.path.join(INPUT_DIR, f)) | |
else: | |
logfile = open(os.path.join(INPUT_DIR, f)) | |
for l in logfile.readlines(): | |
data = re.search(lineformat, l) | |
if data: | |
datadict = data.groupdict() | |
ip = datadict["ipaddress"] | |
datetimestring = datadict["dateandtime"] | |
url = datadict["url"] | |
bytessent = datadict["bytessent"] | |
referrer = datadict["refferer"] | |
useragent = datadict["useragent"] | |
status = datadict["statuscode"] | |
method = data.group(6) | |
print ip, \ | |
datetimestring, \ | |
url, \ | |
bytessent, \ | |
referrer, \ | |
useragent, \ | |
status, \ | |
method | |
logfile.close() |
Thanks!
I had to play with date, hence
https://gist.github.com/ninadpchaudhari/12017b75a6a205b12b360b1aa75d08b3 << Find this fork where I modify this one to normalize and parse datetime string to python objects.
Since HTTP 2.0 is getting more popular, you can fix it something like this:
lineformat = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/[1-2]\.[0-9]")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (?P<refferer>-|"([^"]+)") (["](?P<useragent>[^"]+)["])""", re.IGNORECASE)
support for extracting the HTTP method and thus support HEAD requests too and the second dash is placeholder if remote_user is empty.
nginx log format
log_format vhosts '$host $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"';
python:
lineformat = re.compile( r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - (?P<remoteuser>.+) \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(?P<method>.+) )(?P<url>.+)(http\/[1-2]\.[0-9]")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (["](?P<refferer>(\-)|(.+))["]) (["](?P<useragent>.+)["])""", re.IGNORECASE)
Need help... Using python 3... Getting cannot use a string pattern for a bytes object for the read.search() function
You need to import the "re" package first and then use re.search
import re
data = re.search(lineformat, l)
You need to import the "re" package first and then use re.search
import re
data = re.search(lineformat, l)
I deleted my question... Was able to get that out of the way... There was an html file in myfolder which caused it... Currently, my output display is "None"
Can you assist me ?? Want to add upstream_response_time too..
My current format is ((( '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';)))
how about this one?
lineformat = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/1\.1")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (?P<refferer>-|"([^"]+)") (["](?P<useragent>[^"]+)["])""", re.IGNORECASE)
much better. This one doesn't messes up the user agent and referrer.
Need help... Using python 3... Getting cannot use a string pattern for a bytes object for the read.search() function
You need to import the "re" package first and then use re.search
import re
data = re.search(lineformat, l)
use l.decode() for python3
data = re.search(lineformat, l.decode())
lineformat = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST|HEAD|PUT|DELETE) )(?P<url>.+)(http\/(1\.1|2\.0)")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (?P<refferer>-|"([^"]+)") (["](?P<useragent>[^"]+)["])""", re.IGNORECASE)
from @mohit6522 i have also added :
- more HTTP Verbs
- HTTP 2.0
It should increase match rate.
lineformat = re.compile(
r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST|HEAD|PUT|DELETE) )(?P<url>.+)(http\/(1\.1|2\.0)")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (?P<refferer>-|"([^"]+)") (["](?P<useragent>[^"]+)["])""",
re.IGNORECASE,
)
Added ipv6 matching.
check this one!
log_format = r'(?P<remote_addr>\d+.\d+.\d+.\d+)\s+\S+\s+\S+\s+[(?P[^\]]+)]\s+"(?P[^"]+)"\s+(?P\d+)\s+(?P<bytes_sent>\d+)\s+"(?P[^"]+)+"\s+"(?P<user_agent>(?!http)[^"]*)"'
check my project on github https://github.com/ksn-developer/logbrain.git
how about this one?