Last active
August 29, 2015 14:01
-
-
Save markharwood/e6c137c7673186217b74 to your computer and use it in GitHub Desktop.
Scripts for analyzing Elasticsearch log files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"title": "eslogs", | |
"services": { | |
"query": { | |
"list": { | |
"0": { | |
"query": "", | |
"alias": "", | |
"color": "#7EB26D", | |
"id": 0, | |
"pin": false, | |
"type": "lucene", | |
"enable": true | |
} | |
}, | |
"ids": [ | |
0 | |
] | |
}, | |
"filter": { | |
"list": { | |
"0": { | |
"type": "terms", | |
"field": "tags", | |
"value": "Timeout", | |
"mandate": "mustNot", | |
"active": false, | |
"alias": "", | |
"id": 0 | |
}, | |
"1": { | |
"type": "time", | |
"from": "2014-05-30T00:10:38.000Z", | |
"to": "2014-05-30T18:58:40.000Z", | |
"field": "date", | |
"mandate": "must", | |
"active": true, | |
"alias": "", | |
"id": 1 | |
}, | |
"2": { | |
"type": "time", | |
"from": "2014-05-30T18:22:00.335Z", | |
"to": "2014-05-30T18:58:40.000Z", | |
"field": "date", | |
"mandate": "must", | |
"active": false, | |
"alias": "", | |
"id": 2 | |
}, | |
"3": { | |
"type": "time", | |
"from": "2014-05-30T18:42:15.649Z", | |
"to": "2014-05-30T18:50:19.576Z", | |
"field": "date", | |
"mandate": "must", | |
"active": false, | |
"alias": "", | |
"id": 3 | |
}, | |
"4": { | |
"type": "terms", | |
"field": "node", | |
"value": "1a", | |
"mandate": "must", | |
"active": true, | |
"alias": "", | |
"id": 4 | |
}, | |
"5": { | |
"type": "terms", | |
"field": "tags", | |
"value": "ClusterMembership", | |
"mandate": "must", | |
"active": false, | |
"alias": "", | |
"id": 5 | |
}, | |
"6": { | |
"type": "terms", | |
"field": "tags", | |
"value": "ClusterMembership", | |
"mandate": "must", | |
"active": true, | |
"alias": "", | |
"id": 6 | |
}, | |
"7": { | |
"type": "terms", | |
"field": "source", | |
"value": "action.admin.indices.stats", | |
"mandate": "mustNot", | |
"active": false, | |
"alias": "", | |
"id": 7 | |
}, | |
"9": { | |
"type": "terms", | |
"field": "disconnectedNode", | |
"value": "listening-elastic-prod3-1e-data2-1", | |
"mandate": "must", | |
"active": false, | |
"alias": "", | |
"id": 9 | |
}, | |
"13": { | |
"type": "terms", | |
"field": "tags", | |
"value": "NodeNotConnected", | |
"mandate": "must", | |
"active": false, | |
"alias": "", | |
"id": 13 | |
}, | |
"15": { | |
"type": "terms", | |
"field": "node", | |
"value": "1e", | |
"mandate": "must", | |
"active": false, | |
"alias": "", | |
"id": 15 | |
}, | |
"17": { | |
"type": "terms", | |
"field": "tags", | |
"value": "MasterChange", | |
"mandate": "must", | |
"active": false, | |
"alias": "", | |
"id": 17 | |
}, | |
"18": { | |
"type": "time", | |
"from": "2014-05-30T18:28:10.756Z", | |
"to": "2014-05-30T18:58:40.000Z", | |
"field": "date", | |
"mandate": "must", | |
"active": false, | |
"alias": "", | |
"id": 18 | |
}, | |
"19": { | |
"type": "time", | |
"from": "2014-05-30T18:44:09.993Z", | |
"to": "2014-05-30T18:51:09.381Z", | |
"field": "date", | |
"mandate": "must", | |
"active": false, | |
"alias": "", | |
"id": 19 | |
} | |
}, | |
"ids": [ | |
0, | |
1, | |
2, | |
3, | |
4, | |
5, | |
6, | |
7, | |
8, | |
9, | |
14 | |
] | |
} | |
}, | |
"rows": [ | |
{ | |
"title": "Level", | |
"height": "150px", | |
"editable": true, | |
"collapse": false, | |
"collapsable": true, | |
"panels": [ | |
{ | |
"span": 4, | |
"editable": true, | |
"type": "histogram", | |
"loadingEditor": false, | |
"mode": "count", | |
"time_field": "date", | |
"value_field": null, | |
"x-axis": true, | |
"y-axis": true, | |
"scale": 1, | |
"y_format": "none", | |
"grid": { | |
"max": null, | |
"min": 0 | |
}, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"annotate": { | |
"enable": true, | |
"query": "tags:OOM OR tags:EngineFailure OR tags:MasterConnectivity OR tags:MasterChange OR tags:ClusterMembership", | |
"size": 50000, | |
"field": "tags", | |
"sort": [ | |
"date", | |
"desc" | |
] | |
}, | |
"auto_int": false, | |
"resolution": 100, | |
"interval": "1m", | |
"intervals": [ | |
"auto", | |
"1s", | |
"1m", | |
"5m", | |
"10m", | |
"30m", | |
"1h", | |
"3h", | |
"12h", | |
"1d", | |
"1w", | |
"1y" | |
], | |
"lines": false, | |
"fill": 0, | |
"linewidth": 3, | |
"points": false, | |
"pointradius": 5, | |
"bars": true, | |
"stack": true, | |
"spyable": true, | |
"zoomlinks": true, | |
"options": true, | |
"legend": true, | |
"show_query": true, | |
"interactive": true, | |
"legend_counts": true, | |
"timezone": "utc", | |
"percentage": false, | |
"zerofill": true, | |
"derivative": false, | |
"tooltip": { | |
"value_type": "cumulative", | |
"query_as_alias": true | |
} | |
}, | |
{ | |
"error": false, | |
"span": 2, | |
"editable": true, | |
"type": "terms", | |
"loadingEditor": false, | |
"field": "tags", | |
"exclude": [], | |
"missing": false, | |
"other": false, | |
"size": 12, | |
"order": "count", | |
"style": { | |
"font-size": "10pt" | |
}, | |
"donut": false, | |
"tilt": false, | |
"labels": true, | |
"arrangement": "horizontal", | |
"chart": "bar", | |
"counter_pos": "above", | |
"spyable": true, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"tmode": "terms", | |
"tstat": "total", | |
"valuefield": "", | |
"title": "Key events" | |
}, | |
{ | |
"error": false, | |
"span": 2, | |
"editable": true, | |
"type": "terms", | |
"loadingEditor": false, | |
"field": "level", | |
"exclude": [], | |
"missing": false, | |
"other": false, | |
"size": 10, | |
"order": "count", | |
"style": { | |
"font-size": "10pt" | |
}, | |
"donut": false, | |
"tilt": false, | |
"labels": true, | |
"arrangement": "horizontal", | |
"chart": "bar", | |
"counter_pos": "above", | |
"spyable": true, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"tmode": "terms", | |
"tstat": "total", | |
"valuefield": "", | |
"title": "Level" | |
}, | |
{ | |
"error": false, | |
"span": 4, | |
"editable": true, | |
"type": "terms", | |
"loadingEditor": false, | |
"field": "source", | |
"exclude": [], | |
"missing": false, | |
"other": false, | |
"size": 22, | |
"order": "count", | |
"style": { | |
"font-size": "10pt" | |
}, | |
"donut": false, | |
"tilt": false, | |
"labels": true, | |
"arrangement": "horizontal", | |
"chart": "bar", | |
"counter_pos": "above", | |
"spyable": true, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"tmode": "terms", | |
"tstat": "total", | |
"valuefield": "", | |
"title": "Source" | |
} | |
], | |
"notice": false | |
}, | |
{ | |
"title": "", | |
"height": "250px", | |
"editable": true, | |
"collapse": false, | |
"collapsable": true, | |
"panels": [ | |
{ | |
"error": false, | |
"span": 12, | |
"editable": true, | |
"type": "table", | |
"loadingEditor": false, | |
"size": 1000, | |
"pages": 5, | |
"offset": 0, | |
"sort": [ | |
"date", | |
"asc" | |
], | |
"overflow": "height", | |
"fields": [ | |
"date", | |
"msg", | |
"tags", | |
"source", | |
"level", | |
"lateResponseTimeMillis" | |
], | |
"highlight": [], | |
"sortable": true, | |
"header": true, | |
"paging": true, | |
"field_list": false, | |
"all_fields": false, | |
"trimFactor": 300, | |
"localTime": false, | |
"timeField": "@timestamp", | |
"spyable": true, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"style": { | |
"font-size": "9pt" | |
}, | |
"normTimes": true | |
} | |
], | |
"notice": false | |
}, | |
{ | |
"title": "", | |
"height": "150px", | |
"editable": true, | |
"collapse": false, | |
"collapsable": true, | |
"panels": [ | |
{ | |
"error": false, | |
"span": 4, | |
"editable": true, | |
"type": "terms", | |
"loadingEditor": false, | |
"field": "node", | |
"exclude": [], | |
"missing": false, | |
"other": false, | |
"size": 15, | |
"order": "term", | |
"style": { | |
"font-size": "10pt" | |
}, | |
"donut": false, | |
"tilt": false, | |
"labels": true, | |
"arrangement": "horizontal", | |
"chart": "bar", | |
"counter_pos": "none", | |
"spyable": true, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"tmode": "terms", | |
"tstat": "total", | |
"valuefield": "", | |
"title": "Reporting Node" | |
}, | |
{ | |
"error": false, | |
"span": 4, | |
"editable": true, | |
"type": "terms", | |
"loadingEditor": false, | |
"field": "timedoutNode", | |
"exclude": [], | |
"missing": false, | |
"other": false, | |
"size": 10, | |
"order": "count", | |
"style": { | |
"font-size": "10pt" | |
}, | |
"donut": false, | |
"tilt": false, | |
"labels": true, | |
"arrangement": "horizontal", | |
"chart": "bar", | |
"counter_pos": "above", | |
"spyable": true, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"tmode": "terms", | |
"tstat": "total", | |
"valuefield": "", | |
"title": "Nodes timing out on responses" | |
}, | |
{ | |
"error": false, | |
"span": 4, | |
"editable": true, | |
"type": "terms", | |
"loadingEditor": false, | |
"field": "disconnectedNode", | |
"exclude": [], | |
"missing": false, | |
"other": false, | |
"size": 10, | |
"order": "count", | |
"style": { | |
"font-size": "10pt" | |
}, | |
"donut": false, | |
"tilt": false, | |
"labels": true, | |
"arrangement": "horizontal", | |
"chart": "bar", | |
"counter_pos": "above", | |
"spyable": true, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"tmode": "terms", | |
"tstat": "total", | |
"valuefield": "", | |
"title": "Nodes reported disconnected" | |
} | |
], | |
"notice": false | |
}, | |
{ | |
"title": "", | |
"height": "150px", | |
"editable": true, | |
"collapse": false, | |
"collapsable": true, | |
"panels": [ | |
{ | |
"error": false, | |
"span": 4, | |
"editable": true, | |
"type": "terms", | |
"loadingEditor": false, | |
"field": "membershipChangedNode", | |
"exclude": [], | |
"missing": false, | |
"other": false, | |
"size": 10, | |
"order": "count", | |
"style": { | |
"font-size": "10pt" | |
}, | |
"donut": false, | |
"tilt": false, | |
"labels": true, | |
"arrangement": "horizontal", | |
"chart": "bar", | |
"counter_pos": "above", | |
"spyable": true, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"tmode": "terms", | |
"tstat": "total", | |
"valuefield": "", | |
"title": "Nodes changing cluster membership" | |
}, | |
{ | |
"error": false, | |
"span": 4, | |
"editable": true, | |
"type": "terms", | |
"loadingEditor": false, | |
"field": "lateRespondingNode", | |
"exclude": [], | |
"missing": false, | |
"other": false, | |
"size": 10, | |
"order": "count", | |
"style": { | |
"font-size": "10pt" | |
}, | |
"donut": false, | |
"tilt": false, | |
"labels": true, | |
"arrangement": "horizontal", | |
"chart": "bar", | |
"counter_pos": "above", | |
"spyable": true, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"tmode": "terms", | |
"tstat": "total", | |
"valuefield": "", | |
"title": "Late responding nodes" | |
}, | |
{ | |
"error": false, | |
"span": 4, | |
"editable": true, | |
"type": "stats", | |
"loadingEditor": false, | |
"queries": { | |
"mode": "all", | |
"ids": [ | |
0 | |
] | |
}, | |
"style": { | |
"font-size": "24pt" | |
}, | |
"format": "number", | |
"mode": "max", | |
"display_breakdown": "no", | |
"sort_field": "label", | |
"sort_reverse": true, | |
"label_name": "lateRespondingNode", | |
"value_name": "lateResponseTimeMillis", | |
"spyable": true, | |
"field": "lateResponseTimeMillis", | |
"title": "Max overdue response time ms" | |
} | |
], | |
"notice": false | |
} | |
], | |
"editable": true, | |
"failover": false, | |
"index": { | |
"interval": "none", | |
"pattern": "[logstash-]YYYY.MM.DD", | |
"default": "eslogs", | |
"warm_fields": true | |
}, | |
"style": "dark", | |
"panel_hints": true, | |
"pulldowns": [ | |
{ | |
"type": "query", | |
"collapse": false, | |
"notice": false, | |
"enable": true, | |
"query": "*", | |
"pinned": true, | |
"history": [ | |
"", | |
"2878416", | |
"\"Received response for a request\"", | |
"2702083", | |
"ping*", | |
"time*", | |
"time", | |
"timed", | |
"timeout", | |
"\"listening-elastic-prod3-1e-data2-1\"" | |
], | |
"remember": 10 | |
}, | |
{ | |
"type": "filtering", | |
"collapse": false, | |
"notice": true, | |
"enable": true | |
} | |
], | |
"nav": [ | |
{ | |
"type": "timepicker", | |
"collapse": false, | |
"notice": false, | |
"enable": true, | |
"status": "Stable", | |
"time_options": [ | |
"5m", | |
"15m", | |
"1h", | |
"6h", | |
"12h", | |
"24h", | |
"2d", | |
"7d", | |
"30d" | |
], | |
"refresh_intervals": [ | |
"5s", | |
"10s", | |
"30s", | |
"1m", | |
"5m", | |
"15m", | |
"30m", | |
"1h", | |
"2h", | |
"1d" | |
], | |
"timefield": "@timestamp", | |
"now": false | |
} | |
], | |
"loader": { | |
"save_gist": false, | |
"save_elasticsearch": true, | |
"save_local": true, | |
"save_default": true, | |
"save_temp": true, | |
"save_temp_ttl_enable": true, | |
"save_temp_ttl": "30d", | |
"load_gist": false, | |
"load_elasticsearch": true, | |
"load_elasticsearch_size": 20, | |
"load_local": false, | |
"hide": false | |
}, | |
"refresh": false | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ================ Enrich log files and convert to Json | |
echo "Converting log files to JSON with enriched tags and node names" | |
python parseLog.py "node1.log.2014-05-12" "node1.json" "node1" | |
python parseLog.py "node2.log.2014-05-12" "node2.json" "node2" | |
#... | |
# ================ Create a fresh index | |
echo "Indexing..." | |
es_host="http://localhost:9200" | |
curl -X DELETE "$es_host/eslogs" | |
curl -XPOST "$es_host/eslogs" -d ' | |
{ | |
"settings" : { | |
"number_of_shards" : 1, | |
"number_of_replicas" : 0 | |
}, | |
"mappings":{ | |
"eslog": { | |
"properties": { | |
"logfile": { | |
"type": "string", "index":"not_analyzed" | |
}, | |
"level": { | |
"type": "string", "index":"not_analyzed" | |
}, | |
"tags": { | |
"type": "string", "index":"not_analyzed" | |
}, | |
"source": { | |
"type": "string", "index":"not_analyzed" | |
}, | |
"msg": { | |
"type": "string", "index":"analyzed" | |
}, | |
"disconnectedNode": { | |
"type": "string", "index":"not_analyzed" | |
}, | |
"membershipChangedNode": { | |
"type": "string", "index":"not_analyzed" | |
}, | |
"lateResponseTimeMillis": { | |
"type": "long" | |
}, | |
"lateRespondingNode": { | |
"type": "string", "index":"not_analyzed" | |
}, | |
"timedoutNode": { | |
"type": "string", "index":"not_analyzed" | |
}, | |
"date" : { | |
"type" : "date", | |
"format" : "yyyy-MM-dd HH:mm:ss,SSS" | |
} | |
} | |
} | |
} | |
} | |
' | |
curl -XPOST "$es_host/eslogs/_refresh" | |
stream2es stdin --target $es_host/eslogs/eslog < node1.json | |
stream2es stdin --target $es_host/eslogs/eslog < node2.json | |
#... | |
curl -XPOST "$es_host/eslogs/_refresh" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import StringIO | |
import json | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("inputLogFileName", help="inputLogFileName") | |
parser.add_argument("outputJsonFileName", help="Name of the output json file for use with Stream2ES") | |
parser.add_argument("node", help="The name or number of the node that produced the log file") | |
args = parser.parse_args() | |
print "Processing ",args.inputLogFileName | |
rows=open(args.inputLogFileName, 'rb') | |
outfile=open(args.outputJsonFileName, 'w') | |
lc=0 | |
doc={} | |
msg="" | |
date="" | |
level="" | |
tags=[] | |
source="" | |
#node="" | |
timedoutTargetNodePattern=re.compile(r'.*ReceiveTimeoutTransportException: \[([^\]]*)\].*', re.MULTILINE | re.DOTALL ) | |
disconnectedTargetNodePattern=re.compile(r'.*NodeDisconnectedException: \[([^\]]*)\].*', re.MULTILINE | re.DOTALL ) | |
notConnectedTargetNodePattern=re.compile(r'.*NodeNotConnectedException: \[([^\]]*)\].*', re.MULTILINE | re.DOTALL ) | |
clusterAddedNodePattern=re.compile(r'.*added {\[([^\]]*)\].*', re.MULTILINE | re.DOTALL ) | |
clusterRemovedNodePattern=re.compile(r'.*removed {\[([^\]]*)\].*', re.MULTILINE | re.DOTALL ) | |
lateRespondingNodePattern=re.compile(r'.*Received response for a request that has timed out, sent \[([0-9]*)ms\] ago.*node \[\[([^\]]*)\].*', re.MULTILINE | re.DOTALL ) | |
for row in rows: | |
if row[0]=='[': | |
if lc>1: | |
doc={} | |
doc["date"]=date | |
doc["level"]=level | |
doc["source"]=source | |
doc["node"]=args.node | |
doc["msg"]=msg | |
if "timed out" in msg: | |
tags.append("Timeout") | |
m=timedoutTargetNodePattern.match(msg) | |
if m: | |
doc["timedoutNode"]=m.group(1) | |
else: | |
m=lateRespondingNodePattern.match(msg) | |
if m: | |
doc["lateResponseTimeMillis"]=int(m.group(1)) | |
doc["lateRespondingNode"]=m.group(2) | |
if "should not happen" in msg: | |
tags.append("ShouldNotHappen") | |
if "Failed to send release search context" in msg: | |
tags.append("ReleaseFailure") | |
if "OutOfMemoryError" in msg: | |
tags.append("OOM") | |
if "Maximum number of bytes allocated exceeded" in msg: | |
tags.append("CircuitBreaker") | |
if "failed engine" in msg: | |
tags.append("EngineFailure") | |
if "failed to merge" in msg: | |
tags.append("MergeFailure") | |
if "failed to flush" in msg: | |
tags.append("FlushFailure") | |
if "SearchParseException" in msg: | |
tags.append("InvalidQuery") | |
if "Recovery failed" in msg: | |
tags.append("RecoveryFailure") | |
if "NodeNotConnectedException" in msg: | |
tags.append("NodeNotConnected") | |
m=notConnectedTargetNodePattern.match(msg) | |
if m: | |
doc["disconnectedNode"]=m.group(1) | |
if "NodeDisconnectedException" in msg: | |
tags.append("NodeNotConnected") | |
m=disconnectedTargetNodePattern.match(msg) | |
if m: | |
doc["disconnectedNode"]=m.group(1) | |
if source.startswith("node"): | |
tags.append("NodeLifeCycle") | |
if "cluster.service" in source: | |
if "master {new" in msg: | |
tags.append("MasterChange") | |
if "discovery.zen" in source: | |
# pretty much any zen discovery message looks interesting | |
# if "master_left" in msg: | |
if not "received cluster state" in msg: | |
tags.append("MasterChange") | |
if "cluster.service" in source: | |
if "added" in msg: | |
tags.append("ClusterMembership") | |
m=clusterAddedNodePattern.match(msg) | |
if m: | |
doc["membershipChangedNode"]=m.group(1) | |
if "cluster.service" in source: | |
if "removed" in msg: | |
tags.append("ClusterMembership") | |
m=clusterRemovedNodePattern.match(msg) | |
if m: | |
doc["membershipChangedNode"]=m.group(1) | |
if "index.shard.service" in source: | |
if "suspect illegal state" in msg: | |
tags.append("SuspectChange") | |
if "MasterFaultDetection" in msg: | |
tags.append("MasterConnectivity") | |
doc["tags"]=tags | |
json.dump(doc, outfile) | |
outfile.write(os.linesep) | |
tags=[] | |
lc=lc+1 | |
pos=0 | |
pos2=row.index("]",pos) | |
date=row[pos+1:pos2] | |
pos=row.index("[",pos2) | |
pos2=row.index("]",pos) | |
level=row[pos+1:pos2] | |
#Not all log entries have source and node e.g. [2014-05-12 16:44:54,918][WARN ][netty.channel.socket.nio.AbstractNioSelector] Unexpected exception in the selector loop. | |
pos=row.index("[",pos2) | |
pos2=row.index("]",pos) | |
source=row[pos+1:pos2].strip() | |
msg=row[pos2+1:] | |
else: | |
msg=msg+"\n"+row | |
rows.close | |
print lc, "rows processed" | |
outfile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"aggs" : { | |
"nodes":{ | |
"terms":{"field":"node", "size":20}, | |
aggs:{ | |
"UncommonlyCommonErrors":{"significant_terms":{"field":"tags"}, | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment