一、CloudWatch服务安装
Amazon Linux 2系统安装Agent。
Bash
#!/bin/bash
rpm -ivh https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm
sudo tee -a /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json <<-'EOF'
{
"logs": {
"logs_collected": {
"files": {
"collect_list": [
{
"file_path": "/logArchive/hcaextension/info*.log",
"log_group_name": "RGC-Prod-3in1oven",
"log_stream_name": "info.logs"
},
{
"file_path": "/logArchive/hcaextension/http*.log",
"log_group_name": "RGC-Prod-3in1oven",
"log_stream_name": "http.logs"
}
]
}
}
},
"metrics": {
"aggregation_dimensions": [
[
"InstanceId"
]
],
"append_dimensions": {
"AutoScalingGroupName": "${aws:AutoScalingGroupName}",
"ImageId": "${aws:ImageId}",
"InstanceId": "${aws:InstanceId}",
"InstanceType": "${aws:InstanceType}"
},
"metrics_collected": {
"cpu": {
"measurement": [
"cpu_usage_idle",
"cpu_usage_iowait",
"cpu_usage_user",
"cpu_usage_system"
],
"metrics_collection_interval": 180,
"resources": [
"*"
],
"totalcpu": false
},
"disk": {
"measurement": [
"used_percent"
],
"metrics_collection_interval": 180,
"resources": [
"/"
]
},
"diskio": {
"measurement": [
"io_time",
"write_bytes",
"read_bytes",
"writes",
"reads"
],
"metrics_collection_interval": 180,
"resources": [
"/"
]
},
"mem": {
"measurement": [
"mem_used_percent"
],
"metrics_collection_interval": 180
},
"netstat": {
"measurement": [
"tcp_established",
"tcp_time_wait"
],
"metrics_collection_interval": 180
},
"statsd": {
"metrics_aggregation_interval": 60,
"metrics_collection_interval": 180,
"service_address": ":8125"
},
"swap": {
"measurement": [
"swap_used_percent"
],
"metrics_collection_interval": 180
}
}
}
}
EOF
sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json
systemctl restart amazon-cloudwatch-agent.service
systemctl enable amazon-cloudwatch-agent.service
二、AWS-CLI批量下发监控
前提条件:本机安装awscli工具。
需要修改的是区域信息、ip_list、实例id、sns_arn信息。
通过脚本自动在CloudWatch上添加监控配置EC2监控。
Python
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import json
import subprocess
# 1. 配置cli路径和region
Contants = {
"AWSCLI": '"C:\\Program Files\\Amazon\\AWSCLI\\bin\\aws.exe" --output json',
"AWSREGION": ['eu-central-1'] # 新加坡
}
# 构造字典
class CreateDict(dict):
def __getitem__(self, item):
try:
return dict.__getitem__(self, item)
except KeyError:
value = self[item] = type(self)()
return value
#########################################################################################################
# 配置告警
# CPUUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getCPUUtilizationComm(name, action, instance_id):
mertic = 'CPUUtilization'
print("#####开始配置 %s#####" % mertic)
return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 80 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--unit Percent \
--dimensions "Name=InstanceId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, actinotallow=action, id=instance_id, mertic=mertic)
# MEMUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getmem_used_percentComm(name, action, instance_id, instancetype, imageid):
mertic = 'mem_used_percent'
print("#####开始配置 %s#####" % mertic)
return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace CWAgent \
--statistic Average \
--period 60 \
--threshold 80 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data missing \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions Name=InstanceId,Value={id} Name=ImageId,Value={imageid} Name=InstanceType,Value={instancetype}'''.format(cli=Contants['AWSCLI'], name=name, actinotallow=action, id=instance_id, mertic=mertic,instancetype=instancetype, imageid=imageid)
# DISKUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getdisk_used_percentComm(name, action, instance_id, instancetype, imageid):
mertic = 'disk_used_percent'
print("#####开始配置 %s#####" % mertic)
return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace CWAgent \
--dimensions "Name=path,Value=/" \
--statistic Average \
--period 60 \
--threshold 80 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data missing \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions Name=InstanceId,Value={id} Name=ImageId,Value={imageid} Name=InstanceType,Value={instancetype} Name=device,Value=nvme0n1p1 Name=fstype,Value=ext4 "Name=path,Value=/"'''.format(cli=Contants['AWSCLI'], name=name, actinotallow=action, id=instance_id, mertic=mertic,instancetype=instancetype, imageid=imageid)
#注意因为磁盘无法获取到值和指定变量所以磁盘的值需要在cloudwatch上看下类型值来填写 device和fstype
# NetworkIn,3分钟检查3次,平均值大于或等于5m,就告警。
def getNetworkInComm(name, action, instance_id):
mertic = 'NetworkIn'
print("#####开始配置 %s#####" % mertic)
return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=InstanceId,Value=%s"'''.format(cli=Contants['AWSCLI'], name=name, actinotallow=action, id=instance_id, mertic=mertic)
# NetworkOut,3分钟检查3次,平均值大于或等于5m,就告警。
def getNetworkOutComm(name, action, instance_id):
mertic = 'NetworkOut'
print("#####开始配置 %s#####" % mertic)
return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=InstanceId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, actinotallow=action, id=instance_id, mertic=mertic)
# 执行命令函数
def execCommand(comm):
try:
print(comm)
(status, stdout) = subprocess.getstatusoutput(comm)
print(status)
return stdout
except Exception as e:
print(e)
# 获取当前可用区内所有EC2的基础信息
def getAll(get_server_id_list):
# instanceids = ["i-0f24b7bf904ea9563" ,"i-0ce745e06c12cbde1"]
# for instanceid in instanceids:
# print(instanceid)
# comm1 = "%s ec2 describe-instances --instance-ids %s" % (Contants['AWSCLI'],instanceid)
comm1 = "%s ec2 describe-instances" % Contants['AWSCLI']
all_data = json.loads(execCommand(comm1))
instance_list = []
instance_list_modify = []
for r in all_data['Reservations']:
data = {}
for i in r['Instances']:
data['id'] = i['InstanceId']
data['imageid'] = i['ImageId']
data['instancetype'] = i['InstanceType']
for t in i['Tags']:
if t['Key'] == 'Name':
data['name'] = t['Value']
if not data['name']:
data['name'] = i['InstanceId']
instance_list.append(data)
# print(instance_list)
for instance_id in instance_list:
print(instance_id)
if instance_id.get("id") in get_server_id_list:
instance_list_modify.append(instance_id)
#print(instance_list)
print(instance_list_modify)
return instance_list_modify
# 添加报警
def add_alert(data, action):
for i in data:
instance_id = i['id']
name = i['name']
imageid = i['imageid']
instancetype = i['instancetype']
print(instance_id, name, imageid, instancetype)
#print(instance_id, name)
execCommand(getCPUUtilizationComm(name, action, instance_id))
#execCommand(getNetworkInComm(name, action, instance_id))
#execCommand(getNetworkOutComm(name, action, instance_id))
#execCommand(getmem_used_percentComm(name, action, instance_id, instancetype, imageid))
#execCommand(getdisk_used_percentComm(name, action, instance_id, instancetype, imageid))
def get_server_info(instance_list):
server_info = []
# print(server_dict)
for i in instance_list:
# print(i)
# 显示执行命令
print("aws ec2 describe-instances --output json --instance-ids {0}".format(i))
# print(cmd)
server_dict = {}
data = os.popen("aws ec2 describe-instances --output json --instance-ids {0}".format(i)).read()
json_str = json.loads(data)
"""
# print(json_str["Reservations"][0]["Instances"][0])
server_dict['id']=json_str["Reservations"][0]["Instances"][0]["InstanceId"]
server_dict['imageid']=json_str["Reservations"][0]["Instances"][0]["ImageId"]
server_dict['instancetype']=json_str["Reservations"][0]["Instances"][0]["InstanceType"]
if not json_str["Reservations"][0]["Instances"][0]["Tags"][0]["Value"]:
server_dict['name'] = json_str["Reservations"][0]["Instances"][0]["InstanceId"]
else:
server_dict['name']=json_str["Reservations"][0]["Instances"][0]["Tags"][0]["Value"]
server_info.append(server_dict)
"""
for Reservations_list in json_str["Reservations"]:
for Instances_list in Reservations_list["Instances"]:
server_dict['id'] = Instances_list["InstanceId"]
server_dict['imageid'] = Instances_list["ImageId"]
server_dict['instancetype'] = Instances_list["InstanceType"]
# if not Instances_list["Tags"][0]["Value"]:
# server_dict['name'] = Instances_list["InstanceId"]
# else:
# server_dict['name'] = Instances_list["Tags"][0]["Value"]
for tag_item in Instances_list["Tags"]:
name=tag_item["Key"]
if name == "Name":
server_dict['name'] = tag_item["Value"]
break
if i == server_dict["id"]:
print(server_dict)
server_info.append(server_dict)
return server_info
if __name__ == '__main__':
# 2. 配置sns的arn
sns_arn = "arn:aws:sns:eu-central-1:643xxxxx:xxxx-CloudWatch-Lambda-DingTalk"
ip_list = ["i-010bxxxx","i-00xxxxx"]
cli = Contants['AWSCLI']
for i in Contants['AWSREGION']:
print('[Region] ', i)
Contants['AWSCLI'] = cli + ' --region ' + i
add_alert(get_server_info(ip_list), sns_arn)
三、Amazon SNS创建主题
创建sns主题关联LAMBDA 钉钉程序。
四、Lambda钉钉函数通知脚本
上传如下脚本,通过cloudwatch调式EC2设定的规则来触发告警测试。
Prolog
# _*_coding:utf-8_*_
# python 3.8
# Creation time: 2021/11/18
import time
import hmac
import hashlib
import base64
import urllib.parse
import json
import os
import requests
import datetime
def lambda_handler(event, context):
headers = {'Content-Type': 'application/json;charset=utf-8'}
token = 'ca5533c8cb976c21'
timestamp = str(round(time.time() * 1000))
secret = 'SEC8d1a31ec5e8e91'
secret_enc = secret.encode('utf-8')
string_to_sign = '{}\n{}'.format(timestamp, secret)
string_to_sign_enc = string_to_sign.encode('utf-8')
hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
# get url
api_url = "https://oapi.dingtalk.com/robot/send?access_token={}×tamp={}&sign={}".format(token, timestamp, sign)
# msg setting
#message = event['Records'][0]['SNS']
message = event['Records'][0]['Sns']
Timestamp = message['Timestamp']
Subject = message['Subject']
# sns_message = message['Message']
sns_message = json.loads(message['Message'])
NewStateReason = json.loads(event['Records'][0]['Sns']['Message'])['NewStateReason']
current_time = (datetime.datetime.now() + datetime.timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')
if "ALARM" in Subject:
title = '![1.png](https://xxx.oss-cn.aliyuncs.com/dingding-image/1.png)'
elif "OK" in Subject:
title = '![2.png](https://xxx.oss-cn-shanghai.aliyuncs.com/dingding-image/2.png)'
else:
title = '![3.png](https://xxx.oss-cn-shanghai.aliyuncs.com/dingding-image/3.png)'
_value = sns_message['Trigger']['Dimensions'][0]['value']
if _value.startswith('/'):
_value = sns_message['Trigger']['Dimensions'][1]['value']
content = "### {title}".format(title=title) + \
"\n> #### **时间**: " + current_time + \
"\n> #### **状态**: " + sns_message['OldStateValue'] + " => " + sns_message['NewStateValue'] + \
"\n> #### **告警名称**: " + sns_message['AlarmName'] + \
"\n> #### **账户ID**: " + sns_message['AWSAccountId'] + \
"\n> #### **AWS区域**: " + sns_message['Region'] + \
"\n> #### **描述**: " + sns_message['AlarmDescription'] + \
"\n> #### **产品资源**: " + sns_message['Trigger']['Namespace'] + \
"\n> #### **实例ID**: " + _value + \
"\n> #### **指标名称**: " + sns_message['Trigger']['MetricName'] + \
"\n> #### **报警详情**: " + sns_message['NewStateReason']
msg = {
"msgtype": "markdown",
"markdown": {
"title": title,
"text": content
},
"at": {
"isAtAll": "true"
}
}
# request
request = requests.post(url=api_url, data=json.dumps(msg), headers=headers).content.decode("utf8")
return request
Aws子账户权限调式工具。
https://policysim.aws.amazon.com/