ICMP是layer 4协议,用户可以使用icmp来得到自己想要的东西。大名鼎鼎的ping就是利用ICMP的api写成的。
这个协议主要是用来处理(L3)层的错误信息与控制信息的,
ICMPv4
这里主要是两类: 错误报文和消息报文。著名的ip(来自于iputils包)就是打开 raw socket并且发送一个ICMP_ECHO报文、产生ICMP_REPLY报文作为回应。
icmpv4初始化
首先,在/net/ipv4/icmp.c中首先呼叫inet_init()(在boot阶段),方法,此方法激活icmp_init()方法,然后就会呼叫icmp_sk_init()产生ICMP包。
在 net/ipv4/af_inet.c 中有各种协议的注册。
static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
/*
* Tell SOCKET that we are alive...
*/
(void)sock_register(&inet_family_ops);
#ifdef CONFIG_SYSCTL
ip_static_sysctl_init();
#endif
/*
* Add all the base protocols.
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
/*
* Set the ARP module up
*/
------->-------
接下来看看 icmp_protocal的结构
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.err_handler = icmp_err,
.no_policy = 1,
.netns_ok = 1,
};
handler
首先看看handler, 当接受到由外面发来的包,只要在ip头部等于IPPROTO_ICMP(0x1), icmp_rcv就会触发。
no_policy
当设置为1,意味着不需要表现IPsec策略检查(policy check),
netns_ok
设置为1时提醒协议是network命名空间
icmp_sk_init() 位于net/ipv4/icmp.c 中
static int __net_init icmp_sk_init(struct net *net)
{
int i, err;
net->ipv4.icmp_sk = alloc_percpu(struct sock *);
if (!net->ipv4.icmp_sk)
return -ENOMEM;
for_each_possible_cpu(i) {
struct sock *sk;
err = inet_ctl_sock_create(&sk, PF_INET,
SOCK_RAW, IPPROTO_ICMP, net);
if (err < 0)
goto fail;
*per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
/* Enough space for 2 64K ICMP packets, including
* sk_buff/skb_shared_info struct overhead.
*/
sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
/*
* Speedup sock_wfree()
*/
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
}
/* Control parameters for ECHO replies. */
net->ipv4.sysctl_icmp_echo_ignore_all = 0;
net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;
/* Control parameter - ignore bogus broadcast responses? */
net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1;
/*
* Configurable global rate limit.
*
* ratelimit defines tokens/packet consumed for dst->rate_token
* bucket ratemask defines which icmp types are ratelimited by
* setting it's bit position.
*
* default:
* dest unreachable (3), source quench (4),
* time exceeded (11), parameter problem (12)
*/
net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
net->ipv4.sysctl_icmp_ratemask = 0x1818;
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
return 0;
ICMPv4 Header
主要参考下面的这幅图片就可以了:
前64 bits是属于头文件的部分,后面的Payload包括原始包IPv4头文件和它的的负载。
头部的定义在include/uapi/linux/icmp.h
struct icmphdr {
__u8 type;
__u8 code;
__sum16 checksum;
union {
struct {
__be16 id;
__be16 sequence;
} echo;
__be32 gateway;
struct {
__be16 __unused;
__be16 mtu;
} frag;
__u8 reserved[4];
} un;
};
通过这张表,是不是可以得出这样一个结论:在网络协议中,它的bits size可以通过它的的类型的大小推断出来。
然后ICMPv4模块定义了icmp_control对象, 命名为 icmp_pointer,这个对象被ICMpv4消息的类型索引。
该定义位于 include/uapi/linux/icmp.c
/*
* ICMP control array. This specifies what to do with each ICMP.
*/
struct icmp_control {
bool (*handler)(struct sk_buff *skb);
short error; /* This ICMP is classed as an error message */
};
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
其中, error是1,代表着错误消息(ICMP_DEST_UNREACH), 为0代表着正常消息(ICMP_ECHO)。 而handler有时会被赋值超过一种类型。
ping_rcv()接受一个ping的回应(ICMP_ECHOREPLY)。 位于net/ipv4/icmp.c
bool ping_rcv(struct sk_buff *skb)
{
struct sock *sk;
struct net *net = dev_net(skb->dev);
struct icmphdr *icmph = icmp_hdr(skb);
/* We assume the packet has already been checked by icmp_rcv */
pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
/* Push ICMP header back */
skb_push(skb, skb->data - (u8 *)icmph);
sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
if (sk) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
pr_debug("rcv on socket %p\n", sk);
if (skb2)
ping_queue_rcv_skb(sk, skb2);
sock_put(sk);
return true;
}
pr_debug("no socket, dropping\n");
return false;
}
EXPORT_SYMBOL_GPL(ping_rcv);
在3.0以前的内核版本中,你需要在用户空间程序中创建一个raw socket,当接收到一个(ICMP_ECHOREPLY)消息, 原来的raw socket处理它(理解不准确)。为了详细理解这个过程,先看一个函数。
在 /net/ipv4/in_input.c中。
static int ip_local_deliver_finish(struct net *net,
struct sock *sk, struct sk_buff *skb)
{
__skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
const struct net_protocol *ipprot;
int raw;
resubmit:
raw = raw_local_deliver(skb, protocol);
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot) {
int ret;
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot->handler(skb);
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL,
XFRM_POLICY_IN, skb)) {
__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
kfree_skb(skb);
} else {
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
}
}
}
out:
rcu_read_unlock();
return 0;
}