1
|
<?php
|
2
|
/*
|
3
|
* parser_ipv6.inc
|
4
|
*
|
5
|
* Copyright (c) 2017-2019 Anders Lind (anders.lind@gmail.com)
|
6
|
* All rights reserved.
|
7
|
*
|
8
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
9
|
* you may not use this file except in compliance with the License.
|
10
|
* You may obtain a copy of the License at
|
11
|
*
|
12
|
* http://www.apache.org/licenses/LICENSE-2.0
|
13
|
*
|
14
|
* Unless required by applicable law or agreed to in writing, software
|
15
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
17
|
* See the License for the specific language governing permissions and
|
18
|
* limitations under the License.
|
19
|
*/
|
20
|
|
21
|
/*
|
22
|
* List of methods used:
|
23
|
* (?x) Free-spacing mode.
|
24
|
* Ability to use comments, place white space characters without impact,
|
25
|
* both newlines and white spaces are simply ignored unless escaped
|
26
|
* intentionally.
|
27
|
* Use "\ " to indicate a space e.g. like: Hey\ there.
|
28
|
* You might want to take a look:
|
29
|
* https://www.regular-expressions.info/freespacing.html
|
30
|
*
|
31
|
* # Comment under Free-spacing mode.
|
32
|
* If free-spacing mode is not on one can use (?#Some comment)
|
33
|
*
|
34
|
* (?(DEFINE) Subpatthern.
|
35
|
* Defines a subpattern that we intend to use
|
36
|
*
|
37
|
* (?'hexncolon' A named group.
|
38
|
* Can be used to define the group name of a subpattern
|
39
|
* or simply to give a matching group a name that is more 'logic'
|
40
|
* to use than a numbered group that might even change if the
|
41
|
* regular expression is changed.
|
42
|
*
|
43
|
* (?&hextet) Reference to use.
|
44
|
* Use/reference to the named group, which might be a subpattern.
|
45
|
*
|
46
|
* (?= Positive lookahead.
|
47
|
* Used when we want to make sure something is in the horizon before
|
48
|
* we start to match!
|
49
|
*
|
50
|
* (?! Negative lookahead.
|
51
|
* Used when we want to make sure something is NOT in the horizon before
|
52
|
* we start to match!
|
53
|
*
|
54
|
* () Capturing group.
|
55
|
* Normally identified by a number that corresponds to when it shows up
|
56
|
* in the regular expression.
|
57
|
*
|
58
|
* (?: Non-capturing group.
|
59
|
* Identifies a Non-capturing group that is useful if you e.g. need to
|
60
|
* repeat a match e.g. of a compound expression ab\d, but without capturing
|
61
|
* it: (?:ab\d)
|
62
|
*
|
63
|
* (?> Atomic (capturing) group.
|
64
|
* When it has a match it throws away all backtracking info it might have
|
65
|
* meaning it won't try alternatations if there e.g. is a |.
|
66
|
*
|
67
|
* \G We use \G once to alternate away from acceptable characters and instead
|
68
|
* match from the point where the last match ended. In our case below it is
|
69
|
* used to match at the start of the first line so we do not miss a match.
|
70
|
*
|
71
|
* For now everything runs stable.
|
72
|
*
|
73
|
* It we want we could make the following changes / investigations in the future:
|
74
|
* * At 1. in expression (?>(?&nohexncolonndot)+|\G) experiment with:
|
75
|
* \G vs |^|\s+ vs |^
|
76
|
* , if we want to optimize on speed/results.
|
77
|
* Expression handles cases at start and following matches.
|
78
|
*
|
79
|
* * Make 2 versions:
|
80
|
* One for IPv6 only and another that resemble what we have today (IPv6+IPv4).
|
81
|
* In that way we would have two regexes that can be chosen from.
|
82
|
* That would include when to use hexncolonndot (ipv6+ipv4) vs hexncolon (ipv6)
|
83
|
*
|
84
|
* * Experiment to move check_noclosingsinglecolon to the start right inside of
|
85
|
* (?'MATCH'
|
86
|
* , to see if we receive a speed improvement (that is stable of course).
|
87
|
* My hunch is that it wont work stable and likely require more steps in
|
88
|
* general also it seem less useful for the +IPv4 cases so experiment maybe as
|
89
|
* well right after: (?'IPV6'
|
90
|
*
|
91
|
* Main capturing groups:
|
92
|
* MATCH=We have a match
|
93
|
*
|
94
|
* Explanation to naming of the main groups below this section:
|
95
|
* C=double colon (::)
|
96
|
* L=Left
|
97
|
* M=Middle
|
98
|
* R=Right
|
99
|
* U=Unspecified address (:: alone)
|
100
|
* FULL=Full address not compressed with C/double colon
|
101
|
* 6=IPv6
|
102
|
* 4=IPv4
|
103
|
* , gives:
|
104
|
*
|
105
|
* IPV64
|
106
|
* 2. FULL64
|
107
|
* 3. CMR64
|
108
|
* 4. CLU64
|
109
|
*
|
110
|
* IPV6
|
111
|
* 5. FULL6
|
112
|
* 6. CMR6
|
113
|
* 7. CLU6
|
114
|
*/
|
115
|
|
116
|
const ipv6_regex = <<<'IPV6'
|
117
|
(?x)
|
118
|
# Definitions:
|
119
|
(?(DEFINE)(?'hex'[\da-f]))
|
120
|
(?(DEFINE)(?'hexncolon'[\da-f:]))
|
121
|
(?(DEFINE)(?'hexncolonndot'[\da-f:\.]))
|
122
|
(?(DEFINE)(?'hextet'(?&hex){1,4}))
|
123
|
(?(DEFINE)(?'octet'2[0-5]{2}|1[0-9]{2}|[1-9]?[0-9]))
|
124
|
(?(DEFINE)(?'ipv4'(?>(?&octet)\.){3}(?&octet)))
|
125
|
(?(DEFINE)(?'unspecifiedaddr'::))
|
126
|
(?(DEFINE)(?'nohexncolon'[^\da-f:]))
|
127
|
(?(DEFINE)(?'nohexncolonndot'[^\da-f:\.]))
|
128
|
(?(DEFINE)(?'check_withatleastonedoublecolon'
|
129
|
(?=.*(?&unspecifiedaddr))
|
130
|
))
|
131
|
(?(DEFINE)(?'check_withmostonedoublecolon'
|
132
|
(?!(?&hexncolon)+(?&unspecifiedaddr)(?&hexncolon)+(?&unspecifiedaddr))
|
133
|
))
|
134
|
# 1.
|
135
|
(?(DEFINE)(?'check_noclosingsinglecolon'(?!.*\b:(?>(?&nohexncolon)|$))))
|
136
|
# Start matching:
|
137
|
(?>(?&nohexncolonndot)+|\G)
|
138
|
(?&check_withmostonedoublecolon)
|
139
|
(?'MATCH'
|
140
|
(?'IPV64'
|
141
|
(?'FULL64'
|
142
|
# 2.
|
143
|
(?>(?&hextet):){6}(?&ipv4)
|
144
|
)
|
145
|
|
|
146
|
(?'CMR64'
|
147
|
# 3.
|
148
|
(?&check_withatleastonedoublecolon)(?>(?&hextet)(?>:{1,2})){1,5}(?&ipv4)
|
149
|
)
|
150
|
|
|
151
|
(?'CLU64'
|
152
|
# 4.
|
153
|
(?&unspecifiedaddr)(?>(?&hextet):){0,5}(?&ipv4)
|
154
|
)
|
155
|
)
|
156
|
(?!(?&hexncolonndot))
|
157
|
|
|
158
|
(?'IPV6'
|
159
|
(?'FULL6'
|
160
|
# 5.
|
161
|
(?>(?&hextet)(?>:)){7}(?&hextet)
|
162
|
)
|
163
|
|
|
164
|
(?'CMR6'
|
165
|
# 6.
|
166
|
(?&check_withatleastonedoublecolon)(?&check_noclosingsinglecolon)
|
167
|
(?&hextet):{1,2}(?>(?&hextet)(?>:{1,2}|\b)){0,6}
|
168
|
)
|
169
|
|
|
170
|
(?'CLU6'
|
171
|
# 7.
|
172
|
(?&check_noclosingsinglecolon)
|
173
|
(?&unspecifiedaddr)(?>(?&hextet)(?>:|\b)){0,7}
|
174
|
)
|
175
|
)
|
176
|
(?!(?&hexncolonndot))
|
177
|
)
|
178
|
IPV6;
|
179
|
|
180
|
/*
|
181
|
* Enumerated comments/documentation
|
182
|
*
|
183
|
* 1.
|
184
|
* check_noclosingsinglecolon checks with negative lookahed what we 'anti' match
|
185
|
* (remember we do not capture with negative lookahead).
|
186
|
* check_noclosingsinglecolon defines inside (the inner check) the opposite of
|
187
|
* what we match. Therefore 'anti' match.
|
188
|
* The inner check does the following. It is a match
|
189
|
* 1) that contains something after the (unwanted) :
|
190
|
* or
|
191
|
* 2) that is empty after the : (meaning the end of 'line'/input).
|
192
|
* Inner check: Everything (except newlines), followed by word boundary,
|
193
|
* colon, one character of everything BUT NOT (digit, a-f, colon)
|
194
|
* It means that AFTER the check what we MATCH
|
195
|
* will be the opposite of .*\b:[some character NOT in \da-f:]
|
196
|
* meaning something that does not have the same features.
|
197
|
* The reason why whe have the character class in the end is that it refers to
|
198
|
* our characters that we use as building blocks in an ipv6 address and anything
|
199
|
* else can be considered as separators between addresses. So if these building
|
200
|
* block characters indeed show up the colon would not be the end of the address
|
201
|
* and not be a 'closing colon'.
|
202
|
* If a second colon shows up right after - well then it is potentially a
|
203
|
* shortening of an address meaning something else than a single colon.
|
204
|
* If in fact there is a 'separator' in the inner check then the check/result is
|
205
|
* accepted, but flipped around because of the negative lookahead! Meaning we
|
206
|
* do not match the result afterwards!
|
207
|
*
|
208
|
* 2.
|
209
|
* this must be the first of the 3 expressions - if this came e.g. after the next group
|
210
|
* it would not match in a line with multiple addresses!
|
211
|
* matches like: 1111:2222:3333:4444:5555:6666:222.111.333.231
|
212
|
*
|
213
|
* 3.
|
214
|
* matches like: beef:beef::beef:beef:123.123.123.255
|
215
|
*
|
216
|
* 4.
|
217
|
* matches like: ::beef:beef:231.132.213.0
|
218
|
*
|
219
|
* 5.
|
220
|
* this must be the first of the 3 expressions - if this came e.g. after the next group
|
221
|
* it would not match in a line with multiple addresses!
|
222
|
* matches like: 1111:2222:3333:4444:5555:6666:7777:8888
|
223
|
*
|
224
|
* 6.
|
225
|
* matches like: beef:beef::beef:beef
|
226
|
*
|
227
|
* 7.
|
228
|
* matches like: ::beef:beef
|
229
|
* if hexncolonndot is not used above when we start our hextet will typically
|
230
|
* match the last octet in an ipv4 address when an ipv6 address follows it e.g.:
|
231
|
* ::1234:1234:1234:1234:1234:1234:123.231.213.255 ::11
|
232
|
* 255 gets matched by us. Therefore it is important to use hexncolonndot at -START-
|
233
|
*
|
234
|
*/
|
235
|
|
236
|
/*
|
237
|
* Small wrapper function to match all the lease content.
|
238
|
*/
|
239
|
function parse_all_ipv6_to_array($content) {
|
240
|
preg_match_all('/'.ipv6_regex.'/i', $content, $matches, PREG_SET_ORDER);
|
241
|
return $matches;
|
242
|
}
|