Code refactoring for bpa operator
[icn.git] / cmd / bpa-operator / vendor / github.com / PuerkitoBio / purell / purell.go
1 /*
2 Package purell offers URL normalization as described on the wikipedia page:
3 http://en.wikipedia.org/wiki/URL_normalization
4 */
5 package purell
6
7 import (
8         "bytes"
9         "fmt"
10         "net/url"
11         "regexp"
12         "sort"
13         "strconv"
14         "strings"
15
16         "github.com/PuerkitoBio/urlesc"
17         "golang.org/x/net/idna"
18         "golang.org/x/text/unicode/norm"
19         "golang.org/x/text/width"
20 )
21
22 // A set of normalization flags determines how a URL will
23 // be normalized.
24 type NormalizationFlags uint
25
26 const (
27         // Safe normalizations
28         FlagLowercaseScheme           NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1
29         FlagLowercaseHost                                            // http://HOST -> http://host
30         FlagUppercaseEscapes                                         // http://host/t%ef -> http://host/t%EF
31         FlagDecodeUnnecessaryEscapes                                 // http://host/t%41 -> http://host/tA
32         FlagEncodeNecessaryEscapes                                   // http://host/!"#$ -> http://host/%21%22#$
33         FlagRemoveDefaultPort                                        // http://host:80 -> http://host
34         FlagRemoveEmptyQuerySeparator                                // http://host/path? -> http://host/path
35
36         // Usually safe normalizations
37         FlagRemoveTrailingSlash // http://host/path/ -> http://host/path
38         FlagAddTrailingSlash    // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags)
39         FlagRemoveDotSegments   // http://host/path/./a/b/../c -> http://host/path/a/c
40
41         // Unsafe normalizations
42         FlagRemoveDirectoryIndex   // http://host/path/index.html -> http://host/path/
43         FlagRemoveFragment         // http://host/path#fragment -> http://host/path
44         FlagForceHTTP              // https://host -> http://host
45         FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b
46         FlagRemoveWWW              // http://www.host/ -> http://host/
47         FlagAddWWW                 // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags)
48         FlagSortQuery              // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3
49
50         // Normalizations not in the wikipedia article, required to cover tests cases
51         // submitted by jehiah
52         FlagDecodeDWORDHost           // http://1113982867 -> http://66.102.7.147
53         FlagDecodeOctalHost           // http://0102.0146.07.0223 -> http://66.102.7.147
54         FlagDecodeHexHost             // http://0x42660793 -> http://66.102.7.147
55         FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path
56         FlagRemoveEmptyPortSeparator  // http://host:/path -> http://host/path
57
58         // Convenience set of safe normalizations
59         FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator
60
61         // For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags,
62         // while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix".
63
64         // Convenience set of usually safe normalizations (includes FlagsSafe)
65         FlagsUsuallySafeGreedy    NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments
66         FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments
67
68         // Convenience set of unsafe normalizations (includes FlagsUsuallySafe)
69         FlagsUnsafeGreedy    NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery
70         FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery
71
72         // Convenience set of all available flags
73         FlagsAllGreedy    = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
74         FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
75 )
76
77 const (
78         defaultHttpPort  = ":80"
79         defaultHttpsPort = ":443"
80 )
81
82 // Regular expressions used by the normalizations
83 var rxPort = regexp.MustCompile(`(:\d+)/?$`)
84 var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`)
85 var rxDupSlashes = regexp.MustCompile(`/{2,}`)
86 var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`)
87 var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`)
88 var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`)
89 var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`)
90 var rxEmptyPort = regexp.MustCompile(`:+$`)
91
92 // Map of flags to implementation function.
93 // FlagDecodeUnnecessaryEscapes has no action, since it is done automatically
94 // by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator.
95
96 // Since maps have undefined traversing order, make a slice of ordered keys
97 var flagsOrder = []NormalizationFlags{
98         FlagLowercaseScheme,
99         FlagLowercaseHost,
100         FlagRemoveDefaultPort,
101         FlagRemoveDirectoryIndex,
102         FlagRemoveDotSegments,
103         FlagRemoveFragment,
104         FlagForceHTTP, // Must be after remove default port (because https=443/http=80)
105         FlagRemoveDuplicateSlashes,
106         FlagRemoveWWW,
107         FlagAddWWW,
108         FlagSortQuery,
109         FlagDecodeDWORDHost,
110         FlagDecodeOctalHost,
111         FlagDecodeHexHost,
112         FlagRemoveUnnecessaryHostDots,
113         FlagRemoveEmptyPortSeparator,
114         FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last
115         FlagAddTrailingSlash,
116 }
117
118 // ... and then the map, where order is unimportant
119 var flags = map[NormalizationFlags]func(*url.URL){
120         FlagLowercaseScheme:           lowercaseScheme,
121         FlagLowercaseHost:             lowercaseHost,
122         FlagRemoveDefaultPort:         removeDefaultPort,
123         FlagRemoveDirectoryIndex:      removeDirectoryIndex,
124         FlagRemoveDotSegments:         removeDotSegments,
125         FlagRemoveFragment:            removeFragment,
126         FlagForceHTTP:                 forceHTTP,
127         FlagRemoveDuplicateSlashes:    removeDuplicateSlashes,
128         FlagRemoveWWW:                 removeWWW,
129         FlagAddWWW:                    addWWW,
130         FlagSortQuery:                 sortQuery,
131         FlagDecodeDWORDHost:           decodeDWORDHost,
132         FlagDecodeOctalHost:           decodeOctalHost,
133         FlagDecodeHexHost:             decodeHexHost,
134         FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots,
135         FlagRemoveEmptyPortSeparator:  removeEmptyPortSeparator,
136         FlagRemoveTrailingSlash:       removeTrailingSlash,
137         FlagAddTrailingSlash:          addTrailingSlash,
138 }
139
140 // MustNormalizeURLString returns the normalized string, and panics if an error occurs.
141 // It takes an URL string as input, as well as the normalization flags.
142 func MustNormalizeURLString(u string, f NormalizationFlags) string {
143         result, e := NormalizeURLString(u, f)
144         if e != nil {
145                 panic(e)
146         }
147         return result
148 }
149
150 // NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object.
151 // It takes an URL string as input, as well as the normalization flags.
152 func NormalizeURLString(u string, f NormalizationFlags) (string, error) {
153         parsed, err := url.Parse(u)
154         if err != nil {
155                 return "", err
156         }
157
158         if f&FlagLowercaseHost == FlagLowercaseHost {
159                 parsed.Host = strings.ToLower(parsed.Host)
160         }
161
162         // The idna package doesn't fully conform to RFC 5895
163         // (https://tools.ietf.org/html/rfc5895), so we do it here.
164         // Taken from Go 1.8 cycle source, courtesy of bradfitz.
165         // TODO: Remove when (if?) idna package conforms to RFC 5895.
166         parsed.Host = width.Fold.String(parsed.Host)
167         parsed.Host = norm.NFC.String(parsed.Host)
168         if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil {
169                 return "", err
170         }
171
172         return NormalizeURL(parsed, f), nil
173 }
174
175 // NormalizeURL returns the normalized string.
176 // It takes a parsed URL object as input, as well as the normalization flags.
177 func NormalizeURL(u *url.URL, f NormalizationFlags) string {
178         for _, k := range flagsOrder {
179                 if f&k == k {
180                         flags[k](u)
181                 }
182         }
183         return urlesc.Escape(u)
184 }
185
186 func lowercaseScheme(u *url.URL) {
187         if len(u.Scheme) > 0 {
188                 u.Scheme = strings.ToLower(u.Scheme)
189         }
190 }
191
192 func lowercaseHost(u *url.URL) {
193         if len(u.Host) > 0 {
194                 u.Host = strings.ToLower(u.Host)
195         }
196 }
197
198 func removeDefaultPort(u *url.URL) {
199         if len(u.Host) > 0 {
200                 scheme := strings.ToLower(u.Scheme)
201                 u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string {
202                         if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) {
203                                 return ""
204                         }
205                         return val
206                 })
207         }
208 }
209
210 func removeTrailingSlash(u *url.URL) {
211         if l := len(u.Path); l > 0 {
212                 if strings.HasSuffix(u.Path, "/") {
213                         u.Path = u.Path[:l-1]
214                 }
215         } else if l = len(u.Host); l > 0 {
216                 if strings.HasSuffix(u.Host, "/") {
217                         u.Host = u.Host[:l-1]
218                 }
219         }
220 }
221
222 func addTrailingSlash(u *url.URL) {
223         if l := len(u.Path); l > 0 {
224                 if !strings.HasSuffix(u.Path, "/") {
225                         u.Path += "/"
226                 }
227         } else if l = len(u.Host); l > 0 {
228                 if !strings.HasSuffix(u.Host, "/") {
229                         u.Host += "/"
230                 }
231         }
232 }
233
234 func removeDotSegments(u *url.URL) {
235         if len(u.Path) > 0 {
236                 var dotFree []string
237                 var lastIsDot bool
238
239                 sections := strings.Split(u.Path, "/")
240                 for _, s := range sections {
241                         if s == ".." {
242                                 if len(dotFree) > 0 {
243                                         dotFree = dotFree[:len(dotFree)-1]
244                                 }
245                         } else if s != "." {
246                                 dotFree = append(dotFree, s)
247                         }
248                         lastIsDot = (s == "." || s == "..")
249                 }
250                 // Special case if host does not end with / and new path does not begin with /
251                 u.Path = strings.Join(dotFree, "/")
252                 if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") {
253                         u.Path = "/" + u.Path
254                 }
255                 // Special case if the last segment was a dot, make sure the path ends with a slash
256                 if lastIsDot && !strings.HasSuffix(u.Path, "/") {
257                         u.Path += "/"
258                 }
259         }
260 }
261
262 func removeDirectoryIndex(u *url.URL) {
263         if len(u.Path) > 0 {
264                 u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1")
265         }
266 }
267
268 func removeFragment(u *url.URL) {
269         u.Fragment = ""
270 }
271
272 func forceHTTP(u *url.URL) {
273         if strings.ToLower(u.Scheme) == "https" {
274                 u.Scheme = "http"
275         }
276 }
277
278 func removeDuplicateSlashes(u *url.URL) {
279         if len(u.Path) > 0 {
280                 u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/")
281         }
282 }
283
284 func removeWWW(u *url.URL) {
285         if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") {
286                 u.Host = u.Host[4:]
287         }
288 }
289
290 func addWWW(u *url.URL) {
291         if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") {
292                 u.Host = "www." + u.Host
293         }
294 }
295
296 func sortQuery(u *url.URL) {
297         q := u.Query()
298
299         if len(q) > 0 {
300                 arKeys := make([]string, len(q))
301                 i := 0
302                 for k := range q {
303                         arKeys[i] = k
304                         i++
305                 }
306                 sort.Strings(arKeys)
307                 buf := new(bytes.Buffer)
308                 for _, k := range arKeys {
309                         sort.Strings(q[k])
310                         for _, v := range q[k] {
311                                 if buf.Len() > 0 {
312                                         buf.WriteRune('&')
313                                 }
314                                 buf.WriteString(fmt.Sprintf("%s=%s", k, urlesc.QueryEscape(v)))
315                         }
316                 }
317
318                 // Rebuild the raw query string
319                 u.RawQuery = buf.String()
320         }
321 }
322
323 func decodeDWORDHost(u *url.URL) {
324         if len(u.Host) > 0 {
325                 if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 {
326                         var parts [4]int64
327
328                         dword, _ := strconv.ParseInt(matches[1], 10, 0)
329                         for i, shift := range []uint{24, 16, 8, 0} {
330                                 parts[i] = dword >> shift & 0xFF
331                         }
332                         u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2])
333                 }
334         }
335 }
336
337 func decodeOctalHost(u *url.URL) {
338         if len(u.Host) > 0 {
339                 if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 {
340                         var parts [4]int64
341
342                         for i := 1; i <= 4; i++ {
343                                 parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0)
344                         }
345                         u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5])
346                 }
347         }
348 }
349
350 func decodeHexHost(u *url.URL) {
351         if len(u.Host) > 0 {
352                 if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 {
353                         // Conversion is safe because of regex validation
354                         parsed, _ := strconv.ParseInt(matches[1], 16, 0)
355                         // Set host as DWORD (base 10) encoded host
356                         u.Host = fmt.Sprintf("%d%s", parsed, matches[2])
357                         // The rest is the same as decoding a DWORD host
358                         decodeDWORDHost(u)
359                 }
360         }
361 }
362
363 func removeUnncessaryHostDots(u *url.URL) {
364         if len(u.Host) > 0 {
365                 if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 {
366                         // Trim the leading and trailing dots
367                         u.Host = strings.Trim(matches[1], ".")
368                         if len(matches) > 2 {
369                                 u.Host += matches[2]
370                         }
371                 }
372         }
373 }
374
375 func removeEmptyPortSeparator(u *url.URL) {
376         if len(u.Host) > 0 {
377                 u.Host = rxEmptyPort.ReplaceAllString(u.Host, "")
378         }
379 }