2 Package purell offers URL normalization as described on the wikipedia page:
3 http://en.wikipedia.org/wiki/URL_normalization
16 "github.com/PuerkitoBio/urlesc"
17 "golang.org/x/net/idna"
18 "golang.org/x/text/unicode/norm"
19 "golang.org/x/text/width"
22 // A set of normalization flags determines how a URL will
24 type NormalizationFlags uint
27 // Safe normalizations
28 FlagLowercaseScheme NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1
29 FlagLowercaseHost // http://HOST -> http://host
30 FlagUppercaseEscapes // http://host/t%ef -> http://host/t%EF
31 FlagDecodeUnnecessaryEscapes // http://host/t%41 -> http://host/tA
32 FlagEncodeNecessaryEscapes // http://host/!"#$ -> http://host/%21%22#$
33 FlagRemoveDefaultPort // http://host:80 -> http://host
34 FlagRemoveEmptyQuerySeparator // http://host/path? -> http://host/path
36 // Usually safe normalizations
37 FlagRemoveTrailingSlash // http://host/path/ -> http://host/path
38 FlagAddTrailingSlash // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags)
39 FlagRemoveDotSegments // http://host/path/./a/b/../c -> http://host/path/a/c
41 // Unsafe normalizations
42 FlagRemoveDirectoryIndex // http://host/path/index.html -> http://host/path/
43 FlagRemoveFragment // http://host/path#fragment -> http://host/path
44 FlagForceHTTP // https://host -> http://host
45 FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b
46 FlagRemoveWWW // http://www.host/ -> http://host/
47 FlagAddWWW // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags)
48 FlagSortQuery // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3
50 // Normalizations not in the wikipedia article, required to cover tests cases
51 // submitted by jehiah
52 FlagDecodeDWORDHost // http://1113982867 -> http://66.102.7.147
53 FlagDecodeOctalHost // http://0102.0146.07.0223 -> http://66.102.7.147
54 FlagDecodeHexHost // http://0x42660793 -> http://66.102.7.147
55 FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path
56 FlagRemoveEmptyPortSeparator // http://host:/path -> http://host/path
58 // Convenience set of safe normalizations
59 FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator
61 // For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags,
62 // while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix".
64 // Convenience set of usually safe normalizations (includes FlagsSafe)
65 FlagsUsuallySafeGreedy NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments
66 FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments
68 // Convenience set of unsafe normalizations (includes FlagsUsuallySafe)
69 FlagsUnsafeGreedy NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery
70 FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery
72 // Convenience set of all available flags
73 FlagsAllGreedy = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
74 FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
78 defaultHttpPort = ":80"
79 defaultHttpsPort = ":443"
82 // Regular expressions used by the normalizations
83 var rxPort = regexp.MustCompile(`(:\d+)/?$`)
84 var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`)
85 var rxDupSlashes = regexp.MustCompile(`/{2,}`)
86 var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`)
87 var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`)
88 var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`)
89 var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`)
90 var rxEmptyPort = regexp.MustCompile(`:+$`)
92 // Map of flags to implementation function.
93 // FlagDecodeUnnecessaryEscapes has no action, since it is done automatically
94 // by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator.
96 // Since maps have undefined traversing order, make a slice of ordered keys
97 var flagsOrder = []NormalizationFlags{
100 FlagRemoveDefaultPort,
101 FlagRemoveDirectoryIndex,
102 FlagRemoveDotSegments,
104 FlagForceHTTP, // Must be after remove default port (because https=443/http=80)
105 FlagRemoveDuplicateSlashes,
112 FlagRemoveUnnecessaryHostDots,
113 FlagRemoveEmptyPortSeparator,
114 FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last
115 FlagAddTrailingSlash,
118 // ... and then the map, where order is unimportant
119 var flags = map[NormalizationFlags]func(*url.URL){
120 FlagLowercaseScheme: lowercaseScheme,
121 FlagLowercaseHost: lowercaseHost,
122 FlagRemoveDefaultPort: removeDefaultPort,
123 FlagRemoveDirectoryIndex: removeDirectoryIndex,
124 FlagRemoveDotSegments: removeDotSegments,
125 FlagRemoveFragment: removeFragment,
126 FlagForceHTTP: forceHTTP,
127 FlagRemoveDuplicateSlashes: removeDuplicateSlashes,
128 FlagRemoveWWW: removeWWW,
130 FlagSortQuery: sortQuery,
131 FlagDecodeDWORDHost: decodeDWORDHost,
132 FlagDecodeOctalHost: decodeOctalHost,
133 FlagDecodeHexHost: decodeHexHost,
134 FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots,
135 FlagRemoveEmptyPortSeparator: removeEmptyPortSeparator,
136 FlagRemoveTrailingSlash: removeTrailingSlash,
137 FlagAddTrailingSlash: addTrailingSlash,
140 // MustNormalizeURLString returns the normalized string, and panics if an error occurs.
141 // It takes an URL string as input, as well as the normalization flags.
142 func MustNormalizeURLString(u string, f NormalizationFlags) string {
143 result, e := NormalizeURLString(u, f)
150 // NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object.
151 // It takes an URL string as input, as well as the normalization flags.
152 func NormalizeURLString(u string, f NormalizationFlags) (string, error) {
153 parsed, err := url.Parse(u)
158 if f&FlagLowercaseHost == FlagLowercaseHost {
159 parsed.Host = strings.ToLower(parsed.Host)
162 // The idna package doesn't fully conform to RFC 5895
163 // (https://tools.ietf.org/html/rfc5895), so we do it here.
164 // Taken from Go 1.8 cycle source, courtesy of bradfitz.
165 // TODO: Remove when (if?) idna package conforms to RFC 5895.
166 parsed.Host = width.Fold.String(parsed.Host)
167 parsed.Host = norm.NFC.String(parsed.Host)
168 if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil {
172 return NormalizeURL(parsed, f), nil
175 // NormalizeURL returns the normalized string.
176 // It takes a parsed URL object as input, as well as the normalization flags.
177 func NormalizeURL(u *url.URL, f NormalizationFlags) string {
178 for _, k := range flagsOrder {
183 return urlesc.Escape(u)
186 func lowercaseScheme(u *url.URL) {
187 if len(u.Scheme) > 0 {
188 u.Scheme = strings.ToLower(u.Scheme)
192 func lowercaseHost(u *url.URL) {
194 u.Host = strings.ToLower(u.Host)
198 func removeDefaultPort(u *url.URL) {
200 scheme := strings.ToLower(u.Scheme)
201 u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string {
202 if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) {
210 func removeTrailingSlash(u *url.URL) {
211 if l := len(u.Path); l > 0 {
212 if strings.HasSuffix(u.Path, "/") {
213 u.Path = u.Path[:l-1]
215 } else if l = len(u.Host); l > 0 {
216 if strings.HasSuffix(u.Host, "/") {
217 u.Host = u.Host[:l-1]
222 func addTrailingSlash(u *url.URL) {
223 if l := len(u.Path); l > 0 {
224 if !strings.HasSuffix(u.Path, "/") {
227 } else if l = len(u.Host); l > 0 {
228 if !strings.HasSuffix(u.Host, "/") {
234 func removeDotSegments(u *url.URL) {
239 sections := strings.Split(u.Path, "/")
240 for _, s := range sections {
242 if len(dotFree) > 0 {
243 dotFree = dotFree[:len(dotFree)-1]
246 dotFree = append(dotFree, s)
248 lastIsDot = (s == "." || s == "..")
250 // Special case if host does not end with / and new path does not begin with /
251 u.Path = strings.Join(dotFree, "/")
252 if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") {
253 u.Path = "/" + u.Path
255 // Special case if the last segment was a dot, make sure the path ends with a slash
256 if lastIsDot && !strings.HasSuffix(u.Path, "/") {
262 func removeDirectoryIndex(u *url.URL) {
264 u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1")
268 func removeFragment(u *url.URL) {
272 func forceHTTP(u *url.URL) {
273 if strings.ToLower(u.Scheme) == "https" {
278 func removeDuplicateSlashes(u *url.URL) {
280 u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/")
284 func removeWWW(u *url.URL) {
285 if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") {
290 func addWWW(u *url.URL) {
291 if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") {
292 u.Host = "www." + u.Host
296 func sortQuery(u *url.URL) {
300 arKeys := make([]string, len(q))
307 buf := new(bytes.Buffer)
308 for _, k := range arKeys {
310 for _, v := range q[k] {
314 buf.WriteString(fmt.Sprintf("%s=%s", k, urlesc.QueryEscape(v)))
318 // Rebuild the raw query string
319 u.RawQuery = buf.String()
323 func decodeDWORDHost(u *url.URL) {
325 if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 {
328 dword, _ := strconv.ParseInt(matches[1], 10, 0)
329 for i, shift := range []uint{24, 16, 8, 0} {
330 parts[i] = dword >> shift & 0xFF
332 u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2])
337 func decodeOctalHost(u *url.URL) {
339 if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 {
342 for i := 1; i <= 4; i++ {
343 parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0)
345 u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5])
350 func decodeHexHost(u *url.URL) {
352 if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 {
353 // Conversion is safe because of regex validation
354 parsed, _ := strconv.ParseInt(matches[1], 16, 0)
355 // Set host as DWORD (base 10) encoded host
356 u.Host = fmt.Sprintf("%d%s", parsed, matches[2])
357 // The rest is the same as decoding a DWORD host
363 func removeUnncessaryHostDots(u *url.URL) {
365 if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 {
366 // Trim the leading and trailing dots
367 u.Host = strings.Trim(matches[1], ".")
368 if len(matches) > 2 {
375 func removeEmptyPortSeparator(u *url.URL) {
377 u.Host = rxEmptyPort.ReplaceAllString(u.Host, "")