CernVM-FS  2.12.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
cvmfs_geo.py
Go to the documentation of this file.
1 from __future__ import print_function
2 
3 import math
4 import os
5 import re
6 import bisect
7 import socket
8 import cvmfs_api
9 import time
10 import threading
11 
12 # Open the geodb. Only import maxminddb here (and only once) because it
13 # is not available in the unit test.
14 maxminddb = None
15 def open_geodb(dbname):
16  global maxminddb
17  if maxminddb is None:
18  import maxminddb
19  return maxminddb.open_database(dbname)
20 
21 gidb="/var/lib/cvmfs-server/geo/GeoLite2-City.mmdb"
22 gireader=None
23 oldgireader=None
24 gichecktime=0
25 gimodtime=0
26 
27 geo_cache_secs = 5*60 # 5 minutes
28 
29 geo_cache_max_entries = 100000 # a ridiculously large but manageable number
30 namelookups = 0
31 
32 # geo_cache entries are indexed by name and contain a tuple of
33 # (update time, geo record). Caching DNS lookups is more important
34 # than caching geo information but it's simpler and slightly more
35 # efficient to cache the geo information.
36 geo_cache = {}
37 
38 gilock = threading.Lock()
39 namelock = threading.Lock()
40 
41 # look up geo info for an address
42 # Also periodically check for an update to the database and
43 # reopen it if it changed
44 def lookup_geoinfo(now, addr):
45  global gireader, oldgireader
46  global gichecktime
47  global gimodtime
48 
49  if gireader is None or now > gichecktime + geo_cache_secs:
50  gilock.acquire()
51  try:
52  # gichecktime might have changed before acquiring the lock, look again
53  if gireader is None or now > gichecktime + geo_cache_secs:
54  if oldgireader is not None:
55  # By now we're sure nobody is still using the previous
56  # gireader, so close it. This delay avoids having to
57  # acquire the lock for every lookup.
58  oldgireader.close()
59  oldgireader = None
60  print('cvmfs_geo: closed old ' + gidb)
61  gichecktime = now
62  modtime = os.stat(gidb).st_mtime
63  if modtime != gimodtime:
64  # database was modified, reopen it
65  oldgireader = gireader
66  gireader = open_geodb(gidb)
67  gimodtime = modtime
68  print('cvmfs_geo: opened ' + gidb)
69  finally:
70  gilock.release()
71 
72  return gireader.get(addr)
73 
74 # function came from http://www.johndcook.com/python_longitude_latitude.html
75 def distance_on_unit_sphere(lat1, long1, lat2, long2):
76 
77  if (lat1 == lat2) and (long1 == long2):
78  return 0.0
79 
80  # Convert latitude and longitude to
81  # spherical coordinates in radians.
82  degrees_to_radians = math.pi/180.0
83 
84  # phi = 90 - latitude
85  phi1 = (90.0 - lat1)*degrees_to_radians
86  phi2 = (90.0 - lat2)*degrees_to_radians
87 
88  # theta = longitude
89  theta1 = long1*degrees_to_radians
90  theta2 = long2*degrees_to_radians
91 
92  # Compute spherical distance from spherical coordinates.
93 
94  # For two locations in spherical coordinates
95  # (1, theta, phi) and (1, theta, phi)
96  # cosine( arc length ) =
97  # sin phi sin phi' cos(theta-theta') + cos phi cos phi'
98  # distance = rho * arc length
99 
100  cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) +
101  math.cos(phi1)*math.cos(phi2))
102  arc = math.acos( cos )
103 
104  # Remember to multiply arc by the radius of the earth
105  # in your favorite set of units to get length.
106  return arc
107 
108 # Pattern including all allowed characters in addresses.
109 # The geoip api functions will further validate, but for paranoia's sake
110 # (because I'm not sure how good the functions' error checking is), make
111 # sure the names are limited to valid hostname characters.
112 # Include ':' for IPv6 addresses.
113 addr_pattern = re.compile('^[0-9a-zA-Z.:-]*$')
114 
115 # Look up geo info for IPv4 or IPv6 address.
116 # Will return None if the address does not exist in the DB.
117 def addr_geoinfo(now, addr):
118  if (len(addr) > 256) or not addr_pattern.search(addr):
119  return None
120 
121  response = lookup_geoinfo(now, addr)
122  if response == None:
123  return None
124 
125  return response['location']
126 
127 # Look up geo info by name. Try IPv4 first since that DB is
128 # better and most servers today are dual stack if they have IPv6.
129 # Store results in a cache. Wsgi is multithreaded so need to lock
130 # accesses to the shared cache.
131 # Return geo info record or None if none found.
132 def name_geoinfo(now, name):
133  global geo_cache
134  if (len(name) > 256) or not addr_pattern.search(name):
135  return None
136 
137  global namelookups
138  namelock.acquire()
139  if name in geo_cache:
140  (stamp, gir) = geo_cache[name]
141  if now <= stamp + geo_cache_secs:
142  # still good, use it
143  namelock.release()
144  return gir
145  # update the timestamp so only one thread needs to wait
146  # when a lookup is slow
147  geo_cache[name] = (now, gir)
148  elif len(geo_cache) >= geo_cache_max_entries:
149  # avoid denial of service by removing one arbitrary entry
150  # before we add one
151  geo_cache.popitem()
152  namelookups += 1
153  namelock.release()
154 
155  ai = ()
156  try:
157  ai = socket.getaddrinfo(name,80,0,0,socket.IPPROTO_TCP)
158  except:
159  pass
160  gir = None
161  for info in ai:
162  # look for IPv4 address first
163  if info[0] == socket.AF_INET:
164  gir = lookup_geoinfo(now, info[4][0])
165  break
166  if gir == None:
167  # look for an IPv6 address if no IPv4 record found
168  for info in ai:
169  if info[0] == socket.AF_INET6:
170  gir = lookup_geoinfo(now, info[4][0])
171  break
172  if gir != None:
173  if 'location' in gir:
174  gir = gir['location']
175  else:
176  gir = None
177 
178  namelock.acquire()
179  if gir == None and name in geo_cache:
180  # reuse expired entry
181  gir = geo_cache[name][1]
182 
183  geo_cache[name] = (now, gir)
184  namelock.release()
185 
186  return gir
187 
188 # geo-sort list of servers relative to gir_rem
189 # If trycdn is True, first try prepending "ip." to the name to get the
190 # real IP address instead of a Content Delivery Network front end.
191 # return list of [onegood, indexes] where
192 # onegood - a boolean saying whether or not there was at least
193 # one valid looked up geolocation from the servers
194 # indexes - list of numbers specifying the order of the N given servers
195 # servers numbered 0 to N-1 from geographically closest to furthest
196 # away compared to gir_rem
197 def geosort_servers(now, gir_rem, servers, trycdn=False):
198  idx = 0
199  arcs = []
200  indexes = []
201 
202  onegood = False
203  for server in servers:
204  gir_server = None
205  if trycdn:
206  gir_server = name_geoinfo(now, "ip." + server)
207  if gir_server is None:
208  gir_server = name_geoinfo(now, server)
209 
210  if gir_server is None:
211  # put it on the end of the list
212  arc = float("inf")
213  else:
214  onegood = True
215  arc = distance_on_unit_sphere(gir_rem['latitude'],
216  gir_rem['longitude'],
217  gir_server['latitude'],
218  gir_server['longitude'])
219  #print("distance between " + \
220  # str(gir_rem['latitude']) + ',' + str(gir_rem['longitude']) \
221  # + " and " + \
222  # server + ' (' + str(gir_server['latitude']) + ',' + str(gir_server['longitude']) + ')' + \
223  # " is " + str(arc))
224 
225  i = bisect.bisect(arcs, arc)
226  arcs[i:i] = [arc]
227  indexes[i:i] = [idx]
228  idx += 1
229 
230  return [onegood, indexes]
231 
232 # expected geo api URL: /cvmfs/<repo_name>/api/v<version>/geo/<path_info>
233 # <repo_name> is repository name
234 # <version> is the api version number, typically "1.0"
235 # <path_info> is <caching_string>/<serverlist>
236 # <caching_string> can be anything to assist in ensuring that those
237 # clients wanting the same answer get responses cached together;
238 # typically the name of their shared proxy. If this resolves to
239 # a valid IP address, attempt to use that address as the source
240 # IP rather than the address seen by the web server. The reason for
241 # that is so it isn't possible for someone to poison a cache by
242 # using a name for someone else's proxy.
243 # <serverlist> is a comma-separated list of N server names
244 # response: a comma-separated list of numbers specifying the order of the N
245 # given servers numbered 1 to N from geographically closest to furthest
246 # away from the requester that initiated the connection (the requester
247 # is typically the proxy)
248 
249 def api(path_info, repo_name, version, start_response, environ):
250 
251  slash = path_info.find('/')
252  if (slash == -1):
253  return cvmfs_api.bad_request(start_response, 'no slash in geo path')
254 
255  caching_string = path_info[0:slash]
256  servers = path_info[slash+1:].split(',')
257 
258  if caching_string == "_namelookups_":
259  # this is a special debugging URL
260  return cvmfs_api.good_request(start_response, str(namelookups) + '\n')
261 
262  # TODO(jblomer): Can this be switched to monotonic time?
263  now = int(time.time())
264 
265  trycdn = False
266  if 'HTTP_CF_CONNECTING_IP' in environ:
267  # Request is coming from Cloudflare Content Delivery Network;
268  # servers are probably using Cloudflare too.
269  trycdn = True
270 
271  gir_rem = None
272  if caching_string.find('.'):
273  # might be a FQDN, use it if it resolves to a geo record
274  gir_rem = name_geoinfo(now, caching_string)
275 
276  if gir_rem is None:
277  if 'HTTP_CF_CONNECTING_IP' in environ:
278  # IP address of client connecting to Cloudflare
279  gir_rem = addr_geoinfo(now, environ['HTTP_CF_CONNECTING_IP'])
280  if gir_rem is None and 'HTTP_X_FORWARDED_FOR' in environ:
281  # List of IP addresses forwarded through squid
282  # Try the last IP, in case there's a reverse proxy squid
283  # in front of the web server.
284  forwarded_for = environ['HTTP_X_FORWARDED_FOR']
285  start = forwarded_for.rfind(' ') + 1
286  if (start == 0):
287  start = forwarded_for.rfind(',') + 1
288  gir_rem = addr_geoinfo(now, forwarded_for[start:])
289  if gir_rem is None and 'REMOTE_ADDR' in environ:
290  # IP address connecting to web server
291  gir_rem = addr_geoinfo(now, environ['REMOTE_ADDR'])
292 
293  if gir_rem is None:
294  return cvmfs_api.bad_request(start_response, 'remote addr not found in database')
295 
296  if '+PXYSEP+' in servers:
297  # first geosort the proxies after the separator and if at least one
298  # is good, sort the hosts before the separator relative to that
299  # proxy rather than the client
300  pxysep = servers.index('+PXYSEP+')
301  # assume backup proxies will not be behind a CDN
302  onegood, pxyindexes = \
303  geosort_servers(now, gir_rem, servers[pxysep+1:], False)
304  if onegood:
305  gir_pxy = name_geoinfo(now, servers[pxysep+1+pxyindexes[0]])
306  if not gir_pxy is None:
307  gir_rem = gir_pxy
308  onegood, hostindexes = \
309  geosort_servers(now, gir_rem, servers[0:pxysep], trycdn)
310  indexes = hostindexes + list(pxysep+1+i for i in pxyindexes)
311  # Append the index of the separator for backward compatibility,
312  # so the client can always expect the same number of indexes as
313  # the number of elements in the request.
314  indexes.append(pxysep)
315  else:
316  onegood, indexes = geosort_servers(now, gir_rem, servers, trycdn)
317 
318  if not onegood:
319  # return a bad request only if all the server names were bad
320  return cvmfs_api.bad_request(start_response, 'no server addr found in database')
321 
322  response_body = ','.join(str(i+1) for i in indexes) + '\n'
323 
324  return cvmfs_api.good_request(start_response, response_body)
325 
def open_geodb
Definition: cvmfs_geo.py:15
def geosort_servers
Definition: cvmfs_geo.py:197
def good_request
Definition: cvmfs_api.py:14
def distance_on_unit_sphere
Definition: cvmfs_geo.py:75
def lookup_geoinfo
Definition: cvmfs_geo.py:44
def addr_geoinfo
Definition: cvmfs_geo.py:117
def name_geoinfo
Definition: cvmfs_geo.py:132
def bad_request
Definition: cvmfs_api.py:7