CernVM-FS  2.11.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
cvmfs_geo.py
Go to the documentation of this file.
1 from __future__ import print_function
2 
3 import math
4 import os
5 import re
6 import bisect
7 import socket
8 import cvmfs_api
9 import time
10 import threading
11 
12 # Open the geodb. Only import maxminddb here (and only once) because it
13 # is not available in the unit test.
14 maxminddb = None
15 def open_geodb(dbname):
16  global maxminddb
17  if maxminddb is None:
18  import maxminddb
19  return maxminddb.open_database(dbname)
20 
21 gidb="/var/lib/cvmfs-server/geo/GeoLite2-City.mmdb"
22 gireader=None
23 oldgireader=None
24 gichecktime=0
25 gimodtime=0
26 
27 geo_cache_secs = 5*60 # 5 minutes
28 
29 geo_cache_max_entries = 100000 # a ridiculously large but manageable number
30 namelookups = 0
31 
32 # geo_cache entries are indexed by name and contain a tuple of
33 # (update time, geo record). Caching DNS lookups is more important
34 # than caching geo information but it's simpler and slightly more
35 # efficient to cache the geo information.
36 geo_cache = {}
37 
38 gilock = threading.Lock()
39 namelock = threading.Lock()
40 
41 # look up geo info for an address
42 # Also periodically check for an update to the database and
43 # reopen it if it changed
44 def lookup_geoinfo(now, addr):
45  global gireader, oldgireader
46  global gichecktime
47  global gimodtime
48 
49  if gireader is None or now > gichecktime + geo_cache_secs:
50  gilock.acquire()
51  try:
52  # gichecktime might have changed before acquiring the lock, look again
53  if gireader is None or now > gichecktime + geo_cache_secs:
54  if oldgireader is not None:
55  # By now we're sure nobody is still using the previous
56  # gireader, so close it. This delay avoids having to
57  # acquire the lock for every lookup.
58  oldgireader.close()
59  oldgireader = None
60  print('cvmfs_geo: closed old ' + gidb)
61  gichecktime = now
62  modtime = os.stat(gidb).st_mtime
63  if modtime != gimodtime:
64  # database was modified, reopen it
65  oldgireader = gireader
66  gireader = open_geodb(gidb)
67  gimodtime = modtime
68  print('cvmfs_geo: opened ' + gidb)
69  finally:
70  gilock.release()
71 
72  return gireader.get(addr)
73 
74 # function came from http://www.johndcook.com/python_longitude_latitude.html
75 def distance_on_unit_sphere(lat1, long1, lat2, long2):
76 
77  if (lat1 == lat2) and (long1 == long2):
78  return 0.0
79 
80  # Convert latitude and longitude to
81  # spherical coordinates in radians.
82  degrees_to_radians = math.pi/180.0
83 
84  # phi = 90 - latitude
85  phi1 = (90.0 - lat1)*degrees_to_radians
86  phi2 = (90.0 - lat2)*degrees_to_radians
87 
88  # theta = longitude
89  theta1 = long1*degrees_to_radians
90  theta2 = long2*degrees_to_radians
91 
92  # Compute spherical distance from spherical coordinates.
93 
94  # For two locations in spherical coordinates
95  # (1, theta, phi) and (1, theta, phi)
96  # cosine( arc length ) =
97  # sin phi sin phi' cos(theta-theta') + cos phi cos phi'
98  # distance = rho * arc length
99 
100  cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) +
101  math.cos(phi1)*math.cos(phi2))
102  arc = math.acos( cos )
103 
104  # Remember to multiply arc by the radius of the earth
105  # in your favorite set of units to get length.
106  return arc
107 
108 # Pattern including all allowed characters in addresses.
109 # The geoip api functions will further validate, but for paranoia's sake
110 # (because I'm not sure how good the functions' error checking is), make
111 # sure the names are limited to valid hostname characters.
112 # Include ':' for IPv6 addresses.
113 addr_pattern = re.compile('^[0-9a-zA-Z.:-]*$')
114 
115 # Look up geo info for IPv4 or IPv6 address.
116 # Will return None if the address does not exist in the DB.
117 def addr_geoinfo(now, addr):
118  if (len(addr) > 256) or not addr_pattern.search(addr):
119  return None
120 
121  response = lookup_geoinfo(now, addr)
122  if response == None:
123  return None
124 
125  return response['location']
126 
127 # Look up geo info by name. Try IPv4 first since that DB is
128 # better and most servers today are dual stack if they have IPv6.
129 # Store results in a cache. Wsgi is multithreaded so need to lock
130 # accesses to the shared cache.
131 # Return geo info record or None if none found.
132 def name_geoinfo(now, name):
133  global geo_cache
134  if (len(name) > 256) or not addr_pattern.search(name):
135  return None
136 
137  global namelookups
138  namelock.acquire()
139  if name in geo_cache:
140  (stamp, gir) = geo_cache[name]
141  if now <= stamp + geo_cache_secs:
142  # still good, use it
143  namelock.release()
144  return gir
145  # update the timestamp so only one thread needs to wait
146  # when a lookup is slow
147  geo_cache[name] = (now, gir)
148  elif len(geo_cache) >= geo_cache_max_entries:
149  # avoid denial of service by removing one arbitrary entry
150  # before we add one
151  geo_cache.popitem()
152  namelookups += 1
153  namelock.release()
154 
155  ai = ()
156  try:
157  ai = socket.getaddrinfo(name,80,0,0,socket.IPPROTO_TCP)
158  except:
159  pass
160  gir = None
161  for info in ai:
162  # look for IPv4 address first
163  if info[0] == socket.AF_INET:
164  gir = lookup_geoinfo(now, info[4][0])
165  break
166  if gir == None:
167  # look for an IPv6 address if no IPv4 record found
168  for info in ai:
169  if info[0] == socket.AF_INET6:
170  gir = lookup_geoinfo(now, info[4][0])
171  break
172  if gir != None:
173  gir = gir['location']
174 
175  namelock.acquire()
176  if gir == None and name in geo_cache:
177  # reuse expired entry
178  gir = geo_cache[name][1]
179 
180  geo_cache[name] = (now, gir)
181  namelock.release()
182 
183  return gir
184 
185 # geo-sort list of servers relative to gir_rem
186 # If trycdn is True, first try prepending "ip." to the name to get the
187 # real IP address instead of a Content Delivery Network front end.
188 # return list of [onegood, indexes] where
189 # onegood - a boolean saying whether or not there was at least
190 # one valid looked up geolocation from the servers
191 # indexes - list of numbers specifying the order of the N given servers
192 # servers numbered 0 to N-1 from geographically closest to furthest
193 # away compared to gir_rem
194 def geosort_servers(now, gir_rem, servers, trycdn=False):
195  idx = 0
196  arcs = []
197  indexes = []
198 
199  onegood = False
200  for server in servers:
201  gir_server = None
202  if trycdn:
203  gir_server = name_geoinfo(now, "ip." + server)
204  if gir_server is None:
205  gir_server = name_geoinfo(now, server)
206 
207  if gir_server is None:
208  # put it on the end of the list
209  arc = float("inf")
210  else:
211  onegood = True
212  arc = distance_on_unit_sphere(gir_rem['latitude'],
213  gir_rem['longitude'],
214  gir_server['latitude'],
215  gir_server['longitude'])
216  #print("distance between " + \
217  # str(gir_rem['latitude']) + ',' + str(gir_rem['longitude']) \
218  # + " and " + \
219  # server + ' (' + str(gir_server['latitude']) + ',' + str(gir_server['longitude']) + ')' + \
220  # " is " + str(arc))
221 
222  i = bisect.bisect(arcs, arc)
223  arcs[i:i] = [arc]
224  indexes[i:i] = [idx]
225  idx += 1
226 
227  return [onegood, indexes]
228 
229 # expected geo api URL: /cvmfs/<repo_name>/api/v<version>/geo/<path_info>
230 # <repo_name> is repository name
231 # <version> is the api version number, typically "1.0"
232 # <path_info> is <caching_string>/<serverlist>
233 # <caching_string> can be anything to assist in ensuring that those
234 # clients wanting the same answer get responses cached together;
235 # typically the name of their shared proxy. If this resolves to
236 # a valid IP address, attempt to use that address as the source
237 # IP rather than the address seen by the web server. The reason for
238 # that is so it isn't possible for someone to poison a cache by
239 # using a name for someone else's proxy.
240 # <serverlist> is a comma-separated list of N server names
241 # response: a comma-separated list of numbers specifying the order of the N
242 # given servers numbered 1 to N from geographically closest to furthest
243 # away from the requester that initiated the connection (the requester
244 # is typically the proxy)
245 
246 def api(path_info, repo_name, version, start_response, environ):
247 
248  slash = path_info.find('/')
249  if (slash == -1):
250  return cvmfs_api.bad_request(start_response, 'no slash in geo path')
251 
252  caching_string = path_info[0:slash]
253  servers = path_info[slash+1:].split(',')
254 
255  if caching_string == "_namelookups_":
256  # this is a special debugging URL
257  return cvmfs_api.good_request(start_response, str(namelookups) + '\n')
258 
259  # TODO(jblomer): Can this be switched to monotonic time?
260  now = int(time.time())
261 
262  trycdn = False
263  if 'HTTP_CF_CONNECTING_IP' in environ:
264  # Request is coming from Cloudflare Content Delivery Network;
265  # servers are probably using Cloudflare too.
266  trycdn = True
267 
268  gir_rem = None
269  if caching_string.find('.'):
270  # might be a FQDN, use it if it resolves to a geo record
271  gir_rem = name_geoinfo(now, caching_string)
272 
273  if gir_rem is None:
274  if 'HTTP_CF_CONNECTING_IP' in environ:
275  # IP address of client connecting to Cloudflare
276  gir_rem = addr_geoinfo(now, environ['HTTP_CF_CONNECTING_IP'])
277  if gir_rem is None and 'HTTP_X_FORWARDED_FOR' in environ:
278  # List of IP addresses forwarded through squid
279  # Try the last IP, in case there's a reverse proxy squid
280  # in front of the web server.
281  forwarded_for = environ['HTTP_X_FORWARDED_FOR']
282  start = forwarded_for.rfind(' ') + 1
283  if (start == 0):
284  start = forwarded_for.rfind(',') + 1
285  gir_rem = addr_geoinfo(now, forwarded_for[start:])
286  if gir_rem is None and 'REMOTE_ADDR' in environ:
287  # IP address connecting to web server
288  gir_rem = addr_geoinfo(now, environ['REMOTE_ADDR'])
289 
290  if gir_rem is None:
291  return cvmfs_api.bad_request(start_response, 'remote addr not found in database')
292 
293  if '+PXYSEP+' in servers:
294  # first geosort the proxies after the separator and if at least one
295  # is good, sort the hosts before the separator relative to that
296  # proxy rather than the client
297  pxysep = servers.index('+PXYSEP+')
298  # assume backup proxies will not be behind a CDN
299  onegood, pxyindexes = \
300  geosort_servers(now, gir_rem, servers[pxysep+1:], False)
301  if onegood:
302  gir_pxy = name_geoinfo(now, servers[pxysep+1+pxyindexes[0]])
303  if not gir_pxy is None:
304  gir_rem = gir_pxy
305  onegood, hostindexes = \
306  geosort_servers(now, gir_rem, servers[0:pxysep], trycdn)
307  indexes = hostindexes + list(pxysep+1+i for i in pxyindexes)
308  # Append the index of the separator for backward compatibility,
309  # so the client can always expect the same number of indexes as
310  # the number of elements in the request.
311  indexes.append(pxysep)
312  else:
313  onegood, indexes = geosort_servers(now, gir_rem, servers, trycdn)
314 
315  if not onegood:
316  # return a bad request only if all the server names were bad
317  return cvmfs_api.bad_request(start_response, 'no server addr found in database')
318 
319  response_body = ','.join(str(i+1) for i in indexes) + '\n'
320 
321  return cvmfs_api.good_request(start_response, response_body)
322 
def open_geodb
Definition: cvmfs_geo.py:15
def geosort_servers
Definition: cvmfs_geo.py:194
def good_request
Definition: cvmfs_api.py:14
def distance_on_unit_sphere
Definition: cvmfs_geo.py:75
def lookup_geoinfo
Definition: cvmfs_geo.py:44
def addr_geoinfo
Definition: cvmfs_geo.py:117
def name_geoinfo
Definition: cvmfs_geo.py:132
def bad_request
Definition: cvmfs_api.py:7