CernVM-FS  2.9.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
cvmfs_geo.py
Go to the documentation of this file.
1 import math
2 import string
3 import os
4 import re
5 import bisect
6 import socket
7 import cvmfs_api
8 import time
9 import threading
10 
11 # Open the geodb. Only import maxminddb here (and only once) because it
12 # is not available in the unit test.
13 maxminddb = None
14 def open_geodb(dbname):
15  global maxminddb
16  if maxminddb is None:
17  import maxminddb
18  return maxminddb.open_database(dbname)
19 
20 gidb="/var/lib/cvmfs-server/geo/GeoLite2-City.mmdb"
21 gireader=None
22 oldgireader=None
23 gichecktime=0
24 gimodtime=0
25 
26 geo_cache_secs = 5*60 # 5 minutes
27 
28 geo_cache_max_entries = 100000 # a ridiculously large but manageable number
29 namelookups = 0
30 
31 # geo_cache entries are indexed by name and contain a tuple of
32 # (update time, geo record). Caching DNS lookups is more important
33 # than caching geo information but it's simpler and slightly more
34 # efficient to cache the geo information.
35 geo_cache = {}
36 
37 gilock = threading.Lock()
38 namelock = threading.Lock()
39 
40 # look up geo info for an address
41 # Also periodically check for an update to the database and
42 # reopen it if it changed
43 def lookup_geoinfo(now, addr):
44  global gireader, oldgireader
45  global gichecktime
46  global gimodtime
47 
48  if gireader is None or now > gichecktime + geo_cache_secs:
49  gilock.acquire()
50  try:
51  # gichecktime might have changed before acquiring the lock, look again
52  if gireader is None or now > gichecktime + geo_cache_secs:
53  if oldgireader is not None:
54  # By now we're sure nobody is still using the previous
55  # gireader, so close it. This delay avoids having to
56  # acquire the lock for every lookup.
57  oldgireader.close()
58  oldgireader = None
59  print 'cvmfs_geo: closed old ' + gidb
60  gichecktime = now
61  modtime = os.stat(gidb).st_mtime
62  if modtime != gimodtime:
63  # database was modified, reopen it
64  oldgireader = gireader
65  gireader = open_geodb(gidb)
66  gimodtime = modtime
67  print 'cvmfs_geo: opened ' + gidb
68  finally:
69  gilock.release()
70 
71  return gireader.get(addr)
72 
73 # function came from http://www.johndcook.com/python_longitude_latitude.html
74 def distance_on_unit_sphere(lat1, long1, lat2, long2):
75 
76  if (lat1 == lat2) and (long1 == long2):
77  return 0.0
78 
79  # Convert latitude and longitude to
80  # spherical coordinates in radians.
81  degrees_to_radians = math.pi/180.0
82 
83  # phi = 90 - latitude
84  phi1 = (90.0 - lat1)*degrees_to_radians
85  phi2 = (90.0 - lat2)*degrees_to_radians
86 
87  # theta = longitude
88  theta1 = long1*degrees_to_radians
89  theta2 = long2*degrees_to_radians
90 
91  # Compute spherical distance from spherical coordinates.
92 
93  # For two locations in spherical coordinates
94  # (1, theta, phi) and (1, theta, phi)
95  # cosine( arc length ) =
96  # sin phi sin phi' cos(theta-theta') + cos phi cos phi'
97  # distance = rho * arc length
98 
99  cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) +
100  math.cos(phi1)*math.cos(phi2))
101  arc = math.acos( cos )
102 
103  # Remember to multiply arc by the radius of the earth
104  # in your favorite set of units to get length.
105  return arc
106 
107 # Pattern including all allowed characters in addresses.
108 # The geoip api functions will further validate, but for paranoia's sake
109 # (because I'm not sure how good the functions' error checking is), make
110 # sure the names are limited to valid hostname characters.
111 # Include ':' for IPv6 addresses.
112 addr_pattern = re.compile('^[0-9a-zA-Z.:-]*$')
113 
114 # Look up geo info for IPv4 or IPv6 address.
115 # Will return None if the address does not exist in the DB.
116 def addr_geoinfo(now, addr):
117  if (len(addr) > 256) or not addr_pattern.search(addr):
118  return None
119 
120  response = lookup_geoinfo(now, addr)
121  if response == None:
122  return None
123 
124  return response['location']
125 
126 # Look up geo info by name. Try IPv4 first since that DB is
127 # better and most servers today are dual stack if they have IPv6.
128 # Store results in a cache. Wsgi is multithreaded so need to lock
129 # accesses to the shared cache.
130 # Return geo info record or None if none found.
131 def name_geoinfo(now, name):
132  global geo_cache
133  if (len(name) > 256) or not addr_pattern.search(name):
134  return None
135 
136  global namelookups
137  namelock.acquire()
138  if name in geo_cache:
139  (stamp, gir) = geo_cache[name]
140  if now <= stamp + geo_cache_secs:
141  # still good, use it
142  namelock.release()
143  return gir
144  # update the timestamp so only one thread needs to wait
145  # when a lookup is slow
146  geo_cache[name] = (now, gir)
147  elif len(geo_cache) >= geo_cache_max_entries:
148  # avoid denial of service by removing one arbitrary entry
149  # before we add one
150  geo_cache.popitem()
151  namelookups += 1
152  namelock.release()
153 
154  ai = ()
155  try:
156  ai = socket.getaddrinfo(name,80,0,0,socket.IPPROTO_TCP)
157  except:
158  pass
159  gir = None
160  for info in ai:
161  # look for IPv4 address first
162  if info[0] == socket.AF_INET:
163  gir = lookup_geoinfo(now, info[4][0])
164  break
165  if gir == None:
166  # look for an IPv6 address if no IPv4 record found
167  for info in ai:
168  if info[0] == socket.AF_INET6:
169  gir = lookup_geoinfo(now, info[4][0])
170  break
171  if gir != None:
172  gir = gir['location']
173 
174  namelock.acquire()
175  if gir == None and name in geo_cache:
176  # reuse expired entry
177  gir = geo_cache[name][1]
178 
179  geo_cache[name] = (now, gir)
180  namelock.release()
181 
182  return gir
183 
184 # geo-sort list of servers relative to gir_rem
185 # If trycdn is True, first try prepending "ip." to the name to get the
186 # real IP address instead of a Content Delivery Network front end.
187 # return list of [onegood, indexes] where
188 # onegood - a boolean saying whether or not there was at least
189 # one valid looked up geolocation from the servers
190 # indexes - list of numbers specifying the order of the N given servers
191 # servers numbered 0 to N-1 from geographically closest to furthest
192 # away compared to gir_rem
193 def geosort_servers(now, gir_rem, servers, trycdn=False):
194  idx = 0
195  arcs = []
196  indexes = []
197 
198  onegood = False
199  for server in servers:
200  gir_server = None
201  if trycdn:
202  gir_server = name_geoinfo(now, "ip." + server)
203  if gir_server is None:
204  gir_server = name_geoinfo(now, server)
205 
206  if gir_server is None:
207  # put it on the end of the list
208  arc = float("inf")
209  else:
210  onegood = True
211  arc = distance_on_unit_sphere(gir_rem['latitude'],
212  gir_rem['longitude'],
213  gir_server['latitude'],
214  gir_server['longitude'])
215  #print "distance between " + \
216  # str(gir_rem['latitude']) + ',' + str(gir_rem['longitude']) \
217  # + " and " + \
218  # server + ' (' + str(gir_server['latitude']) + ',' + str(gir_server['longitude']) + ')' + \
219  # " is " + str(arc)
220 
221  i = bisect.bisect(arcs, arc)
222  arcs[i:i] = [arc]
223  indexes[i:i] = [idx]
224  idx += 1
225 
226  return [onegood, indexes]
227 
228 # expected geo api URL: /cvmfs/<repo_name>/api/v<version>/geo/<path_info>
229 # <repo_name> is repository name
230 # <version> is the api version number, typically "1.0"
231 # <path_info> is <caching_string>/<serverlist>
232 # <caching_string> can be anything to assist in ensuring that those
233 # clients wanting the same answer get responses cached together;
234 # typically the name of their shared proxy. If this resolves to
235 # a valid IP address, attempt to use that address as the source
236 # IP rather than the address seen by the web server. The reason for
237 # that is so it isn't possible for someone to poison a cache by
238 # using a name for someone else's proxy.
239 # <serverlist> is a comma-separated list of N server names
240 # response: a comma-separated list of numbers specifying the order of the N
241 # given servers numbered 1 to N from geographically closest to furthest
242 # away from the requester that initiated the connection (the requester
243 # is typically the proxy)
244 
245 def api(path_info, repo_name, version, start_response, environ):
246 
247  slash = path_info.find('/')
248  if (slash == -1):
249  return cvmfs_api.bad_request(start_response, 'no slash in geo path')
250 
251  caching_string = path_info[0:slash]
252  servers = string.split(path_info[slash+1:], ",")
253 
254  if caching_string == "_namelookups_":
255  # this is a special debugging URL
256  return cvmfs_api.good_request(start_response, str(namelookups) + '\n')
257 
258  # TODO(jblomer): Can this be switched to monotonic time?
259  now = int(time.time())
260 
261  trycdn = False
262  if 'HTTP_CF_CONNECTING_IP' in environ:
263  # Request is coming from Cloudflare Content Delivery Network;
264  # servers are probably using Cloudflare too.
265  trycdn = True
266 
267  gir_rem = None
268  if caching_string.find('.'):
269  # might be a FQDN, use it if it resolves to a geo record
270  gir_rem = name_geoinfo(now, caching_string)
271 
272  if gir_rem is None:
273  if 'HTTP_CF_CONNECTING_IP' in environ:
274  # IP address of client connecting to Cloudflare
275  gir_rem = addr_geoinfo(now, environ['HTTP_CF_CONNECTING_IP'])
276  if gir_rem is None and 'HTTP_X_FORWARDED_FOR' in environ:
277  # List of IP addresses forwarded through squid
278  # Try the last IP, in case there's a reverse proxy squid
279  # in front of the web server.
280  forwarded_for = environ['HTTP_X_FORWARDED_FOR']
281  start = string.rfind(forwarded_for, ' ') + 1
282  if (start == 0):
283  start = string.rfind(forwarded_for, ',') + 1
284  gir_rem = addr_geoinfo(now, forwarded_for[start:])
285  if gir_rem is None and 'REMOTE_ADDR' in environ:
286  # IP address connecting to web server
287  gir_rem = addr_geoinfo(now, environ['REMOTE_ADDR'])
288 
289  if gir_rem is None:
290  return cvmfs_api.bad_request(start_response, 'remote addr not found in database')
291 
292  if '+PXYSEP+' in servers:
293  # first geosort the proxies after the separator and if at least one
294  # is good, sort the hosts before the separator relative to that
295  # proxy rather than the client
296  pxysep = servers.index('+PXYSEP+')
297  # assume backup proxies will not be behind a CDN
298  onegood, pxyindexes = \
299  geosort_servers(now, gir_rem, servers[pxysep+1:], False)
300  if onegood:
301  gir_pxy = name_geoinfo(now, servers[pxysep+1+pxyindexes[0]])
302  if not gir_pxy is None:
303  gir_rem = gir_pxy
304  onegood, hostindexes = \
305  geosort_servers(now, gir_rem, servers[0:pxysep], trycdn)
306  indexes = hostindexes + list(pxysep+1+i for i in pxyindexes)
307  # Append the index of the separator for backward compatibility,
308  # so the client can always expect the same number of indexes as
309  # the number of elements in the request.
310  indexes.append(pxysep)
311  else:
312  onegood, indexes = geosort_servers(now, gir_rem, servers, trycdn)
313 
314  if not onegood:
315  # return a bad request only if all the server names were bad
316  return cvmfs_api.bad_request(start_response, 'no server addr found in database')
317 
318  response_body = string.join((str(i+1) for i in indexes), ',') + '\n'
319 
320  return cvmfs_api.good_request(start_response, response_body)
321 
def open_geodb
Definition: cvmfs_geo.py:14
def geosort_servers
Definition: cvmfs_geo.py:193
def good_request
Definition: cvmfs_api.py:14
def distance_on_unit_sphere
Definition: cvmfs_geo.py:74
def lookup_geoinfo
Definition: cvmfs_geo.py:43
def addr_geoinfo
Definition: cvmfs_geo.py:116
def name_geoinfo
Definition: cvmfs_geo.py:131
def bad_request
Definition: cvmfs_api.py:7