import tensorflow as tfimport tensorflow.contrib.eager as tfetfe.enable_eager_execution()x = [[2.]]m = tf.matmul(x, x)
print(m)# The 1x1 matrix [[4.]]
a = tf.constant(12)counter = 0while not tf.equal(a, 1): if tf.equal(a % 2, 0): a = a / 2 else: a = 3 * a + 1 print(a)
def square(x): return tf.multiply(x, x)grad = tfe.gradients_function(square)print(square(3.)) # [9.]print(grad(3.)) # [6.]
gradgrad = tfe.gradients_function(lambda x: grad(x)[0])print(gradgrad(3.)) # [2.]
def abs(x): return x if x > 0. else -xgrad = tfe.gradients_function(abs)print(grad(2.0)) # [1.]print(grad(-2.0)) # [-1.]
def log1pexp(x): return tf.log(1 + tf.exp(x))grad_log1pexp = tfe.gradients_function(log1pexp)# The gradient computation works fine at x = 0.print(grad_log1pexp(0.))# [0.5]# However it returns a `nan` at x = 100 due to numerical instability.print(grad_log1pexp(100.))# [nan]
@tfe.custom_gradientdef log1pexp(x): e = tf.exp(x) def grad(dy): return dy * (1 - 1 / (1 + e)) return tf.log(1 + e), gradgrad_log1pexp = tfe.gradients_function(log1pexp)# Gradient at x = 0 works as before.print(grad_log1pexp(0.))# [0.5]# And now gradient computation at x=100 works as well.print(grad_log1pexp(100.))# [1.0]
class MNISTModel(tfe.Network): def __init__(self): super(MNISTModel, self).__init__() self.layer1 = self.track_layer(tf.layers.Dense(units=10)) self.layer2 = self.track_layer(tf.layers.Dense(units=10)) def call(self, input): """Actually runs the model.""" result = self.layer1(input) result = self.layer2(result) return result
# Let's make up a blank input imagemodel = MNISTModel()batch = tf.zeros([1, 1, 784])print(batch.shape)# (1, 1, 784)result = model(batch)print(result)# tf.Tensor([[[ 0. 0., ...., 0.]]], shape=(1, 1, 10), dtype=float32)
def loss_function(model, x, y): y_ = model(x) return tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)for (x, y) in tfe.Iterator(dataset): grads = tfe.implicit_gradients(loss_function)(model, x, y) optimizer.apply_gradients(grads)
with tf.device("/gpu:0"): for (x, y) in tfe.Iterator(dataset): optimizer.minimize(lambda: loss_function(model, x, y))
1 If we may be allowed one sentence for the experts: the primary function of OpenFermion is to encode the electronic structure problem in second quantization defined by various basis sets and active spaces and then to transform those operators into spin Hamiltonians using various isomorphisms between qubit and fermion algebras.↩
Step 1: Generate an HDR+ image. Portrait mode starts with a picture where everything is sharp. For this we use HDR+, Google's computational photography technique for improving the quality of captured photographs, which runs on all recent Nexus/Pixel phones. It operates by capturing a burst of images that are underexposed to avoid blowing out highlights, aligning and averaging these frames to reduce noise in the shadows, and boosting these shadows in a way that preserves local contrast while judiciously reducing global contrast. The result is a picture with high dynamic range, low noise, and sharp details, even in dim lighting. The idea of aligning and averaging frames to reduce noise has been known in astrophotography for decades. Google's implementation is a bit different, because we do it on bursts captured by a handheld camera, and we need to be careful not to produce ghosts (double images) if the photographer is not steady or if objects in the scene move. Below is an example of a scene with high dynamic range, captured using HDR+. Photographs from the Pixel 2 without (left) and with (right) HDR+ enabled. Notice how HDR+ avoids blowing out the sky and courtyard while retaining detail in the dark arcade ceiling. Photo by Marc Levoy Step 2: Machine learning-based foreground-background segmentation Starting from an HDR+ picture, we next decide which pixels belong to the foreground (typically a person) and which belong to the background. This is a tricky problem, because unlike chroma keying (a.k.a. green-screening) in the movie industry, we can't assume that the background is green (or blue, or any other color) Instead, we apply machine learning. In particular, we have trained a neural network, written in TensorFlow, that looks at the picture, and produces an estimate of which pixels are people and which aren't. The specific network we use is a convolutional neural network (CNN) with skip connections. "Convolutional" means that the learned components of the network are in the form of filters (a weighted sum of the neighbors around each pixel), so you can think of the network as just filtering the image, then filtering the filtered image, etc. The "skip connections" allow information to easily flow from the early stages in the network where it reasons about low-level features (color and edges) up to later stages of the network where it reasons about high-level features (faces and body parts). Combining stages like this is important when you need to not just determine if a photo has a person in it, but to identify exactly which pixels belong to that person. Our CNN was trained on almost a million pictures of people (and their hats, sunglasses, and ice cream cones). Inference to produce the mask runs on the phone using TensorFlow Mobile. Here’s an example: At left is a picture produced by our HDR+ pipeline, and at right is the smoothed output of our neural network. White parts of this mask are thought by the network to be part of the foreground, and black parts are thought to be background. Photo by Sam Kweskin How good is this mask? Not too bad; our neural network recognizes the woman's hair and her teacup as being part of the foreground, so it can keep them sharp. If we blur the photograph based on this mask, we would produce this image: Synthetic shallow depth-of-field image generated using a mask. There are several things to notice about this result. First, the amount of blur is uniform, even though the background contains objects at varying depths. Second, an SLR would also blur out the pastry on her plate (and the plate itself), since it's close to the camera. Our neural network knows the pastry isn't part of her (note that it's black in the mask image), but being below her it’s not likely to be part of the background. We explicitly detect this situation and keep these pixels relatively sharp. Unfortunately, this solution isn’t always correct, and in this situation we should have blurred these pixels more. Step 3. From dual pixels to a depth map To improve on this result, it helps to know the depth at each point in the scene. To compute depth we can use a stereo algorithm. The Pixel 2 doesn't have dual cameras, but it does have a technology called Phase-Detect Auto-Focus (PDAF) pixels, sometimes called dual-pixel autofocus (DPAF). That's a mouthful, but the idea is pretty simple. If one imagines splitting the (tiny) lens of the phone's rear-facing camera into two halves, the view of the world as seen through the left side of the lens and the view through the right side are slightly different. These two viewpoints are less than 1mm apart (roughly the diameter of the lens), but they're different enough to compute stereo and produce a depth map. The way the optics of the camera works, this is equivalent to splitting every pixel on the image sensor chip into two smaller side-by-side pixels and reading them from the chip separately, as shown here: On the rear-facing camera of the Pixel 2, the right side of every pixel looks at the world through the left side of the lens, and the left side of every pixel looks at the world through the right side of the lens. Figure by Markus Kohlpaintner, reproduced with permission. As the diagram shows, PDAF pixels give you views through the left and right sides of the lens in a single snapshot. Or, if you're holding your phone in portrait orientation, then it's the upper and lower halves of the lens. Here's what the upper image and lower image look like for our example scene (below). These images are monochrome because we only use the green pixels of our Bayer color filter sensor in our stereo algorithm, not the red or blue pixels. Having trouble telling the two images apart? Maybe the animated gif at right (below) will help. Look closely; the differences are very small indeed! Views of our test scene through the upper half and lower half of the lens of a Pixel 2. In the animated gif at right, notice that she holds nearly still, because the camera is focused on her, while the background moves up and down. Objects in front of her, if we could see any, would move down when the background moves up (and vice versa). PDAF technology can be found in many cameras, including SLRs to help them focus faster when recording video. In our application, this technology is being used instead to compute a depth map. Specifically, we use our left-side and right-side images (or top and bottom) as input to a stereo algorithm similar to that used in Google's Jump system panorama stitcher (called the Jump Assembler). This algorithm first performs subpixel-accurate tile-based alignment to produce a low-resolution depth map, then interpolates it to high resolution using a bilateral solver. This is similar to the technology formerly used in Google's Lens Blur feature. One more detail: because the left-side and right-side views captured by the Pixel 2 camera are so close together, the depth information we get is inaccurate, especially in low light, due to the high noise in the images. To reduce this noise and improve depth accuracy we capture a burst of left-side and right-side images, then align and average them before applying our stereo algorithm. Of course we need to be careful during this step to avoid wrong matches, just as in HDR+, or we'll get ghosts in our depth map (but that's the subject of another blog post). On the left below is a depth map generated from the example shown above using our stereo algorithm. Left: depth map computed using stereo from the foregoing upper-half-of-lens and lower-half-of-lens images. Lighter means closer to the camera.Right: visualization of how much blur we apply to each pixel in the original. Black means don't blur at all, red denotes scene features behind the in-focus plane (which is her face), the brighter the red the more we blur, and blue denotes features in front of the in-focus plane (the pastry). Step 4. Putting it all together to render the final image The last step is to combine the segmentation mask we computed in step 2 with the depth map we computed in step 3 to decide how much to blur each pixel in the HDR+ picture from step 1. The way we combine the depth and mask is a bit of secret sauce, but the rough idea is that we want scene features we think belong to a person (white parts of the mask) to stay sharp, and features we think belong to the background (black parts of the mask) to be blurred in proportion to how far they are from the in-focus plane, where these distances are taken from the depth map. The red-colored image above is a visualization of how much to blur each pixel. Actually applying the blur is conceptually the simplest part; each pixel is replaced with a translucent disk of the same color but varying size. If we composite all these disks in depth order, it's like the averaging we described earlier, and we get a nice approximation to real optical blur. One of the benefits of defocusing synthetically is that because we're using software, we can get a perfect disk-shaped bokeh without lugging around several pounds of glass camera lenses. Interestingly, in software there's no particular reason we need to stick to realism; we could make the bokeh shape anything we want! For our example scene, here is the final portrait mode output. If you compare this result to the the rightmost result in step 2, you'll see that the pastry is now slightly blurred, much as you would expect from an SLR.
Taking macro shots If you're in portrait mode and you point the camera at a small object instead of a person (like a flower or food), then our neural network can't find a face and won't produce a useful segmentation mask. In other words, step 2 of our pipeline doesn't apply. Fortunately, we still have a depth map from PDAF data (step 3), so we can compute a shallow depth-of-field image based on the depth map alone. Because the baseline between the left and right sides of the lens is so small, this works well only for objects that are roughly less than a meter away. But for such scenes it produces nice pictures. You can think of this as a synthetic macro mode. Below are example straight and portrait mode shots of a macro-sized object, and here's an album with more macro shots, including more hard cases, like a water fountain with a thin wire fence behind it. Just be careful not to get too close; the Pixel 2 can’t focus sharply on objects closer than about 10cm from the camera. Macro picture without (left) and with (right) portrait mode. There’s no person here, so background pixels are identified solely using the depth map. Photo by Marc Levoy The selfie camera The Pixel 2 offers portrait mode on the front-facing (selfie) as well as rear-facing camera. This camera is 8Mpix instead of 12Mpix, and it doesn't have PDAF pixels, meaning that its pixels aren't split into left and right halves. In this case, step 3 of our pipeline doesn't apply, but if we can find a face, then we can still use our neural network (step 2) to produce a segmentation mask. This allows us to still generate a shallow depth-of-field image, but because we don't know how far away objects are, we can't vary the amount of blur with depth. Nevertheless, the effect looks pretty good, especially for selfies shot against a cluttered background, where blurring helps suppress the clutter. Here are example straight and portrait mode selfies taken with the Pixel 2's selfie camera: Selfie without (left) and with (right) portrait mode. The front-facing camera lacks PDAF pixels,so background pixels are identified using only machine learning. Photo by Marc Levoy