yum -y install gcc kernel-devel "kernel-devel-uname-r == $(uname -r)"
echo -e "blacklist nouveau\noptions nouveau modeset=0" > /etc/modprobe.d/blacklist.conf
mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r).img.bak
dracut /boot/initramfs-$(uname -r).img $(uname -r)
地址:https://www.geforce.cn/drivers/results/121083
sh NVIDIA-Linux-x86_64-390.59.run
[root@103 home]# nvidia-smi
Sat Jul 7 03:34:44 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.59 Driver Version: 390.59 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 108... Off | 00000000:02:00.0 Off | N/A |
| 27% 48C P0 59W / 250W | 0MiB / 11178MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 1 GeForce GTX 108... Off | 00000000:03:00.0 Off | N/A |
| 23% 40C P0 52W / 250W | 0MiB / 11178MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 2 GeForce GTX 108... Off | 00000000:83:00.0 Off | N/A |
| 23% 27C P0 56W / 250W | 0MiB / 11178MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
执行安装驱动命令时提示:
error:unable to find the kernel source tree for the currently running kernel. please
make sure you have installed the kernel source files for your kernel and that htey
are properly configured; on red hat linux system, for example, be sure you have
the 'kernel-source' or 'kernel-devel' RPM installed. if you know the correct kernel
source files are installed ,you may specify the kernel source path with the '--
kernel-source-path' command line option.
按提示指定内核源码目录:
sh NVIDIA-Linux-x86_64-390.59.run --kernel-source-path=/usr/src/kernels/3.10.0-862.6.3.el7.x86_64/
接着提示出错:
ERROR: Unable to load the kernel module 'nvidia.ko'. This happens most
frequently when this kernel module was built against the wrong or
improperly configured kernel sources, with a version of gcc that differs
from the one used to build the target kernel, or if a driver such as
rivafb/nvidiafb is present and prevents the NVIDIA kernel module from
obtaining ownership of the NVIDIA graphics device(s).
Please see the log entries 'Kernel module load error' and 'Kernel
messages' at the end of the file '/var/log/nvidia-installer.log' for
more information.
原因分析:
Loaded plugins: fastestmirror
Loading mirror speeds from cached hostfile
* base: mirror.sunnyvision.com
* elrepo: hkg.mirror.rackspace.com
* extras: ftp.cuhk.edu.hk
* updates: ftp.cuhk.edu.hk
Package gcc-4.8.5-28.el7_5.1.x86_64 already installed and latest version
No package kernel-devel-uname-r == 3.10.0-327.el7.x86_64 available.
Resolving Dependencies
--> Running transaction check
---> Package kernel-devel.x86_64 0:3.10.0-862.6.3.el7 will be installed
--> Finished Dependency Resolution
Dependencies Resolved
=====================================================================================================================================================================================
Package Arch Version Repository Size
=====================================================================================================================================================================================
Installing:
kernel-devel x86_64 3.10.0-862.6.3.el7 updates 16 M
Transaction Summary
=====================================================================================================================================================================================
Install 1 Package
Total download size: 16 M
Installed size: 37 M
Downloading packages:
Delta RPMs disabled because /usr/bin/applydeltarpm not installed.
因为使用时CentOS7.2,内核版本为:
Linux 103.215.190.172 3.10.0-327.el7.x86_64 #1 SMP Thu Nov 19 22:10:57 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux
但是源已经更新了,没有对应的内核源码包,安装的是最新的
kernel-devel.x86_64 0:3.10.0-862.6.3.el7
执行安装驱动脚本时导致找不到内核源码目录,解决办法是卸载yum安装的内核源码包,手动下载对应的rpm源码,rpm安装
yum remove kernel-devel
rpm -ivh kernel-devel-3.10.0-327.el7.x86_64.rpm
下载地址:https://developer.nvidia.com/cuda-downloads
选择runfile(rpm不知为何老是装不上)。
执行
chmod +x cuda_9.2.88_396.26_linux.run
sh cuda_9.2.88_396.26_linux.run
cd /usr/local/cuda-8.0/samples/1_Utilities/deviceQuery
sudo make
./deviceQuery
输出:
测试时出现:
[root@103 deviceQuery]# ./deviceQuery
./deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
cudaGetDeviceCount returned 38
-> no CUDA-capable device is detected
Result = FAIL
原因分析:
开始安装的是cuda8.0,估计版本比较旧(与驱动版本相比),后来安装了cuda9.0(最新9.2,但是tensorflow不支持),问题解决
注意一定要下载cuda对应版本的cudnn
下载地址:https://developer.nvidia.com/rdp/cudnn-download
ktar xvf cudnn-9.2-linux-x64-v7.1.tar
sudo cp cuda/include/cudnn.h /usr/local/cuda/include
sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64/ -d
sudo chmod a+r /usr/local/cuda/include/cudnn.h
sudo chmod a+r /usr/local/cuda/lib64/libcudnn*
pip install --upgrade https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
import tensorflow as tf
import numpy as np
# 使用 NumPy 生成假数据(phony data), 总共 100 个点.
x_data = np.float32(np.random.rand(2, 100)) # 随机输入
y_data = np.dot([0.100, 0.200], x_data) + 0.300
# 构造一个线性模型
#
b = tf.Variable(tf.zeros([1]))
W = tf.Variable(tf.random_uniform([1, 2], -1.0, 1.0))
y = tf.matmul(W, x_data) + b
# 最小化方差
loss = tf.reduce_mean(tf.square(y - y_data))
optimizer = tf.train.GradientDescentOptimizer(0.5)
train = optimizer.minimize(loss)
# 初始化变量
init = tf.initialize_all_variables()
# 启动图 (graph)
sess = tf.Session()
sess.run(init)
# 拟合平面
for step in xrange(0, 201):
sess.run(train)
if step % 20 == 0:
print step, sess.run(W), sess.run(b)
# 得到最佳拟合结果 W: [[0.100 0.200]], b: [0.300]
输出:
[root@103 home]# python test.py
Traceback (most recent call last):
File "test.py", line 1, in
import tensorflow as tf
File "/usr/lib/python2.7/site-packages/tensorflow/__init__.py", line 24, in
from tensorflow.python import pywrap_tensorflow # pylint: disable=unused-import
File "/usr/lib/python2.7/site-packages/tensorflow/python/__init__.py", line 49, in
from tensorflow.python import pywrap_tensorflow
File "/usr/lib/python2.7/site-packages/tensorflow/python/pywrap_tensorflow.py", line 74, in
raise ImportError(msg)
ImportError: Traceback (most recent call last):
File "/usr/lib/python2.7/site-packages/tensorflow/python/pywrap_tensorflow.py", line 58, in
from tensorflow.python.pywrap_tensorflow_internal import *
File "/usr/lib/python2.7/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 28, in
_pywrap_tensorflow_internal = swig_import_helper()
File "/usr/lib/python2.7/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 24, in swig_import_helper
_mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)
ImportError: libcublas.so.9.0: cannot open shared object file: No such file or directory
卸载numpy
ValueError: numpy.dtype has the wrong size, try recompiling
再次安装:
pip install --upgrade https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl --ignore-installed six