Table of Contents

It is worthwhile pointing out at the very beginning that setting up live mirrors for websites like Wikipedia is strongly discouraged. As described in here, live mirrors put more strains on WMF servers. However, if it is intended for private use, here are the steps to create such mirrors on your own server. Use at your own risks.

Prerequisites

If you are using Debian Buster:

  • Run apt install nginx and everything should be set.

If you are using Arch Linux:

  • Run pacman -S nginx.
  • Create folders sites-enabled and sites-available at /etc/nginx, and add the include directive to nginx.conf.
  • Follow the instructions below to build an extra module.

Build required module (for Arch users)

Install build dependencies. Add missing entries if any.

pacman -S gcc make git

Download source code for Nginx and substitutions module.

pacman -S nginx-src
cd /usr/src && git clone https://github.com/yaoweibin/ngx_http_substitutions_filter_module.git

Compile the module. Here we build it as a dynamic module so that no binary replacement is needed.

cd /usr/src/nginx
./configure --add-dynamic-module=../ngx_http_substitutions_filter_module
make modules

Copy the compiled library…

cp /usr/src/nginx/objs/ngx_http_subs_filter_module.so /usr/local/lib

…and add the load_module directive to nginx.conf.

load_module /usr/local/lib/ngx_http_subs_filter_module.so;

Obtain a certificate (optional)

Install certbot.

Run the following command to get a free wildcard certificate. Replace example.com with your domain name. Follow the prompts to modify DNS records.

certbot certonly -d "*.example.com" \
> --manual --preferred-challenges dns \
> --server https://acme-v02.api.letsencrypt.org/directory

If no errors occur, you should have the certificate at /etc/letsencrypt/live/example.com.

Configure Nginx

Begin by adding the certificate from the previous step. If your webserver serves no other domains, you can also place it inside the http block.

server {
    # ...
    ssl_certificate /etc/letsencrypt/live/example.com/fullchain.pem;
    ssl_certificate_key /etc/letsencrypt/live/example.com/privkey.pem;
    # ...
}

A recommended SSL configuration can be found here.

Then insert the snippet that handles Wikipedia uploads.

server {
    server_name upload.example.com;
    listen 443 ssl http2;
    # uncomment this if the server has ipv6 connectivity
    #listen [::]:443 ssl http2;

    # ...

    location / {
        proxy_pass https://upload.wikimedia.org;
        proxy_buffering off;
        proxy_http_version 1.1;
        proxy_set_header X-Real_IP $remote_addr;
        proxy_set_header User-Agent $http_user_agent;
        proxy_set_header referer https://$proxy_host$request_uri;
        proxy_cookie_domain $proxy_host $server_name;
    }

    # ...
}

Next is for the main site of Wikipedia…

server {
    server_name www.example.com;
    # ... similar configs

    location / {
        proxy_pass https://www.wikipedia.org;
        # ... similar configs
        proxy_redirect https://$proxy_host https://$server_name;
        proxy_redirect ~^https://([\w\.]+).wikipedia.org/(.*?)$ https://$1.example.com/$2;
        proxy_set_header Accept-Encoding "";
        subs_filter_types text/css text/javascript text/xml application/json;
        subs_filter .wikipedia.org .example.com;
        subs_filter //wikipedia.org //example.com;
        subs_filter upload.wikimedia.org upload.example.com;
    }

    # ...
}

…and subdomains.

server {
    server_name ~^(?<subdomain>[^.]+)\.example\.com$;
    # ... similar configs

    location / {
        proxy_pass https://$subdomain.wikipedia.org;
        # ... similar configs
    }

    # ...
}

Finally configure local cache for Nginx.

location / {
    # ...
    proxy_hide_header X-Accel-Expires;
    proxy_hide_header Expires;
    proxy_hide_header Cache-Control;
    proxy_ignore_headers X-Accel-Expires;
    proxy_ignore_headers Expires;
    proxy_ignore_headers Cache-Control;

    add_header Cache-Control "public";
    # ...
}
http {
    # ...
    proxy_cache_path /var/tmp/nginx levels=1:2 use_temp_path=off keys_zone=main:16m inactive=720h;
    # ...
}
server {
    # ...
    proxy_cache main;
    proxy_cache_valid 200 302 24h;
    proxy_cache_valid 404 10m;
    proxy_cache_revalidate on;
    proxy_cache_use_stale error updating invalid_header timeout http_500 http_502 http_503 http_504;
    proxy_cache_lock on;
    # ...
}

Finish

ln -s /etc/nginx/sites-available/wikipedia /etc/nginx/sites-enabled
systemctl restart nginx

Example

Here is a complete configuration example.

/etc/nginx/nginx.conf:

http {
    # ... other configs go here
    # SSL
    ssl_protocols TLSv1.2 TLSv1.3;
    ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;
    ssl_certificate /etc/letsencrypt/live/example.com/fullchain.pem;
    ssl_certificate_key /etc/letsencrypt/live/example.com/privkey.pem;
    # Cache
    proxy_cache_path /var/tmp/nginx levels=1:2 use_temp_path=off keys_zone=main:16m inactive=720h;
}

/etc/nginx/conf.d/common.conf:

proxy_cache main;
proxy_cache_valid 200 302 24h;
proxy_cache_valid 404 10m;
proxy_cache_revalidate on;
proxy_cache_use_stale error updating invalid_header timeout http_500 http_502 http_503 http_504;
proxy_cache_lock on;

proxy_hide_header X-Accel-Expires;
proxy_hide_header Expires;
proxy_hide_header Cache-Control;
proxy_ignore_headers X-Accel-Expires;
proxy_ignore_headers Expires;
proxy_ignore_headers Cache-Control;
add_header Cache-Control "public";
proxy_set_header Accept-Encoding "";

proxy_buffering off;
proxy_http_version 1.1;
proxy_set_header X-Real_IP $remote_addr;
proxy_set_header User-Agent $http_user_agent;
proxy_cookie_domain $proxy_host $server_name;

/etc/nginx/conf.d/wiki-common.conf:

proxy_redirect https://$proxy_host https://$server_name;
proxy_redirect ~^https://([\w\.]+).wikipedia.org/(.*?)$ https://$1.example.com/$2;
subs_filter_types text/css text/javascript text/xml application/json;
subs_filter .wikipedia.org .example.com;
subs_filter //wikipedia.org //example.com;
subs_filter upload.wikimedia.org upload.example.com;

/etc/nginx/sites-available/wikipedia:

server {
    server_name upload.example.com;
    listen 443 ssl http2;
    location / {
        proxy_pass https://upload.wikimedia.org;
        include /etc/nginx/conf.d/common.conf;
        proxy_set_header referer https://$proxy_host$request_uri;
    }
}

server {
    server_name www.example.com;
    listen 443 ssl http2;
    location / {
        proxy_pass https://www.wikipedia.org;
        include /etc/nginx/conf.d/common.conf;
        include /etc/nginx/conf.d/wiki-common.conf;
    }
}

server {
    server_name ~^(?<subdomain>[^.]+)\.example\.com$;
    listen 443 ssl http2;
    location / {
        proxy_pass https://$subdomain.wikipedia.org;
        include /etc/nginx/conf.d/common;
        include /etc/nginx/conf.d/wiki-common.conf;
    }
}