Commit c0758ab8 authored by Håvard Vika Røen's avatar Håvard Vika Røen
Browse files

Initial

parent 6343b5ee
......@@ -3,3 +3,5 @@
################################################################################
/PubMedToCsv/bin/Debug/netcoreapp3.1
/PubMedToCsv/obj
*.suo

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.30621.155
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PubMedToCsv", "PubMedToCsv\PubMedToCsv.csproj", "{69DB8E3C-19CE-404E-8642-E9DC91A64297}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{69DB8E3C-19CE-404E-8642-E9DC91A64297}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{69DB8E3C-19CE-404E-8642-E9DC91A64297}.Debug|Any CPU.Build.0 = Debug|Any CPU
{69DB8E3C-19CE-404E-8642-E9DC91A64297}.Release|Any CPU.ActiveCfg = Release|Any CPU
{69DB8E3C-19CE-404E-8642-E9DC91A64297}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {7648CFD8-9C2E-420C-8A0F-67B7A60F2FC9}
EndGlobalSection
EndGlobal
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using CommandLine;
using CsvHelper;
using HtmlAgilityPack;
using ScrapySharp.Extensions;
using ScrapySharp.Html;
using ScrapySharp.Html.Forms;
using ScrapySharp.Network;
namespace PubMedToCsv
{
class Program
{
public class Options
{
[Option('s', "search", Required = true, HelpText = "Search pubmed")]
public string Search { get; set; }
[Option('f', "filename", Required = false, HelpText = "Output filename")]
public string Filename { get; set; } = "PubMedExport_" + DateTime.Now.ToString("yyyyMMdd") + ".csv";
}
public static async Task Main(string[] args)
{
await Parser.Default.ParseArguments<Options>(args).MapResult(
(Options o) => DoWork(o),
errs => Task.FromResult(0));
}
private static async Task DoWork(Options o)
{
ScrapingBrowser browser = new ScrapingBrowser();
//set UseDefaultCookiesParser as false if a website returns invalid cookies format
//browser.UseDefaultCookiesParser = false;
WebPage homePage = browser.NavigateToPage(new Uri("https://pubmed.ncbi.nlm.nih.gov"));
PageWebForm form = homePage.FindFormById("search-form");
int pageSize = 200;
form["term"] = o.Search; //"DNA damage, nano*, high througput";
form["size"] = pageSize.ToString();
form.Method = HttpVerb.Get;
WebPage resultsPage = form.Submit();
HtmlNode resultCount = resultsPage.Html.CssSelect("div.results-amount span.value").First();
int count = Convert.ToInt32(resultCount.InnerText.Replace(",",""), CultureInfo.InvariantCulture);
int pagecount = 1;
if (count > pageSize)
{
decimal d = (decimal)count / (decimal)pageSize;
pagecount = Convert.ToInt32(Math.Ceiling(d));
Console.WriteLine("parsing " + pagecount + " results in " +pagecount+ " pages");
}
//HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("a.docsum-title").ToArray();
//resultsLinks.Where(l => l.Name == "href");
List<PubMedResult> pubmedresults = new List<PubMedResult>();
pubmedresults.AddRange(ParsePubmedResults(resultsPage));
for (int i = 2; i <= pagecount; i++)
{
Console.Write("\rparsing page " + i +"/"+ pagecount);
form["page"] = i.ToString();
resultsPage = form.Submit();
pubmedresults.AddRange(ParsePubmedResults(resultsPage));
}
CsvHandler.SaveCsv(pubmedresults, o.Filename);
Console.WriteLine(o.Filename+" saved");
//WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click();
// From Web
//var url = "http://html-agility-pack.net/";
//var web = new HtmlWeb();
//var html = web.Load(url);
//var divs = html.CssSelect("div"); //all div elements
//var nodes = html.CssSelect("div.content"); //all div elements with css class ‘content’
//var nodes = html.CssSelect("div.widget.monthlist"); //all div elements with the both css class
//var nodes = html.CssSelect("#postPaging"); //all HTML elements with the id postPaging
//var nodes = html.CssSelect("div#postPaging.testClass"); // all HTML elements with the id postPaging and css class testClass
//var nodes = html.CssSelect("div.content > p.para"); //p elements who are direct children of div elements with css class ‘content’
//var nodes = html.CssSelect("input[type=text].login"); // textbox with css class login
}
private static List<PubMedResult> ParsePubmedResults(WebPage resultsPage)
{
HtmlNode[] results = resultsPage.Html.CssSelect("div.search-results-chunk.results-chunk div.docsum-content").ToArray();
List<PubMedResult> pubmedresults = new List<PubMedResult>();
foreach (HtmlNode divContent in results)
{
PubMedResult res = new PubMedResult();
var lnk = divContent.ChildNodes.FindFirst("a");
res.Url = "https://pubmed.ncbi.nlm.nih.gov" + lnk.Attributes.First(a => a.Name == "href").Value;
res.Title = lnk.InnerText.Trim();
res.Authors = divContent.CssSelect("span.docsum-authors.full-authors").First().InnerText.Trim();
res.Doi = divContent.CssSelect("span.docsum-journal-citation.full-journal-citation").First().InnerText.Trim();
try
{
res.Doi = res.Doi.Substring(res.Doi.IndexOf("doi:"));
}
catch
{
}
pubmedresults.Add(res);
int i = 0;
}
return pubmedresults;
}
}
public class PubMedResult
{
public string Title { get; set; }
public string Url { get; set; }
public string Authors { get; set; }
public string Doi { get; set; }
}
public static class CsvHandler
{
//public const string _Dateformat = "yyyy.MM.dd HH:mm";
//public static NumberFormatInfo _Nfi = new NumberFormatInfo() { NumberDecimalSeparator = "." };
public static void SaveCsv(IEnumerable<PubMedResult> records, string path)
{
using StreamWriter writer = new StreamWriter(path);
using CsvWriter csv = new CsvWriter(writer, CultureInfo.InvariantCulture);
csv.Configuration.Delimiter = "\t";
csv.WriteRecords(records);
}
}
}
\ No newline at end of file
{
"profiles": {
"PubMedToCsv": {
"commandName": "Project",
"commandLineArgs": "--search \"covid\""
}
}
}
\ No newline at end of file
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp3.1</TargetFramework>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="CommandLineParser" Version="2.8.0" />
<PackageReference Include="CsvHelper" Version="16.1.0" />
<PackageReference Include="ScrapySharp" Version="3.0.0" />
</ItemGroup>
</Project>
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment